Page MenuHomeFreeBSD

No OneTemporary

This file is larger than 256 KB, so syntax highlighting was skipped.
Index: head/sys/arm/allwinner/aw_wdog.c
===================================================================
--- head/sys/arm/allwinner/aw_wdog.c (revision 327172)
+++ head/sys/arm/allwinner/aw_wdog.c (revision 327173)
@@ -1,275 +1,272 @@
/*-
* Copyright (c) 2013 Oleksandr Tymoshenko <gonzo@freebsd.org>
* Copyright (c) 2016 Emmanuel Vadot <manu@bidouilliste.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/watchdog.h>
#include <sys/reboot.h>
#include <sys/bus.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/rman.h>
#include <dev/ofw/openfirm.h>
#include <dev/ofw/ofw_bus.h>
#include <dev/ofw/ofw_bus_subr.h>
#include <machine/bus.h>
#include <machine/machdep.h>
#include <arm/allwinner/aw_wdog.h>
#define READ(_sc, _r) bus_read_4((_sc)->res, (_r))
#define WRITE(_sc, _r, _v) bus_write_4((_sc)->res, (_r), (_v))
#define A10_WDOG_CTRL 0x00
#define A31_WDOG_CTRL 0x10
#define WDOG_CTRL_RESTART (1 << 0)
#define A31_WDOG_CTRL_KEY (0xa57 << 1)
#define A10_WDOG_MODE 0x04
#define A31_WDOG_MODE 0x18
#define A10_WDOG_MODE_INTVL_SHIFT 3
#define A31_WDOG_MODE_INTVL_SHIFT 4
#define A10_WDOG_MODE_RST_EN (1 << 1)
#define WDOG_MODE_EN (1 << 0)
#define A31_WDOG_CONFIG 0x14
#define A31_WDOG_CONFIG_RST_EN_SYSTEM (1 << 0)
#define A31_WDOG_CONFIG_RST_EN_INT (2 << 0)
struct aw_wdog_interval {
uint64_t milliseconds;
unsigned int value;
};
struct aw_wdog_interval wd_intervals[] = {
{ 500, 0 },
{ 1000, 1 },
{ 2000, 2 },
{ 3000, 3 },
{ 4000, 4 },
{ 5000, 5 },
{ 6000, 6 },
{ 8000, 7 },
{ 10000, 8 },
{ 12000, 9 },
{ 14000, 10 },
{ 16000, 11 },
{ 0, 0 } /* sentinel */
};
static struct aw_wdog_softc *aw_wdog_sc = NULL;
struct aw_wdog_softc {
device_t dev;
struct resource * res;
struct mtx mtx;
uint8_t wdog_ctrl;
uint32_t wdog_ctrl_key;
uint8_t wdog_mode;
uint8_t wdog_mode_intvl_shift;
uint8_t wdog_mode_en;
uint8_t wdog_config;
uint8_t wdog_config_value;
};
#define A10_WATCHDOG 1
#define A31_WATCHDOG 2
static struct ofw_compat_data compat_data[] = {
{"allwinner,sun4i-a10-wdt", A10_WATCHDOG},
{"allwinner,sun6i-a31-wdt", A31_WATCHDOG},
{NULL, 0}
};
static void aw_wdog_watchdog_fn(void *, u_int, int *);
static void aw_wdog_shutdown_fn(void *, int);
static int
aw_wdog_probe(device_t dev)
{
- struct aw_wdog_softc *sc;
-
- sc = device_get_softc(dev);
if (!ofw_bus_status_okay(dev))
return (ENXIO);
switch (ofw_bus_search_compatible(dev, compat_data)->ocd_data) {
case A10_WATCHDOG:
device_set_desc(dev, "Allwinner A10 Watchdog");
return (BUS_PROBE_DEFAULT);
case A31_WATCHDOG:
device_set_desc(dev, "Allwinner A31 Watchdog");
return (BUS_PROBE_DEFAULT);
}
return (ENXIO);
}
static int
aw_wdog_attach(device_t dev)
{
struct aw_wdog_softc *sc;
int rid;
if (aw_wdog_sc != NULL)
return (ENXIO);
sc = device_get_softc(dev);
sc->dev = dev;
rid = 0;
sc->res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE);
if (sc->res == NULL) {
device_printf(dev, "could not allocate memory resource\n");
return (ENXIO);
}
aw_wdog_sc = sc;
switch (ofw_bus_search_compatible(dev, compat_data)->ocd_data) {
case A10_WATCHDOG:
sc->wdog_ctrl = A10_WDOG_CTRL;
sc->wdog_mode = A10_WDOG_MODE;
sc->wdog_mode_intvl_shift = A10_WDOG_MODE_INTVL_SHIFT;
sc->wdog_mode_en = A10_WDOG_MODE_RST_EN | WDOG_MODE_EN;
break;
case A31_WATCHDOG:
sc->wdog_ctrl = A31_WDOG_CTRL;
sc->wdog_ctrl_key = A31_WDOG_CTRL_KEY;
sc->wdog_mode = A31_WDOG_MODE;
sc->wdog_mode_intvl_shift = A31_WDOG_MODE_INTVL_SHIFT;
sc->wdog_mode_en = WDOG_MODE_EN;
sc->wdog_config = A31_WDOG_CONFIG;
sc->wdog_config_value = A31_WDOG_CONFIG_RST_EN_SYSTEM;
break;
default:
bus_release_resource(dev, SYS_RES_MEMORY, rid, sc->res);
return (ENXIO);
}
mtx_init(&sc->mtx, "AW Watchdog", "aw_wdog", MTX_DEF);
EVENTHANDLER_REGISTER(watchdog_list, aw_wdog_watchdog_fn, sc, 0);
EVENTHANDLER_REGISTER(shutdown_final, aw_wdog_shutdown_fn, sc,
SHUTDOWN_PRI_LAST - 1);
return (0);
}
static void
aw_wdog_watchdog_fn(void *private, u_int cmd, int *error)
{
struct aw_wdog_softc *sc;
uint64_t ms;
int i;
sc = private;
mtx_lock(&sc->mtx);
cmd &= WD_INTERVAL;
if (cmd > 0) {
ms = ((uint64_t)1 << (cmd & WD_INTERVAL)) / 1000000;
i = 0;
while (wd_intervals[i].milliseconds &&
(ms > wd_intervals[i].milliseconds))
i++;
if (wd_intervals[i].milliseconds) {
WRITE(sc, sc->wdog_mode,
(wd_intervals[i].value << sc->wdog_mode_intvl_shift) |
sc->wdog_mode_en);
WRITE(sc, sc->wdog_ctrl,
WDOG_CTRL_RESTART | sc->wdog_ctrl_key);
if (sc->wdog_config)
WRITE(sc, sc->wdog_config,
sc->wdog_config_value);
*error = 0;
}
else {
/*
* Can't arm
* disable watchdog as watchdog(9) requires
*/
device_printf(sc->dev,
"Can't arm, timeout is more than 16 sec\n");
mtx_unlock(&sc->mtx);
WRITE(sc, sc->wdog_mode, 0);
return;
}
}
else
WRITE(sc, sc->wdog_mode, 0);
mtx_unlock(&sc->mtx);
}
static void
aw_wdog_shutdown_fn(void *private, int howto)
{
if ((howto & (RB_POWEROFF|RB_HALT)) == 0)
aw_wdog_watchdog_reset();
}
void
aw_wdog_watchdog_reset(void)
{
if (aw_wdog_sc == NULL) {
printf("Reset: watchdog device has not been initialized\n");
return;
}
WRITE(aw_wdog_sc, aw_wdog_sc->wdog_mode,
(wd_intervals[0].value << aw_wdog_sc->wdog_mode_intvl_shift) |
aw_wdog_sc->wdog_mode_en);
if (aw_wdog_sc->wdog_config)
WRITE(aw_wdog_sc, aw_wdog_sc->wdog_config,
aw_wdog_sc->wdog_config_value);
WRITE(aw_wdog_sc, aw_wdog_sc->wdog_ctrl,
WDOG_CTRL_RESTART | aw_wdog_sc->wdog_ctrl_key);
while(1)
;
}
static device_method_t aw_wdog_methods[] = {
DEVMETHOD(device_probe, aw_wdog_probe),
DEVMETHOD(device_attach, aw_wdog_attach),
DEVMETHOD_END
};
static driver_t aw_wdog_driver = {
"aw_wdog",
aw_wdog_methods,
sizeof(struct aw_wdog_softc),
};
static devclass_t aw_wdog_devclass;
DRIVER_MODULE(aw_wdog, simplebus, aw_wdog_driver, aw_wdog_devclass, 0, 0);
Index: head/sys/arm/allwinner/axp81x.c
===================================================================
--- head/sys/arm/allwinner/axp81x.c (revision 327172)
+++ head/sys/arm/allwinner/axp81x.c (revision 327173)
@@ -1,785 +1,783 @@
/*-
* Copyright (c) 2016 Jared McNeill <jmcneill@invisible.ca>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
/*
* X-Powers AXP813/818 PMU for Allwinner SoCs
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/eventhandler.h>
#include <sys/bus.h>
#include <sys/rman.h>
#include <sys/kernel.h>
#include <sys/reboot.h>
#include <sys/gpio.h>
#include <sys/module.h>
#include <machine/bus.h>
#include <dev/iicbus/iicbus.h>
#include <dev/iicbus/iiconf.h>
#include <dev/gpio/gpiobusvar.h>
#include <dev/ofw/ofw_bus.h>
#include <dev/ofw/ofw_bus_subr.h>
#include <dev/extres/regulator/regulator.h>
#include "gpio_if.h"
#include "iicbus_if.h"
#include "regdev_if.h"
MALLOC_DEFINE(M_AXP81X_REG, "AXP81x regulator", "AXP81x power regulator");
#define AXP_ICTYPE 0x03
#define AXP_POWERCTL1 0x10
#define AXP_POWERCTL1_DCDC2 (1 << 1)
#define AXP_POWERCTL2 0x12
#define AXP_POWERCTL2_DC1SW (1 << 7)
#define AXP_VOLTCTL_DCDC2 0x21
#define AXP_VOLTCTL_STATUS (1 << 7)
#define AXP_VOLTCTL_MASK 0x7f
#define AXP_POWERBAT 0x32
#define AXP_POWERBAT_SHUTDOWN (1 << 7)
#define AXP_IRQEN1 0x40
#define AXP_IRQEN2 0x41
#define AXP_IRQEN3 0x42
#define AXP_IRQEN4 0x43
#define AXP_IRQEN5 0x44
#define AXP_IRQEN5_POKSIRQ (1 << 4)
#define AXP_IRQEN6 0x45
#define AXP_IRQSTAT5 0x4c
#define AXP_IRQSTAT5_POKSIRQ (1 << 4)
#define AXP_GPIO0_CTRL 0x90
#define AXP_GPIO1_CTRL 0x92
#define AXP_GPIO_FUNC (0x7 << 0)
#define AXP_GPIO_FUNC_SHIFT 0
#define AXP_GPIO_FUNC_DRVLO 0
#define AXP_GPIO_FUNC_DRVHI 1
#define AXP_GPIO_FUNC_INPUT 2
#define AXP_GPIO_SIGBIT 0x94
#define AXP_GPIO_PD 0x97
static const struct {
const char *name;
uint8_t ctrl_reg;
} axp81x_pins[] = {
{ "GPIO0", AXP_GPIO0_CTRL },
{ "GPIO1", AXP_GPIO1_CTRL },
};
static struct ofw_compat_data compat_data[] = {
{ "x-powers,axp813", 1 },
{ "x-powers,axp818", 1 },
{ NULL, 0 }
};
static struct resource_spec axp81x_spec[] = {
{ SYS_RES_IRQ, 0, RF_ACTIVE },
{ -1, 0 }
};
struct axp81x_regdef {
intptr_t id;
char *name;
char *supply_name;
uint8_t enable_reg;
uint8_t enable_mask;
uint8_t voltage_reg;
int voltage_min;
int voltage_max;
int voltage_step1;
int voltage_nstep1;
int voltage_step2;
int voltage_nstep2;
};
enum axp81x_reg_id {
AXP81X_REG_ID_DC1SW,
AXP81X_REG_ID_DCDC2,
};
static struct axp81x_regdef axp81x_regdefs[] = {
{
.id = AXP81X_REG_ID_DC1SW,
.name = "dc1sw",
.enable_reg = AXP_POWERCTL2,
.enable_mask = AXP_POWERCTL2_DC1SW,
},
{
.id = AXP81X_REG_ID_DCDC2,
.name = "dcdc2",
.enable_reg = AXP_POWERCTL1,
.enable_mask = AXP_POWERCTL1_DCDC2,
.voltage_reg = AXP_VOLTCTL_DCDC2,
.voltage_min = 500,
.voltage_max = 1300,
.voltage_step1 = 10,
.voltage_nstep1 = 70,
.voltage_step2 = 20,
.voltage_nstep2 = 5,
},
};
struct axp81x_softc;
struct axp81x_reg_sc {
struct regnode *regnode;
device_t base_dev;
struct axp81x_regdef *def;
phandle_t xref;
struct regnode_std_param *param;
};
struct axp81x_softc {
struct resource *res;
uint16_t addr;
void *ih;
device_t gpiodev;
struct mtx mtx;
int busy;
/* Regulators */
struct axp81x_reg_sc **regs;
int nregs;
};
#define AXP_LOCK(sc) mtx_lock(&(sc)->mtx)
#define AXP_UNLOCK(sc) mtx_unlock(&(sc)->mtx)
static int
axp81x_read(device_t dev, uint8_t reg, uint8_t *data, uint8_t size)
{
struct axp81x_softc *sc;
struct iic_msg msg[2];
sc = device_get_softc(dev);
msg[0].slave = sc->addr;
msg[0].flags = IIC_M_WR;
msg[0].len = 1;
msg[0].buf = &reg;
msg[1].slave = sc->addr;
msg[1].flags = IIC_M_RD;
msg[1].len = size;
msg[1].buf = data;
return (iicbus_transfer(dev, msg, 2));
}
static int
axp81x_write(device_t dev, uint8_t reg, uint8_t val)
{
struct axp81x_softc *sc;
struct iic_msg msg[2];
sc = device_get_softc(dev);
msg[0].slave = sc->addr;
msg[0].flags = IIC_M_WR;
msg[0].len = 1;
msg[0].buf = &reg;
msg[1].slave = sc->addr;
msg[1].flags = IIC_M_WR;
msg[1].len = 1;
msg[1].buf = &val;
return (iicbus_transfer(dev, msg, 2));
}
static int
axp81x_regnode_init(struct regnode *regnode)
{
return (0);
}
static int
axp81x_regnode_enable(struct regnode *regnode, bool enable, int *udelay)
{
struct axp81x_reg_sc *sc;
uint8_t val;
sc = regnode_get_softc(regnode);
axp81x_read(sc->base_dev, sc->def->enable_reg, &val, 1);
if (enable)
val |= sc->def->enable_mask;
else
val &= ~sc->def->enable_mask;
axp81x_write(sc->base_dev, sc->def->enable_reg, val);
*udelay = 0;
return (0);
}
static void
axp81x_regnode_reg_to_voltage(struct axp81x_reg_sc *sc, uint8_t val, int *uv)
{
if (val < sc->def->voltage_nstep1)
*uv = sc->def->voltage_min + val * sc->def->voltage_step1;
else
*uv = sc->def->voltage_min +
(sc->def->voltage_nstep1 * sc->def->voltage_step1) +
((val - sc->def->voltage_nstep1) * sc->def->voltage_step2);
*uv *= 1000;
}
static int
axp81x_regnode_voltage_to_reg(struct axp81x_reg_sc *sc, int min_uvolt,
int max_uvolt, uint8_t *val)
{
uint8_t nval;
int nstep, uvolt;
nval = 0;
uvolt = sc->def->voltage_min * 1000;
for (nstep = 0; nstep < sc->def->voltage_nstep1 && uvolt < min_uvolt;
nstep++) {
++nval;
uvolt += (sc->def->voltage_step1 * 1000);
}
for (nstep = 0; nstep < sc->def->voltage_nstep2 && uvolt < min_uvolt;
nstep++) {
++nval;
uvolt += (sc->def->voltage_step2 * 1000);
}
if (uvolt > max_uvolt)
return (EINVAL);
*val = nval;
return (0);
}
static int
axp81x_regnode_set_voltage(struct regnode *regnode, int min_uvolt,
int max_uvolt, int *udelay)
{
struct axp81x_reg_sc *sc;
uint8_t val;
sc = regnode_get_softc(regnode);
if (!sc->def->voltage_step1 || !sc->def->voltage_step2)
return (ENXIO);
if (axp81x_regnode_voltage_to_reg(sc, min_uvolt, max_uvolt, &val) != 0)
return (ERANGE);
axp81x_write(sc->base_dev, sc->def->voltage_reg, val);
*udelay = 0;
return (0);
}
static int
axp81x_regnode_get_voltage(struct regnode *regnode, int *uvolt)
{
struct axp81x_reg_sc *sc;
uint8_t val;
sc = regnode_get_softc(regnode);
if (!sc->def->voltage_step1 || !sc->def->voltage_step2)
return (ENXIO);
axp81x_read(sc->base_dev, sc->def->voltage_reg, &val, 1);
axp81x_regnode_reg_to_voltage(sc, val & AXP_VOLTCTL_MASK, uvolt);
return (0);
}
static regnode_method_t axp81x_regnode_methods[] = {
/* Regulator interface */
REGNODEMETHOD(regnode_init, axp81x_regnode_init),
REGNODEMETHOD(regnode_enable, axp81x_regnode_enable),
REGNODEMETHOD(regnode_set_voltage, axp81x_regnode_set_voltage),
REGNODEMETHOD(regnode_get_voltage, axp81x_regnode_get_voltage),
REGNODEMETHOD_END
};
DEFINE_CLASS_1(axp81x_regnode, axp81x_regnode_class, axp81x_regnode_methods,
sizeof(struct axp81x_reg_sc), regnode_class);
static void
axp81x_shutdown(void *devp, int howto)
{
device_t dev;
if ((howto & RB_POWEROFF) == 0)
return;
dev = devp;
if (bootverbose)
device_printf(dev, "Shutdown AXP81x\n");
axp81x_write(dev, AXP_POWERBAT, AXP_POWERBAT_SHUTDOWN);
}
static void
axp81x_intr(void *arg)
{
- struct axp81x_softc *sc;
device_t dev;
uint8_t val;
int error;
dev = arg;
- sc = device_get_softc(dev);
error = axp81x_read(dev, AXP_IRQSTAT5, &val, 1);
if (error != 0)
return;
if (val != 0) {
if ((val & AXP_IRQSTAT5_POKSIRQ) != 0) {
if (bootverbose)
device_printf(dev, "Power button pressed\n");
shutdown_nice(RB_POWEROFF);
}
/* Acknowledge */
axp81x_write(dev, AXP_IRQSTAT5, val);
}
}
static device_t
axp81x_gpio_get_bus(device_t dev)
{
struct axp81x_softc *sc;
sc = device_get_softc(dev);
return (sc->gpiodev);
}
static int
axp81x_gpio_pin_max(device_t dev, int *maxpin)
{
*maxpin = nitems(axp81x_pins) - 1;
return (0);
}
static int
axp81x_gpio_pin_getname(device_t dev, uint32_t pin, char *name)
{
if (pin >= nitems(axp81x_pins))
return (EINVAL);
snprintf(name, GPIOMAXNAME, "%s", axp81x_pins[pin].name);
return (0);
}
static int
axp81x_gpio_pin_getcaps(device_t dev, uint32_t pin, uint32_t *caps)
{
if (pin >= nitems(axp81x_pins))
return (EINVAL);
*caps = GPIO_PIN_INPUT | GPIO_PIN_OUTPUT;
return (0);
}
static int
axp81x_gpio_pin_getflags(device_t dev, uint32_t pin, uint32_t *flags)
{
struct axp81x_softc *sc;
uint8_t data, func;
int error;
if (pin >= nitems(axp81x_pins))
return (EINVAL);
sc = device_get_softc(dev);
AXP_LOCK(sc);
error = axp81x_read(dev, axp81x_pins[pin].ctrl_reg, &data, 1);
if (error == 0) {
func = (data & AXP_GPIO_FUNC) >> AXP_GPIO_FUNC_SHIFT;
if (func == AXP_GPIO_FUNC_INPUT)
*flags = GPIO_PIN_INPUT;
else if (func == AXP_GPIO_FUNC_DRVLO ||
func == AXP_GPIO_FUNC_DRVHI)
*flags = GPIO_PIN_OUTPUT;
else
*flags = 0;
}
AXP_UNLOCK(sc);
return (error);
}
static int
axp81x_gpio_pin_setflags(device_t dev, uint32_t pin, uint32_t flags)
{
struct axp81x_softc *sc;
uint8_t data;
int error;
if (pin >= nitems(axp81x_pins))
return (EINVAL);
sc = device_get_softc(dev);
AXP_LOCK(sc);
error = axp81x_read(dev, axp81x_pins[pin].ctrl_reg, &data, 1);
if (error == 0) {
data &= ~AXP_GPIO_FUNC;
if ((flags & (GPIO_PIN_INPUT|GPIO_PIN_OUTPUT)) != 0) {
if ((flags & GPIO_PIN_OUTPUT) == 0)
data |= AXP_GPIO_FUNC_INPUT;
}
error = axp81x_write(dev, axp81x_pins[pin].ctrl_reg, data);
}
AXP_UNLOCK(sc);
return (error);
}
static int
axp81x_gpio_pin_get(device_t dev, uint32_t pin, unsigned int *val)
{
struct axp81x_softc *sc;
uint8_t data, func;
int error;
if (pin >= nitems(axp81x_pins))
return (EINVAL);
sc = device_get_softc(dev);
AXP_LOCK(sc);
error = axp81x_read(dev, axp81x_pins[pin].ctrl_reg, &data, 1);
if (error == 0) {
func = (data & AXP_GPIO_FUNC) >> AXP_GPIO_FUNC_SHIFT;
switch (func) {
case AXP_GPIO_FUNC_DRVLO:
*val = 0;
break;
case AXP_GPIO_FUNC_DRVHI:
*val = 1;
break;
case AXP_GPIO_FUNC_INPUT:
error = axp81x_read(dev, AXP_GPIO_SIGBIT, &data, 1);
if (error == 0)
*val = (data & (1 << pin)) ? 1 : 0;
break;
default:
error = EIO;
break;
}
}
AXP_UNLOCK(sc);
return (error);
}
static int
axp81x_gpio_pin_set(device_t dev, uint32_t pin, unsigned int val)
{
struct axp81x_softc *sc;
uint8_t data, func;
int error;
if (pin >= nitems(axp81x_pins))
return (EINVAL);
sc = device_get_softc(dev);
AXP_LOCK(sc);
error = axp81x_read(dev, axp81x_pins[pin].ctrl_reg, &data, 1);
if (error == 0) {
func = (data & AXP_GPIO_FUNC) >> AXP_GPIO_FUNC_SHIFT;
switch (func) {
case AXP_GPIO_FUNC_DRVLO:
case AXP_GPIO_FUNC_DRVHI:
data &= ~AXP_GPIO_FUNC;
data |= (val << AXP_GPIO_FUNC_SHIFT);
break;
default:
error = EIO;
break;
}
}
if (error == 0)
error = axp81x_write(dev, axp81x_pins[pin].ctrl_reg, data);
AXP_UNLOCK(sc);
return (error);
}
static int
axp81x_gpio_pin_toggle(device_t dev, uint32_t pin)
{
struct axp81x_softc *sc;
uint8_t data, func;
int error;
if (pin >= nitems(axp81x_pins))
return (EINVAL);
sc = device_get_softc(dev);
AXP_LOCK(sc);
error = axp81x_read(dev, axp81x_pins[pin].ctrl_reg, &data, 1);
if (error == 0) {
func = (data & AXP_GPIO_FUNC) >> AXP_GPIO_FUNC_SHIFT;
switch (func) {
case AXP_GPIO_FUNC_DRVLO:
data &= ~AXP_GPIO_FUNC;
data |= (AXP_GPIO_FUNC_DRVHI << AXP_GPIO_FUNC_SHIFT);
break;
case AXP_GPIO_FUNC_DRVHI:
data &= ~AXP_GPIO_FUNC;
data |= (AXP_GPIO_FUNC_DRVLO << AXP_GPIO_FUNC_SHIFT);
break;
default:
error = EIO;
break;
}
}
if (error == 0)
error = axp81x_write(dev, axp81x_pins[pin].ctrl_reg, data);
AXP_UNLOCK(sc);
return (error);
}
static int
axp81x_gpio_map_gpios(device_t bus, phandle_t dev, phandle_t gparent,
int gcells, pcell_t *gpios, uint32_t *pin, uint32_t *flags)
{
if (gpios[0] >= nitems(axp81x_pins))
return (EINVAL);
*pin = gpios[0];
*flags = gpios[1];
return (0);
}
static phandle_t
axp81x_get_node(device_t dev, device_t bus)
{
return (ofw_bus_get_node(dev));
}
static struct axp81x_reg_sc *
axp81x_reg_attach(device_t dev, phandle_t node,
struct axp81x_regdef *def)
{
struct axp81x_reg_sc *reg_sc;
struct regnode_init_def initdef;
struct regnode *regnode;
memset(&initdef, 0, sizeof(initdef));
regulator_parse_ofw_stdparam(dev, node, &initdef);
if (initdef.std_param.min_uvolt == 0)
initdef.std_param.min_uvolt = def->voltage_min * 1000;
if (initdef.std_param.max_uvolt == 0)
initdef.std_param.max_uvolt = def->voltage_max * 1000;
initdef.id = def->id;
initdef.ofw_node = node;
regnode = regnode_create(dev, &axp81x_regnode_class, &initdef);
if (regnode == NULL) {
device_printf(dev, "cannot create regulator\n");
return (NULL);
}
reg_sc = regnode_get_softc(regnode);
reg_sc->regnode = regnode;
reg_sc->base_dev = dev;
reg_sc->def = def;
reg_sc->xref = OF_xref_from_node(node);
reg_sc->param = regnode_get_stdparam(regnode);
regnode_register(regnode);
return (reg_sc);
}
static int
axp81x_regdev_map(device_t dev, phandle_t xref, int ncells, pcell_t *cells,
intptr_t *num)
{
struct axp81x_softc *sc;
int i;
sc = device_get_softc(dev);
for (i = 0; i < sc->nregs; i++) {
if (sc->regs[i] == NULL)
continue;
if (sc->regs[i]->xref == xref) {
*num = sc->regs[i]->def->id;
return (0);
}
}
return (ENXIO);
}
static int
axp81x_probe(device_t dev)
{
if (!ofw_bus_status_okay(dev))
return (ENXIO);
if (ofw_bus_search_compatible(dev, compat_data)->ocd_data == 0)
return (ENXIO);
device_set_desc(dev, "X-Powers AXP81x Power Management Unit");
return (BUS_PROBE_DEFAULT);
}
static int
axp81x_attach(device_t dev)
{
struct axp81x_softc *sc;
struct axp81x_reg_sc *reg;
uint8_t chip_id;
phandle_t rnode, child;
int error, i;
sc = device_get_softc(dev);
sc->addr = iicbus_get_addr(dev);
mtx_init(&sc->mtx, device_get_nameunit(dev), NULL, MTX_DEF);
error = bus_alloc_resources(dev, axp81x_spec, &sc->res);
if (error != 0) {
device_printf(dev, "cannot allocate resources for device\n");
return (error);
}
if (bootverbose) {
axp81x_read(dev, AXP_ICTYPE, &chip_id, 1);
device_printf(dev, "chip ID 0x%02x\n", chip_id);
}
sc->nregs = nitems(axp81x_regdefs);
sc->regs = malloc(sizeof(struct axp81x_reg_sc *) * sc->nregs,
M_AXP81X_REG, M_WAITOK | M_ZERO);
/* Attach known regulators that exist in the DT */
rnode = ofw_bus_find_child(ofw_bus_get_node(dev), "regulators");
if (rnode > 0) {
for (i = 0; i < sc->nregs; i++) {
child = ofw_bus_find_child(rnode,
axp81x_regdefs[i].name);
if (child == 0)
continue;
reg = axp81x_reg_attach(dev, child, &axp81x_regdefs[i]);
if (reg == NULL) {
device_printf(dev,
"cannot attach regulator %s\n",
axp81x_regdefs[i].name);
return (ENXIO);
}
sc->regs[i] = reg;
}
}
/* Enable IRQ on short power key press */
axp81x_write(dev, AXP_IRQEN1, 0);
axp81x_write(dev, AXP_IRQEN2, 0);
axp81x_write(dev, AXP_IRQEN3, 0);
axp81x_write(dev, AXP_IRQEN4, 0);
axp81x_write(dev, AXP_IRQEN5, AXP_IRQEN5_POKSIRQ);
axp81x_write(dev, AXP_IRQEN6, 0);
/* Install interrupt handler */
error = bus_setup_intr(dev, sc->res, INTR_TYPE_MISC | INTR_MPSAFE,
NULL, axp81x_intr, dev, &sc->ih);
if (error != 0) {
device_printf(dev, "cannot setup interrupt handler\n");
return (error);
}
EVENTHANDLER_REGISTER(shutdown_final, axp81x_shutdown, dev,
SHUTDOWN_PRI_LAST);
sc->gpiodev = gpiobus_attach_bus(dev);
return (0);
}
static device_method_t axp81x_methods[] = {
/* Device interface */
DEVMETHOD(device_probe, axp81x_probe),
DEVMETHOD(device_attach, axp81x_attach),
/* GPIO interface */
DEVMETHOD(gpio_get_bus, axp81x_gpio_get_bus),
DEVMETHOD(gpio_pin_max, axp81x_gpio_pin_max),
DEVMETHOD(gpio_pin_getname, axp81x_gpio_pin_getname),
DEVMETHOD(gpio_pin_getcaps, axp81x_gpio_pin_getcaps),
DEVMETHOD(gpio_pin_getflags, axp81x_gpio_pin_getflags),
DEVMETHOD(gpio_pin_setflags, axp81x_gpio_pin_setflags),
DEVMETHOD(gpio_pin_get, axp81x_gpio_pin_get),
DEVMETHOD(gpio_pin_set, axp81x_gpio_pin_set),
DEVMETHOD(gpio_pin_toggle, axp81x_gpio_pin_toggle),
DEVMETHOD(gpio_map_gpios, axp81x_gpio_map_gpios),
/* Regdev interface */
DEVMETHOD(regdev_map, axp81x_regdev_map),
/* OFW bus interface */
DEVMETHOD(ofw_bus_get_node, axp81x_get_node),
DEVMETHOD_END
};
static driver_t axp81x_driver = {
"axp81x_pmu",
axp81x_methods,
sizeof(struct axp81x_softc),
};
static devclass_t axp81x_devclass;
extern devclass_t ofwgpiobus_devclass, gpioc_devclass;
extern driver_t ofw_gpiobus_driver, gpioc_driver;
EARLY_DRIVER_MODULE(axp81x, iicbus, axp81x_driver, axp81x_devclass, 0, 0,
BUS_PASS_INTERRUPT + BUS_PASS_ORDER_LAST);
EARLY_DRIVER_MODULE(ofw_gpiobus, axp81x_pmu, ofw_gpiobus_driver,
ofwgpiobus_devclass, 0, 0, BUS_PASS_INTERRUPT + BUS_PASS_ORDER_LAST);
DRIVER_MODULE(gpioc, axp81x_pmu, gpioc_driver, gpioc_devclass, 0, 0);
MODULE_VERSION(axp81x, 1);
MODULE_DEPEND(axp81x, iicbus, 1, 1, 1);
Index: head/sys/arm/allwinner/clk/aw_pll.c
===================================================================
--- head/sys/arm/allwinner/clk/aw_pll.c (revision 327172)
+++ head/sys/arm/allwinner/clk/aw_pll.c (revision 327173)
@@ -1,1349 +1,1347 @@
/*-
* Copyright (c) 2016 Jared McNeill <jmcneill@invisible.ca>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
/*
* Allwinner PLL clock
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/bus.h>
#include <sys/rman.h>
#include <sys/kernel.h>
#include <sys/module.h>
#include <machine/bus.h>
#include <dev/ofw/ofw_bus.h>
#include <dev/ofw/ofw_bus_subr.h>
#include <dev/ofw/ofw_subr.h>
#include <dev/extres/clk/clk.h>
#include <arm/allwinner/aw_machdep.h>
#include "clkdev_if.h"
#define SUN4I_A10_PLL2_1X 0
#define SUN4I_A10_PLL2_2X 1
#define SUN4I_A10_PLL2_4X 2
#define SUN4I_A10_PLL2_8X 3
#define AW_PLL_ENABLE (1 << 31)
#define A10_PLL1_OUT_EXT_DIVP (0x3 << 16)
#define A10_PLL1_OUT_EXT_DIVP_SHIFT 16
#define A10_PLL1_FACTOR_N (0x1f << 8)
#define A10_PLL1_FACTOR_N_SHIFT 8
#define A10_PLL1_FACTOR_K (0x3 << 4)
#define A10_PLL1_FACTOR_K_SHIFT 4
#define A10_PLL1_FACTOR_M (0x3 << 0)
#define A10_PLL1_FACTOR_M_SHIFT 0
#define A10_PLL2_POST_DIV (0xf << 26)
#define A10_PLL2_POST_DIV_SHIFT 26
#define A10_PLL2_FACTOR_N (0x7f << 8)
#define A10_PLL2_FACTOR_N_SHIFT 8
#define A10_PLL2_PRE_DIV (0x1f << 0)
#define A10_PLL2_PRE_DIV_SHIFT 0
#define A10_PLL3_MODE_SEL (0x1 << 15)
#define A10_PLL3_MODE_SEL_FRACT (0 << 15)
#define A10_PLL3_MODE_SEL_INT (1 << 15)
#define A10_PLL3_FUNC_SET (0x1 << 14)
#define A10_PLL3_FUNC_SET_270MHZ (0 << 14)
#define A10_PLL3_FUNC_SET_297MHZ (1 << 14)
#define A10_PLL3_FACTOR_M (0x7f << 0)
#define A10_PLL3_FACTOR_M_SHIFT 0
#define A10_PLL3_REF_FREQ 3000000
#define A10_PLL5_OUT_EXT_DIVP (0x3 << 16)
#define A10_PLL5_OUT_EXT_DIVP_SHIFT 16
#define A10_PLL5_FACTOR_N (0x1f << 8)
#define A10_PLL5_FACTOR_N_SHIFT 8
#define A10_PLL5_FACTOR_K (0x3 << 4)
#define A10_PLL5_FACTOR_K_SHIFT 4
#define A10_PLL5_FACTOR_M1 (0x3 << 2)
#define A10_PLL5_FACTOR_M1_SHIFT 2
#define A10_PLL5_FACTOR_M (0x3 << 0)
#define A10_PLL5_FACTOR_M_SHIFT 0
#define A10_PLL6_BYPASS_EN (1 << 30)
#define A10_PLL6_SATA_CLK_EN (1 << 14)
#define A10_PLL6_FACTOR_N (0x1f << 8)
#define A10_PLL6_FACTOR_N_SHIFT 8
#define A10_PLL6_FACTOR_K (0x3 << 4)
#define A10_PLL6_FACTOR_K_SHIFT 4
#define A10_PLL6_FACTOR_M (0x3 << 0)
#define A10_PLL6_FACTOR_M_SHIFT 0
#define A10_PLL2_POST_DIV (0xf << 26)
#define A13_PLL2_POST_DIV (0xf << 26)
#define A13_PLL2_POST_DIV_SHIFT 26
#define A13_PLL2_FACTOR_N (0x7f << 8)
#define A13_PLL2_FACTOR_N_SHIFT 8
#define A13_PLL2_PRE_DIV (0x1f << 0)
#define A13_PLL2_PRE_DIV_SHIFT 0
#define A23_PLL1_FACTOR_P (0x3 << 16)
#define A23_PLL1_FACTOR_P_SHIFT 16
#define A23_PLL1_FACTOR_N (0x1f << 8)
#define A23_PLL1_FACTOR_N_SHIFT 8
#define A23_PLL1_FACTOR_K (0x3 << 4)
#define A23_PLL1_FACTOR_K_SHIFT 4
#define A23_PLL1_FACTOR_M (0x3 << 0)
#define A23_PLL1_FACTOR_M_SHIFT 0
#define A31_PLL1_LOCK (1 << 28)
#define A31_PLL1_CPU_SIGMA_DELTA_EN (1 << 24)
#define A31_PLL1_FACTOR_N (0x1f << 8)
#define A31_PLL1_FACTOR_N_SHIFT 8
#define A31_PLL1_FACTOR_K (0x3 << 4)
#define A31_PLL1_FACTOR_K_SHIFT 4
#define A31_PLL1_FACTOR_M (0x3 << 0)
#define A31_PLL1_FACTOR_M_SHIFT 0
#define A31_PLL6_LOCK (1 << 28)
#define A31_PLL6_BYPASS_EN (1 << 25)
#define A31_PLL6_CLK_OUT_EN (1 << 24)
#define A31_PLL6_24M_OUT_EN (1 << 18)
#define A31_PLL6_24M_POST_DIV (0x3 << 16)
#define A31_PLL6_24M_POST_DIV_SHIFT 16
#define A31_PLL6_FACTOR_N (0x1f << 8)
#define A31_PLL6_FACTOR_N_SHIFT 8
#define A31_PLL6_FACTOR_K (0x3 << 4)
#define A31_PLL6_FACTOR_K_SHIFT 4
#define A31_PLL6_DEFAULT_N 0x18
#define A31_PLL6_DEFAULT_K 0x1
#define A31_PLL6_TIMEOUT 10
#define A64_PLLHSIC_LOCK (1 << 28)
#define A64_PLLHSIC_FRAC_CLK_OUT (1 << 25)
#define A64_PLLHSIC_PLL_MODE_SEL (1 << 24)
#define A64_PLLHSIC_PLL_SDM_EN (1 << 20)
#define A64_PLLHSIC_FACTOR_N (0x7f << 8)
#define A64_PLLHSIC_FACTOR_N_SHIFT 8
#define A64_PLLHSIC_PRE_DIV_M (0xf << 0)
#define A64_PLLHSIC_PRE_DIV_M_SHIFT 0
#define A80_PLL4_CLK_OUT_EN (1 << 20)
#define A80_PLL4_PLL_DIV2 (1 << 18)
#define A80_PLL4_PLL_DIV1 (1 << 16)
#define A80_PLL4_FACTOR_N (0xff << 8)
#define A80_PLL4_FACTOR_N_SHIFT 8
#define A83T_PLLCPUX_LOCK_TIME (0x7 << 24)
#define A83T_PLLCPUX_LOCK_TIME_SHIFT 24
#define A83T_PLLCPUX_CLOCK_OUTPUT_DIS (1 << 20)
#define A83T_PLLCPUX_OUT_EXT_DIVP (1 << 16)
#define A83T_PLLCPUX_FACTOR_N (0xff << 8)
#define A83T_PLLCPUX_FACTOR_N_SHIFT 8
#define A83T_PLLCPUX_FACTOR_N_MIN 12
#define A83T_PLLCPUX_FACTOR_N_MAX 125
#define A83T_PLLCPUX_POSTDIV_M (0x3 << 0)
#define A83T_PLLCPUX_POSTDIV_M_SHIFT 0
#define H3_PLL2_LOCK (1 << 28)
#define H3_PLL2_SDM_EN (1 << 24)
#define H3_PLL2_POST_DIV (0xf << 16)
#define H3_PLL2_POST_DIV_SHIFT 16
#define H3_PLL2_FACTOR_N (0x7f << 8)
#define H3_PLL2_FACTOR_N_SHIFT 8
#define H3_PLL2_PRE_DIV (0x1f << 0)
#define H3_PLL2_PRE_DIV_SHIFT 0
#define CLKID_A10_PLL5_DDR 0
#define CLKID_A10_PLL5_OTHER 1
#define CLKID_A10_PLL6_SATA 0
#define CLKID_A10_PLL6_OTHER 1
#define CLKID_A10_PLL6 2
#define CLKID_A10_PLL6_DIV_4 3
#define CLKID_A31_PLL6 0
#define CLKID_A31_PLL6_X2 1
struct aw_pll_factor {
unsigned int n;
unsigned int k;
unsigned int m;
unsigned int p;
uint64_t freq;
};
#define PLLFACTOR(_n, _k, _m, _p, _freq) \
{ .n = (_n), .k = (_k), .m = (_m), .p = (_p), .freq = (_freq) }
static struct aw_pll_factor aw_a10_pll1_factors[] = {
PLLFACTOR(6, 0, 0, 0, 144000000),
PLLFACTOR(12, 0, 0, 0, 312000000),
PLLFACTOR(21, 0, 0, 0, 528000000),
PLLFACTOR(29, 0, 0, 0, 720000000),
PLLFACTOR(18, 1, 0, 0, 864000000),
PLLFACTOR(19, 1, 0, 0, 912000000),
PLLFACTOR(20, 1, 0, 0, 960000000),
};
static struct aw_pll_factor aw_a23_pll1_factors[] = {
PLLFACTOR(9, 0, 0, 2, 60000000),
PLLFACTOR(10, 0, 0, 2, 66000000),
PLLFACTOR(11, 0, 0, 2, 72000000),
PLLFACTOR(12, 0, 0, 2, 78000000),
PLLFACTOR(13, 0, 0, 2, 84000000),
PLLFACTOR(14, 0, 0, 2, 90000000),
PLLFACTOR(15, 0, 0, 2, 96000000),
PLLFACTOR(16, 0, 0, 2, 102000000),
PLLFACTOR(17, 0, 0, 2, 108000000),
PLLFACTOR(18, 0, 0, 2, 114000000),
PLLFACTOR(9, 0, 0, 1, 120000000),
PLLFACTOR(10, 0, 0, 1, 132000000),
PLLFACTOR(11, 0, 0, 1, 144000000),
PLLFACTOR(12, 0, 0, 1, 156000000),
PLLFACTOR(13, 0, 0, 1, 168000000),
PLLFACTOR(14, 0, 0, 1, 180000000),
PLLFACTOR(15, 0, 0, 1, 192000000),
PLLFACTOR(16, 0, 0, 1, 204000000),
PLLFACTOR(17, 0, 0, 1, 216000000),
PLLFACTOR(18, 0, 0, 1, 228000000),
PLLFACTOR(9, 0, 0, 0, 240000000),
PLLFACTOR(10, 0, 0, 0, 264000000),
PLLFACTOR(11, 0, 0, 0, 288000000),
PLLFACTOR(12, 0, 0, 0, 312000000),
PLLFACTOR(13, 0, 0, 0, 336000000),
PLLFACTOR(14, 0, 0, 0, 360000000),
PLLFACTOR(15, 0, 0, 0, 384000000),
PLLFACTOR(16, 0, 0, 0, 408000000),
PLLFACTOR(17, 0, 0, 0, 432000000),
PLLFACTOR(18, 0, 0, 0, 456000000),
PLLFACTOR(19, 0, 0, 0, 480000000),
PLLFACTOR(20, 0, 0, 0, 504000000),
PLLFACTOR(21, 0, 0, 0, 528000000),
PLLFACTOR(22, 0, 0, 0, 552000000),
PLLFACTOR(23, 0, 0, 0, 576000000),
PLLFACTOR(24, 0, 0, 0, 600000000),
PLLFACTOR(25, 0, 0, 0, 624000000),
PLLFACTOR(26, 0, 0, 0, 648000000),
PLLFACTOR(27, 0, 0, 0, 672000000),
PLLFACTOR(28, 0, 0, 0, 696000000),
PLLFACTOR(29, 0, 0, 0, 720000000),
PLLFACTOR(15, 1, 0, 0, 768000000),
PLLFACTOR(10, 2, 0, 0, 792000000),
PLLFACTOR(16, 1, 0, 0, 816000000),
PLLFACTOR(17, 1, 0, 0, 864000000),
PLLFACTOR(18, 1, 0, 0, 912000000),
PLLFACTOR(12, 2, 0, 0, 936000000),
PLLFACTOR(19, 1, 0, 0, 960000000),
PLLFACTOR(20, 1, 0, 0, 1008000000),
PLLFACTOR(21, 1, 0, 0, 1056000000),
PLLFACTOR(14, 2, 0, 0, 1080000000),
PLLFACTOR(22, 1, 0, 0, 1104000000),
PLLFACTOR(23, 1, 0, 0, 1152000000),
PLLFACTOR(24, 1, 0, 0, 1200000000),
PLLFACTOR(16, 2, 0, 0, 1224000000),
PLLFACTOR(25, 1, 0, 0, 1248000000),
PLLFACTOR(26, 1, 0, 0, 1296000000),
PLLFACTOR(27, 1, 0, 0, 1344000000),
PLLFACTOR(18, 2, 0, 0, 1368000000),
PLLFACTOR(28, 1, 0, 0, 1392000000),
PLLFACTOR(29, 1, 0, 0, 1440000000),
PLLFACTOR(20, 2, 0, 0, 1512000000),
PLLFACTOR(15, 3, 0, 0, 1536000000),
PLLFACTOR(21, 2, 0, 0, 1584000000),
PLLFACTOR(16, 3, 0, 0, 1632000000),
PLLFACTOR(22, 2, 0, 0, 1656000000),
PLLFACTOR(23, 2, 0, 0, 1728000000),
PLLFACTOR(24, 2, 0, 0, 1800000000),
PLLFACTOR(18, 3, 0, 0, 1824000000),
PLLFACTOR(25, 2, 0, 0, 1872000000),
};
static struct aw_pll_factor aw_h3_pll2_factors[] = {
PLLFACTOR(13, 0, 0, 13, 24576000),
PLLFACTOR(6, 0, 0, 7, 22579200),
};
enum aw_pll_type {
AWPLL_A10_PLL1 = 1,
AWPLL_A10_PLL2,
AWPLL_A10_PLL3,
AWPLL_A10_PLL5,
AWPLL_A10_PLL6,
AWPLL_A13_PLL2,
AWPLL_A23_PLL1,
AWPLL_A31_PLL1,
AWPLL_A31_PLL6,
AWPLL_A64_PLLHSIC,
AWPLL_A80_PLL4,
AWPLL_A83T_PLLCPUX,
AWPLL_H3_PLL1,
AWPLL_H3_PLL2,
};
struct aw_pll_sc {
enum aw_pll_type type;
device_t clkdev;
bus_addr_t reg;
int id;
};
struct aw_pll_funcs {
int (*recalc)(struct aw_pll_sc *, uint64_t *);
int (*set_freq)(struct aw_pll_sc *, uint64_t, uint64_t *, int);
int (*init)(device_t, bus_addr_t, struct clknode_init_def *);
};
#define PLL_READ(sc, val) CLKDEV_READ_4((sc)->clkdev, (sc)->reg, (val))
#define PLL_WRITE(sc, val) CLKDEV_WRITE_4((sc)->clkdev, (sc)->reg, (val))
#define DEVICE_LOCK(sc) CLKDEV_DEVICE_LOCK((sc)->clkdev)
#define DEVICE_UNLOCK(sc) CLKDEV_DEVICE_UNLOCK((sc)->clkdev)
static int
a10_pll1_set_freq(struct aw_pll_sc *sc, uint64_t fin, uint64_t *fout,
int flags)
{
struct aw_pll_factor *f;
uint32_t val;
int n;
f = NULL;
for (n = 0; n < nitems(aw_a10_pll1_factors); n++) {
if (aw_a10_pll1_factors[n].freq == *fout) {
f = &aw_a10_pll1_factors[n];
break;
}
}
if (f == NULL)
return (EINVAL);
if ((flags & CLK_SET_DRYRUN) != 0)
return (0);
DEVICE_LOCK(sc);
PLL_READ(sc, &val);
val &= ~(A10_PLL1_FACTOR_N|A10_PLL1_FACTOR_K|A10_PLL1_FACTOR_M|
A10_PLL1_OUT_EXT_DIVP);
val |= (f->p << A10_PLL1_OUT_EXT_DIVP_SHIFT);
val |= (f->n << A10_PLL1_FACTOR_N_SHIFT);
val |= (f->k << A10_PLL1_FACTOR_K_SHIFT);
val |= (f->m << A10_PLL1_FACTOR_M_SHIFT);
PLL_WRITE(sc, val);
DEVICE_UNLOCK(sc);
return (0);
}
static int
a10_pll1_recalc(struct aw_pll_sc *sc, uint64_t *freq)
{
uint32_t val, m, n, k, p;
DEVICE_LOCK(sc);
PLL_READ(sc, &val);
DEVICE_UNLOCK(sc);
p = 1 << ((val & A10_PLL1_OUT_EXT_DIVP) >> A10_PLL1_OUT_EXT_DIVP_SHIFT);
m = ((val & A10_PLL1_FACTOR_M) >> A10_PLL1_FACTOR_M_SHIFT) + 1;
k = ((val & A10_PLL1_FACTOR_K) >> A10_PLL1_FACTOR_K_SHIFT) + 1;
n = (val & A10_PLL1_FACTOR_N) >> A10_PLL1_FACTOR_N_SHIFT;
if (n == 0)
n = 1;
*freq = (*freq * n * k) / (m * p);
return (0);
}
static int
a10_pll2_recalc(struct aw_pll_sc *sc, uint64_t *freq)
{
uint32_t val, post_div, n, pre_div;
DEVICE_LOCK(sc);
PLL_READ(sc, &val);
DEVICE_UNLOCK(sc);
post_div = (val & A10_PLL2_POST_DIV) >> A10_PLL2_POST_DIV_SHIFT;
if (post_div == 0)
post_div = 1;
n = (val & A10_PLL2_FACTOR_N) >> A10_PLL2_FACTOR_N_SHIFT;
if (n == 0)
n = 1;
pre_div = (val & A10_PLL2_PRE_DIV) >> A10_PLL2_PRE_DIV_SHIFT;
if (pre_div == 0)
pre_div = 1;
switch (sc->id) {
case SUN4I_A10_PLL2_1X:
*freq = (*freq * 2 * n) / pre_div / post_div / 2;
break;
case SUN4I_A10_PLL2_2X:
*freq = (*freq * 2 * n) / pre_div / 4;
break;
case SUN4I_A10_PLL2_4X:
*freq = (*freq * 2 * n) / pre_div / 2;
break;
case SUN4I_A10_PLL2_8X:
*freq = (*freq * 2 * n) / pre_div;
break;
default:
return (EINVAL);
}
return (0);
}
static int
a10_pll2_set_freq(struct aw_pll_sc *sc, uint64_t fin, uint64_t *fout,
int flags)
{
uint32_t val, post_div, n, pre_div;
if (sc->id != SUN4I_A10_PLL2_1X)
return (ENXIO);
/*
* Audio Codec needs PLL2-1X to be either 24576000 or 22579200.
*
* PLL2-1X output frequency is (48MHz * n) / pre_div / post_div / 2.
* To get as close as possible to the desired rate, we use a
* pre-divider of 21 and a post-divider of 4. With these values,
* a multiplier of 86 or 79 gets us close to the target rates.
*/
if (*fout != 24576000 && *fout != 22579200)
return (EINVAL);
pre_div = 21;
post_div = 4;
n = (*fout * pre_div * post_div * 2) / (2 * fin);
if ((flags & CLK_SET_DRYRUN) != 0)
return (0);
DEVICE_LOCK(sc);
PLL_READ(sc, &val);
val &= ~(A10_PLL2_POST_DIV | A10_PLL2_FACTOR_N | A10_PLL2_PRE_DIV);
val |= (post_div << A10_PLL2_POST_DIV_SHIFT);
val |= (n << A10_PLL2_FACTOR_N_SHIFT);
val |= (pre_div << A10_PLL2_PRE_DIV_SHIFT);
PLL_WRITE(sc, val);
DEVICE_UNLOCK(sc);
return (0);
}
static int
a10_pll3_recalc(struct aw_pll_sc *sc, uint64_t *freq)
{
uint32_t val, m;
DEVICE_LOCK(sc);
PLL_READ(sc, &val);
DEVICE_UNLOCK(sc);
if ((val & A10_PLL3_MODE_SEL) == A10_PLL3_MODE_SEL_INT) {
/* In integer mode, output is 3MHz * m */
m = (val & A10_PLL3_FACTOR_M) >> A10_PLL3_FACTOR_M_SHIFT;
*freq = A10_PLL3_REF_FREQ * m;
} else {
/* In fractional mode, output is either 270MHz or 297MHz */
if ((val & A10_PLL3_FUNC_SET) == A10_PLL3_FUNC_SET_270MHZ)
*freq = 270000000;
else
*freq = 297000000;
}
return (0);
}
static int
a10_pll3_set_freq(struct aw_pll_sc *sc, uint64_t fin, uint64_t *fout,
int flags)
{
uint32_t val, m, mode, func;
if (*fout == 297000000) {
func = A10_PLL3_FUNC_SET_297MHZ;
mode = A10_PLL3_MODE_SEL_FRACT;
m = 0;
} else if (*fout == 270000000) {
func = A10_PLL3_FUNC_SET_270MHZ;
mode = A10_PLL3_MODE_SEL_FRACT;
m = 0;
} else {
mode = A10_PLL3_MODE_SEL_INT;
func = 0;
m = *fout / A10_PLL3_REF_FREQ;
*fout = m * A10_PLL3_REF_FREQ;
}
if ((flags & CLK_SET_DRYRUN) != 0)
return (0);
DEVICE_LOCK(sc);
PLL_READ(sc, &val);
val &= ~(A10_PLL3_MODE_SEL | A10_PLL3_FUNC_SET | A10_PLL3_FACTOR_M);
val |= mode;
val |= func;
val |= (m << A10_PLL3_FACTOR_M_SHIFT);
PLL_WRITE(sc, val);
DEVICE_UNLOCK(sc);
return (0);
}
static int
a10_pll3_init(device_t dev, bus_addr_t reg, struct clknode_init_def *def)
{
uint32_t val;
/* Allow changing PLL frequency while enabled */
def->flags = CLK_NODE_GLITCH_FREE;
/* Set PLL to 297MHz */
CLKDEV_DEVICE_LOCK(dev);
CLKDEV_READ_4(dev, reg, &val);
val &= ~(A10_PLL3_MODE_SEL | A10_PLL3_FUNC_SET | A10_PLL3_FACTOR_M);
val |= A10_PLL3_MODE_SEL_FRACT;
val |= A10_PLL3_FUNC_SET_297MHZ;
CLKDEV_WRITE_4(dev, reg, val);
CLKDEV_DEVICE_UNLOCK(dev);
return (0);
}
static int
a10_pll5_recalc(struct aw_pll_sc *sc, uint64_t *freq)
{
uint32_t val, m, n, k, p;
DEVICE_LOCK(sc);
PLL_READ(sc, &val);
DEVICE_UNLOCK(sc);
p = 1 << ((val & A10_PLL5_OUT_EXT_DIVP) >> A10_PLL5_OUT_EXT_DIVP_SHIFT);
m = ((val & A10_PLL5_FACTOR_M) >> A10_PLL5_FACTOR_M_SHIFT) + 1;
k = ((val & A10_PLL5_FACTOR_K) >> A10_PLL5_FACTOR_K_SHIFT) + 1;
n = (val & A10_PLL5_FACTOR_N) >> A10_PLL5_FACTOR_N_SHIFT;
if (n == 0)
return (ENXIO);
switch (sc->id) {
case CLKID_A10_PLL5_DDR:
*freq = (*freq * n * k) / m;
break;
case CLKID_A10_PLL5_OTHER:
*freq = (*freq * n * k) / p;
break;
default:
return (ENXIO);
}
return (0);
}
static int
a10_pll6_init(device_t dev, bus_addr_t reg, struct clknode_init_def *def)
{
uint32_t val, m, n, k;
/*
* SATA needs PLL6 to be a 100MHz clock.
*
* The SATA output frequency is (24MHz * n * k) / m / 6.
* To get to 100MHz, k & m must be equal and n must be 25.
*/
m = k = 0;
n = 25;
CLKDEV_DEVICE_LOCK(dev);
CLKDEV_READ_4(dev, reg, &val);
val &= ~(A10_PLL6_FACTOR_N | A10_PLL6_FACTOR_K | A10_PLL6_FACTOR_M);
val &= ~A10_PLL6_BYPASS_EN;
val |= A10_PLL6_SATA_CLK_EN;
val |= (n << A10_PLL6_FACTOR_N_SHIFT);
val |= (k << A10_PLL6_FACTOR_K_SHIFT);
val |= (m << A10_PLL6_FACTOR_M_SHIFT);
CLKDEV_WRITE_4(dev, reg, val);
CLKDEV_DEVICE_UNLOCK(dev);
return (0);
}
static int
a10_pll6_recalc(struct aw_pll_sc *sc, uint64_t *freq)
{
uint32_t val, m, k, n;
DEVICE_LOCK(sc);
PLL_READ(sc, &val);
DEVICE_UNLOCK(sc);
m = ((val & A10_PLL6_FACTOR_M) >> A10_PLL6_FACTOR_M_SHIFT) + 1;
k = ((val & A10_PLL6_FACTOR_K) >> A10_PLL6_FACTOR_K_SHIFT) + 1;
n = (val & A10_PLL6_FACTOR_N) >> A10_PLL6_FACTOR_N_SHIFT;
if (n == 0)
return (ENXIO);
switch (sc->id) {
case CLKID_A10_PLL6_SATA:
*freq = (*freq * n * k) / m / 6;
break;
case CLKID_A10_PLL6_OTHER:
*freq = (*freq * n * k) / 2;
break;
case CLKID_A10_PLL6:
*freq = (*freq * n * k);
break;
case CLKID_A10_PLL6_DIV_4:
*freq = (*freq * n * k) / 4;
break;
default:
return (ENXIO);
}
return (0);
}
static int
a10_pll6_set_freq(struct aw_pll_sc *sc, uint64_t fin, uint64_t *fout,
int flags)
{
if (sc->id != CLKID_A10_PLL6_SATA)
return (ENXIO);
/* PLL6 SATA output has been set to 100MHz in a10_pll6_init */
if (*fout != 100000000)
return (ERANGE);
return (0);
}
static int
a13_pll2_recalc(struct aw_pll_sc *sc, uint64_t *freq)
{
uint32_t val, post_div, n, pre_div;
DEVICE_LOCK(sc);
PLL_READ(sc, &val);
DEVICE_UNLOCK(sc);
post_div = ((val & A13_PLL2_POST_DIV) >> A13_PLL2_POST_DIV_SHIFT) + 1;
if (post_div == 0)
post_div = 1;
n = (val & A13_PLL2_FACTOR_N) >> A13_PLL2_FACTOR_N_SHIFT;
if (n == 0)
n = 1;
pre_div = ((val & A13_PLL2_PRE_DIV) >> A13_PLL2_PRE_DIV_SHIFT) + 1;
if (pre_div == 0)
pre_div = 1;
switch (sc->id) {
case SUN4I_A10_PLL2_1X:
*freq = (*freq * 2 * n) / pre_div / post_div / 2;
break;
case SUN4I_A10_PLL2_2X:
*freq = (*freq * 2 * n) / pre_div / 4;
break;
case SUN4I_A10_PLL2_4X:
*freq = (*freq * 2 * n) / pre_div / 2;
break;
case SUN4I_A10_PLL2_8X:
*freq = (*freq * 2 * n) / pre_div;
break;
default:
return (EINVAL);
}
return (0);
}
static int
a13_pll2_set_freq(struct aw_pll_sc *sc, uint64_t fin, uint64_t *fout,
int flags)
{
uint32_t val, post_div, n, pre_div;
if (sc->id != SUN4I_A10_PLL2_1X)
return (ENXIO);
/*
* Audio Codec needs PLL2-1X to be either 24576000 or 22579200.
*
* PLL2-1X output frequency is (48MHz * n) / pre_div / post_div / 2.
* To get as close as possible to the desired rate, we use a
* pre-divider of 21 and a post-divider of 4. With these values,
* a multiplier of 86 or 79 gets us close to the target rates.
*/
if (*fout != 24576000 && *fout != 22579200)
return (EINVAL);
pre_div = 21;
post_div = 4;
n = (*fout * pre_div * post_div * 2) / (2 * fin);
if ((flags & CLK_SET_DRYRUN) != 0)
return (0);
DEVICE_LOCK(sc);
PLL_READ(sc, &val);
val &= ~(A13_PLL2_POST_DIV | A13_PLL2_FACTOR_N | A13_PLL2_PRE_DIV);
val |= ((post_div - 1) << A13_PLL2_POST_DIV_SHIFT);
val |= (n << A13_PLL2_FACTOR_N_SHIFT);
val |= ((pre_div - 1) << A13_PLL2_PRE_DIV_SHIFT);
PLL_WRITE(sc, val);
DEVICE_UNLOCK(sc);
return (0);
}
static int
h3_pll2_recalc(struct aw_pll_sc *sc, uint64_t *freq)
{
uint32_t val, p, n, m;
DEVICE_LOCK(sc);
PLL_READ(sc, &val);
DEVICE_UNLOCK(sc);
p = ((val & H3_PLL2_POST_DIV) >> H3_PLL2_POST_DIV_SHIFT) + 1;
n = ((val & H3_PLL2_FACTOR_N) >> H3_PLL2_FACTOR_N_SHIFT) + 1;
m = ((val & H3_PLL2_PRE_DIV) >> H3_PLL2_PRE_DIV_SHIFT) + 1;
switch (sc->id) {
case SUN4I_A10_PLL2_1X:
*freq = (*freq * n) / (m * p);
break;
case SUN4I_A10_PLL2_2X:
*freq = (*freq * 2 * n) / m / 4;
break;
case SUN4I_A10_PLL2_4X:
*freq = (*freq * 2 * n) / m / 2;
break;
case SUN4I_A10_PLL2_8X:
*freq = (*freq * 2 * n) / m;
break;
default:
return (EINVAL);
}
return (0);
}
static int
h3_pll2_set_freq(struct aw_pll_sc *sc, uint64_t fin, uint64_t *fout,
int flags)
{
struct aw_pll_factor *f;
uint32_t val;
int n, error, retry;
if (sc->id != SUN4I_A10_PLL2_1X)
return (ENXIO);
f = NULL;
for (n = 0; n < nitems(aw_h3_pll2_factors); n++) {
if (aw_h3_pll2_factors[n].freq == *fout) {
f = &aw_h3_pll2_factors[n];
break;
}
}
if (f == NULL)
return (EINVAL);
if ((flags & CLK_SET_DRYRUN) != 0)
return (0);
DEVICE_LOCK(sc);
PLL_READ(sc, &val);
val &= ~(H3_PLL2_POST_DIV|H3_PLL2_FACTOR_N|H3_PLL2_PRE_DIV);
val |= (f->p << H3_PLL2_POST_DIV_SHIFT);
val |= (f->n << H3_PLL2_FACTOR_N_SHIFT);
val |= (f->m << H3_PLL2_PRE_DIV_SHIFT);
val |= AW_PLL_ENABLE;
PLL_WRITE(sc, val);
/* Wait for lock */
error = 0;
for (retry = 0; retry < 1000; retry++) {
PLL_READ(sc, &val);
if ((val & H3_PLL2_LOCK) != 0)
break;
DELAY(100);
}
if (retry == 0)
error = ETIMEDOUT;
DEVICE_UNLOCK(sc);
return (error);
}
static int
a23_pll1_set_freq(struct aw_pll_sc *sc, uint64_t fin, uint64_t *fout,
int flags)
{
struct aw_pll_factor *f;
uint32_t val;
int n;
f = NULL;
for (n = 0; n < nitems(aw_a23_pll1_factors); n++) {
if (aw_a23_pll1_factors[n].freq == *fout) {
f = &aw_a23_pll1_factors[n];
break;
}
}
if (f == NULL)
return (EINVAL);
if ((flags & CLK_SET_DRYRUN) != 0)
return (0);
DEVICE_LOCK(sc);
PLL_READ(sc, &val);
val &= ~(A23_PLL1_FACTOR_N|A23_PLL1_FACTOR_K|A23_PLL1_FACTOR_M|
A23_PLL1_FACTOR_P);
val |= (f->n << A23_PLL1_FACTOR_N_SHIFT);
val |= (f->k << A23_PLL1_FACTOR_K_SHIFT);
val |= (f->m << A23_PLL1_FACTOR_M_SHIFT);
val |= (f->p << A23_PLL1_FACTOR_P_SHIFT);
PLL_WRITE(sc, val);
DEVICE_UNLOCK(sc);
return (0);
}
static int
a23_pll1_recalc(struct aw_pll_sc *sc, uint64_t *freq)
{
uint32_t val, m, n, k, p;
DEVICE_LOCK(sc);
PLL_READ(sc, &val);
DEVICE_UNLOCK(sc);
m = ((val & A23_PLL1_FACTOR_M) >> A23_PLL1_FACTOR_M_SHIFT) + 1;
k = ((val & A23_PLL1_FACTOR_K) >> A23_PLL1_FACTOR_K_SHIFT) + 1;
n = ((val & A23_PLL1_FACTOR_N) >> A23_PLL1_FACTOR_N_SHIFT) + 1;
p = ((val & A23_PLL1_FACTOR_P) >> A23_PLL1_FACTOR_P_SHIFT) + 1;
*freq = (*freq * n * k) / (m * p);
return (0);
}
static int
h3_pll1_set_freq(struct aw_pll_sc *sc, uint64_t fin, uint64_t *fout,
int flags)
{
struct aw_pll_factor *f;
- uint32_t val, n, k, m, p;
+ uint32_t val, m, p;
int i;
f = NULL;
for (i = 0; i < nitems(aw_a23_pll1_factors); i++) {
if (aw_a23_pll1_factors[i].freq == *fout) {
f = &aw_a23_pll1_factors[i];
break;
}
}
if (f == NULL)
return (EINVAL);
if ((flags & CLK_SET_DRYRUN) != 0)
return (0);
DEVICE_LOCK(sc);
PLL_READ(sc, &val);
- n = (val & A23_PLL1_FACTOR_N) >> A23_PLL1_FACTOR_N_SHIFT;
- k = (val & A23_PLL1_FACTOR_K) >> A23_PLL1_FACTOR_K_SHIFT;
m = (val & A23_PLL1_FACTOR_M) >> A23_PLL1_FACTOR_M_SHIFT;
p = (val & A23_PLL1_FACTOR_P) >> A23_PLL1_FACTOR_P_SHIFT;
if (p < f->p) {
val &= ~A23_PLL1_FACTOR_P;
val |= (f->p << A23_PLL1_FACTOR_P_SHIFT);
PLL_WRITE(sc, val);
DELAY(2000);
}
if (m < f->m) {
val &= ~A23_PLL1_FACTOR_M;
val |= (f->m << A23_PLL1_FACTOR_M_SHIFT);
PLL_WRITE(sc, val);
DELAY(2000);
}
val &= ~(A23_PLL1_FACTOR_N|A23_PLL1_FACTOR_K);
val |= (f->n << A23_PLL1_FACTOR_N_SHIFT);
val |= (f->k << A23_PLL1_FACTOR_K_SHIFT);
PLL_WRITE(sc, val);
DELAY(2000);
if (m > f->m) {
val &= ~A23_PLL1_FACTOR_M;
val |= (f->m << A23_PLL1_FACTOR_M_SHIFT);
PLL_WRITE(sc, val);
DELAY(2000);
}
if (p > f->p) {
val &= ~A23_PLL1_FACTOR_P;
val |= (f->p << A23_PLL1_FACTOR_P_SHIFT);
PLL_WRITE(sc, val);
DELAY(2000);
}
DEVICE_UNLOCK(sc);
return (0);
}
static int
a31_pll1_recalc(struct aw_pll_sc *sc, uint64_t *freq)
{
uint32_t val, m, n, k;
DEVICE_LOCK(sc);
PLL_READ(sc, &val);
DEVICE_UNLOCK(sc);
m = ((val & A31_PLL1_FACTOR_M) >> A31_PLL1_FACTOR_M_SHIFT) + 1;
k = ((val & A31_PLL1_FACTOR_K) >> A31_PLL1_FACTOR_K_SHIFT) + 1;
n = ((val & A31_PLL1_FACTOR_N) >> A31_PLL1_FACTOR_N_SHIFT) + 1;
*freq = (*freq * n * k) / m;
return (0);
}
static int
a31_pll6_init(device_t dev, bus_addr_t reg, struct clknode_init_def *def)
{
uint32_t val;
int retry;
if (def->id != CLKID_A31_PLL6)
return (0);
/*
* The datasheet recommends that PLL6 output should be fixed to
* 600MHz.
*/
CLKDEV_DEVICE_LOCK(dev);
CLKDEV_READ_4(dev, reg, &val);
val &= ~(A31_PLL6_FACTOR_N | A31_PLL6_FACTOR_K | A31_PLL6_BYPASS_EN);
val |= (A31_PLL6_DEFAULT_N << A31_PLL6_FACTOR_N_SHIFT);
val |= (A31_PLL6_DEFAULT_K << A31_PLL6_FACTOR_K_SHIFT);
val |= AW_PLL_ENABLE;
CLKDEV_WRITE_4(dev, reg, val);
/* Wait for PLL to become stable */
for (retry = A31_PLL6_TIMEOUT; retry > 0; retry--) {
CLKDEV_READ_4(dev, reg, &val);
if ((val & A31_PLL6_LOCK) == A31_PLL6_LOCK)
break;
DELAY(1);
}
CLKDEV_DEVICE_UNLOCK(dev);
return (0);
}
static int
a31_pll6_recalc(struct aw_pll_sc *sc, uint64_t *freq)
{
uint32_t val, k, n;
DEVICE_LOCK(sc);
PLL_READ(sc, &val);
DEVICE_UNLOCK(sc);
k = ((val & A10_PLL6_FACTOR_K) >> A10_PLL6_FACTOR_K_SHIFT) + 1;
n = ((val & A10_PLL6_FACTOR_N) >> A10_PLL6_FACTOR_N_SHIFT) + 1;
switch (sc->id) {
case CLKID_A31_PLL6:
*freq = (*freq * n * k) / 2;
break;
case CLKID_A31_PLL6_X2:
*freq = *freq * n * k;
break;
default:
return (ENXIO);
}
return (0);
}
static int
a80_pll4_recalc(struct aw_pll_sc *sc, uint64_t *freq)
{
uint32_t val, n, div1, div2;
DEVICE_LOCK(sc);
PLL_READ(sc, &val);
DEVICE_UNLOCK(sc);
n = (val & A80_PLL4_FACTOR_N) >> A80_PLL4_FACTOR_N_SHIFT;
div1 = (val & A80_PLL4_PLL_DIV1) == 0 ? 1 : 2;
div2 = (val & A80_PLL4_PLL_DIV2) == 0 ? 1 : 2;
*freq = (*freq * n) / div1 / div2;
return (0);
}
static int
a64_pllhsic_recalc(struct aw_pll_sc *sc, uint64_t *freq)
{
uint32_t val, n, m;
DEVICE_LOCK(sc);
PLL_READ(sc, &val);
DEVICE_UNLOCK(sc);
n = ((val & A64_PLLHSIC_FACTOR_N) >> A64_PLLHSIC_FACTOR_N_SHIFT) + 1;
m = ((val & A64_PLLHSIC_PRE_DIV_M) >> A64_PLLHSIC_PRE_DIV_M_SHIFT) + 1;
*freq = (*freq * n) / m;
return (0);
}
static int
a64_pllhsic_init(device_t dev, bus_addr_t reg, struct clknode_init_def *def)
{
uint32_t val;
/*
* PLL_HSIC default is 480MHz, just enable it.
*/
CLKDEV_DEVICE_LOCK(dev);
CLKDEV_READ_4(dev, reg, &val);
val |= AW_PLL_ENABLE;
CLKDEV_WRITE_4(dev, reg, val);
CLKDEV_DEVICE_UNLOCK(dev);
return (0);
}
static int
a83t_pllcpux_recalc(struct aw_pll_sc *sc, uint64_t *freq)
{
uint32_t val, n, p;
DEVICE_LOCK(sc);
PLL_READ(sc, &val);
DEVICE_UNLOCK(sc);
n = (val & A83T_PLLCPUX_FACTOR_N) >> A83T_PLLCPUX_FACTOR_N_SHIFT;
p = (val & A83T_PLLCPUX_OUT_EXT_DIVP) ? 4 : 1;
*freq = (*freq * n) / p;
return (0);
}
static int
a83t_pllcpux_set_freq(struct aw_pll_sc *sc, uint64_t fin, uint64_t *fout,
int flags)
{
uint32_t val;
u_int n;
n = *fout / fin;
if (n < A83T_PLLCPUX_FACTOR_N_MIN || n > A83T_PLLCPUX_FACTOR_N_MAX)
return (EINVAL);
if ((flags & CLK_SET_DRYRUN) != 0)
return (0);
DEVICE_LOCK(sc);
PLL_READ(sc, &val);
val &= ~A83T_PLLCPUX_FACTOR_N;
val |= (n << A83T_PLLCPUX_FACTOR_N_SHIFT);
val &= ~A83T_PLLCPUX_CLOCK_OUTPUT_DIS;
PLL_WRITE(sc, val);
DEVICE_UNLOCK(sc);
return (0);
}
#define PLL(_type, _recalc, _set_freq, _init) \
[(_type)] = { \
.recalc = (_recalc), \
.set_freq = (_set_freq), \
.init = (_init) \
}
static struct aw_pll_funcs aw_pll_func[] = {
PLL(AWPLL_A10_PLL1, a10_pll1_recalc, a10_pll1_set_freq, NULL),
PLL(AWPLL_A10_PLL2, a10_pll2_recalc, a10_pll2_set_freq, NULL),
PLL(AWPLL_A10_PLL3, a10_pll3_recalc, a10_pll3_set_freq, a10_pll3_init),
PLL(AWPLL_A10_PLL5, a10_pll5_recalc, NULL, NULL),
PLL(AWPLL_A10_PLL6, a10_pll6_recalc, a10_pll6_set_freq, a10_pll6_init),
PLL(AWPLL_A13_PLL2, a13_pll2_recalc, a13_pll2_set_freq, NULL),
PLL(AWPLL_A23_PLL1, a23_pll1_recalc, a23_pll1_set_freq, NULL),
PLL(AWPLL_A31_PLL1, a31_pll1_recalc, NULL, NULL),
PLL(AWPLL_A31_PLL6, a31_pll6_recalc, NULL, a31_pll6_init),
PLL(AWPLL_A80_PLL4, a80_pll4_recalc, NULL, NULL),
PLL(AWPLL_A83T_PLLCPUX, a83t_pllcpux_recalc, a83t_pllcpux_set_freq, NULL),
PLL(AWPLL_A64_PLLHSIC, a64_pllhsic_recalc, NULL, a64_pllhsic_init),
PLL(AWPLL_H3_PLL1, a23_pll1_recalc, h3_pll1_set_freq, NULL),
PLL(AWPLL_H3_PLL2, h3_pll2_recalc, h3_pll2_set_freq, NULL),
};
static struct ofw_compat_data compat_data[] = {
{ "allwinner,sun4i-a10-pll1-clk", AWPLL_A10_PLL1 },
{ "allwinner,sun4i-a10-pll2-clk", AWPLL_A10_PLL2 },
{ "allwinner,sun4i-a10-pll3-clk", AWPLL_A10_PLL3 },
{ "allwinner,sun4i-a10-pll5-clk", AWPLL_A10_PLL5 },
{ "allwinner,sun4i-a10-pll6-clk", AWPLL_A10_PLL6 },
{ "allwinner,sun5i-a13-pll2-clk", AWPLL_A13_PLL2 },
{ "allwinner,sun6i-a31-pll1-clk", AWPLL_A31_PLL1 },
{ "allwinner,sun6i-a31-pll6-clk", AWPLL_A31_PLL6 },
{ "allwinner,sun8i-a23-pll1-clk", AWPLL_A23_PLL1 },
{ "allwinner,sun8i-a83t-pllcpux-clk", AWPLL_A83T_PLLCPUX },
{ "allwinner,sun8i-h3-pll1-clk", AWPLL_H3_PLL1 },
{ "allwinner,sun8i-h3-pll2-clk", AWPLL_H3_PLL2 },
{ "allwinner,sun9i-a80-pll4-clk", AWPLL_A80_PLL4 },
{ "allwinner,sun50i-a64-pllhsic-clk", AWPLL_A64_PLLHSIC },
{ NULL, 0 }
};
static int
aw_pll_init(struct clknode *clk, device_t dev)
{
clknode_init_parent_idx(clk, 0);
return (0);
}
static int
aw_pll_set_gate(struct clknode *clk, bool enable)
{
struct aw_pll_sc *sc;
uint32_t val;
sc = clknode_get_softc(clk);
DEVICE_LOCK(sc);
PLL_READ(sc, &val);
if (enable)
val |= AW_PLL_ENABLE;
else
val &= ~AW_PLL_ENABLE;
PLL_WRITE(sc, val);
DEVICE_UNLOCK(sc);
return (0);
}
static int
aw_pll_recalc(struct clknode *clk, uint64_t *freq)
{
struct aw_pll_sc *sc;
sc = clknode_get_softc(clk);
if (aw_pll_func[sc->type].recalc == NULL)
return (ENXIO);
return (aw_pll_func[sc->type].recalc(sc, freq));
}
static int
aw_pll_set_freq(struct clknode *clk, uint64_t fin, uint64_t *fout,
int flags, int *stop)
{
struct aw_pll_sc *sc;
sc = clknode_get_softc(clk);
*stop = 1;
if (aw_pll_func[sc->type].set_freq == NULL)
return (ENXIO);
return (aw_pll_func[sc->type].set_freq(sc, fin, fout, flags));
}
static clknode_method_t aw_pll_clknode_methods[] = {
/* Device interface */
CLKNODEMETHOD(clknode_init, aw_pll_init),
CLKNODEMETHOD(clknode_set_gate, aw_pll_set_gate),
CLKNODEMETHOD(clknode_recalc_freq, aw_pll_recalc),
CLKNODEMETHOD(clknode_set_freq, aw_pll_set_freq),
CLKNODEMETHOD_END
};
DEFINE_CLASS_1(aw_pll_clknode, aw_pll_clknode_class, aw_pll_clknode_methods,
sizeof(struct aw_pll_sc), clknode_class);
static int
aw_pll_create(device_t dev, bus_addr_t paddr, struct clkdom *clkdom,
const char *pclkname, const char *clkname, int index)
{
enum aw_pll_type type;
struct clknode_init_def clkdef;
struct aw_pll_sc *sc;
struct clknode *clk;
int error;
type = ofw_bus_search_compatible(dev, compat_data)->ocd_data;
memset(&clkdef, 0, sizeof(clkdef));
clkdef.id = index;
clkdef.name = clkname;
if (pclkname != NULL) {
clkdef.parent_names = malloc(sizeof(char *), M_OFWPROP,
M_WAITOK);
clkdef.parent_names[0] = pclkname;
clkdef.parent_cnt = 1;
} else
clkdef.parent_cnt = 0;
if (aw_pll_func[type].init != NULL) {
error = aw_pll_func[type].init(device_get_parent(dev),
paddr, &clkdef);
if (error != 0) {
device_printf(dev, "clock %s init failed\n", clkname);
return (error);
}
}
clk = clknode_create(clkdom, &aw_pll_clknode_class, &clkdef);
if (clk == NULL) {
device_printf(dev, "cannot create clock node\n");
return (ENXIO);
}
sc = clknode_get_softc(clk);
sc->clkdev = device_get_parent(dev);
sc->reg = paddr;
sc->type = type;
sc->id = clkdef.id;
clknode_register(clkdom, clk);
OF_prop_free(__DECONST(char *, clkdef.parent_names));
return (0);
}
static int
aw_pll_probe(device_t dev)
{
if (!ofw_bus_status_okay(dev))
return (ENXIO);
if (ofw_bus_search_compatible(dev, compat_data)->ocd_data == 0)
return (ENXIO);
device_set_desc(dev, "Allwinner PLL Clock");
return (BUS_PROBE_DEFAULT);
}
static int
aw_pll_attach(device_t dev)
{
struct clkdom *clkdom;
const char **names;
int index, nout, error;
clk_t clk_parent;
uint32_t *indices;
bus_addr_t paddr;
bus_size_t psize;
phandle_t node;
node = ofw_bus_get_node(dev);
if (ofw_reg_to_paddr(node, 0, &paddr, &psize, NULL) != 0) {
device_printf(dev, "couldn't parse 'reg' property\n");
return (ENXIO);
}
clkdom = clkdom_create(dev);
nout = clk_parse_ofw_out_names(dev, node, &names, &indices);
if (nout == 0) {
device_printf(dev, "no clock outputs found\n");
error = ENOENT;
goto fail;
}
if (clk_get_by_ofw_index(dev, 0, 0, &clk_parent) != 0)
clk_parent = NULL;
for (index = 0; index < nout; index++) {
error = aw_pll_create(dev, paddr, clkdom,
clk_parent ? clk_get_name(clk_parent) : NULL,
names[index], nout == 1 ? 1 : index);
if (error)
goto fail;
}
if (clkdom_finit(clkdom) != 0) {
device_printf(dev, "cannot finalize clkdom initialization\n");
error = ENXIO;
goto fail;
}
if (bootverbose)
clkdom_dump(clkdom);
return (0);
fail:
return (error);
}
static device_method_t aw_pll_methods[] = {
/* Device interface */
DEVMETHOD(device_probe, aw_pll_probe),
DEVMETHOD(device_attach, aw_pll_attach),
DEVMETHOD_END
};
static driver_t aw_pll_driver = {
"aw_pll",
aw_pll_methods,
0,
};
static devclass_t aw_pll_devclass;
EARLY_DRIVER_MODULE(aw_pll, simplebus, aw_pll_driver,
aw_pll_devclass, 0, 0, BUS_PASS_BUS + BUS_PASS_ORDER_MIDDLE);
Index: head/sys/arm/allwinner/if_awg.c
===================================================================
--- head/sys/arm/allwinner/if_awg.c (revision 327172)
+++ head/sys/arm/allwinner/if_awg.c (revision 327173)
@@ -1,1816 +1,1812 @@
/*-
* Copyright (c) 2016 Jared McNeill <jmcneill@invisible.ca>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
/*
* Allwinner Gigabit Ethernet MAC (EMAC) controller
*/
#include "opt_device_polling.h"
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/bus.h>
#include <sys/rman.h>
#include <sys/kernel.h>
#include <sys/endian.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/sockio.h>
#include <sys/module.h>
#include <sys/taskqueue.h>
#include <sys/gpio.h>
#include <net/bpf.h>
#include <net/if.h>
#include <net/ethernet.h>
#include <net/if_dl.h>
#include <net/if_media.h>
#include <net/if_types.h>
#include <net/if_var.h>
#include <machine/bus.h>
#include <dev/ofw/ofw_bus.h>
#include <dev/ofw/ofw_bus_subr.h>
#include <arm/allwinner/if_awgreg.h>
#include <arm/allwinner/aw_sid.h>
#include <dev/mii/mii.h>
#include <dev/mii/miivar.h>
#include <dev/extres/clk/clk.h>
#include <dev/extres/hwreset/hwreset.h>
#include <dev/extres/regulator/regulator.h>
#include "miibus_if.h"
#include "gpio_if.h"
#define RD4(sc, reg) bus_read_4((sc)->res[_RES_EMAC], (reg))
#define WR4(sc, reg, val) bus_write_4((sc)->res[_RES_EMAC], (reg), (val))
#define AWG_LOCK(sc) mtx_lock(&(sc)->mtx)
#define AWG_UNLOCK(sc) mtx_unlock(&(sc)->mtx);
#define AWG_ASSERT_LOCKED(sc) mtx_assert(&(sc)->mtx, MA_OWNED)
#define AWG_ASSERT_UNLOCKED(sc) mtx_assert(&(sc)->mtx, MA_NOTOWNED)
#define DESC_ALIGN 4
#define TX_DESC_COUNT 1024
#define TX_DESC_SIZE (sizeof(struct emac_desc) * TX_DESC_COUNT)
#define RX_DESC_COUNT 256
#define RX_DESC_SIZE (sizeof(struct emac_desc) * RX_DESC_COUNT)
#define DESC_OFF(n) ((n) * sizeof(struct emac_desc))
#define TX_NEXT(n) (((n) + 1) & (TX_DESC_COUNT - 1))
#define TX_SKIP(n, o) (((n) + (o)) & (TX_DESC_COUNT - 1))
#define RX_NEXT(n) (((n) + 1) & (RX_DESC_COUNT - 1))
#define TX_MAX_SEGS 20
#define SOFT_RST_RETRY 1000
#define MII_BUSY_RETRY 1000
#define MDIO_FREQ 2500000
#define BURST_LEN_DEFAULT 8
#define RX_TX_PRI_DEFAULT 0
#define PAUSE_TIME_DEFAULT 0x400
#define TX_INTERVAL_DEFAULT 64
#define RX_BATCH_DEFAULT 64
/* syscon EMAC clock register */
#define EMAC_CLK_EPHY_ADDR (0x1f << 20) /* H3 */
#define EMAC_CLK_EPHY_ADDR_SHIFT 20
#define EMAC_CLK_EPHY_LED_POL (1 << 17) /* H3 */
#define EMAC_CLK_EPHY_SHUTDOWN (1 << 16) /* H3 */
#define EMAC_CLK_EPHY_SELECT (1 << 15) /* H3 */
#define EMAC_CLK_RMII_EN (1 << 13)
#define EMAC_CLK_ETXDC (0x7 << 10)
#define EMAC_CLK_ETXDC_SHIFT 10
#define EMAC_CLK_ERXDC (0x1f << 5)
#define EMAC_CLK_ERXDC_SHIFT 5
#define EMAC_CLK_PIT (0x1 << 2)
#define EMAC_CLK_PIT_MII (0 << 2)
#define EMAC_CLK_PIT_RGMII (1 << 2)
#define EMAC_CLK_SRC (0x3 << 0)
#define EMAC_CLK_SRC_MII (0 << 0)
#define EMAC_CLK_SRC_EXT_RGMII (1 << 0)
#define EMAC_CLK_SRC_RGMII (2 << 0)
/* Burst length of RX and TX DMA transfers */
static int awg_burst_len = BURST_LEN_DEFAULT;
TUNABLE_INT("hw.awg.burst_len", &awg_burst_len);
/* RX / TX DMA priority. If 1, RX DMA has priority over TX DMA. */
static int awg_rx_tx_pri = RX_TX_PRI_DEFAULT;
TUNABLE_INT("hw.awg.rx_tx_pri", &awg_rx_tx_pri);
/* Pause time field in the transmitted control frame */
static int awg_pause_time = PAUSE_TIME_DEFAULT;
TUNABLE_INT("hw.awg.pause_time", &awg_pause_time);
/* Request a TX interrupt every <n> descriptors */
static int awg_tx_interval = TX_INTERVAL_DEFAULT;
TUNABLE_INT("hw.awg.tx_interval", &awg_tx_interval);
/* Maximum number of mbufs to send to if_input */
static int awg_rx_batch = RX_BATCH_DEFAULT;
TUNABLE_INT("hw.awg.rx_batch", &awg_rx_batch);
enum awg_type {
EMAC_A83T = 1,
EMAC_H3,
EMAC_A64,
};
static struct ofw_compat_data compat_data[] = {
{ "allwinner,sun8i-a83t-emac", EMAC_A83T },
{ "allwinner,sun8i-h3-emac", EMAC_H3 },
{ "allwinner,sun50i-a64-emac", EMAC_A64 },
{ NULL, 0 }
};
struct awg_bufmap {
bus_dmamap_t map;
struct mbuf *mbuf;
};
struct awg_txring {
bus_dma_tag_t desc_tag;
bus_dmamap_t desc_map;
struct emac_desc *desc_ring;
bus_addr_t desc_ring_paddr;
bus_dma_tag_t buf_tag;
struct awg_bufmap buf_map[TX_DESC_COUNT];
u_int cur, next, queued;
u_int segs;
};
struct awg_rxring {
bus_dma_tag_t desc_tag;
bus_dmamap_t desc_map;
struct emac_desc *desc_ring;
bus_addr_t desc_ring_paddr;
bus_dma_tag_t buf_tag;
struct awg_bufmap buf_map[RX_DESC_COUNT];
bus_dmamap_t buf_spare_map;
u_int cur;
};
enum {
_RES_EMAC,
_RES_IRQ,
_RES_SYSCON,
_RES_NITEMS
};
struct awg_softc {
struct resource *res[_RES_NITEMS];
struct mtx mtx;
if_t ifp;
device_t dev;
device_t miibus;
struct callout stat_ch;
struct task link_task;
void *ih;
u_int mdc_div_ratio_m;
int link;
int if_flags;
enum awg_type type;
struct awg_txring tx;
struct awg_rxring rx;
};
static struct resource_spec awg_spec[] = {
{ SYS_RES_MEMORY, 0, RF_ACTIVE },
{ SYS_RES_IRQ, 0, RF_ACTIVE },
{ SYS_RES_MEMORY, 1, RF_ACTIVE | RF_OPTIONAL },
{ -1, 0 }
};
static void awg_txeof(struct awg_softc *sc);
static int
awg_miibus_readreg(device_t dev, int phy, int reg)
{
struct awg_softc *sc;
int retry, val;
sc = device_get_softc(dev);
val = 0;
WR4(sc, EMAC_MII_CMD,
(sc->mdc_div_ratio_m << MDC_DIV_RATIO_M_SHIFT) |
(phy << PHY_ADDR_SHIFT) |
(reg << PHY_REG_ADDR_SHIFT) |
MII_BUSY);
for (retry = MII_BUSY_RETRY; retry > 0; retry--) {
if ((RD4(sc, EMAC_MII_CMD) & MII_BUSY) == 0) {
val = RD4(sc, EMAC_MII_DATA);
break;
}
DELAY(10);
}
if (retry == 0)
device_printf(dev, "phy read timeout, phy=%d reg=%d\n",
phy, reg);
return (val);
}
static int
awg_miibus_writereg(device_t dev, int phy, int reg, int val)
{
struct awg_softc *sc;
int retry;
sc = device_get_softc(dev);
WR4(sc, EMAC_MII_DATA, val);
WR4(sc, EMAC_MII_CMD,
(sc->mdc_div_ratio_m << MDC_DIV_RATIO_M_SHIFT) |
(phy << PHY_ADDR_SHIFT) |
(reg << PHY_REG_ADDR_SHIFT) |
MII_WR | MII_BUSY);
for (retry = MII_BUSY_RETRY; retry > 0; retry--) {
if ((RD4(sc, EMAC_MII_CMD) & MII_BUSY) == 0)
break;
DELAY(10);
}
if (retry == 0)
device_printf(dev, "phy write timeout, phy=%d reg=%d\n",
phy, reg);
return (0);
}
static void
awg_update_link_locked(struct awg_softc *sc)
{
struct mii_data *mii;
uint32_t val;
AWG_ASSERT_LOCKED(sc);
if ((if_getdrvflags(sc->ifp) & IFF_DRV_RUNNING) == 0)
return;
mii = device_get_softc(sc->miibus);
if ((mii->mii_media_status & (IFM_ACTIVE | IFM_AVALID)) ==
(IFM_ACTIVE | IFM_AVALID)) {
switch (IFM_SUBTYPE(mii->mii_media_active)) {
case IFM_1000_T:
case IFM_1000_SX:
case IFM_100_TX:
case IFM_10_T:
sc->link = 1;
break;
default:
sc->link = 0;
break;
}
} else
sc->link = 0;
if (sc->link == 0)
return;
val = RD4(sc, EMAC_BASIC_CTL_0);
val &= ~(BASIC_CTL_SPEED | BASIC_CTL_DUPLEX);
if (IFM_SUBTYPE(mii->mii_media_active) == IFM_1000_T ||
IFM_SUBTYPE(mii->mii_media_active) == IFM_1000_SX)
val |= BASIC_CTL_SPEED_1000 << BASIC_CTL_SPEED_SHIFT;
else if (IFM_SUBTYPE(mii->mii_media_active) == IFM_100_TX)
val |= BASIC_CTL_SPEED_100 << BASIC_CTL_SPEED_SHIFT;
else
val |= BASIC_CTL_SPEED_10 << BASIC_CTL_SPEED_SHIFT;
if ((IFM_OPTIONS(mii->mii_media_active) & IFM_FDX) != 0)
val |= BASIC_CTL_DUPLEX;
WR4(sc, EMAC_BASIC_CTL_0, val);
val = RD4(sc, EMAC_RX_CTL_0);
val &= ~RX_FLOW_CTL_EN;
if ((IFM_OPTIONS(mii->mii_media_active) & IFM_ETH_RXPAUSE) != 0)
val |= RX_FLOW_CTL_EN;
WR4(sc, EMAC_RX_CTL_0, val);
val = RD4(sc, EMAC_TX_FLOW_CTL);
val &= ~(PAUSE_TIME|TX_FLOW_CTL_EN);
if ((IFM_OPTIONS(mii->mii_media_active) & IFM_ETH_TXPAUSE) != 0)
val |= TX_FLOW_CTL_EN;
if ((IFM_OPTIONS(mii->mii_media_active) & IFM_FDX) != 0)
val |= awg_pause_time << PAUSE_TIME_SHIFT;
WR4(sc, EMAC_TX_FLOW_CTL, val);
}
static void
awg_link_task(void *arg, int pending)
{
struct awg_softc *sc;
sc = arg;
AWG_LOCK(sc);
awg_update_link_locked(sc);
AWG_UNLOCK(sc);
}
static void
awg_miibus_statchg(device_t dev)
{
struct awg_softc *sc;
sc = device_get_softc(dev);
taskqueue_enqueue(taskqueue_swi, &sc->link_task);
}
static void
awg_media_status(if_t ifp, struct ifmediareq *ifmr)
{
struct awg_softc *sc;
struct mii_data *mii;
sc = if_getsoftc(ifp);
mii = device_get_softc(sc->miibus);
AWG_LOCK(sc);
mii_pollstat(mii);
ifmr->ifm_active = mii->mii_media_active;
ifmr->ifm_status = mii->mii_media_status;
AWG_UNLOCK(sc);
}
static int
awg_media_change(if_t ifp)
{
struct awg_softc *sc;
struct mii_data *mii;
int error;
sc = if_getsoftc(ifp);
mii = device_get_softc(sc->miibus);
AWG_LOCK(sc);
error = mii_mediachg(mii);
AWG_UNLOCK(sc);
return (error);
}
static int
awg_encap(struct awg_softc *sc, struct mbuf **mp)
{
bus_dmamap_t map;
bus_dma_segment_t segs[TX_MAX_SEGS];
int error, nsegs, cur, first, last, i;
u_int csum_flags;
uint32_t flags, status;
struct mbuf *m;
cur = first = sc->tx.cur;
map = sc->tx.buf_map[first].map;
m = *mp;
error = bus_dmamap_load_mbuf_sg(sc->tx.buf_tag, map, m, segs,
&nsegs, BUS_DMA_NOWAIT);
if (error == EFBIG) {
m = m_collapse(m, M_NOWAIT, TX_MAX_SEGS);
if (m == NULL) {
device_printf(sc->dev, "awg_encap: m_collapse failed\n");
m_freem(*mp);
*mp = NULL;
return (ENOMEM);
}
*mp = m;
error = bus_dmamap_load_mbuf_sg(sc->tx.buf_tag, map, m,
segs, &nsegs, BUS_DMA_NOWAIT);
if (error != 0) {
m_freem(*mp);
*mp = NULL;
}
}
if (error != 0) {
device_printf(sc->dev, "awg_encap: bus_dmamap_load_mbuf_sg failed\n");
return (error);
}
if (nsegs == 0) {
m_freem(*mp);
*mp = NULL;
return (EIO);
}
if (sc->tx.queued + nsegs > TX_DESC_COUNT) {
bus_dmamap_unload(sc->tx.buf_tag, map);
return (ENOBUFS);
}
bus_dmamap_sync(sc->tx.buf_tag, map, BUS_DMASYNC_PREWRITE);
flags = TX_FIR_DESC;
status = 0;
if ((m->m_pkthdr.csum_flags & CSUM_IP) != 0) {
if ((m->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_UDP)) != 0)
csum_flags = TX_CHECKSUM_CTL_FULL;
else
csum_flags = TX_CHECKSUM_CTL_IP;
flags |= (csum_flags << TX_CHECKSUM_CTL_SHIFT);
}
for (i = 0; i < nsegs; i++) {
sc->tx.segs++;
if (i == nsegs - 1) {
flags |= TX_LAST_DESC;
/*
* Can only request TX completion
* interrupt on last descriptor.
*/
if (sc->tx.segs >= awg_tx_interval) {
sc->tx.segs = 0;
flags |= TX_INT_CTL;
}
}
sc->tx.desc_ring[cur].addr = htole32((uint32_t)segs[i].ds_addr);
sc->tx.desc_ring[cur].size = htole32(flags | segs[i].ds_len);
sc->tx.desc_ring[cur].status = htole32(status);
flags &= ~TX_FIR_DESC;
/*
* Setting of the valid bit in the first descriptor is
* deferred until the whole chain is fully set up.
*/
status = TX_DESC_CTL;
++sc->tx.queued;
cur = TX_NEXT(cur);
}
sc->tx.cur = cur;
/* Store mapping and mbuf in the last segment */
last = TX_SKIP(cur, TX_DESC_COUNT - 1);
sc->tx.buf_map[first].map = sc->tx.buf_map[last].map;
sc->tx.buf_map[last].map = map;
sc->tx.buf_map[last].mbuf = m;
/*
* The whole mbuf chain has been DMA mapped,
* fix the first descriptor.
*/
sc->tx.desc_ring[first].status = htole32(TX_DESC_CTL);
return (0);
}
static void
awg_clean_txbuf(struct awg_softc *sc, int index)
{
struct awg_bufmap *bmap;
--sc->tx.queued;
bmap = &sc->tx.buf_map[index];
if (bmap->mbuf != NULL) {
bus_dmamap_sync(sc->tx.buf_tag, bmap->map,
BUS_DMASYNC_POSTWRITE);
bus_dmamap_unload(sc->tx.buf_tag, bmap->map);
m_freem(bmap->mbuf);
bmap->mbuf = NULL;
}
}
static void
awg_setup_rxdesc(struct awg_softc *sc, int index, bus_addr_t paddr)
{
uint32_t status, size;
status = RX_DESC_CTL;
size = MCLBYTES - 1;
sc->rx.desc_ring[index].addr = htole32((uint32_t)paddr);
sc->rx.desc_ring[index].size = htole32(size);
sc->rx.desc_ring[index].status = htole32(status);
}
static void
awg_reuse_rxdesc(struct awg_softc *sc, int index)
{
sc->rx.desc_ring[index].status = htole32(RX_DESC_CTL);
}
static int
awg_newbuf_rx(struct awg_softc *sc, int index)
{
struct mbuf *m;
bus_dma_segment_t seg;
bus_dmamap_t map;
int nsegs;
m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
if (m == NULL)
return (ENOBUFS);
m->m_pkthdr.len = m->m_len = m->m_ext.ext_size;
m_adj(m, ETHER_ALIGN);
if (bus_dmamap_load_mbuf_sg(sc->rx.buf_tag, sc->rx.buf_spare_map,
m, &seg, &nsegs, BUS_DMA_NOWAIT) != 0) {
m_freem(m);
return (ENOBUFS);
}
if (sc->rx.buf_map[index].mbuf != NULL) {
bus_dmamap_sync(sc->rx.buf_tag, sc->rx.buf_map[index].map,
BUS_DMASYNC_POSTREAD);
bus_dmamap_unload(sc->rx.buf_tag, sc->rx.buf_map[index].map);
}
map = sc->rx.buf_map[index].map;
sc->rx.buf_map[index].map = sc->rx.buf_spare_map;
sc->rx.buf_spare_map = map;
bus_dmamap_sync(sc->rx.buf_tag, sc->rx.buf_map[index].map,
BUS_DMASYNC_PREREAD);
sc->rx.buf_map[index].mbuf = m;
awg_setup_rxdesc(sc, index, seg.ds_addr);
return (0);
}
static void
awg_start_locked(struct awg_softc *sc)
{
struct mbuf *m;
uint32_t val;
if_t ifp;
int cnt, err;
AWG_ASSERT_LOCKED(sc);
if (!sc->link)
return;
ifp = sc->ifp;
if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
IFF_DRV_RUNNING)
return;
for (cnt = 0; ; cnt++) {
m = if_dequeue(ifp);
if (m == NULL)
break;
err = awg_encap(sc, &m);
if (err != 0) {
if (err == ENOBUFS)
if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, 0);
if (m != NULL)
if_sendq_prepend(ifp, m);
break;
}
if_bpfmtap(ifp, m);
}
if (cnt != 0) {
bus_dmamap_sync(sc->tx.desc_tag, sc->tx.desc_map,
BUS_DMASYNC_PREREAD|BUS_DMASYNC_PREWRITE);
/* Start and run TX DMA */
val = RD4(sc, EMAC_TX_CTL_1);
WR4(sc, EMAC_TX_CTL_1, val | TX_DMA_START);
}
}
static void
awg_start(if_t ifp)
{
struct awg_softc *sc;
sc = if_getsoftc(ifp);
AWG_LOCK(sc);
awg_start_locked(sc);
AWG_UNLOCK(sc);
}
static void
awg_tick(void *softc)
{
struct awg_softc *sc;
struct mii_data *mii;
if_t ifp;
int link;
sc = softc;
ifp = sc->ifp;
mii = device_get_softc(sc->miibus);
AWG_ASSERT_LOCKED(sc);
if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0)
return;
link = sc->link;
mii_tick(mii);
if (sc->link && !link)
awg_start_locked(sc);
callout_reset(&sc->stat_ch, hz, awg_tick, sc);
}
/* Bit Reversal - http://aggregate.org/MAGIC/#Bit%20Reversal */
static uint32_t
bitrev32(uint32_t x)
{
x = (((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1));
x = (((x & 0xcccccccc) >> 2) | ((x & 0x33333333) << 2));
x = (((x & 0xf0f0f0f0) >> 4) | ((x & 0x0f0f0f0f) << 4));
x = (((x & 0xff00ff00) >> 8) | ((x & 0x00ff00ff) << 8));
return (x >> 16) | (x << 16);
}
static void
awg_setup_rxfilter(struct awg_softc *sc)
{
uint32_t val, crc, hashreg, hashbit, hash[2], machi, maclo;
int mc_count, mcnt, i;
uint8_t *eaddr, *mta;
if_t ifp;
AWG_ASSERT_LOCKED(sc);
ifp = sc->ifp;
val = 0;
hash[0] = hash[1] = 0;
mc_count = if_multiaddr_count(ifp, -1);
if (if_getflags(ifp) & IFF_PROMISC)
val |= DIS_ADDR_FILTER;
else if (if_getflags(ifp) & IFF_ALLMULTI) {
val |= RX_ALL_MULTICAST;
hash[0] = hash[1] = ~0;
} else if (mc_count > 0) {
val |= HASH_MULTICAST;
mta = malloc(sizeof(unsigned char) * ETHER_ADDR_LEN * mc_count,
M_DEVBUF, M_NOWAIT);
if (mta == NULL) {
if_printf(ifp,
"failed to allocate temporary multicast list\n");
return;
}
if_multiaddr_array(ifp, mta, &mcnt, mc_count);
for (i = 0; i < mcnt; i++) {
crc = ether_crc32_le(mta + (i * ETHER_ADDR_LEN),
ETHER_ADDR_LEN) & 0x7f;
crc = bitrev32(~crc) >> 26;
hashreg = (crc >> 5);
hashbit = (crc & 0x1f);
hash[hashreg] |= (1 << hashbit);
}
free(mta, M_DEVBUF);
}
/* Write our unicast address */
eaddr = IF_LLADDR(ifp);
machi = (eaddr[5] << 8) | eaddr[4];
maclo = (eaddr[3] << 24) | (eaddr[2] << 16) | (eaddr[1] << 8) |
(eaddr[0] << 0);
WR4(sc, EMAC_ADDR_HIGH(0), machi);
WR4(sc, EMAC_ADDR_LOW(0), maclo);
/* Multicast hash filters */
WR4(sc, EMAC_RX_HASH_0, hash[1]);
WR4(sc, EMAC_RX_HASH_1, hash[0]);
/* RX frame filter config */
WR4(sc, EMAC_RX_FRM_FLT, val);
}
static void
awg_enable_intr(struct awg_softc *sc)
{
/* Enable interrupts */
WR4(sc, EMAC_INT_EN, RX_INT_EN | TX_INT_EN | TX_BUF_UA_INT_EN);
}
static void
awg_disable_intr(struct awg_softc *sc)
{
/* Disable interrupts */
WR4(sc, EMAC_INT_EN, 0);
}
static void
awg_init_locked(struct awg_softc *sc)
{
struct mii_data *mii;
uint32_t val;
if_t ifp;
mii = device_get_softc(sc->miibus);
ifp = sc->ifp;
AWG_ASSERT_LOCKED(sc);
if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)
return;
awg_setup_rxfilter(sc);
/* Configure DMA burst length and priorities */
val = awg_burst_len << BASIC_CTL_BURST_LEN_SHIFT;
if (awg_rx_tx_pri)
val |= BASIC_CTL_RX_TX_PRI;
WR4(sc, EMAC_BASIC_CTL_1, val);
/* Enable interrupts */
#ifdef DEVICE_POLLING
if ((if_getcapenable(ifp) & IFCAP_POLLING) == 0)
awg_enable_intr(sc);
else
awg_disable_intr(sc);
#else
awg_enable_intr(sc);
#endif
/* Enable transmit DMA */
val = RD4(sc, EMAC_TX_CTL_1);
WR4(sc, EMAC_TX_CTL_1, val | TX_DMA_EN | TX_MD | TX_NEXT_FRAME);
/* Enable receive DMA */
val = RD4(sc, EMAC_RX_CTL_1);
WR4(sc, EMAC_RX_CTL_1, val | RX_DMA_EN | RX_MD);
/* Enable transmitter */
val = RD4(sc, EMAC_TX_CTL_0);
WR4(sc, EMAC_TX_CTL_0, val | TX_EN);
/* Enable receiver */
val = RD4(sc, EMAC_RX_CTL_0);
WR4(sc, EMAC_RX_CTL_0, val | RX_EN | CHECK_CRC);
if_setdrvflagbits(ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE);
mii_mediachg(mii);
callout_reset(&sc->stat_ch, hz, awg_tick, sc);
}
static void
awg_init(void *softc)
{
struct awg_softc *sc;
sc = softc;
AWG_LOCK(sc);
awg_init_locked(sc);
AWG_UNLOCK(sc);
}
static void
awg_stop(struct awg_softc *sc)
{
if_t ifp;
uint32_t val;
int i;
AWG_ASSERT_LOCKED(sc);
ifp = sc->ifp;
callout_stop(&sc->stat_ch);
/* Stop transmit DMA and flush data in the TX FIFO */
val = RD4(sc, EMAC_TX_CTL_1);
val &= ~TX_DMA_EN;
val |= FLUSH_TX_FIFO;
WR4(sc, EMAC_TX_CTL_1, val);
/* Disable transmitter */
val = RD4(sc, EMAC_TX_CTL_0);
WR4(sc, EMAC_TX_CTL_0, val & ~TX_EN);
/* Disable receiver */
val = RD4(sc, EMAC_RX_CTL_0);
WR4(sc, EMAC_RX_CTL_0, val & ~RX_EN);
/* Disable interrupts */
awg_disable_intr(sc);
/* Disable transmit DMA */
val = RD4(sc, EMAC_TX_CTL_1);
WR4(sc, EMAC_TX_CTL_1, val & ~TX_DMA_EN);
/* Disable receive DMA */
val = RD4(sc, EMAC_RX_CTL_1);
WR4(sc, EMAC_RX_CTL_1, val & ~RX_DMA_EN);
sc->link = 0;
/* Finish handling transmitted buffers */
awg_txeof(sc);
/* Release any untransmitted buffers. */
for (i = sc->tx.next; sc->tx.queued > 0; i = TX_NEXT(i)) {
val = le32toh(sc->tx.desc_ring[i].status);
if ((val & TX_DESC_CTL) != 0)
break;
awg_clean_txbuf(sc, i);
}
sc->tx.next = i;
for (; sc->tx.queued > 0; i = TX_NEXT(i)) {
sc->tx.desc_ring[i].status = 0;
awg_clean_txbuf(sc, i);
}
sc->tx.cur = sc->tx.next;
bus_dmamap_sync(sc->tx.desc_tag, sc->tx.desc_map,
BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
/* Setup RX buffers for reuse */
bus_dmamap_sync(sc->rx.desc_tag, sc->rx.desc_map,
BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
for (i = sc->rx.cur; ; i = RX_NEXT(i)) {
val = le32toh(sc->rx.desc_ring[i].status);
if ((val & RX_DESC_CTL) != 0)
break;
awg_reuse_rxdesc(sc, i);
}
sc->rx.cur = i;
bus_dmamap_sync(sc->rx.desc_tag, sc->rx.desc_map,
BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
}
static int
awg_rxintr(struct awg_softc *sc)
{
if_t ifp;
struct mbuf *m, *mh, *mt;
int error, index, len, cnt, npkt;
uint32_t status;
ifp = sc->ifp;
mh = mt = NULL;
cnt = 0;
npkt = 0;
bus_dmamap_sync(sc->rx.desc_tag, sc->rx.desc_map,
BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
for (index = sc->rx.cur; ; index = RX_NEXT(index)) {
status = le32toh(sc->rx.desc_ring[index].status);
if ((status & RX_DESC_CTL) != 0)
break;
len = (status & RX_FRM_LEN) >> RX_FRM_LEN_SHIFT;
if (len == 0) {
if ((status & (RX_NO_ENOUGH_BUF_ERR | RX_OVERFLOW_ERR)) != 0)
if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
awg_reuse_rxdesc(sc, index);
continue;
}
m = sc->rx.buf_map[index].mbuf;
error = awg_newbuf_rx(sc, index);
if (error != 0) {
if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
awg_reuse_rxdesc(sc, index);
continue;
}
m->m_pkthdr.rcvif = ifp;
m->m_pkthdr.len = len;
m->m_len = len;
if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
if ((if_getcapenable(ifp) & IFCAP_RXCSUM) != 0 &&
(status & RX_FRM_TYPE) != 0) {
m->m_pkthdr.csum_flags = CSUM_IP_CHECKED;
if ((status & RX_HEADER_ERR) == 0)
m->m_pkthdr.csum_flags |= CSUM_IP_VALID;
if ((status & RX_PAYLOAD_ERR) == 0) {
m->m_pkthdr.csum_flags |=
CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
m->m_pkthdr.csum_data = 0xffff;
}
}
m->m_nextpkt = NULL;
if (mh == NULL)
mh = m;
else
mt->m_nextpkt = m;
mt = m;
++cnt;
++npkt;
if (cnt == awg_rx_batch) {
AWG_UNLOCK(sc);
if_input(ifp, mh);
AWG_LOCK(sc);
mh = mt = NULL;
cnt = 0;
}
}
if (index != sc->rx.cur) {
bus_dmamap_sync(sc->rx.desc_tag, sc->rx.desc_map,
BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
}
if (mh != NULL) {
AWG_UNLOCK(sc);
if_input(ifp, mh);
AWG_LOCK(sc);
}
sc->rx.cur = index;
return (npkt);
}
static void
awg_txeof(struct awg_softc *sc)
{
struct emac_desc *desc;
uint32_t status, size;
if_t ifp;
int i, prog;
AWG_ASSERT_LOCKED(sc);
bus_dmamap_sync(sc->tx.desc_tag, sc->tx.desc_map,
BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
ifp = sc->ifp;
prog = 0;
for (i = sc->tx.next; sc->tx.queued > 0; i = TX_NEXT(i)) {
desc = &sc->tx.desc_ring[i];
status = le32toh(desc->status);
if ((status & TX_DESC_CTL) != 0)
break;
size = le32toh(desc->size);
if (size & TX_LAST_DESC) {
if ((status & (TX_HEADER_ERR | TX_PAYLOAD_ERR)) != 0)
if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
else
if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
}
prog++;
awg_clean_txbuf(sc, i);
}
if (prog > 0) {
sc->tx.next = i;
if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
}
}
static void
awg_intr(void *arg)
{
struct awg_softc *sc;
uint32_t val;
sc = arg;
AWG_LOCK(sc);
val = RD4(sc, EMAC_INT_STA);
WR4(sc, EMAC_INT_STA, val);
if (val & RX_INT)
awg_rxintr(sc);
if (val & TX_INT)
awg_txeof(sc);
if (val & (TX_INT | TX_BUF_UA_INT)) {
if (!if_sendq_empty(sc->ifp))
awg_start_locked(sc);
}
AWG_UNLOCK(sc);
}
#ifdef DEVICE_POLLING
static int
awg_poll(if_t ifp, enum poll_cmd cmd, int count)
{
struct awg_softc *sc;
uint32_t val;
int rx_npkts;
sc = if_getsoftc(ifp);
rx_npkts = 0;
AWG_LOCK(sc);
if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) {
AWG_UNLOCK(sc);
return (0);
}
rx_npkts = awg_rxintr(sc);
awg_txeof(sc);
if (!if_sendq_empty(ifp))
awg_start_locked(sc);
if (cmd == POLL_AND_CHECK_STATUS) {
val = RD4(sc, EMAC_INT_STA);
if (val != 0)
WR4(sc, EMAC_INT_STA, val);
}
AWG_UNLOCK(sc);
return (rx_npkts);
}
#endif
static int
awg_ioctl(if_t ifp, u_long cmd, caddr_t data)
{
struct awg_softc *sc;
struct mii_data *mii;
struct ifreq *ifr;
int flags, mask, error;
sc = if_getsoftc(ifp);
mii = device_get_softc(sc->miibus);
ifr = (struct ifreq *)data;
error = 0;
switch (cmd) {
case SIOCSIFFLAGS:
AWG_LOCK(sc);
if (if_getflags(ifp) & IFF_UP) {
if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
flags = if_getflags(ifp) ^ sc->if_flags;
if ((flags & (IFF_PROMISC|IFF_ALLMULTI)) != 0)
awg_setup_rxfilter(sc);
} else
awg_init_locked(sc);
} else {
if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)
awg_stop(sc);
}
sc->if_flags = if_getflags(ifp);
AWG_UNLOCK(sc);
break;
case SIOCADDMULTI:
case SIOCDELMULTI:
if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
AWG_LOCK(sc);
awg_setup_rxfilter(sc);
AWG_UNLOCK(sc);
}
break;
case SIOCSIFMEDIA:
case SIOCGIFMEDIA:
error = ifmedia_ioctl(ifp, ifr, &mii->mii_media, cmd);
break;
case SIOCSIFCAP:
mask = ifr->ifr_reqcap ^ if_getcapenable(ifp);
#ifdef DEVICE_POLLING
if (mask & IFCAP_POLLING) {
if ((ifr->ifr_reqcap & IFCAP_POLLING) != 0) {
error = ether_poll_register(awg_poll, ifp);
if (error != 0)
break;
AWG_LOCK(sc);
awg_disable_intr(sc);
if_setcapenablebit(ifp, IFCAP_POLLING, 0);
AWG_UNLOCK(sc);
} else {
error = ether_poll_deregister(ifp);
AWG_LOCK(sc);
awg_enable_intr(sc);
if_setcapenablebit(ifp, 0, IFCAP_POLLING);
AWG_UNLOCK(sc);
}
}
#endif
if (mask & IFCAP_VLAN_MTU)
if_togglecapenable(ifp, IFCAP_VLAN_MTU);
if (mask & IFCAP_RXCSUM)
if_togglecapenable(ifp, IFCAP_RXCSUM);
if (mask & IFCAP_TXCSUM)
if_togglecapenable(ifp, IFCAP_TXCSUM);
if ((if_getcapenable(ifp) & IFCAP_TXCSUM) != 0)
if_sethwassistbits(ifp, CSUM_IP | CSUM_UDP | CSUM_TCP, 0);
else
if_sethwassistbits(ifp, 0, CSUM_IP | CSUM_UDP | CSUM_TCP);
break;
default:
error = ether_ioctl(ifp, cmd, data);
break;
}
return (error);
}
static int
awg_setup_phy(device_t dev)
{
struct awg_softc *sc;
clk_t clk_tx, clk_tx_parent;
const char *tx_parent_name;
char *phy_type;
phandle_t node;
uint32_t reg, tx_delay, rx_delay;
int error;
sc = device_get_softc(dev);
node = ofw_bus_get_node(dev);
if (OF_getprop_alloc(node, "phy-mode", 1, (void **)&phy_type) == 0)
return (0);
if (bootverbose)
device_printf(dev, "PHY type: %s, conf mode: %s\n", phy_type,
sc->res[_RES_SYSCON] != NULL ? "reg" : "clk");
if (sc->res[_RES_SYSCON] != NULL) {
reg = bus_read_4(sc->res[_RES_SYSCON], 0);
reg &= ~(EMAC_CLK_PIT | EMAC_CLK_SRC | EMAC_CLK_RMII_EN);
if (strcmp(phy_type, "rgmii") == 0)
reg |= EMAC_CLK_PIT_RGMII | EMAC_CLK_SRC_RGMII;
else if (strcmp(phy_type, "rmii") == 0)
reg |= EMAC_CLK_RMII_EN;
else
reg |= EMAC_CLK_PIT_MII | EMAC_CLK_SRC_MII;
if (OF_getencprop(node, "tx-delay", &tx_delay,
sizeof(tx_delay)) > 0) {
reg &= ~EMAC_CLK_ETXDC;
reg |= (tx_delay << EMAC_CLK_ETXDC_SHIFT);
}
if (OF_getencprop(node, "rx-delay", &rx_delay,
sizeof(rx_delay)) > 0) {
reg &= ~EMAC_CLK_ERXDC;
reg |= (rx_delay << EMAC_CLK_ERXDC_SHIFT);
}
if (sc->type == EMAC_H3) {
if (OF_hasprop(node, "allwinner,use-internal-phy")) {
reg |= EMAC_CLK_EPHY_SELECT;
reg &= ~EMAC_CLK_EPHY_SHUTDOWN;
if (OF_hasprop(node,
"allwinner,leds-active-low"))
reg |= EMAC_CLK_EPHY_LED_POL;
else
reg &= ~EMAC_CLK_EPHY_LED_POL;
/* Set internal PHY addr to 1 */
reg &= ~EMAC_CLK_EPHY_ADDR;
reg |= (1 << EMAC_CLK_EPHY_ADDR_SHIFT);
} else {
reg &= ~EMAC_CLK_EPHY_SELECT;
}
}
if (bootverbose)
device_printf(dev, "EMAC clock: 0x%08x\n", reg);
bus_write_4(sc->res[_RES_SYSCON], 0, reg);
} else {
if (strcmp(phy_type, "rgmii") == 0)
tx_parent_name = "emac_int_tx";
else
tx_parent_name = "mii_phy_tx";
/* Get the TX clock */
error = clk_get_by_ofw_name(dev, 0, "tx", &clk_tx);
if (error != 0) {
device_printf(dev, "cannot get tx clock\n");
goto fail;
}
/* Find the desired parent clock based on phy-mode property */
error = clk_get_by_name(dev, tx_parent_name, &clk_tx_parent);
if (error != 0) {
device_printf(dev, "cannot get clock '%s'\n",
tx_parent_name);
goto fail;
}
/* Set TX clock parent */
error = clk_set_parent_by_clk(clk_tx, clk_tx_parent);
if (error != 0) {
device_printf(dev, "cannot set tx clock parent\n");
goto fail;
}
/* Enable TX clock */
error = clk_enable(clk_tx);
if (error != 0) {
device_printf(dev, "cannot enable tx clock\n");
goto fail;
}
}
error = 0;
fail:
OF_prop_free(phy_type);
return (error);
}
static int
awg_setup_extres(device_t dev)
{
struct awg_softc *sc;
hwreset_t rst_ahb, rst_ephy;
clk_t clk_ahb, clk_ephy;
regulator_t reg;
- phandle_t node;
uint64_t freq;
int error, div;
sc = device_get_softc(dev);
- node = ofw_bus_get_node(dev);
rst_ahb = rst_ephy = NULL;
clk_ahb = clk_ephy = NULL;
reg = NULL;
/* Get AHB clock and reset resources */
error = hwreset_get_by_ofw_name(dev, 0, "ahb", &rst_ahb);
if (error != 0) {
device_printf(dev, "cannot get ahb reset\n");
goto fail;
}
if (hwreset_get_by_ofw_name(dev, 0, "ephy", &rst_ephy) != 0)
rst_ephy = NULL;
error = clk_get_by_ofw_name(dev, 0, "ahb", &clk_ahb);
if (error != 0) {
device_printf(dev, "cannot get ahb clock\n");
goto fail;
}
if (clk_get_by_ofw_name(dev, 0, "ephy", &clk_ephy) != 0)
clk_ephy = NULL;
/* Configure PHY for MII or RGMII mode */
if (awg_setup_phy(dev) != 0)
goto fail;
/* Enable clocks */
error = clk_enable(clk_ahb);
if (error != 0) {
device_printf(dev, "cannot enable ahb clock\n");
goto fail;
}
if (clk_ephy != NULL) {
error = clk_enable(clk_ephy);
if (error != 0) {
device_printf(dev, "cannot enable ephy clock\n");
goto fail;
}
}
/* De-assert reset */
error = hwreset_deassert(rst_ahb);
if (error != 0) {
device_printf(dev, "cannot de-assert ahb reset\n");
goto fail;
}
if (rst_ephy != NULL) {
error = hwreset_deassert(rst_ephy);
if (error != 0) {
device_printf(dev, "cannot de-assert ephy reset\n");
goto fail;
}
}
/* Enable PHY regulator if applicable */
if (regulator_get_by_ofw_property(dev, 0, "phy-supply", &reg) == 0) {
error = regulator_enable(reg);
if (error != 0) {
device_printf(dev, "cannot enable PHY regulator\n");
goto fail;
}
}
/* Determine MDC clock divide ratio based on AHB clock */
error = clk_get_freq(clk_ahb, &freq);
if (error != 0) {
device_printf(dev, "cannot get AHB clock frequency\n");
goto fail;
}
div = freq / MDIO_FREQ;
if (div <= 16)
sc->mdc_div_ratio_m = MDC_DIV_RATIO_M_16;
else if (div <= 32)
sc->mdc_div_ratio_m = MDC_DIV_RATIO_M_32;
else if (div <= 64)
sc->mdc_div_ratio_m = MDC_DIV_RATIO_M_64;
else if (div <= 128)
sc->mdc_div_ratio_m = MDC_DIV_RATIO_M_128;
else {
device_printf(dev, "cannot determine MDC clock divide ratio\n");
error = ENXIO;
goto fail;
}
if (bootverbose)
device_printf(dev, "AHB frequency %ju Hz, MDC div: 0x%x\n",
(uintmax_t)freq, sc->mdc_div_ratio_m);
return (0);
fail:
if (reg != NULL)
regulator_release(reg);
if (clk_ephy != NULL)
clk_release(clk_ephy);
if (clk_ahb != NULL)
clk_release(clk_ahb);
if (rst_ephy != NULL)
hwreset_release(rst_ephy);
if (rst_ahb != NULL)
hwreset_release(rst_ahb);
return (error);
}
static void
awg_get_eaddr(device_t dev, uint8_t *eaddr)
{
struct awg_softc *sc;
uint32_t maclo, machi, rnd;
u_char rootkey[16];
sc = device_get_softc(dev);
machi = RD4(sc, EMAC_ADDR_HIGH(0)) & 0xffff;
maclo = RD4(sc, EMAC_ADDR_LOW(0));
if (maclo == 0xffffffff && machi == 0xffff) {
/* MAC address in hardware is invalid, create one */
if (aw_sid_get_rootkey(rootkey) == 0 &&
(rootkey[3] | rootkey[12] | rootkey[13] | rootkey[14] |
rootkey[15]) != 0) {
/* MAC address is derived from the root key in SID */
maclo = (rootkey[13] << 24) | (rootkey[12] << 16) |
(rootkey[3] << 8) | 0x02;
machi = (rootkey[15] << 8) | rootkey[14];
} else {
/* Create one */
rnd = arc4random();
maclo = 0x00f2 | (rnd & 0xffff0000);
machi = rnd & 0xffff;
}
}
eaddr[0] = maclo & 0xff;
eaddr[1] = (maclo >> 8) & 0xff;
eaddr[2] = (maclo >> 16) & 0xff;
eaddr[3] = (maclo >> 24) & 0xff;
eaddr[4] = machi & 0xff;
eaddr[5] = (machi >> 8) & 0xff;
}
#ifdef AWG_DEBUG
static void
awg_dump_regs(device_t dev)
{
static const struct {
const char *name;
u_int reg;
} regs[] = {
{ "BASIC_CTL_0", EMAC_BASIC_CTL_0 },
{ "BASIC_CTL_1", EMAC_BASIC_CTL_1 },
{ "INT_STA", EMAC_INT_STA },
{ "INT_EN", EMAC_INT_EN },
{ "TX_CTL_0", EMAC_TX_CTL_0 },
{ "TX_CTL_1", EMAC_TX_CTL_1 },
{ "TX_FLOW_CTL", EMAC_TX_FLOW_CTL },
{ "TX_DMA_LIST", EMAC_TX_DMA_LIST },
{ "RX_CTL_0", EMAC_RX_CTL_0 },
{ "RX_CTL_1", EMAC_RX_CTL_1 },
{ "RX_DMA_LIST", EMAC_RX_DMA_LIST },
{ "RX_FRM_FLT", EMAC_RX_FRM_FLT },
{ "RX_HASH_0", EMAC_RX_HASH_0 },
{ "RX_HASH_1", EMAC_RX_HASH_1 },
{ "MII_CMD", EMAC_MII_CMD },
{ "ADDR_HIGH0", EMAC_ADDR_HIGH(0) },
{ "ADDR_LOW0", EMAC_ADDR_LOW(0) },
{ "TX_DMA_STA", EMAC_TX_DMA_STA },
{ "TX_DMA_CUR_DESC", EMAC_TX_DMA_CUR_DESC },
{ "TX_DMA_CUR_BUF", EMAC_TX_DMA_CUR_BUF },
{ "RX_DMA_STA", EMAC_RX_DMA_STA },
{ "RX_DMA_CUR_DESC", EMAC_RX_DMA_CUR_DESC },
{ "RX_DMA_CUR_BUF", EMAC_RX_DMA_CUR_BUF },
{ "RGMII_STA", EMAC_RGMII_STA },
};
struct awg_softc *sc;
unsigned int n;
sc = device_get_softc(dev);
for (n = 0; n < nitems(regs); n++)
device_printf(dev, " %-20s %08x\n", regs[n].name,
RD4(sc, regs[n].reg));
}
#endif
#define GPIO_ACTIVE_LOW 1
static int
awg_phy_reset(device_t dev)
{
pcell_t gpio_prop[4], delay_prop[3];
phandle_t node, gpio_node;
device_t gpio;
uint32_t pin, flags;
uint32_t pin_value;
node = ofw_bus_get_node(dev);
if (OF_getencprop(node, "allwinner,reset-gpio", gpio_prop,
sizeof(gpio_prop)) <= 0)
return (0);
if (OF_getencprop(node, "allwinner,reset-delays-us", delay_prop,
sizeof(delay_prop)) <= 0)
return (ENXIO);
gpio_node = OF_node_from_xref(gpio_prop[0]);
if ((gpio = OF_device_from_xref(gpio_prop[0])) == NULL)
return (ENXIO);
if (GPIO_MAP_GPIOS(gpio, node, gpio_node, nitems(gpio_prop) - 1,
gpio_prop + 1, &pin, &flags) != 0)
return (ENXIO);
pin_value = GPIO_PIN_LOW;
if (OF_hasprop(node, "allwinner,reset-active-low"))
pin_value = GPIO_PIN_HIGH;
if (flags & GPIO_ACTIVE_LOW)
pin_value = !pin_value;
GPIO_PIN_SETFLAGS(gpio, pin, GPIO_PIN_OUTPUT);
GPIO_PIN_SET(gpio, pin, pin_value);
DELAY(delay_prop[0]);
GPIO_PIN_SET(gpio, pin, !pin_value);
DELAY(delay_prop[1]);
GPIO_PIN_SET(gpio, pin, pin_value);
DELAY(delay_prop[2]);
return (0);
}
static int
awg_reset(device_t dev)
{
struct awg_softc *sc;
int retry;
sc = device_get_softc(dev);
/* Reset PHY if necessary */
if (awg_phy_reset(dev) != 0) {
device_printf(dev, "failed to reset PHY\n");
return (ENXIO);
}
/* Soft reset all registers and logic */
WR4(sc, EMAC_BASIC_CTL_1, BASIC_CTL_SOFT_RST);
/* Wait for soft reset bit to self-clear */
for (retry = SOFT_RST_RETRY; retry > 0; retry--) {
if ((RD4(sc, EMAC_BASIC_CTL_1) & BASIC_CTL_SOFT_RST) == 0)
break;
DELAY(10);
}
if (retry == 0) {
device_printf(dev, "soft reset timed out\n");
#ifdef AWG_DEBUG
awg_dump_regs(dev);
#endif
return (ETIMEDOUT);
}
return (0);
}
static void
awg_dmamap_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
{
if (error != 0)
return;
*(bus_addr_t *)arg = segs[0].ds_addr;
}
static int
awg_setup_dma(device_t dev)
{
struct awg_softc *sc;
int error, i;
sc = device_get_softc(dev);
/* Setup TX ring */
error = bus_dma_tag_create(
bus_get_dma_tag(dev), /* Parent tag */
DESC_ALIGN, 0, /* alignment, boundary */
BUS_SPACE_MAXADDR_32BIT, /* lowaddr */
BUS_SPACE_MAXADDR, /* highaddr */
NULL, NULL, /* filter, filterarg */
TX_DESC_SIZE, 1, /* maxsize, nsegs */
TX_DESC_SIZE, /* maxsegsize */
0, /* flags */
NULL, NULL, /* lockfunc, lockarg */
&sc->tx.desc_tag);
if (error != 0) {
device_printf(dev, "cannot create TX descriptor ring tag\n");
return (error);
}
error = bus_dmamem_alloc(sc->tx.desc_tag, (void **)&sc->tx.desc_ring,
BUS_DMA_COHERENT | BUS_DMA_WAITOK | BUS_DMA_ZERO, &sc->tx.desc_map);
if (error != 0) {
device_printf(dev, "cannot allocate TX descriptor ring\n");
return (error);
}
error = bus_dmamap_load(sc->tx.desc_tag, sc->tx.desc_map,
sc->tx.desc_ring, TX_DESC_SIZE, awg_dmamap_cb,
&sc->tx.desc_ring_paddr, 0);
if (error != 0) {
device_printf(dev, "cannot load TX descriptor ring\n");
return (error);
}
for (i = 0; i < TX_DESC_COUNT; i++)
sc->tx.desc_ring[i].next =
htole32(sc->tx.desc_ring_paddr + DESC_OFF(TX_NEXT(i)));
error = bus_dma_tag_create(
bus_get_dma_tag(dev), /* Parent tag */
1, 0, /* alignment, boundary */
BUS_SPACE_MAXADDR_32BIT, /* lowaddr */
BUS_SPACE_MAXADDR, /* highaddr */
NULL, NULL, /* filter, filterarg */
MCLBYTES, TX_MAX_SEGS, /* maxsize, nsegs */
MCLBYTES, /* maxsegsize */
0, /* flags */
NULL, NULL, /* lockfunc, lockarg */
&sc->tx.buf_tag);
if (error != 0) {
device_printf(dev, "cannot create TX buffer tag\n");
return (error);
}
sc->tx.queued = 0;
for (i = 0; i < TX_DESC_COUNT; i++) {
error = bus_dmamap_create(sc->tx.buf_tag, 0,
&sc->tx.buf_map[i].map);
if (error != 0) {
device_printf(dev, "cannot create TX buffer map\n");
return (error);
}
}
/* Setup RX ring */
error = bus_dma_tag_create(
bus_get_dma_tag(dev), /* Parent tag */
DESC_ALIGN, 0, /* alignment, boundary */
BUS_SPACE_MAXADDR_32BIT, /* lowaddr */
BUS_SPACE_MAXADDR, /* highaddr */
NULL, NULL, /* filter, filterarg */
RX_DESC_SIZE, 1, /* maxsize, nsegs */
RX_DESC_SIZE, /* maxsegsize */
0, /* flags */
NULL, NULL, /* lockfunc, lockarg */
&sc->rx.desc_tag);
if (error != 0) {
device_printf(dev, "cannot create RX descriptor ring tag\n");
return (error);
}
error = bus_dmamem_alloc(sc->rx.desc_tag, (void **)&sc->rx.desc_ring,
BUS_DMA_COHERENT | BUS_DMA_WAITOK | BUS_DMA_ZERO, &sc->rx.desc_map);
if (error != 0) {
device_printf(dev, "cannot allocate RX descriptor ring\n");
return (error);
}
error = bus_dmamap_load(sc->rx.desc_tag, sc->rx.desc_map,
sc->rx.desc_ring, RX_DESC_SIZE, awg_dmamap_cb,
&sc->rx.desc_ring_paddr, 0);
if (error != 0) {
device_printf(dev, "cannot load RX descriptor ring\n");
return (error);
}
error = bus_dma_tag_create(
bus_get_dma_tag(dev), /* Parent tag */
1, 0, /* alignment, boundary */
BUS_SPACE_MAXADDR_32BIT, /* lowaddr */
BUS_SPACE_MAXADDR, /* highaddr */
NULL, NULL, /* filter, filterarg */
MCLBYTES, 1, /* maxsize, nsegs */
MCLBYTES, /* maxsegsize */
0, /* flags */
NULL, NULL, /* lockfunc, lockarg */
&sc->rx.buf_tag);
if (error != 0) {
device_printf(dev, "cannot create RX buffer tag\n");
return (error);
}
error = bus_dmamap_create(sc->rx.buf_tag, 0, &sc->rx.buf_spare_map);
if (error != 0) {
device_printf(dev,
"cannot create RX buffer spare map\n");
return (error);
}
for (i = 0; i < RX_DESC_COUNT; i++) {
sc->rx.desc_ring[i].next =
htole32(sc->rx.desc_ring_paddr + DESC_OFF(RX_NEXT(i)));
error = bus_dmamap_create(sc->rx.buf_tag, 0,
&sc->rx.buf_map[i].map);
if (error != 0) {
device_printf(dev, "cannot create RX buffer map\n");
return (error);
}
sc->rx.buf_map[i].mbuf = NULL;
error = awg_newbuf_rx(sc, i);
if (error != 0) {
device_printf(dev, "cannot create RX buffer\n");
return (error);
}
}
bus_dmamap_sync(sc->rx.desc_tag, sc->rx.desc_map,
BUS_DMASYNC_PREWRITE);
/* Write transmit and receive descriptor base address registers */
WR4(sc, EMAC_TX_DMA_LIST, sc->tx.desc_ring_paddr);
WR4(sc, EMAC_RX_DMA_LIST, sc->rx.desc_ring_paddr);
return (0);
}
static int
awg_probe(device_t dev)
{
if (!ofw_bus_status_okay(dev))
return (ENXIO);
if (ofw_bus_search_compatible(dev, compat_data)->ocd_data == 0)
return (ENXIO);
device_set_desc(dev, "Allwinner Gigabit Ethernet");
return (BUS_PROBE_DEFAULT);
}
static int
awg_attach(device_t dev)
{
uint8_t eaddr[ETHER_ADDR_LEN];
struct awg_softc *sc;
- phandle_t node;
int error;
sc = device_get_softc(dev);
sc->dev = dev;
sc->type = ofw_bus_search_compatible(dev, compat_data)->ocd_data;
- node = ofw_bus_get_node(dev);
if (bus_alloc_resources(dev, awg_spec, sc->res) != 0) {
device_printf(dev, "cannot allocate resources for device\n");
return (ENXIO);
}
mtx_init(&sc->mtx, device_get_nameunit(dev), MTX_NETWORK_LOCK, MTX_DEF);
callout_init_mtx(&sc->stat_ch, &sc->mtx, 0);
TASK_INIT(&sc->link_task, 0, awg_link_task, sc);
/* Setup clocks and regulators */
error = awg_setup_extres(dev);
if (error != 0)
return (error);
/* Read MAC address before resetting the chip */
awg_get_eaddr(dev, eaddr);
/* Soft reset EMAC core */
error = awg_reset(dev);
if (error != 0)
return (error);
/* Setup DMA descriptors */
error = awg_setup_dma(dev);
if (error != 0)
return (error);
/* Install interrupt handler */
error = bus_setup_intr(dev, sc->res[_RES_IRQ],
INTR_TYPE_NET | INTR_MPSAFE, NULL, awg_intr, sc, &sc->ih);
if (error != 0) {
device_printf(dev, "cannot setup interrupt handler\n");
return (error);
}
/* Setup ethernet interface */
sc->ifp = if_alloc(IFT_ETHER);
if_setsoftc(sc->ifp, sc);
if_initname(sc->ifp, device_get_name(dev), device_get_unit(dev));
if_setflags(sc->ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
if_setstartfn(sc->ifp, awg_start);
if_setioctlfn(sc->ifp, awg_ioctl);
if_setinitfn(sc->ifp, awg_init);
if_setsendqlen(sc->ifp, TX_DESC_COUNT - 1);
if_setsendqready(sc->ifp);
if_sethwassist(sc->ifp, CSUM_IP | CSUM_UDP | CSUM_TCP);
if_setcapabilities(sc->ifp, IFCAP_VLAN_MTU | IFCAP_HWCSUM);
if_setcapenable(sc->ifp, if_getcapabilities(sc->ifp));
#ifdef DEVICE_POLLING
if_setcapabilitiesbit(sc->ifp, IFCAP_POLLING, 0);
#endif
/* Attach MII driver */
error = mii_attach(dev, &sc->miibus, sc->ifp, awg_media_change,
awg_media_status, BMSR_DEFCAPMASK, MII_PHY_ANY, MII_OFFSET_ANY,
MIIF_DOPAUSE);
if (error != 0) {
device_printf(dev, "cannot attach PHY\n");
return (error);
}
/* Attach ethernet interface */
ether_ifattach(sc->ifp, eaddr);
return (0);
}
static device_method_t awg_methods[] = {
/* Device interface */
DEVMETHOD(device_probe, awg_probe),
DEVMETHOD(device_attach, awg_attach),
/* MII interface */
DEVMETHOD(miibus_readreg, awg_miibus_readreg),
DEVMETHOD(miibus_writereg, awg_miibus_writereg),
DEVMETHOD(miibus_statchg, awg_miibus_statchg),
DEVMETHOD_END
};
static driver_t awg_driver = {
"awg",
awg_methods,
sizeof(struct awg_softc),
};
static devclass_t awg_devclass;
DRIVER_MODULE(awg, simplebus, awg_driver, awg_devclass, 0, 0);
DRIVER_MODULE(miibus, awg, miibus_driver, miibus_devclass, 0, 0);
MODULE_DEPEND(awg, ether, 1, 1, 1);
MODULE_DEPEND(awg, miibus, 1, 1, 1);
Index: head/sys/arm/arm/gic.c
===================================================================
--- head/sys/arm/arm/gic.c (revision 327172)
+++ head/sys/arm/arm/gic.c (revision 327173)
@@ -1,1599 +1,1597 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (c) 2011 The FreeBSD Foundation
* All rights reserved.
*
* Developed by Damjan Marion <damjan.marion@gmail.com>
*
* Based on OMAP4 GIC code by Ben Gray
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the company nor the name of the author may be used to
* endorse or promote products derived from this software without specific
* prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_platform.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/bus.h>
#include <sys/kernel.h>
#include <sys/ktr.h>
#include <sys/module.h>
#include <sys/malloc.h>
#include <sys/rman.h>
#include <sys/pcpu.h>
#include <sys/proc.h>
#include <sys/cpuset.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/smp.h>
#ifdef INTRNG
#include <sys/sched.h>
#endif
#include <vm/vm.h>
#include <vm/pmap.h>
#include <machine/bus.h>
#include <machine/intr.h>
#include <machine/smp.h>
#ifdef FDT
#include <dev/fdt/fdt_intr.h>
#include <dev/ofw/ofw_bus_subr.h>
#endif
#include <arm/arm/gic.h>
#include <arm/arm/gic_common.h>
#ifdef INTRNG
#include "pic_if.h"
#include "msi_if.h"
#endif
/* We are using GICv2 register naming */
/* Distributor Registers */
/* CPU Registers */
#define GICC_CTLR 0x0000 /* v1 ICCICR */
#define GICC_PMR 0x0004 /* v1 ICCPMR */
#define GICC_BPR 0x0008 /* v1 ICCBPR */
#define GICC_IAR 0x000C /* v1 ICCIAR */
#define GICC_EOIR 0x0010 /* v1 ICCEOIR */
#define GICC_RPR 0x0014 /* v1 ICCRPR */
#define GICC_HPPIR 0x0018 /* v1 ICCHPIR */
#define GICC_ABPR 0x001C /* v1 ICCABPR */
#define GICC_IIDR 0x00FC /* v1 ICCIIDR*/
/* TYPER Registers */
#define GICD_TYPER_SECURITYEXT 0x400
#define GIC_SUPPORT_SECEXT(_sc) \
((_sc->typer & GICD_TYPER_SECURITYEXT) == GICD_TYPER_SECURITYEXT)
#ifndef GIC_DEFAULT_ICFGR_INIT
#define GIC_DEFAULT_ICFGR_INIT 0x00000000
#endif
#ifdef INTRNG
struct gic_irqsrc {
struct intr_irqsrc gi_isrc;
uint32_t gi_irq;
enum intr_polarity gi_pol;
enum intr_trigger gi_trig;
#define GI_FLAG_EARLY_EOI (1 << 0)
#define GI_FLAG_MSI (1 << 1) /* This interrupt source should only */
/* be used for MSI/MSI-X interrupts */
#define GI_FLAG_MSI_USED (1 << 2) /* This irq is already allocated */
/* for a MSI/MSI-X interrupt */
u_int gi_flags;
};
static u_int gic_irq_cpu;
static int arm_gic_bind_intr(device_t dev, struct intr_irqsrc *isrc);
#ifdef SMP
static u_int sgi_to_ipi[GIC_LAST_SGI - GIC_FIRST_SGI + 1];
static u_int sgi_first_unused = GIC_FIRST_SGI;
#endif
#define GIC_INTR_ISRC(sc, irq) (&sc->gic_irqs[irq].gi_isrc)
#else /* !INTRNG */
static struct ofw_compat_data compat_data[] = {
{"arm,gic", true}, /* Non-standard, used in FreeBSD dts. */
{"arm,gic-400", true},
{"arm,cortex-a15-gic", true},
{"arm,cortex-a9-gic", true},
{"arm,cortex-a7-gic", true},
{"arm,arm11mp-gic", true},
{"brcm,brahma-b15-gic", true},
{"qcom,msm-qgic2", true},
{NULL, false}
};
#endif
static struct resource_spec arm_gic_spec[] = {
{ SYS_RES_MEMORY, 0, RF_ACTIVE }, /* Distributor registers */
{ SYS_RES_MEMORY, 1, RF_ACTIVE }, /* CPU Interrupt Intf. registers */
#ifdef INTRNG
{ SYS_RES_IRQ, 0, RF_ACTIVE | RF_OPTIONAL }, /* Parent interrupt */
#endif
{ -1, 0 }
};
#if defined(__arm__) && defined(INVARIANTS)
static int gic_debug_spurious = 1;
#else
static int gic_debug_spurious = 0;
#endif
TUNABLE_INT("hw.gic.debug_spurious", &gic_debug_spurious);
static u_int arm_gic_map[MAXCPU];
static struct arm_gic_softc *gic_sc = NULL;
#define gic_c_read_4(_sc, _reg) \
bus_space_read_4((_sc)->gic_c_bst, (_sc)->gic_c_bsh, (_reg))
#define gic_c_write_4(_sc, _reg, _val) \
bus_space_write_4((_sc)->gic_c_bst, (_sc)->gic_c_bsh, (_reg), (_val))
#define gic_d_read_4(_sc, _reg) \
bus_space_read_4((_sc)->gic_d_bst, (_sc)->gic_d_bsh, (_reg))
#define gic_d_write_1(_sc, _reg, _val) \
bus_space_write_1((_sc)->gic_d_bst, (_sc)->gic_d_bsh, (_reg), (_val))
#define gic_d_write_4(_sc, _reg, _val) \
bus_space_write_4((_sc)->gic_d_bst, (_sc)->gic_d_bsh, (_reg), (_val))
#ifndef INTRNG
static int gic_config_irq(int irq, enum intr_trigger trig,
enum intr_polarity pol);
static void gic_post_filter(void *);
#endif
#ifdef INTRNG
static inline void
gic_irq_unmask(struct arm_gic_softc *sc, u_int irq)
{
gic_d_write_4(sc, GICD_ISENABLER(irq), GICD_I_MASK(irq));
}
static inline void
gic_irq_mask(struct arm_gic_softc *sc, u_int irq)
{
gic_d_write_4(sc, GICD_ICENABLER(irq), GICD_I_MASK(irq));
}
#endif
static uint8_t
gic_cpu_mask(struct arm_gic_softc *sc)
{
uint32_t mask;
int i;
/* Read the current cpuid mask by reading ITARGETSR{0..7} */
for (i = 0; i < 8; i++) {
mask = gic_d_read_4(sc, GICD_ITARGETSR(4 * i));
if (mask != 0)
break;
}
/* No mask found, assume we are on CPU interface 0 */
if (mask == 0)
return (1);
/* Collect the mask in the lower byte */
mask |= mask >> 16;
mask |= mask >> 8;
return (mask);
}
#ifdef SMP
#ifdef INTRNG
static void
arm_gic_init_secondary(device_t dev)
{
struct arm_gic_softc *sc = device_get_softc(dev);
u_int irq, cpu;
/* Set the mask so we can find this CPU to send it IPIs */
cpu = PCPU_GET(cpuid);
arm_gic_map[cpu] = gic_cpu_mask(sc);
for (irq = 0; irq < sc->nirqs; irq += 4)
gic_d_write_4(sc, GICD_IPRIORITYR(irq), 0);
/* Set all the interrupts to be in Group 0 (secure) */
for (irq = 0; GIC_SUPPORT_SECEXT(sc) && irq < sc->nirqs; irq += 32) {
gic_d_write_4(sc, GICD_IGROUPR(irq), 0);
}
/* Enable CPU interface */
gic_c_write_4(sc, GICC_CTLR, 1);
/* Set priority mask register. */
gic_c_write_4(sc, GICC_PMR, 0xff);
/* Enable interrupt distribution */
gic_d_write_4(sc, GICD_CTLR, 0x01);
/* Unmask attached SGI interrupts. */
for (irq = GIC_FIRST_SGI; irq <= GIC_LAST_SGI; irq++)
if (intr_isrc_init_on_cpu(GIC_INTR_ISRC(sc, irq), cpu))
gic_irq_unmask(sc, irq);
/* Unmask attached PPI interrupts. */
for (irq = GIC_FIRST_PPI; irq <= GIC_LAST_PPI; irq++)
if (intr_isrc_init_on_cpu(GIC_INTR_ISRC(sc, irq), cpu))
gic_irq_unmask(sc, irq);
}
#else
static void
arm_gic_init_secondary(device_t dev)
{
struct arm_gic_softc *sc = device_get_softc(dev);
int i;
/* Set the mask so we can find this CPU to send it IPIs */
arm_gic_map[PCPU_GET(cpuid)] = gic_cpu_mask(sc);
for (i = 0; i < sc->nirqs; i += 4)
gic_d_write_4(sc, GICD_IPRIORITYR(i), 0);
/* Set all the interrupts to be in Group 0 (secure) */
for (i = 0; GIC_SUPPORT_SECEXT(sc) && i < sc->nirqs; i += 32) {
gic_d_write_4(sc, GICD_IGROUPR(i), 0);
}
/* Enable CPU interface */
gic_c_write_4(sc, GICC_CTLR, 1);
/* Set priority mask register. */
gic_c_write_4(sc, GICC_PMR, 0xff);
/* Enable interrupt distribution */
gic_d_write_4(sc, GICD_CTLR, 0x01);
/*
* Activate the timer interrupts: virtual, secure, and non-secure.
*/
gic_d_write_4(sc, GICD_ISENABLER(27), GICD_I_MASK(27));
gic_d_write_4(sc, GICD_ISENABLER(29), GICD_I_MASK(29));
gic_d_write_4(sc, GICD_ISENABLER(30), GICD_I_MASK(30));
}
#endif /* INTRNG */
#endif /* SMP */
#ifndef INTRNG
int
gic_decode_fdt(phandle_t iparent, pcell_t *intr, int *interrupt,
int *trig, int *pol)
{
static u_int num_intr_cells;
static phandle_t self;
struct ofw_compat_data *ocd;
if (self == 0) {
for (ocd = compat_data; ocd->ocd_str != NULL; ocd++) {
if (ofw_bus_node_is_compatible(iparent, ocd->ocd_str)) {
self = iparent;
break;
}
}
}
if (self != iparent)
return (ENXIO);
if (num_intr_cells == 0) {
if (OF_searchencprop(OF_node_from_xref(iparent),
"#interrupt-cells", &num_intr_cells,
sizeof(num_intr_cells)) == -1) {
num_intr_cells = 1;
}
}
if (num_intr_cells == 1) {
*interrupt = fdt32_to_cpu(intr[0]);
*trig = INTR_TRIGGER_CONFORM;
*pol = INTR_POLARITY_CONFORM;
} else {
if (fdt32_to_cpu(intr[0]) == 0)
*interrupt = fdt32_to_cpu(intr[1]) + GIC_FIRST_SPI;
else
*interrupt = fdt32_to_cpu(intr[1]) + GIC_FIRST_PPI;
/*
* In intr[2], bits[3:0] are trigger type and level flags.
* 1 = low-to-high edge triggered
* 2 = high-to-low edge triggered
* 4 = active high level-sensitive
* 8 = active low level-sensitive
* The hardware only supports active-high-level or rising-edge
* for SPIs
*/
if (*interrupt >= GIC_FIRST_SPI &&
fdt32_to_cpu(intr[2]) & 0x0a) {
printf("unsupported trigger/polarity configuration "
"0x%02x\n", fdt32_to_cpu(intr[2]) & 0x0f);
}
*pol = INTR_POLARITY_CONFORM;
if (fdt32_to_cpu(intr[2]) & 0x03)
*trig = INTR_TRIGGER_EDGE;
else
*trig = INTR_TRIGGER_LEVEL;
}
return (0);
}
#endif
#ifdef INTRNG
static int
arm_gic_register_isrcs(struct arm_gic_softc *sc, uint32_t num)
{
int error;
uint32_t irq;
struct gic_irqsrc *irqs;
struct intr_irqsrc *isrc;
const char *name;
irqs = malloc(num * sizeof(struct gic_irqsrc), M_DEVBUF,
M_WAITOK | M_ZERO);
name = device_get_nameunit(sc->gic_dev);
for (irq = 0; irq < num; irq++) {
irqs[irq].gi_irq = irq;
irqs[irq].gi_pol = INTR_POLARITY_CONFORM;
irqs[irq].gi_trig = INTR_TRIGGER_CONFORM;
isrc = &irqs[irq].gi_isrc;
if (irq <= GIC_LAST_SGI) {
error = intr_isrc_register(isrc, sc->gic_dev,
INTR_ISRCF_IPI, "%s,i%u", name, irq - GIC_FIRST_SGI);
} else if (irq <= GIC_LAST_PPI) {
error = intr_isrc_register(isrc, sc->gic_dev,
INTR_ISRCF_PPI, "%s,p%u", name, irq - GIC_FIRST_PPI);
} else {
error = intr_isrc_register(isrc, sc->gic_dev, 0,
"%s,s%u", name, irq - GIC_FIRST_SPI);
}
if (error != 0) {
/* XXX call intr_isrc_deregister() */
free(irqs, M_DEVBUF);
return (error);
}
}
sc->gic_irqs = irqs;
sc->nirqs = num;
return (0);
}
static void
arm_gic_reserve_msi_range(device_t dev, u_int start, u_int count)
{
struct arm_gic_softc *sc;
int i;
sc = device_get_softc(dev);
KASSERT((start + count) < sc->nirqs,
("%s: Trying to allocate too many MSI IRQs: %d + %d > %d", __func__,
start, count, sc->nirqs));
for (i = 0; i < count; i++) {
KASSERT(sc->gic_irqs[start + i].gi_isrc.isrc_handlers == 0,
("%s: MSI interrupt %d already has a handler", __func__,
count + i));
KASSERT(sc->gic_irqs[start + i].gi_pol == INTR_POLARITY_CONFORM,
("%s: MSI interrupt %d already has a polarity", __func__,
count + i));
KASSERT(sc->gic_irqs[start + i].gi_trig == INTR_TRIGGER_CONFORM,
("%s: MSI interrupt %d already has a trigger", __func__,
count + i));
sc->gic_irqs[start + i].gi_pol = INTR_POLARITY_HIGH;
sc->gic_irqs[start + i].gi_trig = INTR_TRIGGER_EDGE;
sc->gic_irqs[start + i].gi_flags |= GI_FLAG_MSI;
}
}
#endif
int
arm_gic_attach(device_t dev)
{
struct arm_gic_softc *sc;
int i;
uint32_t icciidr, mask, nirqs;
if (gic_sc)
return (ENXIO);
sc = device_get_softc(dev);
if (bus_alloc_resources(dev, arm_gic_spec, sc->gic_res)) {
device_printf(dev, "could not allocate resources\n");
return (ENXIO);
}
sc->gic_dev = dev;
gic_sc = sc;
/* Initialize mutex */
mtx_init(&sc->mutex, "GIC lock", "", MTX_SPIN);
/* Distributor Interface */
sc->gic_d_bst = rman_get_bustag(sc->gic_res[0]);
sc->gic_d_bsh = rman_get_bushandle(sc->gic_res[0]);
/* CPU Interface */
sc->gic_c_bst = rman_get_bustag(sc->gic_res[1]);
sc->gic_c_bsh = rman_get_bushandle(sc->gic_res[1]);
/* Disable interrupt forwarding to the CPU interface */
gic_d_write_4(sc, GICD_CTLR, 0x00);
/* Get the number of interrupts */
sc->typer = gic_d_read_4(sc, GICD_TYPER);
nirqs = GICD_TYPER_I_NUM(sc->typer);
#ifdef INTRNG
if (arm_gic_register_isrcs(sc, nirqs)) {
device_printf(dev, "could not register irqs\n");
goto cleanup;
}
#else
sc->nirqs = nirqs;
/* Set up function pointers */
arm_post_filter = gic_post_filter;
arm_config_irq = gic_config_irq;
#endif
icciidr = gic_c_read_4(sc, GICC_IIDR);
device_printf(dev,
"pn 0x%x, arch 0x%x, rev 0x%x, implementer 0x%x irqs %u\n",
GICD_IIDR_PROD(icciidr), GICD_IIDR_VAR(icciidr),
GICD_IIDR_REV(icciidr), GICD_IIDR_IMPL(icciidr), sc->nirqs);
#ifdef INTRNG
sc->gic_iidr = icciidr;
#endif
/* Set all global interrupts to be level triggered, active low. */
for (i = 32; i < sc->nirqs; i += 16) {
gic_d_write_4(sc, GICD_ICFGR(i), GIC_DEFAULT_ICFGR_INIT);
}
/* Disable all interrupts. */
for (i = 32; i < sc->nirqs; i += 32) {
gic_d_write_4(sc, GICD_ICENABLER(i), 0xFFFFFFFF);
}
/* Find the current cpu mask */
mask = gic_cpu_mask(sc);
/* Set the mask so we can find this CPU to send it IPIs */
arm_gic_map[PCPU_GET(cpuid)] = mask;
/* Set all four targets to this cpu */
mask |= mask << 8;
mask |= mask << 16;
for (i = 0; i < sc->nirqs; i += 4) {
gic_d_write_4(sc, GICD_IPRIORITYR(i), 0);
if (i > 32) {
gic_d_write_4(sc, GICD_ITARGETSR(i), mask);
}
}
/* Set all the interrupts to be in Group 0 (secure) */
for (i = 0; GIC_SUPPORT_SECEXT(sc) && i < sc->nirqs; i += 32) {
gic_d_write_4(sc, GICD_IGROUPR(i), 0);
}
/* Enable CPU interface */
gic_c_write_4(sc, GICC_CTLR, 1);
/* Set priority mask register. */
gic_c_write_4(sc, GICC_PMR, 0xff);
/* Enable interrupt distribution */
gic_d_write_4(sc, GICD_CTLR, 0x01);
return (0);
#ifdef INTRNG
cleanup:
arm_gic_detach(dev);
return(ENXIO);
#endif
}
int
arm_gic_detach(device_t dev)
{
#ifdef INTRNG
struct arm_gic_softc *sc;
sc = device_get_softc(dev);
if (sc->gic_irqs != NULL)
free(sc->gic_irqs, M_DEVBUF);
bus_release_resources(dev, arm_gic_spec, sc->gic_res);
#endif
return (0);
}
#ifdef INTRNG
static int
arm_gic_print_child(device_t bus, device_t child)
{
struct resource_list *rl;
int rv;
rv = bus_print_child_header(bus, child);
rl = BUS_GET_RESOURCE_LIST(bus, child);
if (rl != NULL) {
rv += resource_list_print_type(rl, "mem", SYS_RES_MEMORY,
"%#jx");
rv += resource_list_print_type(rl, "irq", SYS_RES_IRQ, "%jd");
}
rv += bus_print_child_footer(bus, child);
return (rv);
}
static struct resource *
arm_gic_alloc_resource(device_t bus, device_t child, int type, int *rid,
rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
{
struct arm_gic_softc *sc;
struct resource_list_entry *rle;
struct resource_list *rl;
int j;
KASSERT(type == SYS_RES_MEMORY, ("Invalid resoure type %x", type));
sc = device_get_softc(bus);
/*
* Request for the default allocation with a given rid: use resource
* list stored in the local device info.
*/
if (RMAN_IS_DEFAULT_RANGE(start, end)) {
rl = BUS_GET_RESOURCE_LIST(bus, child);
if (type == SYS_RES_IOPORT)
type = SYS_RES_MEMORY;
rle = resource_list_find(rl, type, *rid);
if (rle == NULL) {
if (bootverbose)
device_printf(bus, "no default resources for "
"rid = %d, type = %d\n", *rid, type);
return (NULL);
}
start = rle->start;
end = rle->end;
count = rle->count;
}
/* Remap through ranges property */
for (j = 0; j < sc->nranges; j++) {
if (start >= sc->ranges[j].bus && end <
sc->ranges[j].bus + sc->ranges[j].size) {
start -= sc->ranges[j].bus;
start += sc->ranges[j].host;
end -= sc->ranges[j].bus;
end += sc->ranges[j].host;
break;
}
}
if (j == sc->nranges && sc->nranges != 0) {
if (bootverbose)
device_printf(bus, "Could not map resource "
"%#jx-%#jx\n", (uintmax_t)start, (uintmax_t)end);
return (NULL);
}
return (bus_generic_alloc_resource(bus, child, type, rid, start, end,
count, flags));
}
static int
arm_gic_read_ivar(device_t dev, device_t child, int which, uintptr_t *result)
{
struct arm_gic_softc *sc;
sc = device_get_softc(dev);
switch(which) {
case GIC_IVAR_HW_REV:
KASSERT(GICD_IIDR_VAR(sc->gic_iidr) < 3 &&
GICD_IIDR_VAR(sc->gic_iidr) != 0,
("arm_gic_read_ivar: Unknown IIDR revision %u (%.08x)",
GICD_IIDR_VAR(sc->gic_iidr), sc->gic_iidr));
*result = GICD_IIDR_VAR(sc->gic_iidr);
return (0);
case GIC_IVAR_BUS:
KASSERT(sc->gic_bus != GIC_BUS_UNKNOWN,
("arm_gic_read_ivar: Unknown bus type"));
KASSERT(sc->gic_bus <= GIC_BUS_MAX,
("arm_gic_read_ivar: Invalid bus type %u", sc->gic_bus));
*result = sc->gic_bus;
return (0);
}
return (ENOENT);
}
int
arm_gic_intr(void *arg)
{
struct arm_gic_softc *sc = arg;
struct gic_irqsrc *gi;
uint32_t irq_active_reg, irq;
struct trapframe *tf;
irq_active_reg = gic_c_read_4(sc, GICC_IAR);
irq = irq_active_reg & 0x3FF;
/*
* 1. We do EOI here because recent read value from active interrupt
* register must be used for it. Another approach is to save this
* value into associated interrupt source.
* 2. EOI must be done on same CPU where interrupt has fired. Thus
* we must ensure that interrupted thread does not migrate to
* another CPU.
* 3. EOI cannot be delayed by any preemption which could happen on
* critical_exit() used in MI intr code, when interrupt thread is
* scheduled. See next point.
* 4. IPI_RENDEZVOUS assumes that no preemption is permitted during
* an action and any use of critical_exit() could break this
* assumption. See comments within smp_rendezvous_action().
* 5. We always return FILTER_HANDLED as this is an interrupt
* controller dispatch function. Otherwise, in cascaded interrupt
* case, the whole interrupt subtree would be masked.
*/
if (irq >= sc->nirqs) {
if (gic_debug_spurious)
device_printf(sc->gic_dev,
"Spurious interrupt detected: last irq: %d on CPU%d\n",
sc->last_irq[PCPU_GET(cpuid)], PCPU_GET(cpuid));
return (FILTER_HANDLED);
}
tf = curthread->td_intr_frame;
dispatch_irq:
gi = sc->gic_irqs + irq;
/*
* Note that GIC_FIRST_SGI is zero and is not used in 'if' statement
* as compiler complains that comparing u_int >= 0 is always true.
*/
if (irq <= GIC_LAST_SGI) {
#ifdef SMP
/* Call EOI for all IPI before dispatch. */
gic_c_write_4(sc, GICC_EOIR, irq_active_reg);
intr_ipi_dispatch(sgi_to_ipi[gi->gi_irq], tf);
goto next_irq;
#else
device_printf(sc->gic_dev, "SGI %u on UP system detected\n",
irq - GIC_FIRST_SGI);
gic_c_write_4(sc, GICC_EOIR, irq_active_reg);
goto next_irq;
#endif
}
if (gic_debug_spurious)
sc->last_irq[PCPU_GET(cpuid)] = irq;
if ((gi->gi_flags & GI_FLAG_EARLY_EOI) == GI_FLAG_EARLY_EOI)
gic_c_write_4(sc, GICC_EOIR, irq_active_reg);
if (intr_isrc_dispatch(&gi->gi_isrc, tf) != 0) {
gic_irq_mask(sc, irq);
if ((gi->gi_flags & GI_FLAG_EARLY_EOI) != GI_FLAG_EARLY_EOI)
gic_c_write_4(sc, GICC_EOIR, irq_active_reg);
device_printf(sc->gic_dev, "Stray irq %u disabled\n", irq);
}
next_irq:
arm_irq_memory_barrier(irq);
irq_active_reg = gic_c_read_4(sc, GICC_IAR);
irq = irq_active_reg & 0x3FF;
if (irq < sc->nirqs)
goto dispatch_irq;
return (FILTER_HANDLED);
}
static void
gic_config(struct arm_gic_softc *sc, u_int irq, enum intr_trigger trig,
enum intr_polarity pol)
{
uint32_t reg;
uint32_t mask;
if (irq < GIC_FIRST_SPI)
return;
mtx_lock_spin(&sc->mutex);
reg = gic_d_read_4(sc, GICD_ICFGR(irq));
mask = (reg >> 2*(irq % 16)) & 0x3;
if (pol == INTR_POLARITY_LOW) {
mask &= ~GICD_ICFGR_POL_MASK;
mask |= GICD_ICFGR_POL_LOW;
} else if (pol == INTR_POLARITY_HIGH) {
mask &= ~GICD_ICFGR_POL_MASK;
mask |= GICD_ICFGR_POL_HIGH;
}
if (trig == INTR_TRIGGER_LEVEL) {
mask &= ~GICD_ICFGR_TRIG_MASK;
mask |= GICD_ICFGR_TRIG_LVL;
} else if (trig == INTR_TRIGGER_EDGE) {
mask &= ~GICD_ICFGR_TRIG_MASK;
mask |= GICD_ICFGR_TRIG_EDGE;
}
/* Set mask */
reg = reg & ~(0x3 << 2*(irq % 16));
reg = reg | (mask << 2*(irq % 16));
gic_d_write_4(sc, GICD_ICFGR(irq), reg);
mtx_unlock_spin(&sc->mutex);
}
static int
gic_bind(struct arm_gic_softc *sc, u_int irq, cpuset_t *cpus)
{
uint32_t cpu, end, mask;
end = min(mp_ncpus, 8);
for (cpu = end; cpu < MAXCPU; cpu++)
if (CPU_ISSET(cpu, cpus))
return (EINVAL);
for (mask = 0, cpu = 0; cpu < end; cpu++)
if (CPU_ISSET(cpu, cpus))
mask |= arm_gic_map[cpu];
gic_d_write_1(sc, GICD_ITARGETSR(0) + irq, mask);
return (0);
}
#ifdef FDT
static int
gic_map_fdt(device_t dev, u_int ncells, pcell_t *cells, u_int *irqp,
enum intr_polarity *polp, enum intr_trigger *trigp)
{
if (ncells == 1) {
*irqp = cells[0];
*polp = INTR_POLARITY_CONFORM;
*trigp = INTR_TRIGGER_CONFORM;
return (0);
}
if (ncells == 3) {
u_int irq, tripol;
/*
* The 1st cell is the interrupt type:
* 0 = SPI
* 1 = PPI
* The 2nd cell contains the interrupt number:
* [0 - 987] for SPI
* [0 - 15] for PPI
* The 3rd cell is the flags, encoded as follows:
* bits[3:0] trigger type and level flags
* 1 = low-to-high edge triggered
* 2 = high-to-low edge triggered
* 4 = active high level-sensitive
* 8 = active low level-sensitive
* bits[15:8] PPI interrupt cpu mask
* Each bit corresponds to each of the 8 possible cpus
* attached to the GIC. A bit set to '1' indicated
* the interrupt is wired to that CPU.
*/
switch (cells[0]) {
case 0:
irq = GIC_FIRST_SPI + cells[1];
/* SPI irq is checked later. */
break;
case 1:
irq = GIC_FIRST_PPI + cells[1];
if (irq > GIC_LAST_PPI) {
device_printf(dev, "unsupported PPI interrupt "
"number %u\n", cells[1]);
return (EINVAL);
}
break;
default:
device_printf(dev, "unsupported interrupt type "
"configuration %u\n", cells[0]);
return (EINVAL);
}
tripol = cells[2] & 0xff;
if (tripol & 0xf0 || (tripol & FDT_INTR_LOW_MASK &&
cells[0] == 0))
device_printf(dev, "unsupported trigger/polarity "
"configuration 0x%02x\n", tripol);
*irqp = irq;
*polp = INTR_POLARITY_CONFORM;
*trigp = tripol & FDT_INTR_EDGE_MASK ?
INTR_TRIGGER_EDGE : INTR_TRIGGER_LEVEL;
return (0);
}
return (EINVAL);
}
#endif
static int
gic_map_msi(device_t dev, struct intr_map_data_msi *msi_data, u_int *irqp,
enum intr_polarity *polp, enum intr_trigger *trigp)
{
struct gic_irqsrc *gi;
/* Map a non-GICv2m MSI */
gi = (struct gic_irqsrc *)msi_data->isrc;
if (gi == NULL)
return (ENXIO);
*irqp = gi->gi_irq;
/* MSI/MSI-X interrupts are always edge triggered with high polarity */
*polp = INTR_POLARITY_HIGH;
*trigp = INTR_TRIGGER_EDGE;
return (0);
}
static int
gic_map_intr(device_t dev, struct intr_map_data *data, u_int *irqp,
enum intr_polarity *polp, enum intr_trigger *trigp)
{
u_int irq;
enum intr_polarity pol;
enum intr_trigger trig;
struct arm_gic_softc *sc;
struct intr_map_data_msi *dam;
#ifdef FDT
struct intr_map_data_fdt *daf;
#endif
sc = device_get_softc(dev);
switch (data->type) {
#ifdef FDT
case INTR_MAP_DATA_FDT:
daf = (struct intr_map_data_fdt *)data;
if (gic_map_fdt(dev, daf->ncells, daf->cells, &irq, &pol,
&trig) != 0)
return (EINVAL);
KASSERT(irq >= sc->nirqs ||
(sc->gic_irqs[irq].gi_flags & GI_FLAG_MSI) == 0,
("%s: Attempting to map a MSI interrupt from FDT",
__func__));
break;
#endif
case INTR_MAP_DATA_MSI:
/* Non-GICv2m MSI */
dam = (struct intr_map_data_msi *)data;
if (gic_map_msi(dev, dam, &irq, &pol, &trig) != 0)
return (EINVAL);
break;
default:
return (ENOTSUP);
}
if (irq >= sc->nirqs)
return (EINVAL);
if (pol != INTR_POLARITY_CONFORM && pol != INTR_POLARITY_LOW &&
pol != INTR_POLARITY_HIGH)
return (EINVAL);
if (trig != INTR_TRIGGER_CONFORM && trig != INTR_TRIGGER_EDGE &&
trig != INTR_TRIGGER_LEVEL)
return (EINVAL);
*irqp = irq;
if (polp != NULL)
*polp = pol;
if (trigp != NULL)
*trigp = trig;
return (0);
}
static int
arm_gic_map_intr(device_t dev, struct intr_map_data *data,
struct intr_irqsrc **isrcp)
{
int error;
u_int irq;
struct arm_gic_softc *sc;
error = gic_map_intr(dev, data, &irq, NULL, NULL);
if (error == 0) {
sc = device_get_softc(dev);
*isrcp = GIC_INTR_ISRC(sc, irq);
}
return (error);
}
static int
arm_gic_setup_intr(device_t dev, struct intr_irqsrc *isrc,
struct resource *res, struct intr_map_data *data)
{
struct arm_gic_softc *sc = device_get_softc(dev);
struct gic_irqsrc *gi = (struct gic_irqsrc *)isrc;
enum intr_trigger trig;
enum intr_polarity pol;
if ((gi->gi_flags & GI_FLAG_MSI) == GI_FLAG_MSI) {
/* GICv2m MSI */
pol = gi->gi_pol;
trig = gi->gi_trig;
KASSERT(pol == INTR_POLARITY_HIGH,
("%s: MSI interrupts must be active-high", __func__));
KASSERT(trig == INTR_TRIGGER_EDGE,
("%s: MSI interrupts must be edge triggered", __func__));
} else if (data != NULL) {
u_int irq;
/* Get config for resource. */
if (gic_map_intr(dev, data, &irq, &pol, &trig) ||
gi->gi_irq != irq)
return (EINVAL);
} else {
pol = INTR_POLARITY_CONFORM;
trig = INTR_TRIGGER_CONFORM;
}
/* Compare config if this is not first setup. */
if (isrc->isrc_handlers != 0) {
if ((pol != INTR_POLARITY_CONFORM && pol != gi->gi_pol) ||
(trig != INTR_TRIGGER_CONFORM && trig != gi->gi_trig))
return (EINVAL);
else
return (0);
}
/* For MSI/MSI-X we should have already configured these */
if ((gi->gi_flags & GI_FLAG_MSI) == 0) {
if (pol == INTR_POLARITY_CONFORM)
pol = INTR_POLARITY_LOW; /* just pick some */
if (trig == INTR_TRIGGER_CONFORM)
trig = INTR_TRIGGER_EDGE; /* just pick some */
gi->gi_pol = pol;
gi->gi_trig = trig;
/* Edge triggered interrupts need an early EOI sent */
if (gi->gi_trig == INTR_TRIGGER_EDGE)
gi->gi_flags |= GI_FLAG_EARLY_EOI;
}
/*
* XXX - In case that per CPU interrupt is going to be enabled in time
* when SMP is already started, we need some IPI call which
* enables it on others CPUs. Further, it's more complicated as
* pic_enable_source() and pic_disable_source() should act on
* per CPU basis only. Thus, it should be solved here somehow.
*/
if (isrc->isrc_flags & INTR_ISRCF_PPI)
CPU_SET(PCPU_GET(cpuid), &isrc->isrc_cpu);
gic_config(sc, gi->gi_irq, gi->gi_trig, gi->gi_pol);
arm_gic_bind_intr(dev, isrc);
return (0);
}
static int
arm_gic_teardown_intr(device_t dev, struct intr_irqsrc *isrc,
struct resource *res, struct intr_map_data *data)
{
struct gic_irqsrc *gi = (struct gic_irqsrc *)isrc;
if (isrc->isrc_handlers == 0 && (gi->gi_flags & GI_FLAG_MSI) == 0) {
gi->gi_pol = INTR_POLARITY_CONFORM;
gi->gi_trig = INTR_TRIGGER_CONFORM;
}
return (0);
}
static void
arm_gic_enable_intr(device_t dev, struct intr_irqsrc *isrc)
{
struct arm_gic_softc *sc = device_get_softc(dev);
struct gic_irqsrc *gi = (struct gic_irqsrc *)isrc;
arm_irq_memory_barrier(gi->gi_irq);
gic_irq_unmask(sc, gi->gi_irq);
}
static void
arm_gic_disable_intr(device_t dev, struct intr_irqsrc *isrc)
{
struct arm_gic_softc *sc = device_get_softc(dev);
struct gic_irqsrc *gi = (struct gic_irqsrc *)isrc;
gic_irq_mask(sc, gi->gi_irq);
}
static void
arm_gic_pre_ithread(device_t dev, struct intr_irqsrc *isrc)
{
struct arm_gic_softc *sc = device_get_softc(dev);
struct gic_irqsrc *gi = (struct gic_irqsrc *)isrc;
arm_gic_disable_intr(dev, isrc);
gic_c_write_4(sc, GICC_EOIR, gi->gi_irq);
}
static void
arm_gic_post_ithread(device_t dev, struct intr_irqsrc *isrc)
{
arm_irq_memory_barrier(0);
arm_gic_enable_intr(dev, isrc);
}
static void
arm_gic_post_filter(device_t dev, struct intr_irqsrc *isrc)
{
struct arm_gic_softc *sc = device_get_softc(dev);
struct gic_irqsrc *gi = (struct gic_irqsrc *)isrc;
/* EOI for edge-triggered done earlier. */
if ((gi->gi_flags & GI_FLAG_EARLY_EOI) == GI_FLAG_EARLY_EOI)
return;
arm_irq_memory_barrier(0);
gic_c_write_4(sc, GICC_EOIR, gi->gi_irq);
}
static int
arm_gic_bind_intr(device_t dev, struct intr_irqsrc *isrc)
{
struct arm_gic_softc *sc = device_get_softc(dev);
struct gic_irqsrc *gi = (struct gic_irqsrc *)isrc;
if (gi->gi_irq < GIC_FIRST_SPI)
return (EINVAL);
if (CPU_EMPTY(&isrc->isrc_cpu)) {
gic_irq_cpu = intr_irq_next_cpu(gic_irq_cpu, &all_cpus);
CPU_SETOF(gic_irq_cpu, &isrc->isrc_cpu);
}
return (gic_bind(sc, gi->gi_irq, &isrc->isrc_cpu));
}
#ifdef SMP
static void
arm_gic_ipi_send(device_t dev, struct intr_irqsrc *isrc, cpuset_t cpus,
u_int ipi)
{
struct arm_gic_softc *sc = device_get_softc(dev);
struct gic_irqsrc *gi = (struct gic_irqsrc *)isrc;
uint32_t val = 0, i;
for (i = 0; i < MAXCPU; i++)
if (CPU_ISSET(i, &cpus))
val |= arm_gic_map[i] << GICD_SGI_TARGET_SHIFT;
gic_d_write_4(sc, GICD_SGIR, val | gi->gi_irq);
}
static int
arm_gic_ipi_setup(device_t dev, u_int ipi, struct intr_irqsrc **isrcp)
{
struct intr_irqsrc *isrc;
struct arm_gic_softc *sc = device_get_softc(dev);
if (sgi_first_unused > GIC_LAST_SGI)
return (ENOSPC);
isrc = GIC_INTR_ISRC(sc, sgi_first_unused);
sgi_to_ipi[sgi_first_unused++] = ipi;
CPU_SET(PCPU_GET(cpuid), &isrc->isrc_cpu);
*isrcp = isrc;
return (0);
}
#endif
#else
static int
arm_gic_next_irq(struct arm_gic_softc *sc, int last_irq)
{
uint32_t active_irq;
active_irq = gic_c_read_4(sc, GICC_IAR);
/*
* Immediately EOIR the SGIs, because doing so requires the other
* bits (ie CPU number), not just the IRQ number, and we do not
* have this information later.
*/
if ((active_irq & 0x3ff) <= GIC_LAST_SGI)
gic_c_write_4(sc, GICC_EOIR, active_irq);
active_irq &= 0x3FF;
if (active_irq == 0x3FF) {
if (last_irq == -1)
device_printf(sc->gic_dev,
"Spurious interrupt detected\n");
return -1;
}
return active_irq;
}
static int
arm_gic_config(device_t dev, int irq, enum intr_trigger trig,
enum intr_polarity pol)
{
struct arm_gic_softc *sc = device_get_softc(dev);
uint32_t reg;
uint32_t mask;
/* Function is public-accessible, so validate input arguments */
if ((irq < 0) || (irq >= sc->nirqs))
goto invalid_args;
if ((trig != INTR_TRIGGER_EDGE) && (trig != INTR_TRIGGER_LEVEL) &&
(trig != INTR_TRIGGER_CONFORM))
goto invalid_args;
if ((pol != INTR_POLARITY_HIGH) && (pol != INTR_POLARITY_LOW) &&
(pol != INTR_POLARITY_CONFORM))
goto invalid_args;
mtx_lock_spin(&sc->mutex);
reg = gic_d_read_4(sc, GICD_ICFGR(irq));
mask = (reg >> 2*(irq % 16)) & 0x3;
if (pol == INTR_POLARITY_LOW) {
mask &= ~GICD_ICFGR_POL_MASK;
mask |= GICD_ICFGR_POL_LOW;
} else if (pol == INTR_POLARITY_HIGH) {
mask &= ~GICD_ICFGR_POL_MASK;
mask |= GICD_ICFGR_POL_HIGH;
}
if (trig == INTR_TRIGGER_LEVEL) {
mask &= ~GICD_ICFGR_TRIG_MASK;
mask |= GICD_ICFGR_TRIG_LVL;
} else if (trig == INTR_TRIGGER_EDGE) {
mask &= ~GICD_ICFGR_TRIG_MASK;
mask |= GICD_ICFGR_TRIG_EDGE;
}
/* Set mask */
reg = reg & ~(0x3 << 2*(irq % 16));
reg = reg | (mask << 2*(irq % 16));
gic_d_write_4(sc, GICD_ICFGR(irq), reg);
mtx_unlock_spin(&sc->mutex);
return (0);
invalid_args:
device_printf(dev, "gic_config_irg, invalid parameters\n");
return (EINVAL);
}
static void
arm_gic_mask(device_t dev, int irq)
{
struct arm_gic_softc *sc = device_get_softc(dev);
gic_d_write_4(sc, GICD_ICENABLER(irq), (1UL << (irq & 0x1F)));
gic_c_write_4(sc, GICC_EOIR, irq); /* XXX - not allowed */
}
static void
arm_gic_unmask(device_t dev, int irq)
{
struct arm_gic_softc *sc = device_get_softc(dev);
if (irq > GIC_LAST_SGI)
arm_irq_memory_barrier(irq);
gic_d_write_4(sc, GICD_ISENABLER(irq), (1UL << (irq & 0x1F)));
}
#ifdef SMP
static void
arm_gic_ipi_send(device_t dev, cpuset_t cpus, u_int ipi)
{
struct arm_gic_softc *sc = device_get_softc(dev);
uint32_t val = 0, i;
for (i = 0; i < MAXCPU; i++)
if (CPU_ISSET(i, &cpus))
val |= arm_gic_map[i] << GICD_SGI_TARGET_SHIFT;
gic_d_write_4(sc, GICD_SGIR, val | ipi);
}
static int
arm_gic_ipi_read(device_t dev, int i)
{
if (i != -1) {
/*
* The intr code will automagically give the frame pointer
* if the interrupt argument is 0.
*/
if ((unsigned int)i > 16)
return (0);
return (i);
}
return (0x3ff);
}
static void
arm_gic_ipi_clear(device_t dev, int ipi)
{
/* no-op */
}
#endif
static void
gic_post_filter(void *arg)
{
struct arm_gic_softc *sc = gic_sc;
uintptr_t irq = (uintptr_t) arg;
if (irq > GIC_LAST_SGI)
arm_irq_memory_barrier(irq);
gic_c_write_4(sc, GICC_EOIR, irq);
}
static int
gic_config_irq(int irq, enum intr_trigger trig, enum intr_polarity pol)
{
return (arm_gic_config(gic_sc->gic_dev, irq, trig, pol));
}
void
arm_mask_irq(uintptr_t nb)
{
arm_gic_mask(gic_sc->gic_dev, nb);
}
void
arm_unmask_irq(uintptr_t nb)
{
arm_gic_unmask(gic_sc->gic_dev, nb);
}
int
arm_get_next_irq(int last_irq)
{
return (arm_gic_next_irq(gic_sc, last_irq));
}
#ifdef SMP
void
intr_pic_init_secondary(void)
{
arm_gic_init_secondary(gic_sc->gic_dev);
}
void
pic_ipi_send(cpuset_t cpus, u_int ipi)
{
arm_gic_ipi_send(gic_sc->gic_dev, cpus, ipi);
}
int
pic_ipi_read(int i)
{
return (arm_gic_ipi_read(gic_sc->gic_dev, i));
}
void
pic_ipi_clear(int ipi)
{
arm_gic_ipi_clear(gic_sc->gic_dev, ipi);
}
#endif
#endif /* INTRNG */
static device_method_t arm_gic_methods[] = {
#ifdef INTRNG
/* Bus interface */
DEVMETHOD(bus_print_child, arm_gic_print_child),
DEVMETHOD(bus_add_child, bus_generic_add_child),
DEVMETHOD(bus_alloc_resource, arm_gic_alloc_resource),
DEVMETHOD(bus_release_resource, bus_generic_release_resource),
DEVMETHOD(bus_activate_resource,bus_generic_activate_resource),
DEVMETHOD(bus_read_ivar, arm_gic_read_ivar),
/* Interrupt controller interface */
DEVMETHOD(pic_disable_intr, arm_gic_disable_intr),
DEVMETHOD(pic_enable_intr, arm_gic_enable_intr),
DEVMETHOD(pic_map_intr, arm_gic_map_intr),
DEVMETHOD(pic_setup_intr, arm_gic_setup_intr),
DEVMETHOD(pic_teardown_intr, arm_gic_teardown_intr),
DEVMETHOD(pic_post_filter, arm_gic_post_filter),
DEVMETHOD(pic_post_ithread, arm_gic_post_ithread),
DEVMETHOD(pic_pre_ithread, arm_gic_pre_ithread),
#ifdef SMP
DEVMETHOD(pic_bind_intr, arm_gic_bind_intr),
DEVMETHOD(pic_init_secondary, arm_gic_init_secondary),
DEVMETHOD(pic_ipi_send, arm_gic_ipi_send),
DEVMETHOD(pic_ipi_setup, arm_gic_ipi_setup),
#endif
#endif
{ 0, 0 }
};
DEFINE_CLASS_0(gic, arm_gic_driver, arm_gic_methods,
sizeof(struct arm_gic_softc));
#ifdef INTRNG
/*
* GICv2m support -- the GICv2 MSI/MSI-X controller.
*/
#define GICV2M_MSI_TYPER 0x008
#define MSI_TYPER_SPI_BASE(x) (((x) >> 16) & 0x3ff)
#define MSI_TYPER_SPI_COUNT(x) (((x) >> 0) & 0x3ff)
#define GICv2M_MSI_SETSPI_NS 0x040
#define GICV2M_MSI_IIDR 0xFCC
int
arm_gicv2m_attach(device_t dev)
{
struct arm_gicv2m_softc *sc;
- struct arm_gic_softc *psc;
uint32_t typer;
int rid;
- psc = device_get_softc(device_get_parent(dev));
sc = device_get_softc(dev);
rid = 0;
sc->sc_mem = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
RF_ACTIVE);
if (sc->sc_mem == NULL) {
device_printf(dev, "Unable to allocate resources\n");
return (ENXIO);
}
typer = bus_read_4(sc->sc_mem, GICV2M_MSI_TYPER);
sc->sc_spi_start = MSI_TYPER_SPI_BASE(typer);
sc->sc_spi_count = MSI_TYPER_SPI_COUNT(typer);
sc->sc_spi_end = sc->sc_spi_start + sc->sc_spi_count;
/* Reserve these interrupts for MSI/MSI-X use */
arm_gic_reserve_msi_range(device_get_parent(dev), sc->sc_spi_start,
sc->sc_spi_count);
mtx_init(&sc->sc_mutex, "GICv2m lock", "", MTX_DEF);
intr_msi_register(dev, sc->sc_xref);
if (bootverbose)
device_printf(dev, "using spi %u to %u\n", sc->sc_spi_start,
sc->sc_spi_start + sc->sc_spi_count - 1);
return (0);
}
static int
arm_gicv2m_alloc_msi(device_t dev, device_t child, int count, int maxcount,
device_t *pic, struct intr_irqsrc **srcs)
{
struct arm_gic_softc *psc;
struct arm_gicv2m_softc *sc;
int i, irq, end_irq;
bool found;
KASSERT(powerof2(count), ("%s: bad count", __func__));
KASSERT(powerof2(maxcount), ("%s: bad maxcount", __func__));
psc = device_get_softc(device_get_parent(dev));
sc = device_get_softc(dev);
mtx_lock(&sc->sc_mutex);
found = false;
for (irq = sc->sc_spi_start; irq < sc->sc_spi_end; irq++) {
/* Start on an aligned interrupt */
if ((irq & (maxcount - 1)) != 0)
continue;
/* Assume we found a valid range until shown otherwise */
found = true;
/* Check this range is valid */
for (end_irq = irq; end_irq != irq + count; end_irq++) {
/* No free interrupts */
if (end_irq == sc->sc_spi_end) {
found = false;
break;
}
KASSERT((psc->gic_irqs[end_irq].gi_flags & GI_FLAG_MSI)!= 0,
("%s: Non-MSI interrupt found", __func__));
/* This is already used */
if ((psc->gic_irqs[end_irq].gi_flags & GI_FLAG_MSI_USED) ==
GI_FLAG_MSI_USED) {
found = false;
break;
}
}
if (found)
break;
}
/* Not enough interrupts were found */
if (!found || irq == sc->sc_spi_end) {
mtx_unlock(&sc->sc_mutex);
return (ENXIO);
}
for (i = 0; i < count; i++) {
/* Mark the interrupt as used */
psc->gic_irqs[irq + i].gi_flags |= GI_FLAG_MSI_USED;
}
mtx_unlock(&sc->sc_mutex);
for (i = 0; i < count; i++)
srcs[i] = (struct intr_irqsrc *)&psc->gic_irqs[irq + i];
*pic = device_get_parent(dev);
return (0);
}
static int
arm_gicv2m_release_msi(device_t dev, device_t child, int count,
struct intr_irqsrc **isrc)
{
struct arm_gicv2m_softc *sc;
struct gic_irqsrc *gi;
int i;
sc = device_get_softc(dev);
mtx_lock(&sc->sc_mutex);
for (i = 0; i < count; i++) {
gi = (struct gic_irqsrc *)isrc[i];
KASSERT((gi->gi_flags & GI_FLAG_MSI_USED) == GI_FLAG_MSI_USED,
("%s: Trying to release an unused MSI-X interrupt",
__func__));
gi->gi_flags &= ~GI_FLAG_MSI_USED;
}
mtx_unlock(&sc->sc_mutex);
return (0);
}
static int
arm_gicv2m_alloc_msix(device_t dev, device_t child, device_t *pic,
struct intr_irqsrc **isrcp)
{
struct arm_gicv2m_softc *sc;
struct arm_gic_softc *psc;
int irq;
psc = device_get_softc(device_get_parent(dev));
sc = device_get_softc(dev);
mtx_lock(&sc->sc_mutex);
/* Find an unused interrupt */
for (irq = sc->sc_spi_start; irq < sc->sc_spi_end; irq++) {
KASSERT((psc->gic_irqs[irq].gi_flags & GI_FLAG_MSI) != 0,
("%s: Non-MSI interrupt found", __func__));
if ((psc->gic_irqs[irq].gi_flags & GI_FLAG_MSI_USED) == 0)
break;
}
/* No free interrupt was found */
if (irq == sc->sc_spi_end) {
mtx_unlock(&sc->sc_mutex);
return (ENXIO);
}
/* Mark the interrupt as used */
psc->gic_irqs[irq].gi_flags |= GI_FLAG_MSI_USED;
mtx_unlock(&sc->sc_mutex);
*isrcp = (struct intr_irqsrc *)&psc->gic_irqs[irq];
*pic = device_get_parent(dev);
return (0);
}
static int
arm_gicv2m_release_msix(device_t dev, device_t child, struct intr_irqsrc *isrc)
{
struct arm_gicv2m_softc *sc;
struct gic_irqsrc *gi;
sc = device_get_softc(dev);
gi = (struct gic_irqsrc *)isrc;
KASSERT((gi->gi_flags & GI_FLAG_MSI_USED) == GI_FLAG_MSI_USED,
("%s: Trying to release an unused MSI-X interrupt", __func__));
mtx_lock(&sc->sc_mutex);
gi->gi_flags &= ~GI_FLAG_MSI_USED;
mtx_unlock(&sc->sc_mutex);
return (0);
}
static int
arm_gicv2m_map_msi(device_t dev, device_t child, struct intr_irqsrc *isrc,
uint64_t *addr, uint32_t *data)
{
struct arm_gicv2m_softc *sc = device_get_softc(dev);
struct gic_irqsrc *gi = (struct gic_irqsrc *)isrc;
*addr = vtophys(rman_get_virtual(sc->sc_mem)) + GICv2M_MSI_SETSPI_NS;
*data = gi->gi_irq;
return (0);
}
static device_method_t arm_gicv2m_methods[] = {
/* Device interface */
DEVMETHOD(device_attach, arm_gicv2m_attach),
/* MSI/MSI-X */
DEVMETHOD(msi_alloc_msi, arm_gicv2m_alloc_msi),
DEVMETHOD(msi_release_msi, arm_gicv2m_release_msi),
DEVMETHOD(msi_alloc_msix, arm_gicv2m_alloc_msix),
DEVMETHOD(msi_release_msix, arm_gicv2m_release_msix),
DEVMETHOD(msi_map_msi, arm_gicv2m_map_msi),
/* End */
DEVMETHOD_END
};
DEFINE_CLASS_0(gicv2m, arm_gicv2m_driver, arm_gicv2m_methods,
sizeof(struct arm_gicv2m_softc));
#endif
Index: head/sys/arm/broadcom/bcm2835/bcm2835_cpufreq.c
===================================================================
--- head/sys/arm/broadcom/bcm2835/bcm2835_cpufreq.c (revision 327172)
+++ head/sys/arm/broadcom/bcm2835/bcm2835_cpufreq.c (revision 327173)
@@ -1,1640 +1,1642 @@
/*-
* Copyright (C) 2013-2015 Daisuke Aoyama <aoyama@peach.ne.jp>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/bus.h>
#include <sys/cpu.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/sema.h>
#include <sys/sysctl.h>
#include <machine/bus.h>
#include <machine/cpu.h>
#include <machine/intr.h>
#include <dev/ofw/ofw_bus.h>
#include <dev/ofw/ofw_bus_subr.h>
#include <arm/broadcom/bcm2835/bcm2835_mbox.h>
#include <arm/broadcom/bcm2835/bcm2835_mbox_prop.h>
#include <arm/broadcom/bcm2835/bcm2835_vcbus.h>
#include "cpufreq_if.h"
#include "mbox_if.h"
#ifdef DEBUG
#define DPRINTF(fmt, ...) do { \
printf("%s:%u: ", __func__, __LINE__); \
printf(fmt, ##__VA_ARGS__); \
} while (0)
#else
#define DPRINTF(fmt, ...)
#endif
#define HZ2MHZ(freq) ((freq) / (1000 * 1000))
#define MHZ2HZ(freq) ((freq) * (1000 * 1000))
#ifdef SOC_BCM2835
#define OFFSET2MVOLT(val) (1200 + ((val) * 25))
#define MVOLT2OFFSET(val) (((val) - 1200) / 25)
#define DEFAULT_ARM_FREQUENCY 700
#define DEFAULT_LOWEST_FREQ 300
#else
#define OFFSET2MVOLT(val) (((val) / 1000))
#define MVOLT2OFFSET(val) (((val) * 1000))
#define DEFAULT_ARM_FREQUENCY 600
#define DEFAULT_LOWEST_FREQ 600
#endif
#define DEFAULT_CORE_FREQUENCY 250
#define DEFAULT_SDRAM_FREQUENCY 400
#define TRANSITION_LATENCY 1000
#define MIN_OVER_VOLTAGE -16
#define MAX_OVER_VOLTAGE 6
#define MSG_ERROR -999999999
#define MHZSTEP 100
#define HZSTEP (MHZ2HZ(MHZSTEP))
#define TZ_ZEROC 2731
#define VC_LOCK(sc) do { \
sema_wait(&vc_sema); \
} while (0)
#define VC_UNLOCK(sc) do { \
sema_post(&vc_sema); \
} while (0)
/* ARM->VC mailbox property semaphore */
static struct sema vc_sema;
static struct sysctl_ctx_list bcm2835_sysctl_ctx;
struct bcm2835_cpufreq_softc {
device_t dev;
int arm_max_freq;
int arm_min_freq;
int core_max_freq;
int core_min_freq;
int sdram_max_freq;
int sdram_min_freq;
int max_voltage_core;
int min_voltage_core;
/* the values written in mbox */
int voltage_core;
int voltage_sdram;
int voltage_sdram_c;
int voltage_sdram_i;
int voltage_sdram_p;
int turbo_mode;
/* initial hook for waiting mbox intr */
struct intr_config_hook init_hook;
};
static struct ofw_compat_data compat_data[] = {
{ "broadcom,bcm2835-vc", 1 },
{ "broadcom,bcm2708-vc", 1 },
{ "brcm,bcm2709", 1 },
{ "brcm,bcm2836", 1 },
{ NULL, 0 }
};
static int cpufreq_verbose = 0;
TUNABLE_INT("hw.bcm2835.cpufreq.verbose", &cpufreq_verbose);
static int cpufreq_lowest_freq = DEFAULT_LOWEST_FREQ;
TUNABLE_INT("hw.bcm2835.cpufreq.lowest_freq", &cpufreq_lowest_freq);
#ifdef PROP_DEBUG
static void
bcm2835_dump(const void *data, int len)
{
const uint8_t *p = (const uint8_t*)data;
int i;
printf("dump @ %p:\n", data);
for (i = 0; i < len; i++) {
printf("%2.2x ", p[i]);
if ((i % 4) == 3)
printf(" ");
if ((i % 16) == 15)
printf("\n");
}
printf("\n");
}
#endif
static int
bcm2835_cpufreq_get_clock_rate(struct bcm2835_cpufreq_softc *sc,
uint32_t clock_id)
{
struct msg_get_clock_rate msg;
int rate;
int err;
/*
* Get clock rate
* Tag: 0x00030002
* Request:
* Length: 4
* Value:
* u32: clock id
* Response:
* Length: 8
* Value:
* u32: clock id
* u32: rate (in Hz)
*/
/* setup single tag buffer */
memset(&msg, 0, sizeof(msg));
msg.hdr.buf_size = sizeof(msg);
msg.hdr.code = BCM2835_MBOX_CODE_REQ;
msg.tag_hdr.tag = BCM2835_MBOX_TAG_GET_CLOCK_RATE;
msg.tag_hdr.val_buf_size = sizeof(msg.body);
msg.tag_hdr.val_len = sizeof(msg.body.req);
msg.body.req.clock_id = clock_id;
msg.end_tag = 0;
/* call mailbox property */
err = bcm2835_mbox_property(&msg, sizeof(msg));
if (err) {
device_printf(sc->dev, "can't get clock rate (id=%u)\n",
clock_id);
return (MSG_ERROR);
}
/* result (Hz) */
rate = (int)msg.body.resp.rate_hz;
DPRINTF("clock = %d(Hz)\n", rate);
return (rate);
}
static int
bcm2835_cpufreq_get_max_clock_rate(struct bcm2835_cpufreq_softc *sc,
uint32_t clock_id)
{
struct msg_get_max_clock_rate msg;
int rate;
int err;
/*
* Get max clock rate
* Tag: 0x00030004
* Request:
* Length: 4
* Value:
* u32: clock id
* Response:
* Length: 8
* Value:
* u32: clock id
* u32: rate (in Hz)
*/
/* setup single tag buffer */
memset(&msg, 0, sizeof(msg));
msg.hdr.buf_size = sizeof(msg);
msg.hdr.code = BCM2835_MBOX_CODE_REQ;
msg.tag_hdr.tag = BCM2835_MBOX_TAG_GET_MAX_CLOCK_RATE;
msg.tag_hdr.val_buf_size = sizeof(msg.body);
msg.tag_hdr.val_len = sizeof(msg.body.req);
msg.body.req.clock_id = clock_id;
msg.end_tag = 0;
/* call mailbox property */
err = bcm2835_mbox_property(&msg, sizeof(msg));
if (err) {
device_printf(sc->dev, "can't get max clock rate (id=%u)\n",
clock_id);
return (MSG_ERROR);
}
/* result (Hz) */
rate = (int)msg.body.resp.rate_hz;
DPRINTF("clock = %d(Hz)\n", rate);
return (rate);
}
static int
bcm2835_cpufreq_get_min_clock_rate(struct bcm2835_cpufreq_softc *sc,
uint32_t clock_id)
{
struct msg_get_min_clock_rate msg;
int rate;
int err;
/*
* Get min clock rate
* Tag: 0x00030007
* Request:
* Length: 4
* Value:
* u32: clock id
* Response:
* Length: 8
* Value:
* u32: clock id
* u32: rate (in Hz)
*/
/* setup single tag buffer */
memset(&msg, 0, sizeof(msg));
msg.hdr.buf_size = sizeof(msg);
msg.hdr.code = BCM2835_MBOX_CODE_REQ;
msg.tag_hdr.tag = BCM2835_MBOX_TAG_GET_MIN_CLOCK_RATE;
msg.tag_hdr.val_buf_size = sizeof(msg.body);
msg.tag_hdr.val_len = sizeof(msg.body.req);
msg.body.req.clock_id = clock_id;
msg.end_tag = 0;
/* call mailbox property */
err = bcm2835_mbox_property(&msg, sizeof(msg));
if (err) {
device_printf(sc->dev, "can't get min clock rate (id=%u)\n",
clock_id);
return (MSG_ERROR);
}
/* result (Hz) */
rate = (int)msg.body.resp.rate_hz;
DPRINTF("clock = %d(Hz)\n", rate);
return (rate);
}
static int
bcm2835_cpufreq_set_clock_rate(struct bcm2835_cpufreq_softc *sc,
uint32_t clock_id, uint32_t rate_hz)
{
struct msg_set_clock_rate msg;
int rate;
int err;
/*
* Set clock rate
* Tag: 0x00038002
* Request:
* Length: 8
* Value:
* u32: clock id
* u32: rate (in Hz)
* Response:
* Length: 8
* Value:
* u32: clock id
* u32: rate (in Hz)
*/
/* setup single tag buffer */
memset(&msg, 0, sizeof(msg));
msg.hdr.buf_size = sizeof(msg);
msg.hdr.code = BCM2835_MBOX_CODE_REQ;
msg.tag_hdr.tag = BCM2835_MBOX_TAG_SET_CLOCK_RATE;
msg.tag_hdr.val_buf_size = sizeof(msg.body);
msg.tag_hdr.val_len = sizeof(msg.body.req);
msg.body.req.clock_id = clock_id;
msg.body.req.rate_hz = rate_hz;
msg.end_tag = 0;
/* call mailbox property */
err = bcm2835_mbox_property(&msg, sizeof(msg));
if (err) {
device_printf(sc->dev, "can't set clock rate (id=%u)\n",
clock_id);
return (MSG_ERROR);
}
/* workaround for core clock */
if (clock_id == BCM2835_MBOX_CLOCK_ID_CORE) {
/* for safety (may change voltage without changing clock) */
DELAY(TRANSITION_LATENCY);
/*
* XXX: the core clock is unable to change at once,
* to change certainly, write it twice now.
*/
/* setup single tag buffer */
memset(&msg, 0, sizeof(msg));
msg.hdr.buf_size = sizeof(msg);
msg.hdr.code = BCM2835_MBOX_CODE_REQ;
msg.tag_hdr.tag = BCM2835_MBOX_TAG_SET_CLOCK_RATE;
msg.tag_hdr.val_buf_size = sizeof(msg.body);
msg.tag_hdr.val_len = sizeof(msg.body.req);
msg.body.req.clock_id = clock_id;
msg.body.req.rate_hz = rate_hz;
msg.end_tag = 0;
/* call mailbox property */
err = bcm2835_mbox_property(&msg, sizeof(msg));
if (err) {
device_printf(sc->dev,
"can't set clock rate (id=%u)\n", clock_id);
return (MSG_ERROR);
}
}
/* result (Hz) */
rate = (int)msg.body.resp.rate_hz;
DPRINTF("clock = %d(Hz)\n", rate);
return (rate);
}
static int
bcm2835_cpufreq_get_turbo(struct bcm2835_cpufreq_softc *sc)
{
struct msg_get_turbo msg;
int level;
int err;
/*
* Get turbo
* Tag: 0x00030009
* Request:
* Length: 4
* Value:
* u32: id
* Response:
* Length: 8
* Value:
* u32: id
* u32: level
*/
/* setup single tag buffer */
memset(&msg, 0, sizeof(msg));
msg.hdr.buf_size = sizeof(msg);
msg.hdr.code = BCM2835_MBOX_CODE_REQ;
msg.tag_hdr.tag = BCM2835_MBOX_TAG_GET_TURBO;
msg.tag_hdr.val_buf_size = sizeof(msg.body);
msg.tag_hdr.val_len = sizeof(msg.body.req);
msg.body.req.id = 0;
msg.end_tag = 0;
/* call mailbox property */
err = bcm2835_mbox_property(&msg, sizeof(msg));
if (err) {
device_printf(sc->dev, "can't get turbo\n");
return (MSG_ERROR);
}
/* result 0=non-turbo, 1=turbo */
level = (int)msg.body.resp.level;
DPRINTF("level = %d\n", level);
return (level);
}
static int
bcm2835_cpufreq_set_turbo(struct bcm2835_cpufreq_softc *sc, uint32_t level)
{
struct msg_set_turbo msg;
int value;
int err;
/*
* Set turbo
* Tag: 0x00038009
* Request:
* Length: 8
* Value:
* u32: id
* u32: level
* Response:
* Length: 8
* Value:
* u32: id
* u32: level
*/
/* replace unknown value to OFF */
if (level != BCM2835_MBOX_TURBO_ON && level != BCM2835_MBOX_TURBO_OFF)
level = BCM2835_MBOX_TURBO_OFF;
/* setup single tag buffer */
memset(&msg, 0, sizeof(msg));
msg.hdr.buf_size = sizeof(msg);
msg.hdr.code = BCM2835_MBOX_CODE_REQ;
msg.tag_hdr.tag = BCM2835_MBOX_TAG_SET_TURBO;
msg.tag_hdr.val_buf_size = sizeof(msg.body);
msg.tag_hdr.val_len = sizeof(msg.body.req);
msg.body.req.id = 0;
msg.body.req.level = level;
msg.end_tag = 0;
/* call mailbox property */
err = bcm2835_mbox_property(&msg, sizeof(msg));
if (err) {
device_printf(sc->dev, "can't set turbo\n");
return (MSG_ERROR);
}
/* result 0=non-turbo, 1=turbo */
value = (int)msg.body.resp.level;
DPRINTF("level = %d\n", value);
return (value);
}
static int
bcm2835_cpufreq_get_voltage(struct bcm2835_cpufreq_softc *sc,
uint32_t voltage_id)
{
struct msg_get_voltage msg;
int value;
int err;
/*
* Get voltage
* Tag: 0x00030003
* Request:
* Length: 4
* Value:
* u32: voltage id
* Response:
* Length: 8
* Value:
* u32: voltage id
* u32: value (offset from 1.2V in units of 0.025V)
*/
/* setup single tag buffer */
memset(&msg, 0, sizeof(msg));
msg.hdr.buf_size = sizeof(msg);
msg.hdr.code = BCM2835_MBOX_CODE_REQ;
msg.tag_hdr.tag = BCM2835_MBOX_TAG_GET_VOLTAGE;
msg.tag_hdr.val_buf_size = sizeof(msg.body);
msg.tag_hdr.val_len = sizeof(msg.body.req);
msg.body.req.voltage_id = voltage_id;
msg.end_tag = 0;
/* call mailbox property */
err = bcm2835_mbox_property(&msg, sizeof(msg));
if (err) {
device_printf(sc->dev, "can't get voltage\n");
return (MSG_ERROR);
}
/* result (offset from 1.2V) */
value = (int)msg.body.resp.value;
DPRINTF("value = %d\n", value);
return (value);
}
static int
bcm2835_cpufreq_get_max_voltage(struct bcm2835_cpufreq_softc *sc,
uint32_t voltage_id)
{
struct msg_get_max_voltage msg;
int value;
int err;
/*
* Get voltage
* Tag: 0x00030005
* Request:
* Length: 4
* Value:
* u32: voltage id
* Response:
* Length: 8
* Value:
* u32: voltage id
* u32: value (offset from 1.2V in units of 0.025V)
*/
/* setup single tag buffer */
memset(&msg, 0, sizeof(msg));
msg.hdr.buf_size = sizeof(msg);
msg.hdr.code = BCM2835_MBOX_CODE_REQ;
msg.tag_hdr.tag = BCM2835_MBOX_TAG_GET_MAX_VOLTAGE;
msg.tag_hdr.val_buf_size = sizeof(msg.body);
msg.tag_hdr.val_len = sizeof(msg.body.req);
msg.body.req.voltage_id = voltage_id;
msg.end_tag = 0;
/* call mailbox property */
err = bcm2835_mbox_property(&msg, sizeof(msg));
if (err) {
device_printf(sc->dev, "can't get max voltage\n");
return (MSG_ERROR);
}
/* result (offset from 1.2V) */
value = (int)msg.body.resp.value;
DPRINTF("value = %d\n", value);
return (value);
}
static int
bcm2835_cpufreq_get_min_voltage(struct bcm2835_cpufreq_softc *sc,
uint32_t voltage_id)
{
struct msg_get_min_voltage msg;
int value;
int err;
/*
* Get voltage
* Tag: 0x00030008
* Request:
* Length: 4
* Value:
* u32: voltage id
* Response:
* Length: 8
* Value:
* u32: voltage id
* u32: value (offset from 1.2V in units of 0.025V)
*/
/* setup single tag buffer */
memset(&msg, 0, sizeof(msg));
msg.hdr.buf_size = sizeof(msg);
msg.hdr.code = BCM2835_MBOX_CODE_REQ;
msg.tag_hdr.tag = BCM2835_MBOX_TAG_GET_MIN_VOLTAGE;
msg.tag_hdr.val_buf_size = sizeof(msg.body);
msg.tag_hdr.val_len = sizeof(msg.body.req);
msg.body.req.voltage_id = voltage_id;
msg.end_tag = 0;
/* call mailbox property */
err = bcm2835_mbox_property(&msg, sizeof(msg));
if (err) {
device_printf(sc->dev, "can't get min voltage\n");
return (MSG_ERROR);
}
/* result (offset from 1.2V) */
value = (int)msg.body.resp.value;
DPRINTF("value = %d\n", value);
return (value);
}
static int
bcm2835_cpufreq_set_voltage(struct bcm2835_cpufreq_softc *sc,
uint32_t voltage_id, int32_t value)
{
struct msg_set_voltage msg;
int err;
/*
* Set voltage
* Tag: 0x00038003
* Request:
* Length: 4
* Value:
* u32: voltage id
* u32: value (offset from 1.2V in units of 0.025V)
* Response:
* Length: 8
* Value:
* u32: voltage id
* u32: value (offset from 1.2V in units of 0.025V)
*/
/*
* over_voltage:
* 0 (1.2 V). Values above 6 are only allowed when force_turbo or
* current_limit_override are specified (which set the warranty bit).
*/
if (value > MAX_OVER_VOLTAGE || value < MIN_OVER_VOLTAGE) {
/* currently not supported */
device_printf(sc->dev, "not supported voltage: %d\n", value);
return (MSG_ERROR);
}
/* setup single tag buffer */
memset(&msg, 0, sizeof(msg));
msg.hdr.buf_size = sizeof(msg);
msg.hdr.code = BCM2835_MBOX_CODE_REQ;
msg.tag_hdr.tag = BCM2835_MBOX_TAG_SET_VOLTAGE;
msg.tag_hdr.val_buf_size = sizeof(msg.body);
msg.tag_hdr.val_len = sizeof(msg.body.req);
msg.body.req.voltage_id = voltage_id;
msg.body.req.value = (uint32_t)value;
msg.end_tag = 0;
/* call mailbox property */
err = bcm2835_mbox_property(&msg, sizeof(msg));
if (err) {
device_printf(sc->dev, "can't set voltage\n");
return (MSG_ERROR);
}
/* result (offset from 1.2V) */
value = (int)msg.body.resp.value;
DPRINTF("value = %d\n", value);
return (value);
}
static int
bcm2835_cpufreq_get_temperature(struct bcm2835_cpufreq_softc *sc)
{
struct msg_get_temperature msg;
int value;
int err;
/*
* Get temperature
* Tag: 0x00030006
* Request:
* Length: 4
* Value:
* u32: temperature id
* Response:
* Length: 8
* Value:
* u32: temperature id
* u32: value
*/
/* setup single tag buffer */
memset(&msg, 0, sizeof(msg));
msg.hdr.buf_size = sizeof(msg);
msg.hdr.code = BCM2835_MBOX_CODE_REQ;
msg.tag_hdr.tag = BCM2835_MBOX_TAG_GET_TEMPERATURE;
msg.tag_hdr.val_buf_size = sizeof(msg.body);
msg.tag_hdr.val_len = sizeof(msg.body.req);
msg.body.req.temperature_id = 0;
msg.end_tag = 0;
/* call mailbox property */
err = bcm2835_mbox_property(&msg, sizeof(msg));
if (err) {
device_printf(sc->dev, "can't get temperature\n");
return (MSG_ERROR);
}
/* result (temperature of degree C) */
value = (int)msg.body.resp.value;
DPRINTF("value = %d\n", value);
return (value);
}
static int
sysctl_bcm2835_cpufreq_arm_freq(SYSCTL_HANDLER_ARGS)
{
struct bcm2835_cpufreq_softc *sc = arg1;
int val;
int err;
/* get realtime value */
VC_LOCK(sc);
val = bcm2835_cpufreq_get_clock_rate(sc, BCM2835_MBOX_CLOCK_ID_ARM);
VC_UNLOCK(sc);
if (val == MSG_ERROR)
return (EIO);
err = sysctl_handle_int(oidp, &val, 0, req);
if (err || !req->newptr) /* error || read request */
return (err);
/* write request */
VC_LOCK(sc);
err = bcm2835_cpufreq_set_clock_rate(sc, BCM2835_MBOX_CLOCK_ID_ARM,
val);
VC_UNLOCK(sc);
if (err == MSG_ERROR) {
device_printf(sc->dev, "set clock arm_freq error\n");
return (EIO);
}
DELAY(TRANSITION_LATENCY);
return (0);
}
static int
sysctl_bcm2835_cpufreq_core_freq(SYSCTL_HANDLER_ARGS)
{
struct bcm2835_cpufreq_softc *sc = arg1;
int val;
int err;
/* get realtime value */
VC_LOCK(sc);
val = bcm2835_cpufreq_get_clock_rate(sc, BCM2835_MBOX_CLOCK_ID_CORE);
VC_UNLOCK(sc);
if (val == MSG_ERROR)
return (EIO);
err = sysctl_handle_int(oidp, &val, 0, req);
if (err || !req->newptr) /* error || read request */
return (err);
/* write request */
VC_LOCK(sc);
err = bcm2835_cpufreq_set_clock_rate(sc, BCM2835_MBOX_CLOCK_ID_CORE,
val);
if (err == MSG_ERROR) {
VC_UNLOCK(sc);
device_printf(sc->dev, "set clock core_freq error\n");
return (EIO);
}
VC_UNLOCK(sc);
DELAY(TRANSITION_LATENCY);
return (0);
}
static int
sysctl_bcm2835_cpufreq_sdram_freq(SYSCTL_HANDLER_ARGS)
{
struct bcm2835_cpufreq_softc *sc = arg1;
int val;
int err;
/* get realtime value */
VC_LOCK(sc);
val = bcm2835_cpufreq_get_clock_rate(sc, BCM2835_MBOX_CLOCK_ID_SDRAM);
VC_UNLOCK(sc);
if (val == MSG_ERROR)
return (EIO);
err = sysctl_handle_int(oidp, &val, 0, req);
if (err || !req->newptr) /* error || read request */
return (err);
/* write request */
VC_LOCK(sc);
err = bcm2835_cpufreq_set_clock_rate(sc, BCM2835_MBOX_CLOCK_ID_SDRAM,
val);
VC_UNLOCK(sc);
if (err == MSG_ERROR) {
device_printf(sc->dev, "set clock sdram_freq error\n");
return (EIO);
}
DELAY(TRANSITION_LATENCY);
return (0);
}
static int
sysctl_bcm2835_cpufreq_turbo(SYSCTL_HANDLER_ARGS)
{
struct bcm2835_cpufreq_softc *sc = arg1;
int val;
int err;
/* get realtime value */
VC_LOCK(sc);
val = bcm2835_cpufreq_get_turbo(sc);
VC_UNLOCK(sc);
if (val == MSG_ERROR)
return (EIO);
err = sysctl_handle_int(oidp, &val, 0, req);
if (err || !req->newptr) /* error || read request */
return (err);
/* write request */
if (val > 0)
sc->turbo_mode = BCM2835_MBOX_TURBO_ON;
else
sc->turbo_mode = BCM2835_MBOX_TURBO_OFF;
VC_LOCK(sc);
err = bcm2835_cpufreq_set_turbo(sc, sc->turbo_mode);
VC_UNLOCK(sc);
if (err == MSG_ERROR) {
device_printf(sc->dev, "set turbo error\n");
return (EIO);
}
DELAY(TRANSITION_LATENCY);
return (0);
}
static int
sysctl_bcm2835_cpufreq_voltage_core(SYSCTL_HANDLER_ARGS)
{
struct bcm2835_cpufreq_softc *sc = arg1;
int val;
int err;
/* get realtime value */
VC_LOCK(sc);
val = bcm2835_cpufreq_get_voltage(sc, BCM2835_MBOX_VOLTAGE_ID_CORE);
VC_UNLOCK(sc);
if (val == MSG_ERROR)
return (EIO);
err = sysctl_handle_int(oidp, &val, 0, req);
if (err || !req->newptr) /* error || read request */
return (err);
/* write request */
if (val > MAX_OVER_VOLTAGE || val < MIN_OVER_VOLTAGE)
return (EINVAL);
sc->voltage_core = val;
VC_LOCK(sc);
err = bcm2835_cpufreq_set_voltage(sc, BCM2835_MBOX_VOLTAGE_ID_CORE,
sc->voltage_core);
VC_UNLOCK(sc);
if (err == MSG_ERROR) {
device_printf(sc->dev, "set voltage core error\n");
return (EIO);
}
DELAY(TRANSITION_LATENCY);
return (0);
}
static int
sysctl_bcm2835_cpufreq_voltage_sdram_c(SYSCTL_HANDLER_ARGS)
{
struct bcm2835_cpufreq_softc *sc = arg1;
int val;
int err;
/* get realtime value */
VC_LOCK(sc);
val = bcm2835_cpufreq_get_voltage(sc, BCM2835_MBOX_VOLTAGE_ID_SDRAM_C);
VC_UNLOCK(sc);
if (val == MSG_ERROR)
return (EIO);
err = sysctl_handle_int(oidp, &val, 0, req);
if (err || !req->newptr) /* error || read request */
return (err);
/* write request */
if (val > MAX_OVER_VOLTAGE || val < MIN_OVER_VOLTAGE)
return (EINVAL);
sc->voltage_sdram_c = val;
VC_LOCK(sc);
err = bcm2835_cpufreq_set_voltage(sc, BCM2835_MBOX_VOLTAGE_ID_SDRAM_C,
sc->voltage_sdram_c);
VC_UNLOCK(sc);
if (err == MSG_ERROR) {
device_printf(sc->dev, "set voltage sdram_c error\n");
return (EIO);
}
DELAY(TRANSITION_LATENCY);
return (0);
}
static int
sysctl_bcm2835_cpufreq_voltage_sdram_i(SYSCTL_HANDLER_ARGS)
{
struct bcm2835_cpufreq_softc *sc = arg1;
int val;
int err;
/* get realtime value */
VC_LOCK(sc);
val = bcm2835_cpufreq_get_voltage(sc, BCM2835_MBOX_VOLTAGE_ID_SDRAM_I);
VC_UNLOCK(sc);
if (val == MSG_ERROR)
return (EIO);
err = sysctl_handle_int(oidp, &val, 0, req);
if (err || !req->newptr) /* error || read request */
return (err);
/* write request */
if (val > MAX_OVER_VOLTAGE || val < MIN_OVER_VOLTAGE)
return (EINVAL);
sc->voltage_sdram_i = val;
VC_LOCK(sc);
err = bcm2835_cpufreq_set_voltage(sc, BCM2835_MBOX_VOLTAGE_ID_SDRAM_I,
sc->voltage_sdram_i);
VC_UNLOCK(sc);
if (err == MSG_ERROR) {
device_printf(sc->dev, "set voltage sdram_i error\n");
return (EIO);
}
DELAY(TRANSITION_LATENCY);
return (0);
}
static int
sysctl_bcm2835_cpufreq_voltage_sdram_p(SYSCTL_HANDLER_ARGS)
{
struct bcm2835_cpufreq_softc *sc = arg1;
int val;
int err;
/* get realtime value */
VC_LOCK(sc);
val = bcm2835_cpufreq_get_voltage(sc, BCM2835_MBOX_VOLTAGE_ID_SDRAM_P);
VC_UNLOCK(sc);
if (val == MSG_ERROR)
return (EIO);
err = sysctl_handle_int(oidp, &val, 0, req);
if (err || !req->newptr) /* error || read request */
return (err);
/* write request */
if (val > MAX_OVER_VOLTAGE || val < MIN_OVER_VOLTAGE)
return (EINVAL);
sc->voltage_sdram_p = val;
VC_LOCK(sc);
err = bcm2835_cpufreq_set_voltage(sc, BCM2835_MBOX_VOLTAGE_ID_SDRAM_P,
sc->voltage_sdram_p);
VC_UNLOCK(sc);
if (err == MSG_ERROR) {
device_printf(sc->dev, "set voltage sdram_p error\n");
return (EIO);
}
DELAY(TRANSITION_LATENCY);
return (0);
}
static int
sysctl_bcm2835_cpufreq_voltage_sdram(SYSCTL_HANDLER_ARGS)
{
struct bcm2835_cpufreq_softc *sc = arg1;
int val;
int err;
/* multiple write only */
if (!req->newptr)
return (EINVAL);
val = 0;
err = sysctl_handle_int(oidp, &val, 0, req);
if (err)
return (err);
/* write request */
if (val > MAX_OVER_VOLTAGE || val < MIN_OVER_VOLTAGE)
return (EINVAL);
sc->voltage_sdram = val;
VC_LOCK(sc);
err = bcm2835_cpufreq_set_voltage(sc, BCM2835_MBOX_VOLTAGE_ID_SDRAM_C,
val);
if (err == MSG_ERROR) {
VC_UNLOCK(sc);
device_printf(sc->dev, "set voltage sdram_c error\n");
return (EIO);
}
err = bcm2835_cpufreq_set_voltage(sc, BCM2835_MBOX_VOLTAGE_ID_SDRAM_I,
val);
if (err == MSG_ERROR) {
VC_UNLOCK(sc);
device_printf(sc->dev, "set voltage sdram_i error\n");
return (EIO);
}
err = bcm2835_cpufreq_set_voltage(sc, BCM2835_MBOX_VOLTAGE_ID_SDRAM_P,
val);
if (err == MSG_ERROR) {
VC_UNLOCK(sc);
device_printf(sc->dev, "set voltage sdram_p error\n");
return (EIO);
}
VC_UNLOCK(sc);
DELAY(TRANSITION_LATENCY);
return (0);
}
static int
sysctl_bcm2835_cpufreq_temperature(SYSCTL_HANDLER_ARGS)
{
struct bcm2835_cpufreq_softc *sc = arg1;
int val;
int err;
/* get realtime value */
VC_LOCK(sc);
val = bcm2835_cpufreq_get_temperature(sc);
VC_UNLOCK(sc);
if (val == MSG_ERROR)
return (EIO);
err = sysctl_handle_int(oidp, &val, 0, req);
if (err || !req->newptr) /* error || read request */
return (err);
/* write request */
return (EINVAL);
}
static int
sysctl_bcm2835_devcpu_temperature(SYSCTL_HANDLER_ARGS)
{
struct bcm2835_cpufreq_softc *sc = arg1;
int val;
int err;
/* get realtime value */
VC_LOCK(sc);
val = bcm2835_cpufreq_get_temperature(sc);
VC_UNLOCK(sc);
if (val == MSG_ERROR)
return (EIO);
/* 1/1000 celsius (raw) to 1/10 kelvin */
val = val / 100 + TZ_ZEROC;
err = sysctl_handle_int(oidp, &val, 0, req);
if (err || !req->newptr) /* error || read request */
return (err);
/* write request */
return (EINVAL);
}
static void
bcm2835_cpufreq_init(void *arg)
{
struct bcm2835_cpufreq_softc *sc = arg;
struct sysctl_ctx_list *ctx;
device_t cpu;
int arm_freq, core_freq, sdram_freq;
int arm_max_freq, arm_min_freq, core_max_freq, core_min_freq;
int sdram_max_freq, sdram_min_freq;
int voltage_core, voltage_sdram_c, voltage_sdram_i, voltage_sdram_p;
int max_voltage_core, min_voltage_core;
int max_voltage_sdram_c, min_voltage_sdram_c;
int max_voltage_sdram_i, min_voltage_sdram_i;
int max_voltage_sdram_p, min_voltage_sdram_p;
int turbo, temperature;
VC_LOCK(sc);
/* current clock */
arm_freq = bcm2835_cpufreq_get_clock_rate(sc,
BCM2835_MBOX_CLOCK_ID_ARM);
core_freq = bcm2835_cpufreq_get_clock_rate(sc,
BCM2835_MBOX_CLOCK_ID_CORE);
sdram_freq = bcm2835_cpufreq_get_clock_rate(sc,
BCM2835_MBOX_CLOCK_ID_SDRAM);
/* max/min clock */
arm_max_freq = bcm2835_cpufreq_get_max_clock_rate(sc,
BCM2835_MBOX_CLOCK_ID_ARM);
arm_min_freq = bcm2835_cpufreq_get_min_clock_rate(sc,
BCM2835_MBOX_CLOCK_ID_ARM);
core_max_freq = bcm2835_cpufreq_get_max_clock_rate(sc,
BCM2835_MBOX_CLOCK_ID_CORE);
core_min_freq = bcm2835_cpufreq_get_min_clock_rate(sc,
BCM2835_MBOX_CLOCK_ID_CORE);
sdram_max_freq = bcm2835_cpufreq_get_max_clock_rate(sc,
BCM2835_MBOX_CLOCK_ID_SDRAM);
sdram_min_freq = bcm2835_cpufreq_get_min_clock_rate(sc,
BCM2835_MBOX_CLOCK_ID_SDRAM);
/* turbo mode */
turbo = bcm2835_cpufreq_get_turbo(sc);
if (turbo > 0)
sc->turbo_mode = BCM2835_MBOX_TURBO_ON;
else
sc->turbo_mode = BCM2835_MBOX_TURBO_OFF;
/* voltage */
voltage_core = bcm2835_cpufreq_get_voltage(sc,
BCM2835_MBOX_VOLTAGE_ID_CORE);
voltage_sdram_c = bcm2835_cpufreq_get_voltage(sc,
BCM2835_MBOX_VOLTAGE_ID_SDRAM_C);
voltage_sdram_i = bcm2835_cpufreq_get_voltage(sc,
BCM2835_MBOX_VOLTAGE_ID_SDRAM_I);
voltage_sdram_p = bcm2835_cpufreq_get_voltage(sc,
BCM2835_MBOX_VOLTAGE_ID_SDRAM_P);
/* current values (offset from 1.2V) */
sc->voltage_core = voltage_core;
sc->voltage_sdram = voltage_sdram_c;
sc->voltage_sdram_c = voltage_sdram_c;
sc->voltage_sdram_i = voltage_sdram_i;
sc->voltage_sdram_p = voltage_sdram_p;
/* max/min voltage */
max_voltage_core = bcm2835_cpufreq_get_max_voltage(sc,
BCM2835_MBOX_VOLTAGE_ID_CORE);
min_voltage_core = bcm2835_cpufreq_get_min_voltage(sc,
BCM2835_MBOX_VOLTAGE_ID_CORE);
max_voltage_sdram_c = bcm2835_cpufreq_get_max_voltage(sc,
BCM2835_MBOX_VOLTAGE_ID_SDRAM_C);
max_voltage_sdram_i = bcm2835_cpufreq_get_max_voltage(sc,
BCM2835_MBOX_VOLTAGE_ID_SDRAM_I);
max_voltage_sdram_p = bcm2835_cpufreq_get_max_voltage(sc,
BCM2835_MBOX_VOLTAGE_ID_SDRAM_P);
min_voltage_sdram_c = bcm2835_cpufreq_get_min_voltage(sc,
BCM2835_MBOX_VOLTAGE_ID_SDRAM_C);
min_voltage_sdram_i = bcm2835_cpufreq_get_min_voltage(sc,
BCM2835_MBOX_VOLTAGE_ID_SDRAM_I);
min_voltage_sdram_p = bcm2835_cpufreq_get_min_voltage(sc,
BCM2835_MBOX_VOLTAGE_ID_SDRAM_P);
/* temperature */
temperature = bcm2835_cpufreq_get_temperature(sc);
/* show result */
if (cpufreq_verbose || bootverbose) {
device_printf(sc->dev, "Boot settings:\n");
device_printf(sc->dev,
"current ARM %dMHz, Core %dMHz, SDRAM %dMHz, Turbo %s\n",
HZ2MHZ(arm_freq), HZ2MHZ(core_freq), HZ2MHZ(sdram_freq),
(sc->turbo_mode == BCM2835_MBOX_TURBO_ON) ? "ON" : "OFF");
device_printf(sc->dev,
"max/min ARM %d/%dMHz, Core %d/%dMHz, SDRAM %d/%dMHz\n",
HZ2MHZ(arm_max_freq), HZ2MHZ(arm_min_freq),
HZ2MHZ(core_max_freq), HZ2MHZ(core_min_freq),
HZ2MHZ(sdram_max_freq), HZ2MHZ(sdram_min_freq));
device_printf(sc->dev,
"current Core %dmV, SDRAM_C %dmV, SDRAM_I %dmV, "
"SDRAM_P %dmV\n",
OFFSET2MVOLT(voltage_core), OFFSET2MVOLT(voltage_sdram_c),
OFFSET2MVOLT(voltage_sdram_i),
OFFSET2MVOLT(voltage_sdram_p));
device_printf(sc->dev,
"max/min Core %d/%dmV, SDRAM_C %d/%dmV, SDRAM_I %d/%dmV, "
"SDRAM_P %d/%dmV\n",
OFFSET2MVOLT(max_voltage_core),
OFFSET2MVOLT(min_voltage_core),
OFFSET2MVOLT(max_voltage_sdram_c),
OFFSET2MVOLT(min_voltage_sdram_c),
OFFSET2MVOLT(max_voltage_sdram_i),
OFFSET2MVOLT(min_voltage_sdram_i),
OFFSET2MVOLT(max_voltage_sdram_p),
OFFSET2MVOLT(min_voltage_sdram_p));
device_printf(sc->dev,
"Temperature %d.%dC\n", (temperature / 1000),
(temperature % 1000) / 100);
} else { /* !cpufreq_verbose && !bootverbose */
device_printf(sc->dev,
"ARM %dMHz, Core %dMHz, SDRAM %dMHz, Turbo %s\n",
HZ2MHZ(arm_freq), HZ2MHZ(core_freq), HZ2MHZ(sdram_freq),
(sc->turbo_mode == BCM2835_MBOX_TURBO_ON) ? "ON" : "OFF");
}
/* keep in softc (MHz/mV) */
sc->arm_max_freq = HZ2MHZ(arm_max_freq);
sc->arm_min_freq = HZ2MHZ(arm_min_freq);
sc->core_max_freq = HZ2MHZ(core_max_freq);
sc->core_min_freq = HZ2MHZ(core_min_freq);
sc->sdram_max_freq = HZ2MHZ(sdram_max_freq);
sc->sdram_min_freq = HZ2MHZ(sdram_min_freq);
sc->max_voltage_core = OFFSET2MVOLT(max_voltage_core);
sc->min_voltage_core = OFFSET2MVOLT(min_voltage_core);
/* if turbo is on, set to max values */
if (sc->turbo_mode == BCM2835_MBOX_TURBO_ON) {
bcm2835_cpufreq_set_clock_rate(sc, BCM2835_MBOX_CLOCK_ID_ARM,
arm_max_freq);
DELAY(TRANSITION_LATENCY);
bcm2835_cpufreq_set_clock_rate(sc, BCM2835_MBOX_CLOCK_ID_CORE,
core_max_freq);
DELAY(TRANSITION_LATENCY);
bcm2835_cpufreq_set_clock_rate(sc,
BCM2835_MBOX_CLOCK_ID_SDRAM, sdram_max_freq);
DELAY(TRANSITION_LATENCY);
} else {
bcm2835_cpufreq_set_clock_rate(sc, BCM2835_MBOX_CLOCK_ID_ARM,
arm_min_freq);
DELAY(TRANSITION_LATENCY);
bcm2835_cpufreq_set_clock_rate(sc, BCM2835_MBOX_CLOCK_ID_CORE,
core_min_freq);
DELAY(TRANSITION_LATENCY);
bcm2835_cpufreq_set_clock_rate(sc,
BCM2835_MBOX_CLOCK_ID_SDRAM, sdram_min_freq);
DELAY(TRANSITION_LATENCY);
}
VC_UNLOCK(sc);
/* add human readable temperature to dev.cpu node */
cpu = device_get_parent(sc->dev);
if (cpu != NULL) {
ctx = device_get_sysctl_ctx(cpu);
SYSCTL_ADD_PROC(ctx,
SYSCTL_CHILDREN(device_get_sysctl_tree(cpu)), OID_AUTO,
"temperature", CTLTYPE_INT | CTLFLAG_RD, sc, 0,
sysctl_bcm2835_devcpu_temperature, "IK",
"Current SoC temperature");
}
/* release this hook (continue boot) */
config_intrhook_disestablish(&sc->init_hook);
}
static void
bcm2835_cpufreq_identify(driver_t *driver, device_t parent)
{
const struct ofw_compat_data *compat;
phandle_t root;
root = OF_finddevice("/");
for (compat = compat_data; compat->ocd_str != NULL; compat++)
if (ofw_bus_node_is_compatible(root, compat->ocd_str))
break;
if (compat->ocd_data == 0)
return;
DPRINTF("driver=%p, parent=%p\n", driver, parent);
if (device_find_child(parent, "bcm2835_cpufreq", -1) != NULL)
return;
if (BUS_ADD_CHILD(parent, 0, "bcm2835_cpufreq", -1) == NULL)
device_printf(parent, "add child failed\n");
}
static int
bcm2835_cpufreq_probe(device_t dev)
{
if (device_get_unit(dev) != 0)
return (ENXIO);
device_set_desc(dev, "CPU Frequency Control");
return (0);
}
static int
bcm2835_cpufreq_attach(device_t dev)
{
struct bcm2835_cpufreq_softc *sc;
struct sysctl_oid *oid;
/* set self dev */
sc = device_get_softc(dev);
sc->dev = dev;
/* initial values */
sc->arm_max_freq = -1;
sc->arm_min_freq = -1;
sc->core_max_freq = -1;
sc->core_min_freq = -1;
sc->sdram_max_freq = -1;
sc->sdram_min_freq = -1;
sc->max_voltage_core = 0;
sc->min_voltage_core = 0;
/* setup sysctl at first device */
if (device_get_unit(dev) == 0) {
sysctl_ctx_init(&bcm2835_sysctl_ctx);
/* create node for hw.cpufreq */
oid = SYSCTL_ADD_NODE(&bcm2835_sysctl_ctx,
SYSCTL_STATIC_CHILDREN(_hw), OID_AUTO, "cpufreq",
CTLFLAG_RD, NULL, "");
/* Frequency (Hz) */
SYSCTL_ADD_PROC(&bcm2835_sysctl_ctx, SYSCTL_CHILDREN(oid),
OID_AUTO, "arm_freq", CTLTYPE_INT | CTLFLAG_RW, sc, 0,
sysctl_bcm2835_cpufreq_arm_freq, "IU",
"ARM frequency (Hz)");
SYSCTL_ADD_PROC(&bcm2835_sysctl_ctx, SYSCTL_CHILDREN(oid),
OID_AUTO, "core_freq", CTLTYPE_INT | CTLFLAG_RW, sc, 0,
sysctl_bcm2835_cpufreq_core_freq, "IU",
"Core frequency (Hz)");
SYSCTL_ADD_PROC(&bcm2835_sysctl_ctx, SYSCTL_CHILDREN(oid),
OID_AUTO, "sdram_freq", CTLTYPE_INT | CTLFLAG_RW, sc, 0,
sysctl_bcm2835_cpufreq_sdram_freq, "IU",
"SDRAM frequency (Hz)");
/* Turbo state */
SYSCTL_ADD_PROC(&bcm2835_sysctl_ctx, SYSCTL_CHILDREN(oid),
OID_AUTO, "turbo", CTLTYPE_INT | CTLFLAG_RW, sc, 0,
sysctl_bcm2835_cpufreq_turbo, "IU",
"Disables dynamic clocking");
/* Voltage (offset from 1.2V in units of 0.025V) */
SYSCTL_ADD_PROC(&bcm2835_sysctl_ctx, SYSCTL_CHILDREN(oid),
OID_AUTO, "voltage_core", CTLTYPE_INT | CTLFLAG_RW, sc, 0,
sysctl_bcm2835_cpufreq_voltage_core, "I",
"ARM/GPU core voltage"
"(offset from 1.2V in units of 0.025V)");
SYSCTL_ADD_PROC(&bcm2835_sysctl_ctx, SYSCTL_CHILDREN(oid),
OID_AUTO, "voltage_sdram", CTLTYPE_INT | CTLFLAG_WR, sc,
0, sysctl_bcm2835_cpufreq_voltage_sdram, "I",
"SDRAM voltage (offset from 1.2V in units of 0.025V)");
/* Voltage individual SDRAM */
SYSCTL_ADD_PROC(&bcm2835_sysctl_ctx, SYSCTL_CHILDREN(oid),
OID_AUTO, "voltage_sdram_c", CTLTYPE_INT | CTLFLAG_RW, sc,
0, sysctl_bcm2835_cpufreq_voltage_sdram_c, "I",
"SDRAM controller voltage"
"(offset from 1.2V in units of 0.025V)");
SYSCTL_ADD_PROC(&bcm2835_sysctl_ctx, SYSCTL_CHILDREN(oid),
OID_AUTO, "voltage_sdram_i", CTLTYPE_INT | CTLFLAG_RW, sc,
0, sysctl_bcm2835_cpufreq_voltage_sdram_i, "I",
"SDRAM I/O voltage (offset from 1.2V in units of 0.025V)");
SYSCTL_ADD_PROC(&bcm2835_sysctl_ctx, SYSCTL_CHILDREN(oid),
OID_AUTO, "voltage_sdram_p", CTLTYPE_INT | CTLFLAG_RW, sc,
0, sysctl_bcm2835_cpufreq_voltage_sdram_p, "I",
"SDRAM phy voltage (offset from 1.2V in units of 0.025V)");
/* Temperature */
SYSCTL_ADD_PROC(&bcm2835_sysctl_ctx, SYSCTL_CHILDREN(oid),
OID_AUTO, "temperature", CTLTYPE_INT | CTLFLAG_RD, sc, 0,
sysctl_bcm2835_cpufreq_temperature, "I",
"SoC temperature (thousandths of a degree C)");
}
/* ARM->VC lock */
sema_init(&vc_sema, 1, "vcsema");
/* register callback for using mbox when interrupts are enabled */
sc->init_hook.ich_func = bcm2835_cpufreq_init;
sc->init_hook.ich_arg = sc;
if (config_intrhook_establish(&sc->init_hook) != 0) {
device_printf(dev, "config_intrhook_establish failed\n");
return (ENOMEM);
}
/* this device is controlled by cpufreq(4) */
cpufreq_register(dev);
return (0);
}
static int
bcm2835_cpufreq_detach(device_t dev)
{
- struct bcm2835_cpufreq_softc *sc;
- sc = device_get_softc(dev);
-
sema_destroy(&vc_sema);
return (cpufreq_unregister(dev));
}
static int
bcm2835_cpufreq_set(device_t dev, const struct cf_setting *cf)
{
struct bcm2835_cpufreq_softc *sc;
uint32_t rate_hz, rem;
- int cur_freq, resp_freq, arm_freq, min_freq, core_freq;
+ int resp_freq, arm_freq, min_freq, core_freq;
+#ifdef DEBUG
+ int cur_freq;
+#endif
if (cf == NULL || cf->freq < 0)
return (EINVAL);
sc = device_get_softc(dev);
/* setting clock (Hz) */
rate_hz = (uint32_t)MHZ2HZ(cf->freq);
rem = rate_hz % HZSTEP;
rate_hz -= rem;
if (rate_hz == 0)
return (EINVAL);
/* adjust min freq */
min_freq = sc->arm_min_freq;
if (sc->turbo_mode != BCM2835_MBOX_TURBO_ON)
if (min_freq > cpufreq_lowest_freq)
min_freq = cpufreq_lowest_freq;
if (rate_hz < MHZ2HZ(min_freq) || rate_hz > MHZ2HZ(sc->arm_max_freq))
return (EINVAL);
/* set new value and verify it */
VC_LOCK(sc);
+#ifdef DEBUG
cur_freq = bcm2835_cpufreq_get_clock_rate(sc,
BCM2835_MBOX_CLOCK_ID_ARM);
+#endif
resp_freq = bcm2835_cpufreq_set_clock_rate(sc,
BCM2835_MBOX_CLOCK_ID_ARM, rate_hz);
DELAY(TRANSITION_LATENCY);
arm_freq = bcm2835_cpufreq_get_clock_rate(sc,
BCM2835_MBOX_CLOCK_ID_ARM);
/*
* if non-turbo and lower than or equal min_freq,
* clock down core and sdram to default first.
*/
if (sc->turbo_mode != BCM2835_MBOX_TURBO_ON) {
core_freq = bcm2835_cpufreq_get_clock_rate(sc,
BCM2835_MBOX_CLOCK_ID_CORE);
if (rate_hz > MHZ2HZ(sc->arm_min_freq)) {
bcm2835_cpufreq_set_clock_rate(sc,
BCM2835_MBOX_CLOCK_ID_CORE,
MHZ2HZ(sc->core_max_freq));
DELAY(TRANSITION_LATENCY);
bcm2835_cpufreq_set_clock_rate(sc,
BCM2835_MBOX_CLOCK_ID_SDRAM,
MHZ2HZ(sc->sdram_max_freq));
DELAY(TRANSITION_LATENCY);
} else {
if (sc->core_min_freq < DEFAULT_CORE_FREQUENCY &&
core_freq > DEFAULT_CORE_FREQUENCY) {
/* first, down to 250, then down to min */
DELAY(TRANSITION_LATENCY);
bcm2835_cpufreq_set_clock_rate(sc,
BCM2835_MBOX_CLOCK_ID_CORE,
MHZ2HZ(DEFAULT_CORE_FREQUENCY));
DELAY(TRANSITION_LATENCY);
/* reset core voltage */
bcm2835_cpufreq_set_voltage(sc,
BCM2835_MBOX_VOLTAGE_ID_CORE, 0);
DELAY(TRANSITION_LATENCY);
}
bcm2835_cpufreq_set_clock_rate(sc,
BCM2835_MBOX_CLOCK_ID_CORE,
MHZ2HZ(sc->core_min_freq));
DELAY(TRANSITION_LATENCY);
bcm2835_cpufreq_set_clock_rate(sc,
BCM2835_MBOX_CLOCK_ID_SDRAM,
MHZ2HZ(sc->sdram_min_freq));
DELAY(TRANSITION_LATENCY);
}
}
VC_UNLOCK(sc);
if (resp_freq < 0 || arm_freq < 0 || resp_freq != arm_freq) {
device_printf(dev, "wrong freq\n");
return (EIO);
}
DPRINTF("cpufreq: %d -> %d\n", cur_freq, arm_freq);
return (0);
}
static int
bcm2835_cpufreq_get(device_t dev, struct cf_setting *cf)
{
struct bcm2835_cpufreq_softc *sc;
int arm_freq;
if (cf == NULL)
return (EINVAL);
sc = device_get_softc(dev);
memset(cf, CPUFREQ_VAL_UNKNOWN, sizeof(*cf));
cf->dev = NULL;
/* get cuurent value */
VC_LOCK(sc);
arm_freq = bcm2835_cpufreq_get_clock_rate(sc,
BCM2835_MBOX_CLOCK_ID_ARM);
VC_UNLOCK(sc);
if (arm_freq < 0) {
device_printf(dev, "can't get clock\n");
return (EINVAL);
}
/* CPU clock in MHz or 100ths of a percent. */
cf->freq = HZ2MHZ(arm_freq);
/* Voltage in mV. */
cf->volts = CPUFREQ_VAL_UNKNOWN;
/* Power consumed in mW. */
cf->power = CPUFREQ_VAL_UNKNOWN;
/* Transition latency in us. */
cf->lat = TRANSITION_LATENCY;
/* Driver providing this setting. */
cf->dev = dev;
return (0);
}
static int
bcm2835_cpufreq_make_freq_list(device_t dev, struct cf_setting *sets,
int *count)
{
struct bcm2835_cpufreq_softc *sc;
int freq, min_freq, volts, rem;
int idx;
sc = device_get_softc(dev);
freq = sc->arm_max_freq;
min_freq = sc->arm_min_freq;
/* adjust head freq to STEP */
rem = freq % MHZSTEP;
freq -= rem;
if (freq < min_freq)
freq = min_freq;
/* if non-turbo, add extra low freq */
if (sc->turbo_mode != BCM2835_MBOX_TURBO_ON)
if (min_freq > cpufreq_lowest_freq)
min_freq = cpufreq_lowest_freq;
#ifdef SOC_BCM2835
/* from freq to min_freq */
for (idx = 0; idx < *count && freq >= min_freq; idx++) {
if (freq > sc->arm_min_freq)
volts = sc->max_voltage_core;
else
volts = sc->min_voltage_core;
sets[idx].freq = freq;
sets[idx].volts = volts;
sets[idx].lat = TRANSITION_LATENCY;
sets[idx].dev = dev;
freq -= MHZSTEP;
}
#else
/* XXX RPi2 have only 900/600MHz */
idx = 0;
volts = sc->min_voltage_core;
sets[idx].freq = freq;
sets[idx].volts = volts;
sets[idx].lat = TRANSITION_LATENCY;
sets[idx].dev = dev;
idx++;
if (freq != min_freq) {
sets[idx].freq = min_freq;
sets[idx].volts = volts;
sets[idx].lat = TRANSITION_LATENCY;
sets[idx].dev = dev;
idx++;
}
#endif
*count = idx;
return (0);
}
static int
bcm2835_cpufreq_settings(device_t dev, struct cf_setting *sets, int *count)
{
struct bcm2835_cpufreq_softc *sc;
if (sets == NULL || count == NULL)
return (EINVAL);
sc = device_get_softc(dev);
if (sc->arm_min_freq < 0 || sc->arm_max_freq < 0) {
printf("device is not configured\n");
return (EINVAL);
}
/* fill data with unknown value */
memset(sets, CPUFREQ_VAL_UNKNOWN, sizeof(*sets) * (*count));
/* create new array up to count */
bcm2835_cpufreq_make_freq_list(dev, sets, count);
return (0);
}
static int
bcm2835_cpufreq_type(device_t dev, int *type)
{
if (type == NULL)
return (EINVAL);
*type = CPUFREQ_TYPE_ABSOLUTE;
return (0);
}
static device_method_t bcm2835_cpufreq_methods[] = {
/* Device interface */
DEVMETHOD(device_identify, bcm2835_cpufreq_identify),
DEVMETHOD(device_probe, bcm2835_cpufreq_probe),
DEVMETHOD(device_attach, bcm2835_cpufreq_attach),
DEVMETHOD(device_detach, bcm2835_cpufreq_detach),
/* cpufreq interface */
DEVMETHOD(cpufreq_drv_set, bcm2835_cpufreq_set),
DEVMETHOD(cpufreq_drv_get, bcm2835_cpufreq_get),
DEVMETHOD(cpufreq_drv_settings, bcm2835_cpufreq_settings),
DEVMETHOD(cpufreq_drv_type, bcm2835_cpufreq_type),
DEVMETHOD_END
};
static devclass_t bcm2835_cpufreq_devclass;
static driver_t bcm2835_cpufreq_driver = {
"bcm2835_cpufreq",
bcm2835_cpufreq_methods,
sizeof(struct bcm2835_cpufreq_softc),
};
DRIVER_MODULE(bcm2835_cpufreq, cpu, bcm2835_cpufreq_driver,
bcm2835_cpufreq_devclass, 0, 0);
Index: head/sys/arm/broadcom/bcm2835/bcm2835_gpio.c
===================================================================
--- head/sys/arm/broadcom/bcm2835/bcm2835_gpio.c (revision 327172)
+++ head/sys/arm/broadcom/bcm2835/bcm2835_gpio.c (revision 327173)
@@ -1,1232 +1,1231 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2012 Oleksandr Tymoshenko <gonzo@FreeBSD.org>
* Copyright (c) 2012-2015 Luiz Otavio O Souza <loos@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_platform.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/bus.h>
#include <sys/gpio.h>
#include <sys/interrupt.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/rman.h>
#include <sys/sysctl.h>
#include <machine/bus.h>
#include <machine/intr.h>
#include <dev/gpio/gpiobusvar.h>
#include <dev/ofw/ofw_bus.h>
#include <arm/broadcom/bcm2835/bcm2835_gpio.h>
#include "gpio_if.h"
#include "pic_if.h"
#ifdef DEBUG
#define dprintf(fmt, args...) do { printf("%s(): ", __func__); \
printf(fmt,##args); } while (0)
#else
#define dprintf(fmt, args...)
#endif
#define BCM_GPIO_IRQS 4
#define BCM_GPIO_PINS 54
#define BCM_GPIO_PINS_PER_BANK 32
#define BCM_GPIO_DEFAULT_CAPS (GPIO_PIN_INPUT | GPIO_PIN_OUTPUT | \
GPIO_PIN_PULLUP | GPIO_PIN_PULLDOWN | GPIO_INTR_LEVEL_LOW | \
GPIO_INTR_LEVEL_HIGH | GPIO_INTR_EDGE_RISING | \
GPIO_INTR_EDGE_FALLING | GPIO_INTR_EDGE_BOTH)
static struct resource_spec bcm_gpio_res_spec[] = {
{ SYS_RES_MEMORY, 0, RF_ACTIVE },
{ SYS_RES_IRQ, 0, RF_ACTIVE }, /* bank 0 interrupt */
{ SYS_RES_IRQ, 1, RF_ACTIVE }, /* bank 1 interrupt */
{ -1, 0, 0 }
};
struct bcm_gpio_sysctl {
struct bcm_gpio_softc *sc;
uint32_t pin;
};
struct bcm_gpio_irqsrc {
struct intr_irqsrc bgi_isrc;
uint32_t bgi_irq;
uint32_t bgi_mode;
uint32_t bgi_mask;
};
struct bcm_gpio_softc {
device_t sc_dev;
device_t sc_busdev;
struct mtx sc_mtx;
struct resource * sc_res[BCM_GPIO_IRQS + 1];
bus_space_tag_t sc_bst;
bus_space_handle_t sc_bsh;
void * sc_intrhand[BCM_GPIO_IRQS];
int sc_gpio_npins;
int sc_ro_npins;
int sc_ro_pins[BCM_GPIO_PINS];
struct gpio_pin sc_gpio_pins[BCM_GPIO_PINS];
struct bcm_gpio_sysctl sc_sysctl[BCM_GPIO_PINS];
struct bcm_gpio_irqsrc sc_isrcs[BCM_GPIO_PINS];
};
enum bcm_gpio_pud {
BCM_GPIO_NONE,
BCM_GPIO_PULLDOWN,
BCM_GPIO_PULLUP,
};
#define BCM_GPIO_LOCK(_sc) mtx_lock_spin(&(_sc)->sc_mtx)
#define BCM_GPIO_UNLOCK(_sc) mtx_unlock_spin(&(_sc)->sc_mtx)
#define BCM_GPIO_LOCK_ASSERT(_sc) mtx_assert(&(_sc)->sc_mtx, MA_OWNED)
#define BCM_GPIO_WRITE(_sc, _off, _val) \
bus_space_write_4((_sc)->sc_bst, (_sc)->sc_bsh, _off, _val)
#define BCM_GPIO_READ(_sc, _off) \
bus_space_read_4((_sc)->sc_bst, (_sc)->sc_bsh, _off)
#define BCM_GPIO_CLEAR_BITS(_sc, _off, _bits) \
BCM_GPIO_WRITE(_sc, _off, BCM_GPIO_READ(_sc, _off) & ~(_bits))
#define BCM_GPIO_SET_BITS(_sc, _off, _bits) \
BCM_GPIO_WRITE(_sc, _off, BCM_GPIO_READ(_sc, _off) | _bits)
#define BCM_GPIO_BANK(a) (a / BCM_GPIO_PINS_PER_BANK)
#define BCM_GPIO_MASK(a) (1U << (a % BCM_GPIO_PINS_PER_BANK))
#define BCM_GPIO_GPFSEL(_bank) (0x00 + _bank * 4) /* Function Select */
#define BCM_GPIO_GPSET(_bank) (0x1c + _bank * 4) /* Pin Out Set */
#define BCM_GPIO_GPCLR(_bank) (0x28 + _bank * 4) /* Pin Out Clear */
#define BCM_GPIO_GPLEV(_bank) (0x34 + _bank * 4) /* Pin Level */
#define BCM_GPIO_GPEDS(_bank) (0x40 + _bank * 4) /* Event Status */
#define BCM_GPIO_GPREN(_bank) (0x4c + _bank * 4) /* Rising Edge irq */
#define BCM_GPIO_GPFEN(_bank) (0x58 + _bank * 4) /* Falling Edge irq */
#define BCM_GPIO_GPHEN(_bank) (0x64 + _bank * 4) /* High Level irq */
#define BCM_GPIO_GPLEN(_bank) (0x70 + _bank * 4) /* Low Level irq */
#define BCM_GPIO_GPAREN(_bank) (0x7c + _bank * 4) /* Async Rising Edge */
#define BCM_GPIO_GPAFEN(_bank) (0x88 + _bank * 4) /* Async Falling Egde */
#define BCM_GPIO_GPPUD(_bank) (0x94) /* Pin Pull up/down */
#define BCM_GPIO_GPPUDCLK(_bank) (0x98 + _bank * 4) /* Pin Pull up clock */
static struct ofw_compat_data compat_data[] = {
{"broadcom,bcm2835-gpio", 1},
{"brcm,bcm2835-gpio", 1},
{NULL, 0}
};
static struct bcm_gpio_softc *bcm_gpio_sc = NULL;
static int bcm_gpio_intr_bank0(void *arg);
static int bcm_gpio_intr_bank1(void *arg);
static int bcm_gpio_pic_attach(struct bcm_gpio_softc *sc);
static int bcm_gpio_pic_detach(struct bcm_gpio_softc *sc);
static int
bcm_gpio_pin_is_ro(struct bcm_gpio_softc *sc, int pin)
{
int i;
for (i = 0; i < sc->sc_ro_npins; i++)
if (pin == sc->sc_ro_pins[i])
return (1);
return (0);
}
static uint32_t
bcm_gpio_get_function(struct bcm_gpio_softc *sc, uint32_t pin)
{
uint32_t bank, func, offset;
/* Five banks, 10 pins per bank, 3 bits per pin. */
bank = pin / 10;
offset = (pin - bank * 10) * 3;
BCM_GPIO_LOCK(sc);
func = (BCM_GPIO_READ(sc, BCM_GPIO_GPFSEL(bank)) >> offset) & 7;
BCM_GPIO_UNLOCK(sc);
return (func);
}
static void
bcm_gpio_func_str(uint32_t nfunc, char *buf, int bufsize)
{
switch (nfunc) {
case BCM_GPIO_INPUT:
strncpy(buf, "input", bufsize);
break;
case BCM_GPIO_OUTPUT:
strncpy(buf, "output", bufsize);
break;
case BCM_GPIO_ALT0:
strncpy(buf, "alt0", bufsize);
break;
case BCM_GPIO_ALT1:
strncpy(buf, "alt1", bufsize);
break;
case BCM_GPIO_ALT2:
strncpy(buf, "alt2", bufsize);
break;
case BCM_GPIO_ALT3:
strncpy(buf, "alt3", bufsize);
break;
case BCM_GPIO_ALT4:
strncpy(buf, "alt4", bufsize);
break;
case BCM_GPIO_ALT5:
strncpy(buf, "alt5", bufsize);
break;
default:
strncpy(buf, "invalid", bufsize);
}
}
static int
bcm_gpio_str_func(char *func, uint32_t *nfunc)
{
if (strcasecmp(func, "input") == 0)
*nfunc = BCM_GPIO_INPUT;
else if (strcasecmp(func, "output") == 0)
*nfunc = BCM_GPIO_OUTPUT;
else if (strcasecmp(func, "alt0") == 0)
*nfunc = BCM_GPIO_ALT0;
else if (strcasecmp(func, "alt1") == 0)
*nfunc = BCM_GPIO_ALT1;
else if (strcasecmp(func, "alt2") == 0)
*nfunc = BCM_GPIO_ALT2;
else if (strcasecmp(func, "alt3") == 0)
*nfunc = BCM_GPIO_ALT3;
else if (strcasecmp(func, "alt4") == 0)
*nfunc = BCM_GPIO_ALT4;
else if (strcasecmp(func, "alt5") == 0)
*nfunc = BCM_GPIO_ALT5;
else
return (-1);
return (0);
}
static uint32_t
bcm_gpio_func_flag(uint32_t nfunc)
{
switch (nfunc) {
case BCM_GPIO_INPUT:
return (GPIO_PIN_INPUT);
case BCM_GPIO_OUTPUT:
return (GPIO_PIN_OUTPUT);
}
return (0);
}
static void
bcm_gpio_set_function(struct bcm_gpio_softc *sc, uint32_t pin, uint32_t f)
{
uint32_t bank, data, offset;
/* Must be called with lock held. */
BCM_GPIO_LOCK_ASSERT(sc);
/* Five banks, 10 pins per bank, 3 bits per pin. */
bank = pin / 10;
offset = (pin - bank * 10) * 3;
data = BCM_GPIO_READ(sc, BCM_GPIO_GPFSEL(bank));
data &= ~(7 << offset);
data |= (f << offset);
BCM_GPIO_WRITE(sc, BCM_GPIO_GPFSEL(bank), data);
}
static void
bcm_gpio_set_pud(struct bcm_gpio_softc *sc, uint32_t pin, uint32_t state)
{
uint32_t bank;
/* Must be called with lock held. */
BCM_GPIO_LOCK_ASSERT(sc);
bank = BCM_GPIO_BANK(pin);
BCM_GPIO_WRITE(sc, BCM_GPIO_GPPUD(0), state);
BCM_GPIO_WRITE(sc, BCM_GPIO_GPPUDCLK(bank), BCM_GPIO_MASK(pin));
BCM_GPIO_WRITE(sc, BCM_GPIO_GPPUD(0), 0);
BCM_GPIO_WRITE(sc, BCM_GPIO_GPPUDCLK(bank), 0);
}
void
bcm_gpio_set_alternate(device_t dev, uint32_t pin, uint32_t nfunc)
{
struct bcm_gpio_softc *sc;
int i;
sc = device_get_softc(dev);
BCM_GPIO_LOCK(sc);
/* Disable pull-up or pull-down on pin. */
bcm_gpio_set_pud(sc, pin, BCM_GPIO_NONE);
/* And now set the pin function. */
bcm_gpio_set_function(sc, pin, nfunc);
/* Update the pin flags. */
for (i = 0; i < sc->sc_gpio_npins; i++) {
if (sc->sc_gpio_pins[i].gp_pin == pin)
break;
}
if (i < sc->sc_gpio_npins)
sc->sc_gpio_pins[i].gp_flags = bcm_gpio_func_flag(nfunc);
BCM_GPIO_UNLOCK(sc);
}
static void
bcm_gpio_pin_configure(struct bcm_gpio_softc *sc, struct gpio_pin *pin,
unsigned int flags)
{
BCM_GPIO_LOCK(sc);
/*
* Manage input/output.
*/
if (flags & (GPIO_PIN_INPUT|GPIO_PIN_OUTPUT)) {
pin->gp_flags &= ~(GPIO_PIN_INPUT|GPIO_PIN_OUTPUT);
if (flags & GPIO_PIN_OUTPUT) {
pin->gp_flags |= GPIO_PIN_OUTPUT;
bcm_gpio_set_function(sc, pin->gp_pin,
BCM_GPIO_OUTPUT);
} else {
pin->gp_flags |= GPIO_PIN_INPUT;
bcm_gpio_set_function(sc, pin->gp_pin,
BCM_GPIO_INPUT);
}
}
/* Manage Pull-up/pull-down. */
pin->gp_flags &= ~(GPIO_PIN_PULLUP|GPIO_PIN_PULLDOWN);
if (flags & (GPIO_PIN_PULLUP|GPIO_PIN_PULLDOWN)) {
if (flags & GPIO_PIN_PULLUP) {
pin->gp_flags |= GPIO_PIN_PULLUP;
bcm_gpio_set_pud(sc, pin->gp_pin, BCM_GPIO_PULLUP);
} else {
pin->gp_flags |= GPIO_PIN_PULLDOWN;
bcm_gpio_set_pud(sc, pin->gp_pin, BCM_GPIO_PULLDOWN);
}
} else
bcm_gpio_set_pud(sc, pin->gp_pin, BCM_GPIO_NONE);
BCM_GPIO_UNLOCK(sc);
}
static device_t
bcm_gpio_get_bus(device_t dev)
{
struct bcm_gpio_softc *sc;
sc = device_get_softc(dev);
return (sc->sc_busdev);
}
static int
bcm_gpio_pin_max(device_t dev, int *maxpin)
{
*maxpin = BCM_GPIO_PINS - 1;
return (0);
}
static int
bcm_gpio_pin_getcaps(device_t dev, uint32_t pin, uint32_t *caps)
{
struct bcm_gpio_softc *sc = device_get_softc(dev);
int i;
for (i = 0; i < sc->sc_gpio_npins; i++) {
if (sc->sc_gpio_pins[i].gp_pin == pin)
break;
}
if (i >= sc->sc_gpio_npins)
return (EINVAL);
BCM_GPIO_LOCK(sc);
*caps = sc->sc_gpio_pins[i].gp_caps;
BCM_GPIO_UNLOCK(sc);
return (0);
}
static int
bcm_gpio_pin_getflags(device_t dev, uint32_t pin, uint32_t *flags)
{
struct bcm_gpio_softc *sc = device_get_softc(dev);
int i;
for (i = 0; i < sc->sc_gpio_npins; i++) {
if (sc->sc_gpio_pins[i].gp_pin == pin)
break;
}
if (i >= sc->sc_gpio_npins)
return (EINVAL);
BCM_GPIO_LOCK(sc);
*flags = sc->sc_gpio_pins[i].gp_flags;
BCM_GPIO_UNLOCK(sc);
return (0);
}
static int
bcm_gpio_pin_getname(device_t dev, uint32_t pin, char *name)
{
struct bcm_gpio_softc *sc = device_get_softc(dev);
int i;
for (i = 0; i < sc->sc_gpio_npins; i++) {
if (sc->sc_gpio_pins[i].gp_pin == pin)
break;
}
if (i >= sc->sc_gpio_npins)
return (EINVAL);
BCM_GPIO_LOCK(sc);
memcpy(name, sc->sc_gpio_pins[i].gp_name, GPIOMAXNAME);
BCM_GPIO_UNLOCK(sc);
return (0);
}
static int
bcm_gpio_pin_setflags(device_t dev, uint32_t pin, uint32_t flags)
{
struct bcm_gpio_softc *sc = device_get_softc(dev);
int i;
for (i = 0; i < sc->sc_gpio_npins; i++) {
if (sc->sc_gpio_pins[i].gp_pin == pin)
break;
}
if (i >= sc->sc_gpio_npins)
return (EINVAL);
/* We never touch on read-only/reserved pins. */
if (bcm_gpio_pin_is_ro(sc, pin))
return (EINVAL);
bcm_gpio_pin_configure(sc, &sc->sc_gpio_pins[i], flags);
return (0);
}
static int
bcm_gpio_pin_set(device_t dev, uint32_t pin, unsigned int value)
{
struct bcm_gpio_softc *sc = device_get_softc(dev);
uint32_t bank, reg;
int i;
for (i = 0; i < sc->sc_gpio_npins; i++) {
if (sc->sc_gpio_pins[i].gp_pin == pin)
break;
}
if (i >= sc->sc_gpio_npins)
return (EINVAL);
/* We never write to read-only/reserved pins. */
if (bcm_gpio_pin_is_ro(sc, pin))
return (EINVAL);
BCM_GPIO_LOCK(sc);
bank = BCM_GPIO_BANK(pin);
if (value)
reg = BCM_GPIO_GPSET(bank);
else
reg = BCM_GPIO_GPCLR(bank);
BCM_GPIO_WRITE(sc, reg, BCM_GPIO_MASK(pin));
BCM_GPIO_UNLOCK(sc);
return (0);
}
static int
bcm_gpio_pin_get(device_t dev, uint32_t pin, unsigned int *val)
{
struct bcm_gpio_softc *sc = device_get_softc(dev);
uint32_t bank, reg_data;
int i;
for (i = 0; i < sc->sc_gpio_npins; i++) {
if (sc->sc_gpio_pins[i].gp_pin == pin)
break;
}
if (i >= sc->sc_gpio_npins)
return (EINVAL);
bank = BCM_GPIO_BANK(pin);
BCM_GPIO_LOCK(sc);
reg_data = BCM_GPIO_READ(sc, BCM_GPIO_GPLEV(bank));
BCM_GPIO_UNLOCK(sc);
*val = (reg_data & BCM_GPIO_MASK(pin)) ? 1 : 0;
return (0);
}
static int
bcm_gpio_pin_toggle(device_t dev, uint32_t pin)
{
struct bcm_gpio_softc *sc = device_get_softc(dev);
uint32_t bank, data, reg;
int i;
for (i = 0; i < sc->sc_gpio_npins; i++) {
if (sc->sc_gpio_pins[i].gp_pin == pin)
break;
}
if (i >= sc->sc_gpio_npins)
return (EINVAL);
/* We never write to read-only/reserved pins. */
if (bcm_gpio_pin_is_ro(sc, pin))
return (EINVAL);
BCM_GPIO_LOCK(sc);
bank = BCM_GPIO_BANK(pin);
data = BCM_GPIO_READ(sc, BCM_GPIO_GPLEV(bank));
if (data & BCM_GPIO_MASK(pin))
reg = BCM_GPIO_GPCLR(bank);
else
reg = BCM_GPIO_GPSET(bank);
BCM_GPIO_WRITE(sc, reg, BCM_GPIO_MASK(pin));
BCM_GPIO_UNLOCK(sc);
return (0);
}
static int
bcm_gpio_func_proc(SYSCTL_HANDLER_ARGS)
{
char buf[16];
struct bcm_gpio_softc *sc;
struct bcm_gpio_sysctl *sc_sysctl;
uint32_t nfunc;
int error;
sc_sysctl = arg1;
sc = sc_sysctl->sc;
/* Get the current pin function. */
nfunc = bcm_gpio_get_function(sc, sc_sysctl->pin);
bcm_gpio_func_str(nfunc, buf, sizeof(buf));
error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
if (error != 0 || req->newptr == NULL)
return (error);
/* Ignore changes on read-only pins. */
if (bcm_gpio_pin_is_ro(sc, sc_sysctl->pin))
return (0);
/* Parse the user supplied string and check for a valid pin function. */
if (bcm_gpio_str_func(buf, &nfunc) != 0)
return (EINVAL);
/* Update the pin alternate function. */
bcm_gpio_set_alternate(sc->sc_dev, sc_sysctl->pin, nfunc);
return (0);
}
static void
bcm_gpio_sysctl_init(struct bcm_gpio_softc *sc)
{
char pinbuf[3];
struct bcm_gpio_sysctl *sc_sysctl;
struct sysctl_ctx_list *ctx;
struct sysctl_oid *tree_node, *pin_node, *pinN_node;
struct sysctl_oid_list *tree, *pin_tree, *pinN_tree;
int i;
/*
* Add per-pin sysctl tree/handlers.
*/
ctx = device_get_sysctl_ctx(sc->sc_dev);
tree_node = device_get_sysctl_tree(sc->sc_dev);
tree = SYSCTL_CHILDREN(tree_node);
pin_node = SYSCTL_ADD_NODE(ctx, tree, OID_AUTO, "pin",
CTLFLAG_RD, NULL, "GPIO Pins");
pin_tree = SYSCTL_CHILDREN(pin_node);
for (i = 0; i < sc->sc_gpio_npins; i++) {
snprintf(pinbuf, sizeof(pinbuf), "%d", i);
pinN_node = SYSCTL_ADD_NODE(ctx, pin_tree, OID_AUTO, pinbuf,
CTLFLAG_RD, NULL, "GPIO Pin");
pinN_tree = SYSCTL_CHILDREN(pinN_node);
sc->sc_sysctl[i].sc = sc;
sc_sysctl = &sc->sc_sysctl[i];
sc_sysctl->sc = sc;
sc_sysctl->pin = sc->sc_gpio_pins[i].gp_pin;
SYSCTL_ADD_PROC(ctx, pinN_tree, OID_AUTO, "function",
CTLFLAG_RW | CTLTYPE_STRING, sc_sysctl,
sizeof(struct bcm_gpio_sysctl), bcm_gpio_func_proc,
"A", "Pin Function");
}
}
static int
bcm_gpio_get_ro_pins(struct bcm_gpio_softc *sc, phandle_t node,
const char *propname, const char *label)
{
int i, need_comma, npins, range_start, range_stop;
pcell_t *pins;
/* Get the property data. */
npins = OF_getencprop_alloc(node, propname, sizeof(*pins),
(void **)&pins);
if (npins < 0)
return (-1);
if (npins == 0) {
OF_prop_free(pins);
return (0);
}
for (i = 0; i < npins; i++)
sc->sc_ro_pins[i + sc->sc_ro_npins] = pins[i];
sc->sc_ro_npins += npins;
need_comma = 0;
device_printf(sc->sc_dev, "%s pins: ", label);
range_start = range_stop = pins[0];
for (i = 1; i < npins; i++) {
if (pins[i] != range_stop + 1) {
if (need_comma)
printf(",");
if (range_start != range_stop)
printf("%d-%d", range_start, range_stop);
else
printf("%d", range_start);
range_start = range_stop = pins[i];
need_comma = 1;
} else
range_stop++;
}
if (need_comma)
printf(",");
if (range_start != range_stop)
printf("%d-%d.\n", range_start, range_stop);
else
printf("%d.\n", range_start);
OF_prop_free(pins);
return (0);
}
static int
bcm_gpio_get_reserved_pins(struct bcm_gpio_softc *sc)
{
char *name;
phandle_t gpio, node, reserved;
ssize_t len;
/* Get read-only pins if they're provided */
gpio = ofw_bus_get_node(sc->sc_dev);
if (bcm_gpio_get_ro_pins(sc, gpio, "broadcom,read-only",
"read-only") != 0)
return (0);
/* Traverse the GPIO subnodes to find the reserved pins node. */
reserved = 0;
node = OF_child(gpio);
while ((node != 0) && (reserved == 0)) {
len = OF_getprop_alloc(node, "name", 1, (void **)&name);
if (len == -1)
return (-1);
if (strcmp(name, "reserved") == 0)
reserved = node;
OF_prop_free(name);
node = OF_peer(node);
}
if (reserved == 0)
return (-1);
/* Get the reserved pins. */
if (bcm_gpio_get_ro_pins(sc, reserved, "broadcom,pins",
"reserved") != 0)
return (-1);
return (0);
}
static int
bcm_gpio_probe(device_t dev)
{
if (!ofw_bus_status_okay(dev))
return (ENXIO);
if (ofw_bus_search_compatible(dev, compat_data)->ocd_data == 0)
return (ENXIO);
device_set_desc(dev, "BCM2708/2835 GPIO controller");
return (BUS_PROBE_DEFAULT);
}
static int
bcm_gpio_intr_attach(device_t dev)
{
struct bcm_gpio_softc *sc;
/*
* Only first two interrupt lines are used. Third line is
* mirrored second line and forth line is common for all banks.
*/
sc = device_get_softc(dev);
if (sc->sc_res[1] == NULL || sc->sc_res[2] == NULL)
return (-1);
if (bcm_gpio_pic_attach(sc) != 0) {
device_printf(dev, "unable to attach PIC\n");
return (-1);
}
if (bus_setup_intr(dev, sc->sc_res[1], INTR_TYPE_MISC | INTR_MPSAFE,
bcm_gpio_intr_bank0, NULL, sc, &sc->sc_intrhand[0]) != 0)
return (-1);
if (bus_setup_intr(dev, sc->sc_res[2], INTR_TYPE_MISC | INTR_MPSAFE,
bcm_gpio_intr_bank1, NULL, sc, &sc->sc_intrhand[1]) != 0)
return (-1);
return (0);
}
static void
bcm_gpio_intr_detach(device_t dev)
{
struct bcm_gpio_softc *sc;
sc = device_get_softc(dev);
if (sc->sc_intrhand[0] != NULL)
bus_teardown_intr(dev, sc->sc_res[1], sc->sc_intrhand[0]);
if (sc->sc_intrhand[1] != NULL)
bus_teardown_intr(dev, sc->sc_res[2], sc->sc_intrhand[1]);
bcm_gpio_pic_detach(sc);
}
static int
bcm_gpio_attach(device_t dev)
{
int i, j;
phandle_t gpio;
struct bcm_gpio_softc *sc;
uint32_t func;
if (bcm_gpio_sc != NULL)
return (ENXIO);
bcm_gpio_sc = sc = device_get_softc(dev);
sc->sc_dev = dev;
mtx_init(&sc->sc_mtx, "bcm gpio", "gpio", MTX_SPIN);
if (bus_alloc_resources(dev, bcm_gpio_res_spec, sc->sc_res) != 0) {
device_printf(dev, "cannot allocate resources\n");
goto fail;
}
sc->sc_bst = rman_get_bustag(sc->sc_res[0]);
sc->sc_bsh = rman_get_bushandle(sc->sc_res[0]);
/* Setup the GPIO interrupt handler. */
if (bcm_gpio_intr_attach(dev)) {
device_printf(dev, "unable to setup the gpio irq handler\n");
goto fail;
}
/* Find our node. */
gpio = ofw_bus_get_node(sc->sc_dev);
if (!OF_hasprop(gpio, "gpio-controller"))
/* Node is not a GPIO controller. */
goto fail;
/*
* Find the read-only pins. These are pins we never touch or bad
* things could happen.
*/
if (bcm_gpio_get_reserved_pins(sc) == -1)
goto fail;
/* Initialize the software controlled pins. */
for (i = 0, j = 0; j < BCM_GPIO_PINS; j++) {
snprintf(sc->sc_gpio_pins[i].gp_name, GPIOMAXNAME,
"pin %d", j);
func = bcm_gpio_get_function(sc, j);
sc->sc_gpio_pins[i].gp_pin = j;
sc->sc_gpio_pins[i].gp_caps = BCM_GPIO_DEFAULT_CAPS;
sc->sc_gpio_pins[i].gp_flags = bcm_gpio_func_flag(func);
i++;
}
sc->sc_gpio_npins = i;
bcm_gpio_sysctl_init(sc);
sc->sc_busdev = gpiobus_attach_bus(dev);
if (sc->sc_busdev == NULL)
goto fail;
return (0);
fail:
bcm_gpio_intr_detach(dev);
bus_release_resources(dev, bcm_gpio_res_spec, sc->sc_res);
mtx_destroy(&sc->sc_mtx);
return (ENXIO);
}
static int
bcm_gpio_detach(device_t dev)
{
return (EBUSY);
}
static inline void
bcm_gpio_modify(struct bcm_gpio_softc *sc, uint32_t reg, uint32_t mask,
bool set_bits)
{
if (set_bits)
BCM_GPIO_SET_BITS(sc, reg, mask);
else
BCM_GPIO_CLEAR_BITS(sc, reg, mask);
}
static inline void
bcm_gpio_isrc_eoi(struct bcm_gpio_softc *sc, struct bcm_gpio_irqsrc *bgi)
{
uint32_t bank;
/* Write 1 to clear. */
bank = BCM_GPIO_BANK(bgi->bgi_irq);
BCM_GPIO_WRITE(sc, BCM_GPIO_GPEDS(bank), bgi->bgi_mask);
}
static inline bool
bcm_gpio_isrc_is_level(struct bcm_gpio_irqsrc *bgi)
{
return (bgi->bgi_mode == GPIO_INTR_LEVEL_LOW ||
bgi->bgi_mode == GPIO_INTR_LEVEL_HIGH);
}
static inline void
bcm_gpio_isrc_mask(struct bcm_gpio_softc *sc, struct bcm_gpio_irqsrc *bgi)
{
uint32_t bank;
bank = BCM_GPIO_BANK(bgi->bgi_irq);
BCM_GPIO_LOCK(sc);
switch (bgi->bgi_mode) {
case GPIO_INTR_LEVEL_LOW:
BCM_GPIO_CLEAR_BITS(sc, BCM_GPIO_GPLEN(bank), bgi->bgi_mask);
break;
case GPIO_INTR_LEVEL_HIGH:
BCM_GPIO_CLEAR_BITS(sc, BCM_GPIO_GPHEN(bank), bgi->bgi_mask);
break;
case GPIO_INTR_EDGE_RISING:
BCM_GPIO_CLEAR_BITS(sc, BCM_GPIO_GPREN(bank), bgi->bgi_mask);
break;
case GPIO_INTR_EDGE_FALLING:
BCM_GPIO_CLEAR_BITS(sc, BCM_GPIO_GPFEN(bank), bgi->bgi_mask);
break;
case GPIO_INTR_EDGE_BOTH:
BCM_GPIO_CLEAR_BITS(sc, BCM_GPIO_GPREN(bank), bgi->bgi_mask);
BCM_GPIO_CLEAR_BITS(sc, BCM_GPIO_GPFEN(bank), bgi->bgi_mask);
break;
}
BCM_GPIO_UNLOCK(sc);
}
static inline void
bcm_gpio_isrc_unmask(struct bcm_gpio_softc *sc, struct bcm_gpio_irqsrc *bgi)
{
uint32_t bank;
bank = BCM_GPIO_BANK(bgi->bgi_irq);
BCM_GPIO_LOCK(sc);
switch (bgi->bgi_mode) {
case GPIO_INTR_LEVEL_LOW:
BCM_GPIO_SET_BITS(sc, BCM_GPIO_GPLEN(bank), bgi->bgi_mask);
break;
case GPIO_INTR_LEVEL_HIGH:
BCM_GPIO_SET_BITS(sc, BCM_GPIO_GPHEN(bank), bgi->bgi_mask);
break;
case GPIO_INTR_EDGE_RISING:
BCM_GPIO_SET_BITS(sc, BCM_GPIO_GPREN(bank), bgi->bgi_mask);
break;
case GPIO_INTR_EDGE_FALLING:
BCM_GPIO_SET_BITS(sc, BCM_GPIO_GPFEN(bank), bgi->bgi_mask);
break;
case GPIO_INTR_EDGE_BOTH:
BCM_GPIO_SET_BITS(sc, BCM_GPIO_GPREN(bank), bgi->bgi_mask);
BCM_GPIO_SET_BITS(sc, BCM_GPIO_GPFEN(bank), bgi->bgi_mask);
break;
}
BCM_GPIO_UNLOCK(sc);
}
static int
bcm_gpio_intr_internal(struct bcm_gpio_softc *sc, uint32_t bank)
{
u_int irq;
struct bcm_gpio_irqsrc *bgi;
uint32_t reg;
/* Do not care of spurious interrupt on GPIO. */
reg = BCM_GPIO_READ(sc, BCM_GPIO_GPEDS(bank));
while (reg != 0) {
irq = BCM_GPIO_PINS_PER_BANK * bank + ffs(reg) - 1;
bgi = sc->sc_isrcs + irq;
if (!bcm_gpio_isrc_is_level(bgi))
bcm_gpio_isrc_eoi(sc, bgi);
if (intr_isrc_dispatch(&bgi->bgi_isrc,
curthread->td_intr_frame) != 0) {
bcm_gpio_isrc_mask(sc, bgi);
if (bcm_gpio_isrc_is_level(bgi))
bcm_gpio_isrc_eoi(sc, bgi);
device_printf(sc->sc_dev, "Stray irq %u disabled\n",
irq);
}
reg &= ~bgi->bgi_mask;
}
return (FILTER_HANDLED);
}
static int
bcm_gpio_intr_bank0(void *arg)
{
return (bcm_gpio_intr_internal(arg, 0));
}
static int
bcm_gpio_intr_bank1(void *arg)
{
return (bcm_gpio_intr_internal(arg, 1));
}
static int
bcm_gpio_pic_attach(struct bcm_gpio_softc *sc)
{
int error;
uint32_t irq;
const char *name;
name = device_get_nameunit(sc->sc_dev);
for (irq = 0; irq < BCM_GPIO_PINS; irq++) {
sc->sc_isrcs[irq].bgi_irq = irq;
sc->sc_isrcs[irq].bgi_mask = BCM_GPIO_MASK(irq);
sc->sc_isrcs[irq].bgi_mode = GPIO_INTR_CONFORM;
error = intr_isrc_register(&sc->sc_isrcs[irq].bgi_isrc,
sc->sc_dev, 0, "%s,%u", name, irq);
if (error != 0)
return (error); /* XXX deregister ISRCs */
}
if (intr_pic_register(sc->sc_dev,
OF_xref_from_node(ofw_bus_get_node(sc->sc_dev))) == NULL)
return (ENXIO);
return (0);
}
static int
bcm_gpio_pic_detach(struct bcm_gpio_softc *sc)
{
/*
* There has not been established any procedure yet
* how to detach PIC from living system correctly.
*/
device_printf(sc->sc_dev, "%s: not implemented yet\n", __func__);
return (EBUSY);
}
static void
bcm_gpio_pic_config_intr(struct bcm_gpio_softc *sc, struct bcm_gpio_irqsrc *bgi,
uint32_t mode)
{
uint32_t bank;
bank = BCM_GPIO_BANK(bgi->bgi_irq);
BCM_GPIO_LOCK(sc);
bcm_gpio_modify(sc, BCM_GPIO_GPREN(bank), bgi->bgi_mask,
mode == GPIO_INTR_EDGE_RISING || mode == GPIO_INTR_EDGE_BOTH);
bcm_gpio_modify(sc, BCM_GPIO_GPFEN(bank), bgi->bgi_mask,
mode == GPIO_INTR_EDGE_FALLING || mode == GPIO_INTR_EDGE_BOTH);
bcm_gpio_modify(sc, BCM_GPIO_GPHEN(bank), bgi->bgi_mask,
mode == GPIO_INTR_LEVEL_HIGH);
bcm_gpio_modify(sc, BCM_GPIO_GPLEN(bank), bgi->bgi_mask,
mode == GPIO_INTR_LEVEL_LOW);
bgi->bgi_mode = mode;
BCM_GPIO_UNLOCK(sc);
}
static void
bcm_gpio_pic_disable_intr(device_t dev, struct intr_irqsrc *isrc)
{
struct bcm_gpio_softc *sc = device_get_softc(dev);
struct bcm_gpio_irqsrc *bgi = (struct bcm_gpio_irqsrc *)isrc;
bcm_gpio_isrc_mask(sc, bgi);
}
static void
bcm_gpio_pic_enable_intr(device_t dev, struct intr_irqsrc *isrc)
{
struct bcm_gpio_softc *sc = device_get_softc(dev);
struct bcm_gpio_irqsrc *bgi = (struct bcm_gpio_irqsrc *)isrc;
arm_irq_memory_barrier(bgi->bgi_irq);
bcm_gpio_isrc_unmask(sc, bgi);
}
static int
bcm_gpio_pic_map_fdt(struct bcm_gpio_softc *sc, struct intr_map_data_fdt *daf,
u_int *irqp, uint32_t *modep)
{
u_int irq;
- uint32_t mode, bank;
+ uint32_t mode;
/*
* The first cell is the interrupt number.
* The second cell is used to specify flags:
* bits[3:0] trigger type and level flags:
* 1 = low-to-high edge triggered.
* 2 = high-to-low edge triggered.
* 4 = active high level-sensitive.
* 8 = active low level-sensitive.
*/
if (daf->ncells != 2)
return (EINVAL);
irq = daf->cells[0];
if (irq >= BCM_GPIO_PINS || bcm_gpio_pin_is_ro(sc, irq))
return (EINVAL);
/* Only reasonable modes are supported. */
- bank = BCM_GPIO_BANK(irq);
if (daf->cells[1] == 1)
mode = GPIO_INTR_EDGE_RISING;
else if (daf->cells[1] == 2)
mode = GPIO_INTR_EDGE_FALLING;
else if (daf->cells[1] == 3)
mode = GPIO_INTR_EDGE_BOTH;
else if (daf->cells[1] == 4)
mode = GPIO_INTR_LEVEL_HIGH;
else if (daf->cells[1] == 8)
mode = GPIO_INTR_LEVEL_LOW;
else
return (EINVAL);
*irqp = irq;
if (modep != NULL)
*modep = mode;
return (0);
}
static int
bcm_gpio_pic_map_gpio(struct bcm_gpio_softc *sc, struct intr_map_data_gpio *dag,
u_int *irqp, uint32_t *modep)
{
u_int irq;
uint32_t mode;
irq = dag->gpio_pin_num;
if (irq >= BCM_GPIO_PINS || bcm_gpio_pin_is_ro(sc, irq))
return (EINVAL);
mode = dag->gpio_intr_mode;
if (mode != GPIO_INTR_LEVEL_LOW && mode != GPIO_INTR_LEVEL_HIGH &&
mode != GPIO_INTR_EDGE_RISING && mode != GPIO_INTR_EDGE_FALLING &&
mode != GPIO_INTR_EDGE_BOTH)
return (EINVAL);
*irqp = irq;
if (modep != NULL)
*modep = mode;
return (0);
}
static int
bcm_gpio_pic_map(struct bcm_gpio_softc *sc, struct intr_map_data *data,
u_int *irqp, uint32_t *modep)
{
switch (data->type) {
case INTR_MAP_DATA_FDT:
return (bcm_gpio_pic_map_fdt(sc,
(struct intr_map_data_fdt *)data, irqp, modep));
case INTR_MAP_DATA_GPIO:
return (bcm_gpio_pic_map_gpio(sc,
(struct intr_map_data_gpio *)data, irqp, modep));
default:
return (ENOTSUP);
}
}
static int
bcm_gpio_pic_map_intr(device_t dev, struct intr_map_data *data,
struct intr_irqsrc **isrcp)
{
int error;
u_int irq;
struct bcm_gpio_softc *sc = device_get_softc(dev);
error = bcm_gpio_pic_map(sc, data, &irq, NULL);
if (error == 0)
*isrcp = &sc->sc_isrcs[irq].bgi_isrc;
return (error);
}
static void
bcm_gpio_pic_post_filter(device_t dev, struct intr_irqsrc *isrc)
{
struct bcm_gpio_softc *sc = device_get_softc(dev);
struct bcm_gpio_irqsrc *bgi = (struct bcm_gpio_irqsrc *)isrc;
if (bcm_gpio_isrc_is_level(bgi))
bcm_gpio_isrc_eoi(sc, bgi);
}
static void
bcm_gpio_pic_post_ithread(device_t dev, struct intr_irqsrc *isrc)
{
bcm_gpio_pic_enable_intr(dev, isrc);
}
static void
bcm_gpio_pic_pre_ithread(device_t dev, struct intr_irqsrc *isrc)
{
struct bcm_gpio_softc *sc = device_get_softc(dev);
struct bcm_gpio_irqsrc *bgi = (struct bcm_gpio_irqsrc *)isrc;
bcm_gpio_isrc_mask(sc, bgi);
if (bcm_gpio_isrc_is_level(bgi))
bcm_gpio_isrc_eoi(sc, bgi);
}
static int
bcm_gpio_pic_setup_intr(device_t dev, struct intr_irqsrc *isrc,
struct resource *res, struct intr_map_data *data)
{
u_int irq;
uint32_t mode;
struct bcm_gpio_softc *sc;
struct bcm_gpio_irqsrc *bgi;
if (data == NULL)
return (ENOTSUP);
sc = device_get_softc(dev);
bgi = (struct bcm_gpio_irqsrc *)isrc;
/* Get and check config for an interrupt. */
if (bcm_gpio_pic_map(sc, data, &irq, &mode) != 0 || bgi->bgi_irq != irq)
return (EINVAL);
/*
* If this is a setup for another handler,
* only check that its configuration match.
*/
if (isrc->isrc_handlers != 0)
return (bgi->bgi_mode == mode ? 0 : EINVAL);
bcm_gpio_pic_config_intr(sc, bgi, mode);
return (0);
}
static int
bcm_gpio_pic_teardown_intr(device_t dev, struct intr_irqsrc *isrc,
struct resource *res, struct intr_map_data *data)
{
struct bcm_gpio_softc *sc = device_get_softc(dev);
struct bcm_gpio_irqsrc *bgi = (struct bcm_gpio_irqsrc *)isrc;
if (isrc->isrc_handlers == 0)
bcm_gpio_pic_config_intr(sc, bgi, GPIO_INTR_CONFORM);
return (0);
}
static phandle_t
bcm_gpio_get_node(device_t bus, device_t dev)
{
/* We only have one child, the GPIO bus, which needs our own node. */
return (ofw_bus_get_node(bus));
}
static device_method_t bcm_gpio_methods[] = {
/* Device interface */
DEVMETHOD(device_probe, bcm_gpio_probe),
DEVMETHOD(device_attach, bcm_gpio_attach),
DEVMETHOD(device_detach, bcm_gpio_detach),
/* GPIO protocol */
DEVMETHOD(gpio_get_bus, bcm_gpio_get_bus),
DEVMETHOD(gpio_pin_max, bcm_gpio_pin_max),
DEVMETHOD(gpio_pin_getname, bcm_gpio_pin_getname),
DEVMETHOD(gpio_pin_getflags, bcm_gpio_pin_getflags),
DEVMETHOD(gpio_pin_getcaps, bcm_gpio_pin_getcaps),
DEVMETHOD(gpio_pin_setflags, bcm_gpio_pin_setflags),
DEVMETHOD(gpio_pin_get, bcm_gpio_pin_get),
DEVMETHOD(gpio_pin_set, bcm_gpio_pin_set),
DEVMETHOD(gpio_pin_toggle, bcm_gpio_pin_toggle),
/* Interrupt controller interface */
DEVMETHOD(pic_disable_intr, bcm_gpio_pic_disable_intr),
DEVMETHOD(pic_enable_intr, bcm_gpio_pic_enable_intr),
DEVMETHOD(pic_map_intr, bcm_gpio_pic_map_intr),
DEVMETHOD(pic_post_filter, bcm_gpio_pic_post_filter),
DEVMETHOD(pic_post_ithread, bcm_gpio_pic_post_ithread),
DEVMETHOD(pic_pre_ithread, bcm_gpio_pic_pre_ithread),
DEVMETHOD(pic_setup_intr, bcm_gpio_pic_setup_intr),
DEVMETHOD(pic_teardown_intr, bcm_gpio_pic_teardown_intr),
/* ofw_bus interface */
DEVMETHOD(ofw_bus_get_node, bcm_gpio_get_node),
DEVMETHOD_END
};
static devclass_t bcm_gpio_devclass;
static driver_t bcm_gpio_driver = {
"gpio",
bcm_gpio_methods,
sizeof(struct bcm_gpio_softc),
};
DRIVER_MODULE(bcm_gpio, simplebus, bcm_gpio_driver, bcm_gpio_devclass, 0, 0);
Index: head/sys/arm/broadcom/bcm2835/bcm2835_mbox.c
===================================================================
--- head/sys/arm/broadcom/bcm2835/bcm2835_mbox.c (revision 327172)
+++ head/sys/arm/broadcom/bcm2835/bcm2835_mbox.c (revision 327173)
@@ -1,538 +1,542 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2012 Oleksandr Tymoshenko <gonzo@freebsd.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/bus.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/sx.h>
#include <sys/rman.h>
#include <machine/bus.h>
#include <dev/ofw/ofw_bus.h>
#include <dev/ofw/ofw_bus_subr.h>
#include <arm/broadcom/bcm2835/bcm2835_mbox.h>
#include <arm/broadcom/bcm2835/bcm2835_mbox_prop.h>
#include <arm/broadcom/bcm2835/bcm2835_vcbus.h>
#include "mbox_if.h"
#define REG_READ 0x00
#define REG_POL 0x10
#define REG_SENDER 0x14
#define REG_STATUS 0x18
#define STATUS_FULL 0x80000000
#define STATUS_EMPTY 0x40000000
#define REG_CONFIG 0x1C
#define CONFIG_DATA_IRQ 0x00000001
#define REG_WRITE 0x20 /* This is Mailbox 1 address */
#define MBOX_MSG(chan, data) (((data) & ~0xf) | ((chan) & 0xf))
#define MBOX_CHAN(msg) ((msg) & 0xf)
#define MBOX_DATA(msg) ((msg) & ~0xf)
#define MBOX_LOCK(sc) do { \
mtx_lock(&(sc)->lock); \
} while(0)
#define MBOX_UNLOCK(sc) do { \
mtx_unlock(&(sc)->lock); \
} while(0)
#ifdef DEBUG
#define dprintf(fmt, args...) printf(fmt, ##args)
#else
#define dprintf(fmt, args...)
#endif
struct bcm_mbox_softc {
struct mtx lock;
struct resource * mem_res;
struct resource * irq_res;
void* intr_hl;
bus_space_tag_t bst;
bus_space_handle_t bsh;
int msg[BCM2835_MBOX_CHANS];
int have_message[BCM2835_MBOX_CHANS];
struct sx property_chan_lock;
};
#define mbox_read_4(sc, reg) \
bus_space_read_4((sc)->bst, (sc)->bsh, reg)
#define mbox_write_4(sc, reg, val) \
bus_space_write_4((sc)->bst, (sc)->bsh, reg, val)
static struct ofw_compat_data compat_data[] = {
{"broadcom,bcm2835-mbox", 1},
{"brcm,bcm2835-mbox", 1},
{NULL, 0}
};
static int
bcm_mbox_read_msg(struct bcm_mbox_softc *sc, int *ochan)
{
+#ifdef DEBUG
uint32_t data;
+#endif
uint32_t msg;
int chan;
msg = mbox_read_4(sc, REG_READ);
dprintf("bcm_mbox_intr: raw data %08x\n", msg);
chan = MBOX_CHAN(msg);
+#ifdef DEBUG
data = MBOX_DATA(msg);
+#endif
if (sc->msg[chan]) {
printf("bcm_mbox_intr: channel %d oveflow\n", chan);
return (1);
}
dprintf("bcm_mbox_intr: chan %d, data %08x\n", chan, data);
sc->msg[chan] = msg;
if (ochan != NULL)
*ochan = chan;
return (0);
}
static void
bcm_mbox_intr(void *arg)
{
struct bcm_mbox_softc *sc = arg;
int chan;
MBOX_LOCK(sc);
while (!(mbox_read_4(sc, REG_STATUS) & STATUS_EMPTY))
if (bcm_mbox_read_msg(sc, &chan) == 0) {
sc->have_message[chan] = 1;
wakeup(&sc->have_message[chan]);
}
MBOX_UNLOCK(sc);
}
static int
bcm_mbox_probe(device_t dev)
{
if (!ofw_bus_status_okay(dev))
return (ENXIO);
if (ofw_bus_search_compatible(dev, compat_data)->ocd_data == 0)
return (ENXIO);
device_set_desc(dev, "BCM2835 VideoCore Mailbox");
return (BUS_PROBE_DEFAULT);
}
static int
bcm_mbox_attach(device_t dev)
{
struct bcm_mbox_softc *sc = device_get_softc(dev);
int i;
int rid = 0;
sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE);
if (sc->mem_res == NULL) {
device_printf(dev, "could not allocate memory resource\n");
return (ENXIO);
}
sc->bst = rman_get_bustag(sc->mem_res);
sc->bsh = rman_get_bushandle(sc->mem_res);
rid = 0;
sc->irq_res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_ACTIVE);
if (sc->irq_res == NULL) {
device_printf(dev, "could not allocate interrupt resource\n");
return (ENXIO);
}
/* Setup and enable the timer */
if (bus_setup_intr(dev, sc->irq_res, INTR_MPSAFE | INTR_TYPE_MISC,
NULL, bcm_mbox_intr, sc, &sc->intr_hl) != 0) {
bus_release_resource(dev, SYS_RES_IRQ, rid, sc->irq_res);
device_printf(dev, "Unable to setup the clock irq handler.\n");
return (ENXIO);
}
mtx_init(&sc->lock, "vcio mbox", NULL, MTX_DEF);
for (i = 0; i < BCM2835_MBOX_CHANS; i++) {
sc->msg[i] = 0;
sc->have_message[i] = 0;
}
sx_init(&sc->property_chan_lock, "mboxprop");
/* Read all pending messages */
while ((mbox_read_4(sc, REG_STATUS) & STATUS_EMPTY) == 0)
(void)mbox_read_4(sc, REG_READ);
mbox_write_4(sc, REG_CONFIG, CONFIG_DATA_IRQ);
return (0);
}
/*
* Mailbox API
*/
static int
bcm_mbox_write(device_t dev, int chan, uint32_t data)
{
int limit = 1000;
struct bcm_mbox_softc *sc = device_get_softc(dev);
dprintf("bcm_mbox_write: chan %d, data %08x\n", chan, data);
MBOX_LOCK(sc);
sc->have_message[chan] = 0;
while ((mbox_read_4(sc, REG_STATUS) & STATUS_FULL) && --limit)
DELAY(5);
if (limit == 0) {
printf("bcm_mbox_write: STATUS_FULL stuck");
MBOX_UNLOCK(sc);
return (EAGAIN);
}
mbox_write_4(sc, REG_WRITE, MBOX_MSG(chan, data));
MBOX_UNLOCK(sc);
return (0);
}
static int
bcm_mbox_read(device_t dev, int chan, uint32_t *data)
{
struct bcm_mbox_softc *sc = device_get_softc(dev);
int err, read_chan;
dprintf("bcm_mbox_read: chan %d\n", chan);
err = 0;
MBOX_LOCK(sc);
if (!cold) {
if (sc->have_message[chan] == 0) {
if (mtx_sleep(&sc->have_message[chan], &sc->lock, 0,
"mbox", 10*hz) != 0) {
device_printf(dev, "timeout waiting for message on chan %d\n", chan);
err = ETIMEDOUT;
}
}
} else {
do {
/* Wait for a message */
while ((mbox_read_4(sc, REG_STATUS) & STATUS_EMPTY))
;
/* Read the message */
if (bcm_mbox_read_msg(sc, &read_chan) != 0) {
err = EINVAL;
goto out;
}
} while (read_chan != chan);
}
/*
* get data from intr handler, the same channel is never coming
* because of holding sc lock.
*/
*data = MBOX_DATA(sc->msg[chan]);
sc->msg[chan] = 0;
sc->have_message[chan] = 0;
out:
MBOX_UNLOCK(sc);
dprintf("bcm_mbox_read: chan %d, data %08x\n", chan, *data);
return (err);
}
static device_method_t bcm_mbox_methods[] = {
DEVMETHOD(device_probe, bcm_mbox_probe),
DEVMETHOD(device_attach, bcm_mbox_attach),
DEVMETHOD(mbox_read, bcm_mbox_read),
DEVMETHOD(mbox_write, bcm_mbox_write),
DEVMETHOD_END
};
static driver_t bcm_mbox_driver = {
"mbox",
bcm_mbox_methods,
sizeof(struct bcm_mbox_softc),
};
static devclass_t bcm_mbox_devclass;
DRIVER_MODULE(mbox, simplebus, bcm_mbox_driver, bcm_mbox_devclass, 0, 0);
static void
bcm2835_mbox_dma_cb(void *arg, bus_dma_segment_t *segs, int nseg, int err)
{
bus_addr_t *addr;
if (err)
return;
addr = (bus_addr_t *)arg;
*addr = PHYS_TO_VCBUS(segs[0].ds_addr);
}
static void *
bcm2835_mbox_init_dma(device_t dev, size_t len, bus_dma_tag_t *tag,
bus_dmamap_t *map, bus_addr_t *phys)
{
void *buf;
int err;
err = bus_dma_tag_create(bus_get_dma_tag(dev), 16, 0,
BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR, NULL, NULL,
len, 1, len, 0, NULL, NULL, tag);
if (err != 0) {
device_printf(dev, "can't create DMA tag\n");
return (NULL);
}
err = bus_dmamem_alloc(*tag, &buf, 0, map);
if (err != 0) {
bus_dma_tag_destroy(*tag);
device_printf(dev, "can't allocate dmamem\n");
return (NULL);
}
err = bus_dmamap_load(*tag, *map, buf, len, bcm2835_mbox_dma_cb,
phys, 0);
if (err != 0) {
bus_dmamem_free(*tag, buf, *map);
bus_dma_tag_destroy(*tag);
device_printf(dev, "can't load DMA map\n");
return (NULL);
}
return (buf);
}
static int
bcm2835_mbox_err(device_t dev, bus_addr_t msg_phys, uint32_t resp_phys,
struct bcm2835_mbox_hdr *msg, size_t len)
{
int idx;
struct bcm2835_mbox_tag_hdr *tag;
uint8_t *last;
if ((uint32_t)msg_phys != resp_phys) {
device_printf(dev, "response channel mismatch\n");
return (EIO);
}
if (msg->code != BCM2835_MBOX_CODE_RESP_SUCCESS) {
device_printf(dev, "mbox response error\n");
return (EIO);
}
/* Loop until the end tag. */
tag = (struct bcm2835_mbox_tag_hdr *)(msg + 1);
last = (uint8_t *)msg + len;
for (idx = 0; tag->tag != 0; idx++) {
if ((tag->val_len & BCM2835_MBOX_TAG_VAL_LEN_RESPONSE) == 0) {
device_printf(dev, "tag %d response error\n", idx);
return (EIO);
}
/* Clear the response bit. */
tag->val_len &= ~BCM2835_MBOX_TAG_VAL_LEN_RESPONSE;
/* Next tag. */
tag = (struct bcm2835_mbox_tag_hdr *)((uint8_t *)tag +
sizeof(*tag) + tag->val_buf_size);
if ((uint8_t *)tag > last) {
device_printf(dev, "mbox buffer size error\n");
return (EIO);
}
}
return (0);
}
int
bcm2835_mbox_property(void *msg, size_t msg_size)
{
struct bcm_mbox_softc *sc;
struct msg_set_power_state *buf;
bus_dma_tag_t msg_tag;
bus_dmamap_t msg_map;
bus_addr_t msg_phys;
uint32_t reg;
device_t mbox;
int err;
/* get mbox device */
mbox = devclass_get_device(devclass_find("mbox"), 0);
if (mbox == NULL)
return (ENXIO);
sc = device_get_softc(mbox);
sx_xlock(&sc->property_chan_lock);
/* Allocate memory for the message */
buf = bcm2835_mbox_init_dma(mbox, msg_size, &msg_tag, &msg_map,
&msg_phys);
if (buf == NULL) {
err = ENOMEM;
goto out;
}
memcpy(buf, msg, msg_size);
bus_dmamap_sync(msg_tag, msg_map,
BUS_DMASYNC_PREWRITE);
MBOX_WRITE(mbox, BCM2835_MBOX_CHAN_PROP, (uint32_t)msg_phys);
MBOX_READ(mbox, BCM2835_MBOX_CHAN_PROP, &reg);
bus_dmamap_sync(msg_tag, msg_map,
BUS_DMASYNC_PREREAD);
memcpy(msg, buf, msg_size);
err = bcm2835_mbox_err(mbox, msg_phys, reg,
(struct bcm2835_mbox_hdr *)msg, msg_size);
bus_dmamap_unload(msg_tag, msg_map);
bus_dmamem_free(msg_tag, buf, msg_map);
bus_dma_tag_destroy(msg_tag);
out:
sx_xunlock(&sc->property_chan_lock);
return (err);
}
int
bcm2835_mbox_set_power_state(uint32_t device_id, boolean_t on)
{
struct msg_set_power_state msg;
int err;
memset(&msg, 0, sizeof(msg));
msg.hdr.buf_size = sizeof(msg);
msg.hdr.code = BCM2835_MBOX_CODE_REQ;
msg.tag_hdr.tag = BCM2835_MBOX_TAG_SET_POWER_STATE;
msg.tag_hdr.val_buf_size = sizeof(msg.body);
msg.tag_hdr.val_len = sizeof(msg.body.req);
msg.body.req.device_id = device_id;
msg.body.req.state = (on ? BCM2835_MBOX_POWER_ON : 0) |
BCM2835_MBOX_POWER_WAIT;
msg.end_tag = 0;
err = bcm2835_mbox_property(&msg, sizeof(msg));
return (err);
}
int
bcm2835_mbox_get_clock_rate(uint32_t clock_id, uint32_t *hz)
{
struct msg_get_clock_rate msg;
int err;
memset(&msg, 0, sizeof(msg));
msg.hdr.buf_size = sizeof(msg);
msg.hdr.code = BCM2835_MBOX_CODE_REQ;
msg.tag_hdr.tag = BCM2835_MBOX_TAG_GET_CLOCK_RATE;
msg.tag_hdr.val_buf_size = sizeof(msg.body);
msg.tag_hdr.val_len = sizeof(msg.body.req);
msg.body.req.clock_id = clock_id;
msg.end_tag = 0;
err = bcm2835_mbox_property(&msg, sizeof(msg));
*hz = msg.body.resp.rate_hz;
return (err);
}
int
bcm2835_mbox_fb_get_w_h(struct bcm2835_fb_config *fb)
{
int err;
struct msg_fb_get_w_h msg;
memset(&msg, 0, sizeof(msg));
msg.hdr.buf_size = sizeof(msg);
msg.hdr.code = BCM2835_MBOX_CODE_REQ;
BCM2835_MBOX_INIT_TAG(&msg.physical_w_h, GET_PHYSICAL_W_H);
msg.physical_w_h.tag_hdr.val_len = 0;
msg.end_tag = 0;
err = bcm2835_mbox_property(&msg, sizeof(msg));
if (err == 0) {
fb->xres = msg.physical_w_h.body.resp.width;
fb->yres = msg.physical_w_h.body.resp.height;
}
return (err);
}
int
bcm2835_mbox_fb_init(struct bcm2835_fb_config *fb)
{
int err;
struct msg_fb_setup msg;
memset(&msg, 0, sizeof(msg));
msg.hdr.buf_size = sizeof(msg);
msg.hdr.code = BCM2835_MBOX_CODE_REQ;
BCM2835_MBOX_INIT_TAG(&msg.physical_w_h, SET_PHYSICAL_W_H);
msg.physical_w_h.body.req.width = fb->xres;
msg.physical_w_h.body.req.height = fb->yres;
BCM2835_MBOX_INIT_TAG(&msg.virtual_w_h, SET_VIRTUAL_W_H);
msg.virtual_w_h.body.req.width = fb->vxres;
msg.virtual_w_h.body.req.height = fb->vyres;
BCM2835_MBOX_INIT_TAG(&msg.offset, SET_VIRTUAL_OFFSET);
msg.offset.body.req.x = fb->xoffset;
msg.offset.body.req.y = fb->yoffset;
BCM2835_MBOX_INIT_TAG(&msg.depth, SET_DEPTH);
msg.depth.body.req.bpp = fb->bpp;
BCM2835_MBOX_INIT_TAG(&msg.alpha, SET_ALPHA_MODE);
msg.alpha.body.req.alpha = BCM2835_MBOX_ALPHA_MODE_IGNORED;
BCM2835_MBOX_INIT_TAG(&msg.buffer, ALLOCATE_BUFFER);
msg.buffer.body.req.alignment = PAGE_SIZE;
BCM2835_MBOX_INIT_TAG(&msg.pitch, GET_PITCH);
msg.end_tag = 0;
err = bcm2835_mbox_property(&msg, sizeof(msg));
if (err == 0) {
fb->xres = msg.physical_w_h.body.resp.width;
fb->yres = msg.physical_w_h.body.resp.height;
fb->vxres = msg.virtual_w_h.body.resp.width;
fb->vyres = msg.virtual_w_h.body.resp.height;
fb->xoffset = msg.offset.body.resp.x;
fb->yoffset = msg.offset.body.resp.y;
fb->pitch = msg.pitch.body.resp.pitch;
fb->base = VCBUS_TO_PHYS(msg.buffer.body.resp.fb_address);
fb->size = msg.buffer.body.resp.fb_size;
}
return (err);
}
Index: head/sys/arm64/arm64/gic_v3.c
===================================================================
--- head/sys/arm64/arm64/gic_v3.c (revision 327172)
+++ head/sys/arm64/arm64/gic_v3.c (revision 327173)
@@ -1,1248 +1,1246 @@
/*-
* Copyright (c) 2015-2016 The FreeBSD Foundation
* All rights reserved.
*
* This software was developed by Andrew Turner under
* the sponsorship of the FreeBSD Foundation.
*
* This software was developed by Semihalf under
* the sponsorship of the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "opt_platform.h"
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/bitstring.h>
#include <sys/bus.h>
#include <sys/kernel.h>
#include <sys/ktr.h>
#include <sys/malloc.h>
#include <sys/module.h>
#include <sys/rman.h>
#include <sys/pcpu.h>
#include <sys/proc.h>
#include <sys/cpuset.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/smp.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <machine/bus.h>
#include <machine/cpu.h>
#include <machine/intr.h>
#ifdef FDT
#include <dev/fdt/fdt_intr.h>
#include <dev/ofw/ofw_bus_subr.h>
#endif
#include "pic_if.h"
#include <arm/arm/gic_common.h>
#include "gic_v3_reg.h"
#include "gic_v3_var.h"
static bus_get_domain_t gic_v3_get_domain;
static bus_read_ivar_t gic_v3_read_ivar;
static pic_disable_intr_t gic_v3_disable_intr;
static pic_enable_intr_t gic_v3_enable_intr;
static pic_map_intr_t gic_v3_map_intr;
static pic_setup_intr_t gic_v3_setup_intr;
static pic_teardown_intr_t gic_v3_teardown_intr;
static pic_post_filter_t gic_v3_post_filter;
static pic_post_ithread_t gic_v3_post_ithread;
static pic_pre_ithread_t gic_v3_pre_ithread;
static pic_bind_intr_t gic_v3_bind_intr;
#ifdef SMP
static pic_init_secondary_t gic_v3_init_secondary;
static pic_ipi_send_t gic_v3_ipi_send;
static pic_ipi_setup_t gic_v3_ipi_setup;
#endif
static u_int gic_irq_cpu;
#ifdef SMP
static u_int sgi_to_ipi[GIC_LAST_SGI - GIC_FIRST_SGI + 1];
static u_int sgi_first_unused = GIC_FIRST_SGI;
#endif
static device_method_t gic_v3_methods[] = {
/* Device interface */
DEVMETHOD(device_detach, gic_v3_detach),
/* Bus interface */
DEVMETHOD(bus_get_domain, gic_v3_get_domain),
DEVMETHOD(bus_read_ivar, gic_v3_read_ivar),
/* Interrupt controller interface */
DEVMETHOD(pic_disable_intr, gic_v3_disable_intr),
DEVMETHOD(pic_enable_intr, gic_v3_enable_intr),
DEVMETHOD(pic_map_intr, gic_v3_map_intr),
DEVMETHOD(pic_setup_intr, gic_v3_setup_intr),
DEVMETHOD(pic_teardown_intr, gic_v3_teardown_intr),
DEVMETHOD(pic_post_filter, gic_v3_post_filter),
DEVMETHOD(pic_post_ithread, gic_v3_post_ithread),
DEVMETHOD(pic_pre_ithread, gic_v3_pre_ithread),
#ifdef SMP
DEVMETHOD(pic_bind_intr, gic_v3_bind_intr),
DEVMETHOD(pic_init_secondary, gic_v3_init_secondary),
DEVMETHOD(pic_ipi_send, gic_v3_ipi_send),
DEVMETHOD(pic_ipi_setup, gic_v3_ipi_setup),
#endif
/* End */
DEVMETHOD_END
};
DEFINE_CLASS_0(gic, gic_v3_driver, gic_v3_methods,
sizeof(struct gic_v3_softc));
/*
* Driver-specific definitions.
*/
MALLOC_DEFINE(M_GIC_V3, "GICv3", GIC_V3_DEVSTR);
/*
* Helper functions and definitions.
*/
/* Destination registers, either Distributor or Re-Distributor */
enum gic_v3_xdist {
DIST = 0,
REDIST,
};
struct gic_v3_irqsrc {
struct intr_irqsrc gi_isrc;
uint32_t gi_irq;
enum intr_polarity gi_pol;
enum intr_trigger gi_trig;
};
/* Helper routines starting with gic_v3_ */
static int gic_v3_dist_init(struct gic_v3_softc *);
static int gic_v3_redist_alloc(struct gic_v3_softc *);
static int gic_v3_redist_find(struct gic_v3_softc *);
static int gic_v3_redist_init(struct gic_v3_softc *);
static int gic_v3_cpu_init(struct gic_v3_softc *);
static void gic_v3_wait_for_rwp(struct gic_v3_softc *, enum gic_v3_xdist);
/* A sequence of init functions for primary (boot) CPU */
typedef int (*gic_v3_initseq_t) (struct gic_v3_softc *);
/* Primary CPU initialization sequence */
static gic_v3_initseq_t gic_v3_primary_init[] = {
gic_v3_dist_init,
gic_v3_redist_alloc,
gic_v3_redist_init,
gic_v3_cpu_init,
NULL
};
#ifdef SMP
/* Secondary CPU initialization sequence */
static gic_v3_initseq_t gic_v3_secondary_init[] = {
gic_v3_redist_init,
gic_v3_cpu_init,
NULL
};
#endif
uint32_t
gic_r_read_4(device_t dev, bus_size_t offset)
{
struct gic_v3_softc *sc;
sc = device_get_softc(dev);
return (bus_read_4(sc->gic_redists.pcpu[PCPU_GET(cpuid)], offset));
}
uint64_t
gic_r_read_8(device_t dev, bus_size_t offset)
{
struct gic_v3_softc *sc;
sc = device_get_softc(dev);
return (bus_read_8(sc->gic_redists.pcpu[PCPU_GET(cpuid)], offset));
}
void
gic_r_write_4(device_t dev, bus_size_t offset, uint32_t val)
{
struct gic_v3_softc *sc;
sc = device_get_softc(dev);
bus_write_4(sc->gic_redists.pcpu[PCPU_GET(cpuid)], offset, val);
}
void
gic_r_write_8(device_t dev, bus_size_t offset, uint64_t val)
{
struct gic_v3_softc *sc;
sc = device_get_softc(dev);
bus_write_8(sc->gic_redists.pcpu[PCPU_GET(cpuid)], offset, val);
}
/*
* Device interface.
*/
int
gic_v3_attach(device_t dev)
{
struct gic_v3_softc *sc;
gic_v3_initseq_t *init_func;
uint32_t typer;
int rid;
int err;
size_t i;
u_int irq;
const char *name;
sc = device_get_softc(dev);
sc->gic_registered = FALSE;
sc->dev = dev;
err = 0;
/* Initialize mutex */
mtx_init(&sc->gic_mtx, "GICv3 lock", NULL, MTX_SPIN);
/*
* Allocate array of struct resource.
* One entry for Distributor and all remaining for Re-Distributor.
*/
sc->gic_res = malloc(
sizeof(*sc->gic_res) * (sc->gic_redists.nregions + 1),
M_GIC_V3, M_WAITOK);
/* Now allocate corresponding resources */
for (i = 0, rid = 0; i < (sc->gic_redists.nregions + 1); i++, rid++) {
sc->gic_res[rid] = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
&rid, RF_ACTIVE);
if (sc->gic_res[rid] == NULL)
return (ENXIO);
}
/*
* Distributor interface
*/
sc->gic_dist = sc->gic_res[0];
/*
* Re-Dristributor interface
*/
/* Allocate space under region descriptions */
sc->gic_redists.regions = malloc(
sizeof(*sc->gic_redists.regions) * sc->gic_redists.nregions,
M_GIC_V3, M_WAITOK);
/* Fill-up bus_space information for each region. */
for (i = 0, rid = 1; i < sc->gic_redists.nregions; i++, rid++)
sc->gic_redists.regions[i] = sc->gic_res[rid];
/* Get the number of supported SPI interrupts */
typer = gic_d_read(sc, 4, GICD_TYPER);
sc->gic_nirqs = GICD_TYPER_I_NUM(typer);
if (sc->gic_nirqs > GIC_I_NUM_MAX)
sc->gic_nirqs = GIC_I_NUM_MAX;
sc->gic_irqs = malloc(sizeof(*sc->gic_irqs) * sc->gic_nirqs,
M_GIC_V3, M_WAITOK | M_ZERO);
name = device_get_nameunit(dev);
for (irq = 0; irq < sc->gic_nirqs; irq++) {
struct intr_irqsrc *isrc;
sc->gic_irqs[irq].gi_irq = irq;
sc->gic_irqs[irq].gi_pol = INTR_POLARITY_CONFORM;
sc->gic_irqs[irq].gi_trig = INTR_TRIGGER_CONFORM;
isrc = &sc->gic_irqs[irq].gi_isrc;
if (irq <= GIC_LAST_SGI) {
err = intr_isrc_register(isrc, sc->dev,
INTR_ISRCF_IPI, "%s,i%u", name, irq - GIC_FIRST_SGI);
} else if (irq <= GIC_LAST_PPI) {
err = intr_isrc_register(isrc, sc->dev,
INTR_ISRCF_PPI, "%s,p%u", name, irq - GIC_FIRST_PPI);
} else {
err = intr_isrc_register(isrc, sc->dev, 0,
"%s,s%u", name, irq - GIC_FIRST_SPI);
}
if (err != 0) {
/* XXX call intr_isrc_deregister() */
free(sc->gic_irqs, M_DEVBUF);
return (err);
}
}
/*
* Read the Peripheral ID2 register. This is an implementation
* defined register, but seems to be implemented in all GICv3
* parts and Linux expects it to be there.
*/
sc->gic_pidr2 = gic_d_read(sc, 4, GICD_PIDR2);
/* Get the number of supported interrupt identifier bits */
sc->gic_idbits = GICD_TYPER_IDBITS(typer);
if (bootverbose) {
device_printf(dev, "SPIs: %u, IDs: %u\n",
sc->gic_nirqs, (1 << sc->gic_idbits) - 1);
}
/* Train init sequence for boot CPU */
for (init_func = gic_v3_primary_init; *init_func != NULL; init_func++) {
err = (*init_func)(sc);
if (err != 0)
return (err);
}
return (0);
}
int
gic_v3_detach(device_t dev)
{
struct gic_v3_softc *sc;
size_t i;
int rid;
sc = device_get_softc(dev);
if (device_is_attached(dev)) {
/*
* XXX: We should probably deregister PIC
*/
if (sc->gic_registered)
panic("Trying to detach registered PIC");
}
for (rid = 0; rid < (sc->gic_redists.nregions + 1); rid++)
bus_release_resource(dev, SYS_RES_MEMORY, rid, sc->gic_res[rid]);
for (i = 0; i <= mp_maxid; i++)
free(sc->gic_redists.pcpu[i], M_GIC_V3);
free(sc->gic_res, M_GIC_V3);
free(sc->gic_redists.regions, M_GIC_V3);
return (0);
}
static int
gic_v3_get_domain(device_t dev, device_t child, int *domain)
{
struct gic_v3_devinfo *di;
di = device_get_ivars(child);
if (di->gic_domain < 0)
return (ENOENT);
*domain = di->gic_domain;
return (0);
}
static int
gic_v3_read_ivar(device_t dev, device_t child, int which, uintptr_t *result)
{
struct gic_v3_softc *sc;
sc = device_get_softc(dev);
switch (which) {
case GICV3_IVAR_NIRQS:
*result = (NIRQ - sc->gic_nirqs) / sc->gic_nchildren;
return (0);
case GICV3_IVAR_REDIST_VADDR:
*result = (uintptr_t)rman_get_virtual(
sc->gic_redists.pcpu[PCPU_GET(cpuid)]);
return (0);
case GIC_IVAR_HW_REV:
KASSERT(
GICR_PIDR2_ARCH(sc->gic_pidr2) == GICR_PIDR2_ARCH_GICv3 ||
GICR_PIDR2_ARCH(sc->gic_pidr2) == GICR_PIDR2_ARCH_GICv4,
("gic_v3_read_ivar: Invalid GIC architecture: %d (%.08X)",
GICR_PIDR2_ARCH(sc->gic_pidr2), sc->gic_pidr2));
*result = GICR_PIDR2_ARCH(sc->gic_pidr2);
return (0);
case GIC_IVAR_BUS:
KASSERT(sc->gic_bus != GIC_BUS_UNKNOWN,
("gic_v3_read_ivar: Unknown bus type"));
KASSERT(sc->gic_bus <= GIC_BUS_MAX,
("gic_v3_read_ivar: Invalid bus type %u", sc->gic_bus));
*result = sc->gic_bus;
return (0);
}
return (ENOENT);
}
int
arm_gic_v3_intr(void *arg)
{
struct gic_v3_softc *sc = arg;
struct gic_v3_irqsrc *gi;
struct intr_pic *pic;
uint64_t active_irq;
struct trapframe *tf;
- bool first;
- first = true;
pic = sc->gic_pic;
while (1) {
if (CPU_MATCH_ERRATA_CAVIUM_THUNDER_1_1) {
/*
* Hardware: Cavium ThunderX
* Chip revision: Pass 1.0 (early version)
* Pass 1.1 (production)
* ERRATUM: 22978, 23154
*/
__asm __volatile(
"nop;nop;nop;nop;nop;nop;nop;nop; \n"
"mrs %0, ICC_IAR1_EL1 \n"
"nop;nop;nop;nop; \n"
"dsb sy \n"
: "=&r" (active_irq));
} else {
active_irq = gic_icc_read(IAR1);
}
if (active_irq >= GIC_FIRST_LPI) {
intr_child_irq_handler(pic, active_irq);
continue;
}
if (__predict_false(active_irq >= sc->gic_nirqs))
return (FILTER_HANDLED);
tf = curthread->td_intr_frame;
gi = &sc->gic_irqs[active_irq];
if (active_irq <= GIC_LAST_SGI) {
/* Call EOI for all IPI before dispatch. */
gic_icc_write(EOIR1, (uint64_t)active_irq);
#ifdef SMP
intr_ipi_dispatch(sgi_to_ipi[gi->gi_irq], tf);
#else
device_printf(sc->dev, "SGI %ju on UP system detected\n",
(uintmax_t)(active_irq - GIC_FIRST_SGI));
#endif
} else if (active_irq >= GIC_FIRST_PPI &&
active_irq <= GIC_LAST_SPI) {
if (gi->gi_trig == INTR_TRIGGER_EDGE)
gic_icc_write(EOIR1, gi->gi_irq);
if (intr_isrc_dispatch(&gi->gi_isrc, tf) != 0) {
if (gi->gi_trig != INTR_TRIGGER_EDGE)
gic_icc_write(EOIR1, gi->gi_irq);
gic_v3_disable_intr(sc->dev, &gi->gi_isrc);
device_printf(sc->dev,
"Stray irq %lu disabled\n", active_irq);
}
}
}
}
#ifdef FDT
static int
gic_map_fdt(device_t dev, u_int ncells, pcell_t *cells, u_int *irqp,
enum intr_polarity *polp, enum intr_trigger *trigp)
{
u_int irq;
if (ncells < 3)
return (EINVAL);
/*
* The 1st cell is the interrupt type:
* 0 = SPI
* 1 = PPI
* The 2nd cell contains the interrupt number:
* [0 - 987] for SPI
* [0 - 15] for PPI
* The 3rd cell is the flags, encoded as follows:
* bits[3:0] trigger type and level flags
* 1 = edge triggered
* 2 = edge triggered (PPI only)
* 4 = level-sensitive
* 8 = level-sensitive (PPI only)
*/
switch (cells[0]) {
case 0:
irq = GIC_FIRST_SPI + cells[1];
/* SPI irq is checked later. */
break;
case 1:
irq = GIC_FIRST_PPI + cells[1];
if (irq > GIC_LAST_PPI) {
device_printf(dev, "unsupported PPI interrupt "
"number %u\n", cells[1]);
return (EINVAL);
}
break;
default:
device_printf(dev, "unsupported interrupt type "
"configuration %u\n", cells[0]);
return (EINVAL);
}
switch (cells[2] & FDT_INTR_MASK) {
case FDT_INTR_EDGE_RISING:
*trigp = INTR_TRIGGER_EDGE;
*polp = INTR_POLARITY_HIGH;
break;
case FDT_INTR_EDGE_FALLING:
*trigp = INTR_TRIGGER_EDGE;
*polp = INTR_POLARITY_LOW;
break;
case FDT_INTR_LEVEL_HIGH:
*trigp = INTR_TRIGGER_LEVEL;
*polp = INTR_POLARITY_HIGH;
break;
case FDT_INTR_LEVEL_LOW:
*trigp = INTR_TRIGGER_LEVEL;
*polp = INTR_POLARITY_LOW;
break;
default:
device_printf(dev, "unsupported trigger/polarity "
"configuration 0x%02x\n", cells[2]);
return (EINVAL);
}
/* Check the interrupt is valid */
if (irq >= GIC_FIRST_SPI && *polp != INTR_POLARITY_HIGH)
return (EINVAL);
*irqp = irq;
return (0);
}
#endif
static int
gic_map_msi(device_t dev, struct intr_map_data_msi *msi_data, u_int *irqp,
enum intr_polarity *polp, enum intr_trigger *trigp)
{
struct gic_v3_irqsrc *gi;
/* SPI-mapped MSI */
gi = (struct gic_v3_irqsrc *)msi_data->isrc;
if (gi == NULL)
return (ENXIO);
*irqp = gi->gi_irq;
/* MSI/MSI-X interrupts are always edge triggered with high polarity */
*polp = INTR_POLARITY_HIGH;
*trigp = INTR_TRIGGER_EDGE;
return (0);
}
static int
do_gic_v3_map_intr(device_t dev, struct intr_map_data *data, u_int *irqp,
enum intr_polarity *polp, enum intr_trigger *trigp)
{
struct gic_v3_softc *sc;
enum intr_polarity pol;
enum intr_trigger trig;
struct intr_map_data_msi *dam;
#ifdef FDT
struct intr_map_data_fdt *daf;
#endif
u_int irq;
sc = device_get_softc(dev);
switch (data->type) {
#ifdef FDT
case INTR_MAP_DATA_FDT:
daf = (struct intr_map_data_fdt *)data;
if (gic_map_fdt(dev, daf->ncells, daf->cells, &irq, &pol,
&trig) != 0)
return (EINVAL);
break;
#endif
case INTR_MAP_DATA_MSI:
/* SPI-mapped MSI */
dam = (struct intr_map_data_msi *)data;
if (gic_map_msi(dev, dam, &irq, &pol, &trig) != 0)
return (EINVAL);
break;
default:
return (EINVAL);
}
if (irq >= sc->gic_nirqs)
return (EINVAL);
switch (pol) {
case INTR_POLARITY_CONFORM:
case INTR_POLARITY_LOW:
case INTR_POLARITY_HIGH:
break;
default:
return (EINVAL);
}
switch (trig) {
case INTR_TRIGGER_CONFORM:
case INTR_TRIGGER_EDGE:
case INTR_TRIGGER_LEVEL:
break;
default:
return (EINVAL);
}
*irqp = irq;
if (polp != NULL)
*polp = pol;
if (trigp != NULL)
*trigp = trig;
return (0);
}
static int
gic_v3_map_intr(device_t dev, struct intr_map_data *data,
struct intr_irqsrc **isrcp)
{
struct gic_v3_softc *sc;
int error;
u_int irq;
error = do_gic_v3_map_intr(dev, data, &irq, NULL, NULL);
if (error == 0) {
sc = device_get_softc(dev);
*isrcp = GIC_INTR_ISRC(sc, irq);
}
return (error);
}
static int
gic_v3_setup_intr(device_t dev, struct intr_irqsrc *isrc,
struct resource *res, struct intr_map_data *data)
{
struct gic_v3_softc *sc = device_get_softc(dev);
struct gic_v3_irqsrc *gi = (struct gic_v3_irqsrc *)isrc;
enum intr_trigger trig;
enum intr_polarity pol;
uint32_t reg;
u_int irq;
int error;
if (data == NULL)
return (ENOTSUP);
error = do_gic_v3_map_intr(dev, data, &irq, &pol, &trig);
if (error != 0)
return (error);
if (gi->gi_irq != irq || pol == INTR_POLARITY_CONFORM ||
trig == INTR_TRIGGER_CONFORM)
return (EINVAL);
/* Compare config if this is not first setup. */
if (isrc->isrc_handlers != 0) {
if (pol != gi->gi_pol || trig != gi->gi_trig)
return (EINVAL);
else
return (0);
}
gi->gi_pol = pol;
gi->gi_trig = trig;
/*
* XXX - In case that per CPU interrupt is going to be enabled in time
* when SMP is already started, we need some IPI call which
* enables it on others CPUs. Further, it's more complicated as
* pic_enable_source() and pic_disable_source() should act on
* per CPU basis only. Thus, it should be solved here somehow.
*/
if (isrc->isrc_flags & INTR_ISRCF_PPI)
CPU_SET(PCPU_GET(cpuid), &isrc->isrc_cpu);
if (irq >= GIC_FIRST_PPI && irq <= GIC_LAST_SPI) {
mtx_lock_spin(&sc->gic_mtx);
/* Set the trigger and polarity */
if (irq <= GIC_LAST_PPI)
reg = gic_r_read(sc, 4,
GICR_SGI_BASE_SIZE + GICD_ICFGR(irq));
else
reg = gic_d_read(sc, 4, GICD_ICFGR(irq));
if (trig == INTR_TRIGGER_LEVEL)
reg &= ~(2 << ((irq % 16) * 2));
else
reg |= 2 << ((irq % 16) * 2);
if (irq <= GIC_LAST_PPI) {
gic_r_write(sc, 4,
GICR_SGI_BASE_SIZE + GICD_ICFGR(irq), reg);
gic_v3_wait_for_rwp(sc, REDIST);
} else {
gic_d_write(sc, 4, GICD_ICFGR(irq), reg);
gic_v3_wait_for_rwp(sc, DIST);
}
mtx_unlock_spin(&sc->gic_mtx);
gic_v3_bind_intr(dev, isrc);
}
return (0);
}
static int
gic_v3_teardown_intr(device_t dev, struct intr_irqsrc *isrc,
struct resource *res, struct intr_map_data *data)
{
struct gic_v3_irqsrc *gi = (struct gic_v3_irqsrc *)isrc;
if (isrc->isrc_handlers == 0) {
gi->gi_pol = INTR_POLARITY_CONFORM;
gi->gi_trig = INTR_TRIGGER_CONFORM;
}
return (0);
}
static void
gic_v3_disable_intr(device_t dev, struct intr_irqsrc *isrc)
{
struct gic_v3_softc *sc;
struct gic_v3_irqsrc *gi;
u_int irq;
sc = device_get_softc(dev);
gi = (struct gic_v3_irqsrc *)isrc;
irq = gi->gi_irq;
if (irq <= GIC_LAST_PPI) {
/* SGIs and PPIs in corresponding Re-Distributor */
gic_r_write(sc, 4, GICR_SGI_BASE_SIZE + GICD_ICENABLER(irq),
GICD_I_MASK(irq));
gic_v3_wait_for_rwp(sc, REDIST);
} else if (irq >= GIC_FIRST_SPI && irq <= GIC_LAST_SPI) {
/* SPIs in distributor */
gic_d_write(sc, 4, GICD_ICENABLER(irq), GICD_I_MASK(irq));
gic_v3_wait_for_rwp(sc, DIST);
} else
panic("%s: Unsupported IRQ %u", __func__, irq);
}
static void
gic_v3_enable_intr(device_t dev, struct intr_irqsrc *isrc)
{
struct gic_v3_softc *sc;
struct gic_v3_irqsrc *gi;
u_int irq;
sc = device_get_softc(dev);
gi = (struct gic_v3_irqsrc *)isrc;
irq = gi->gi_irq;
if (irq <= GIC_LAST_PPI) {
/* SGIs and PPIs in corresponding Re-Distributor */
gic_r_write(sc, 4, GICR_SGI_BASE_SIZE + GICD_ISENABLER(irq),
GICD_I_MASK(irq));
gic_v3_wait_for_rwp(sc, REDIST);
} else if (irq >= GIC_FIRST_SPI && irq <= GIC_LAST_SPI) {
/* SPIs in distributor */
gic_d_write(sc, 4, GICD_ISENABLER(irq), GICD_I_MASK(irq));
gic_v3_wait_for_rwp(sc, DIST);
} else
panic("%s: Unsupported IRQ %u", __func__, irq);
}
static void
gic_v3_pre_ithread(device_t dev, struct intr_irqsrc *isrc)
{
struct gic_v3_irqsrc *gi = (struct gic_v3_irqsrc *)isrc;
gic_v3_disable_intr(dev, isrc);
gic_icc_write(EOIR1, gi->gi_irq);
}
static void
gic_v3_post_ithread(device_t dev, struct intr_irqsrc *isrc)
{
gic_v3_enable_intr(dev, isrc);
}
static void
gic_v3_post_filter(device_t dev, struct intr_irqsrc *isrc)
{
struct gic_v3_irqsrc *gi = (struct gic_v3_irqsrc *)isrc;
if (gi->gi_trig == INTR_TRIGGER_EDGE)
return;
gic_icc_write(EOIR1, gi->gi_irq);
}
static int
gic_v3_bind_intr(device_t dev, struct intr_irqsrc *isrc)
{
struct gic_v3_softc *sc;
struct gic_v3_irqsrc *gi;
int cpu;
gi = (struct gic_v3_irqsrc *)isrc;
if (gi->gi_irq <= GIC_LAST_PPI)
return (EINVAL);
KASSERT(gi->gi_irq >= GIC_FIRST_SPI && gi->gi_irq <= GIC_LAST_SPI,
("%s: Attempting to bind an invalid IRQ", __func__));
sc = device_get_softc(dev);
if (CPU_EMPTY(&isrc->isrc_cpu)) {
gic_irq_cpu = intr_irq_next_cpu(gic_irq_cpu, &all_cpus);
CPU_SETOF(gic_irq_cpu, &isrc->isrc_cpu);
gic_d_write(sc, 4, GICD_IROUTER(gi->gi_irq),
CPU_AFFINITY(gic_irq_cpu));
} else {
/*
* We can only bind to a single CPU so select
* the first CPU found.
*/
cpu = CPU_FFS(&isrc->isrc_cpu) - 1;
gic_d_write(sc, 4, GICD_IROUTER(gi->gi_irq), CPU_AFFINITY(cpu));
}
return (0);
}
#ifdef SMP
static void
gic_v3_init_secondary(device_t dev)
{
device_t child;
struct gic_v3_softc *sc;
gic_v3_initseq_t *init_func;
struct intr_irqsrc *isrc;
u_int cpu, irq;
int err, i;
sc = device_get_softc(dev);
cpu = PCPU_GET(cpuid);
/* Train init sequence for boot CPU */
for (init_func = gic_v3_secondary_init; *init_func != NULL;
init_func++) {
err = (*init_func)(sc);
if (err != 0) {
device_printf(dev,
"Could not initialize GIC for CPU%u\n", cpu);
return;
}
}
/* Unmask attached SGI interrupts. */
for (irq = GIC_FIRST_SGI; irq <= GIC_LAST_SGI; irq++) {
isrc = GIC_INTR_ISRC(sc, irq);
if (intr_isrc_init_on_cpu(isrc, cpu))
gic_v3_enable_intr(dev, isrc);
}
/* Unmask attached PPI interrupts. */
for (irq = GIC_FIRST_PPI; irq <= GIC_LAST_PPI; irq++) {
isrc = GIC_INTR_ISRC(sc, irq);
if (intr_isrc_init_on_cpu(isrc, cpu))
gic_v3_enable_intr(dev, isrc);
}
for (i = 0; i < sc->gic_nchildren; i++) {
child = sc->gic_children[i];
PIC_INIT_SECONDARY(child);
}
}
static void
gic_v3_ipi_send(device_t dev, struct intr_irqsrc *isrc, cpuset_t cpus,
u_int ipi)
{
struct gic_v3_irqsrc *gi = (struct gic_v3_irqsrc *)isrc;
uint64_t aff, val, irq;
int i;
#define GIC_AFF_MASK (CPU_AFF3_MASK | CPU_AFF2_MASK | CPU_AFF1_MASK)
#define GIC_AFFINITY(i) (CPU_AFFINITY(i) & GIC_AFF_MASK)
aff = GIC_AFFINITY(0);
irq = gi->gi_irq;
val = 0;
/* Iterate through all CPUs in set */
for (i = 0; i <= mp_maxid; i++) {
/* Move to the next affinity group */
if (aff != GIC_AFFINITY(i)) {
/* Send the IPI */
if (val != 0) {
gic_icc_write(SGI1R, val);
val = 0;
}
aff = GIC_AFFINITY(i);
}
/* Send the IPI to this cpu */
if (CPU_ISSET(i, &cpus)) {
#define ICC_SGI1R_AFFINITY(aff) \
(((uint64_t)CPU_AFF3(aff) << ICC_SGI1R_EL1_AFF3_SHIFT) | \
((uint64_t)CPU_AFF2(aff) << ICC_SGI1R_EL1_AFF2_SHIFT) | \
((uint64_t)CPU_AFF1(aff) << ICC_SGI1R_EL1_AFF1_SHIFT))
/* Set the affinity when the first at this level */
if (val == 0)
val = ICC_SGI1R_AFFINITY(aff) |
irq << ICC_SGI1R_EL1_SGIID_SHIFT;
/* Set the bit to send the IPI to te CPU */
val |= 1 << CPU_AFF0(CPU_AFFINITY(i));
}
}
/* Send the IPI to the last cpu affinity group */
if (val != 0)
gic_icc_write(SGI1R, val);
#undef GIC_AFF_MASK
#undef GIC_AFFINITY
}
static int
gic_v3_ipi_setup(device_t dev, u_int ipi, struct intr_irqsrc **isrcp)
{
struct intr_irqsrc *isrc;
struct gic_v3_softc *sc = device_get_softc(dev);
if (sgi_first_unused > GIC_LAST_SGI)
return (ENOSPC);
isrc = GIC_INTR_ISRC(sc, sgi_first_unused);
sgi_to_ipi[sgi_first_unused++] = ipi;
CPU_SET(PCPU_GET(cpuid), &isrc->isrc_cpu);
*isrcp = isrc;
return (0);
}
#endif /* SMP */
/*
* Helper routines
*/
static void
gic_v3_wait_for_rwp(struct gic_v3_softc *sc, enum gic_v3_xdist xdist)
{
struct resource *res;
u_int cpuid;
size_t us_left = 1000000;
cpuid = PCPU_GET(cpuid);
switch (xdist) {
case DIST:
res = sc->gic_dist;
break;
case REDIST:
res = sc->gic_redists.pcpu[cpuid];
break;
default:
KASSERT(0, ("%s: Attempt to wait for unknown RWP", __func__));
return;
}
while ((bus_read_4(res, GICD_CTLR) & GICD_CTLR_RWP) != 0) {
DELAY(1);
if (us_left-- == 0)
panic("GICD Register write pending for too long");
}
}
/* CPU interface. */
static __inline void
gic_v3_cpu_priority(uint64_t mask)
{
/* Set prority mask */
gic_icc_write(PMR, mask & ICC_PMR_EL1_PRIO_MASK);
}
static int
gic_v3_cpu_enable_sre(struct gic_v3_softc *sc)
{
uint64_t sre;
u_int cpuid;
cpuid = PCPU_GET(cpuid);
/*
* Set the SRE bit to enable access to GIC CPU interface
* via system registers.
*/
sre = READ_SPECIALREG(icc_sre_el1);
sre |= ICC_SRE_EL1_SRE;
WRITE_SPECIALREG(icc_sre_el1, sre);
isb();
/*
* Now ensure that the bit is set.
*/
sre = READ_SPECIALREG(icc_sre_el1);
if ((sre & ICC_SRE_EL1_SRE) == 0) {
/* We are done. This was disabled in EL2 */
device_printf(sc->dev, "ERROR: CPU%u cannot enable CPU interface "
"via system registers\n", cpuid);
return (ENXIO);
} else if (bootverbose) {
device_printf(sc->dev,
"CPU%u enabled CPU interface via system registers\n",
cpuid);
}
return (0);
}
static int
gic_v3_cpu_init(struct gic_v3_softc *sc)
{
int err;
/* Enable access to CPU interface via system registers */
err = gic_v3_cpu_enable_sre(sc);
if (err != 0)
return (err);
/* Priority mask to minimum - accept all interrupts */
gic_v3_cpu_priority(GIC_PRIORITY_MIN);
/* Disable EOI mode */
gic_icc_clear(CTLR, ICC_CTLR_EL1_EOIMODE);
/* Enable group 1 (insecure) interrups */
gic_icc_set(IGRPEN1, ICC_IGRPEN0_EL1_EN);
return (0);
}
/* Distributor */
static int
gic_v3_dist_init(struct gic_v3_softc *sc)
{
uint64_t aff;
u_int i;
/*
* 1. Disable the Distributor
*/
gic_d_write(sc, 4, GICD_CTLR, 0);
gic_v3_wait_for_rwp(sc, DIST);
/*
* 2. Configure the Distributor
*/
/* Set all SPIs to be Group 1 Non-secure */
for (i = GIC_FIRST_SPI; i < sc->gic_nirqs; i += GICD_I_PER_IGROUPRn)
gic_d_write(sc, 4, GICD_IGROUPR(i), 0xFFFFFFFF);
/* Set all global interrupts to be level triggered, active low. */
for (i = GIC_FIRST_SPI; i < sc->gic_nirqs; i += GICD_I_PER_ICFGRn)
gic_d_write(sc, 4, GICD_ICFGR(i), 0x00000000);
/* Set priority to all shared interrupts */
for (i = GIC_FIRST_SPI;
i < sc->gic_nirqs; i += GICD_I_PER_IPRIORITYn) {
/* Set highest priority */
gic_d_write(sc, 4, GICD_IPRIORITYR(i), GIC_PRIORITY_MAX);
}
/*
* Disable all interrupts. Leave PPI and SGIs as they are enabled in
* Re-Distributor registers.
*/
for (i = GIC_FIRST_SPI; i < sc->gic_nirqs; i += GICD_I_PER_ISENABLERn)
gic_d_write(sc, 4, GICD_ICENABLER(i), 0xFFFFFFFF);
gic_v3_wait_for_rwp(sc, DIST);
/*
* 3. Enable Distributor
*/
/* Enable Distributor with ARE, Group 1 */
gic_d_write(sc, 4, GICD_CTLR, GICD_CTLR_ARE_NS | GICD_CTLR_G1A |
GICD_CTLR_G1);
/*
* 4. Route all interrupts to boot CPU.
*/
aff = CPU_AFFINITY(0);
for (i = GIC_FIRST_SPI; i < sc->gic_nirqs; i++)
gic_d_write(sc, 4, GICD_IROUTER(i), aff);
return (0);
}
/* Re-Distributor */
static int
gic_v3_redist_alloc(struct gic_v3_softc *sc)
{
u_int cpuid;
/* Allocate struct resource for all CPU's Re-Distributor registers */
for (cpuid = 0; cpuid <= mp_maxid; cpuid++)
if (CPU_ISSET(cpuid, &all_cpus) != 0)
sc->gic_redists.pcpu[cpuid] =
malloc(sizeof(*sc->gic_redists.pcpu[0]),
M_GIC_V3, M_WAITOK);
else
sc->gic_redists.pcpu[cpuid] = NULL;
return (0);
}
static int
gic_v3_redist_find(struct gic_v3_softc *sc)
{
struct resource r_res;
bus_space_handle_t r_bsh;
uint64_t aff;
uint64_t typer;
uint32_t pidr2;
u_int cpuid;
size_t i;
cpuid = PCPU_GET(cpuid);
aff = CPU_AFFINITY(cpuid);
/* Affinity in format for comparison with typer */
aff = (CPU_AFF3(aff) << 24) | (CPU_AFF2(aff) << 16) |
(CPU_AFF1(aff) << 8) | CPU_AFF0(aff);
if (bootverbose) {
device_printf(sc->dev,
"Start searching for Re-Distributor\n");
}
/* Iterate through Re-Distributor regions */
for (i = 0; i < sc->gic_redists.nregions; i++) {
/* Take a copy of the region's resource */
r_res = *sc->gic_redists.regions[i];
r_bsh = rman_get_bushandle(&r_res);
pidr2 = bus_read_4(&r_res, GICR_PIDR2);
switch (GICR_PIDR2_ARCH(pidr2)) {
case GICR_PIDR2_ARCH_GICv3: /* fall through */
case GICR_PIDR2_ARCH_GICv4:
break;
default:
device_printf(sc->dev,
"No Re-Distributor found for CPU%u\n", cpuid);
return (ENODEV);
}
do {
typer = bus_read_8(&r_res, GICR_TYPER);
if ((typer >> GICR_TYPER_AFF_SHIFT) == aff) {
KASSERT(sc->gic_redists.pcpu[cpuid] != NULL,
("Invalid pointer to per-CPU redistributor"));
/* Copy res contents to its final destination */
*sc->gic_redists.pcpu[cpuid] = r_res;
if (bootverbose) {
device_printf(sc->dev,
"CPU%u Re-Distributor has been found\n",
cpuid);
}
return (0);
}
r_bsh += (GICR_RD_BASE_SIZE + GICR_SGI_BASE_SIZE);
if ((typer & GICR_TYPER_VLPIS) != 0) {
r_bsh +=
(GICR_VLPI_BASE_SIZE + GICR_RESERVED_SIZE);
}
rman_set_bushandle(&r_res, r_bsh);
} while ((typer & GICR_TYPER_LAST) == 0);
}
device_printf(sc->dev, "No Re-Distributor found for CPU%u\n", cpuid);
return (ENXIO);
}
static int
gic_v3_redist_wake(struct gic_v3_softc *sc)
{
uint32_t waker;
size_t us_left = 1000000;
waker = gic_r_read(sc, 4, GICR_WAKER);
/* Wake up Re-Distributor for this CPU */
waker &= ~GICR_WAKER_PS;
gic_r_write(sc, 4, GICR_WAKER, waker);
/*
* When clearing ProcessorSleep bit it is required to wait for
* ChildrenAsleep to become zero following the processor power-on.
*/
while ((gic_r_read(sc, 4, GICR_WAKER) & GICR_WAKER_CA) != 0) {
DELAY(1);
if (us_left-- == 0) {
panic("Could not wake Re-Distributor for CPU%u",
PCPU_GET(cpuid));
}
}
if (bootverbose) {
device_printf(sc->dev, "CPU%u Re-Distributor woke up\n",
PCPU_GET(cpuid));
}
return (0);
}
static int
gic_v3_redist_init(struct gic_v3_softc *sc)
{
int err;
size_t i;
err = gic_v3_redist_find(sc);
if (err != 0)
return (err);
err = gic_v3_redist_wake(sc);
if (err != 0)
return (err);
/* Configure SGIs and PPIs to be Group1 Non-secure */
gic_r_write(sc, 4, GICR_SGI_BASE_SIZE + GICR_IGROUPR0,
0xFFFFFFFF);
/* Disable SPIs */
gic_r_write(sc, 4, GICR_SGI_BASE_SIZE + GICR_ICENABLER0,
GICR_I_ENABLER_PPI_MASK);
/* Enable SGIs */
gic_r_write(sc, 4, GICR_SGI_BASE_SIZE + GICR_ISENABLER0,
GICR_I_ENABLER_SGI_MASK);
/* Set priority for SGIs and PPIs */
for (i = 0; i <= GIC_LAST_PPI; i += GICR_I_PER_IPRIORITYn) {
gic_r_write(sc, 4, GICR_SGI_BASE_SIZE + GICD_IPRIORITYR(i),
GIC_PRIORITY_MAX);
}
gic_v3_wait_for_rwp(sc, REDIST);
return (0);
}
Index: head/sys/arm64/arm64/gicv3_its.c
===================================================================
--- head/sys/arm64/arm64/gicv3_its.c (revision 327172)
+++ head/sys/arm64/arm64/gicv3_its.c (revision 327173)
@@ -1,1694 +1,1688 @@
/*-
* Copyright (c) 2015-2016 The FreeBSD Foundation
* All rights reserved.
*
* This software was developed by Andrew Turner under
* the sponsorship of the FreeBSD Foundation.
*
* This software was developed by Semihalf under
* the sponsorship of the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "opt_platform.h"
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/bus.h>
#include <sys/cpuset.h>
#include <sys/endian.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/module.h>
#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/rman.h>
#include <sys/smp.h>
#include <sys/vmem.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <machine/bus.h>
#include <machine/intr.h>
#include <arm/arm/gic_common.h>
#include <arm64/arm64/gic_v3_reg.h>
#include <arm64/arm64/gic_v3_var.h>
#ifdef FDT
#include <dev/ofw/openfirm.h>
#include <dev/ofw/ofw_bus.h>
#include <dev/ofw/ofw_bus_subr.h>
#endif
#include <dev/pci/pcireg.h>
#include <dev/pci/pcivar.h>
#include "pcib_if.h"
#include "pic_if.h"
#include "msi_if.h"
MALLOC_DEFINE(M_GICV3_ITS, "GICv3 ITS",
"ARM GICv3 Interrupt Translation Service");
#define LPI_NIRQS (64 * 1024)
/* The size and alignment of the command circular buffer */
#define ITS_CMDQ_SIZE (64 * 1024) /* Must be a multiple of 4K */
#define ITS_CMDQ_ALIGN (64 * 1024)
#define LPI_CONFTAB_SIZE LPI_NIRQS
#define LPI_CONFTAB_ALIGN (64 * 1024)
#define LPI_CONFTAB_MAX_ADDR ((1ul << 48) - 1) /* We need a 47 bit PA */
/* 1 bit per SPI, PPI, and SGI (8k), and 1 bit per LPI (LPI_CONFTAB_SIZE) */
#define LPI_PENDTAB_SIZE ((LPI_NIRQS + GIC_FIRST_LPI) / 8)
#define LPI_PENDTAB_ALIGN (64 * 1024)
#define LPI_PENDTAB_MAX_ADDR ((1ul << 48) - 1) /* We need a 47 bit PA */
#define LPI_INT_TRANS_TAB_ALIGN 256
#define LPI_INT_TRANS_TAB_MAX_ADDR ((1ul << 48) - 1)
/* ITS commands encoding */
#define ITS_CMD_MOVI (0x01)
#define ITS_CMD_SYNC (0x05)
#define ITS_CMD_MAPD (0x08)
#define ITS_CMD_MAPC (0x09)
#define ITS_CMD_MAPTI (0x0a)
#define ITS_CMD_MAPI (0x0b)
#define ITS_CMD_INV (0x0c)
#define ITS_CMD_INVALL (0x0d)
/* Command */
#define CMD_COMMAND_MASK (0xFFUL)
/* PCI device ID */
#define CMD_DEVID_SHIFT (32)
#define CMD_DEVID_MASK (0xFFFFFFFFUL << CMD_DEVID_SHIFT)
/* Size of IRQ ID bitfield */
#define CMD_SIZE_MASK (0xFFUL)
/* Virtual LPI ID */
#define CMD_ID_MASK (0xFFFFFFFFUL)
/* Physical LPI ID */
#define CMD_PID_SHIFT (32)
#define CMD_PID_MASK (0xFFFFFFFFUL << CMD_PID_SHIFT)
/* Collection */
#define CMD_COL_MASK (0xFFFFUL)
/* Target (CPU or Re-Distributor) */
#define CMD_TARGET_SHIFT (16)
#define CMD_TARGET_MASK (0xFFFFFFFFUL << CMD_TARGET_SHIFT)
/* Interrupt Translation Table address */
#define CMD_ITT_MASK (0xFFFFFFFFFF00UL)
/* Valid command bit */
#define CMD_VALID_SHIFT (63)
#define CMD_VALID_MASK (1UL << CMD_VALID_SHIFT)
#define ITS_TARGET_NONE 0xFBADBEEF
/* LPI chunk owned by ITS device */
struct lpi_chunk {
u_int lpi_base;
u_int lpi_free; /* First free LPI in set */
u_int lpi_num; /* Total number of LPIs in chunk */
u_int lpi_busy; /* Number of busy LPIs in chink */
};
/* ITS device */
struct its_dev {
TAILQ_ENTRY(its_dev) entry;
/* PCI device */
device_t pci_dev;
/* Device ID (i.e. PCI device ID) */
uint32_t devid;
/* List of assigned LPIs */
struct lpi_chunk lpis;
/* Virtual address of ITT */
vm_offset_t itt;
size_t itt_size;
};
/*
* ITS command descriptor.
* Idea for command description passing taken from Linux.
*/
struct its_cmd_desc {
uint8_t cmd_type;
union {
struct {
struct its_dev *its_dev;
struct its_col *col;
uint32_t id;
} cmd_desc_movi;
struct {
struct its_col *col;
} cmd_desc_sync;
struct {
struct its_col *col;
uint8_t valid;
} cmd_desc_mapc;
struct {
struct its_dev *its_dev;
struct its_col *col;
uint32_t pid;
uint32_t id;
} cmd_desc_mapvi;
struct {
struct its_dev *its_dev;
struct its_col *col;
uint32_t pid;
} cmd_desc_mapi;
struct {
struct its_dev *its_dev;
uint8_t valid;
} cmd_desc_mapd;
struct {
struct its_dev *its_dev;
struct its_col *col;
uint32_t pid;
} cmd_desc_inv;
struct {
struct its_col *col;
} cmd_desc_invall;
};
};
/* ITS command. Each command is 32 bytes long */
struct its_cmd {
uint64_t cmd_dword[4]; /* ITS command double word */
};
/* An ITS private table */
struct its_ptable {
vm_offset_t ptab_vaddr;
unsigned long ptab_size;
};
/* ITS collection description. */
struct its_col {
uint64_t col_target; /* Target Re-Distributor */
uint64_t col_id; /* Collection ID */
};
struct gicv3_its_irqsrc {
struct intr_irqsrc gi_isrc;
u_int gi_irq;
struct its_dev *gi_its_dev;
};
struct gicv3_its_softc {
struct intr_pic *sc_pic;
struct resource *sc_its_res;
cpuset_t sc_cpus;
u_int gic_irq_cpu;
struct its_ptable sc_its_ptab[GITS_BASER_NUM];
struct its_col *sc_its_cols[MAXCPU]; /* Per-CPU collections */
/*
* TODO: We should get these from the parent as we only want a
* single copy of each across the interrupt controller.
*/
vm_offset_t sc_conf_base;
vm_offset_t sc_pend_base[MAXCPU];
/* Command handling */
struct mtx sc_its_cmd_lock;
struct its_cmd *sc_its_cmd_base; /* Command circular buffer address */
size_t sc_its_cmd_next_idx;
vmem_t *sc_irq_alloc;
struct gicv3_its_irqsrc *sc_irqs;
u_int sc_irq_base;
u_int sc_irq_length;
struct mtx sc_its_dev_lock;
TAILQ_HEAD(its_dev_list, its_dev) sc_its_dev_list;
#define ITS_FLAGS_CMDQ_FLUSH 0x00000001
#define ITS_FLAGS_LPI_CONF_FLUSH 0x00000002
#define ITS_FLAGS_ERRATA_CAVIUM_22375 0x00000004
u_int sc_its_flags;
};
typedef void (its_quirk_func_t)(device_t);
static its_quirk_func_t its_quirk_cavium_22375;
static const struct {
const char *desc;
uint32_t iidr;
uint32_t iidr_mask;
its_quirk_func_t *func;
} its_quirks[] = {
{
/* Cavium ThunderX Pass 1.x */
.desc = "Cavoum ThunderX errata: 22375, 24313",
.iidr = GITS_IIDR_RAW(GITS_IIDR_IMPL_CAVIUM,
GITS_IIDR_PROD_THUNDER, GITS_IIDR_VAR_THUNDER_1, 0),
.iidr_mask = ~GITS_IIDR_REVISION_MASK,
.func = its_quirk_cavium_22375,
},
};
#define gic_its_read_4(sc, reg) \
bus_read_4((sc)->sc_its_res, (reg))
#define gic_its_read_8(sc, reg) \
bus_read_8((sc)->sc_its_res, (reg))
#define gic_its_write_4(sc, reg, val) \
bus_write_4((sc)->sc_its_res, (reg), (val))
#define gic_its_write_8(sc, reg, val) \
bus_write_8((sc)->sc_its_res, (reg), (val))
static device_attach_t gicv3_its_attach;
static device_detach_t gicv3_its_detach;
static pic_disable_intr_t gicv3_its_disable_intr;
static pic_enable_intr_t gicv3_its_enable_intr;
static pic_map_intr_t gicv3_its_map_intr;
static pic_setup_intr_t gicv3_its_setup_intr;
static pic_post_filter_t gicv3_its_post_filter;
static pic_post_ithread_t gicv3_its_post_ithread;
static pic_pre_ithread_t gicv3_its_pre_ithread;
static pic_bind_intr_t gicv3_its_bind_intr;
#ifdef SMP
static pic_init_secondary_t gicv3_its_init_secondary;
#endif
static msi_alloc_msi_t gicv3_its_alloc_msi;
static msi_release_msi_t gicv3_its_release_msi;
static msi_alloc_msix_t gicv3_its_alloc_msix;
static msi_release_msix_t gicv3_its_release_msix;
static msi_map_msi_t gicv3_its_map_msi;
static void its_cmd_movi(device_t, struct gicv3_its_irqsrc *);
static void its_cmd_mapc(device_t, struct its_col *, uint8_t);
static void its_cmd_mapti(device_t, struct gicv3_its_irqsrc *);
static void its_cmd_mapd(device_t, struct its_dev *, uint8_t);
static void its_cmd_inv(device_t, struct its_dev *, struct gicv3_its_irqsrc *);
static void its_cmd_invall(device_t, struct its_col *);
static device_method_t gicv3_its_methods[] = {
/* Device interface */
DEVMETHOD(device_detach, gicv3_its_detach),
/* Interrupt controller interface */
DEVMETHOD(pic_disable_intr, gicv3_its_disable_intr),
DEVMETHOD(pic_enable_intr, gicv3_its_enable_intr),
DEVMETHOD(pic_map_intr, gicv3_its_map_intr),
DEVMETHOD(pic_setup_intr, gicv3_its_setup_intr),
DEVMETHOD(pic_post_filter, gicv3_its_post_filter),
DEVMETHOD(pic_post_ithread, gicv3_its_post_ithread),
DEVMETHOD(pic_pre_ithread, gicv3_its_pre_ithread),
#ifdef SMP
DEVMETHOD(pic_bind_intr, gicv3_its_bind_intr),
DEVMETHOD(pic_init_secondary, gicv3_its_init_secondary),
#endif
/* MSI/MSI-X */
DEVMETHOD(msi_alloc_msi, gicv3_its_alloc_msi),
DEVMETHOD(msi_release_msi, gicv3_its_release_msi),
DEVMETHOD(msi_alloc_msix, gicv3_its_alloc_msix),
DEVMETHOD(msi_release_msix, gicv3_its_release_msix),
DEVMETHOD(msi_map_msi, gicv3_its_map_msi),
/* End */
DEVMETHOD_END
};
static DEFINE_CLASS_0(gic, gicv3_its_driver, gicv3_its_methods,
sizeof(struct gicv3_its_softc));
static void
gicv3_its_cmdq_init(struct gicv3_its_softc *sc)
{
vm_paddr_t cmd_paddr;
uint64_t reg, tmp;
/* Set up the command circular buffer */
sc->sc_its_cmd_base = contigmalloc(ITS_CMDQ_SIZE, M_GICV3_ITS,
M_WAITOK | M_ZERO, 0, (1ul << 48) - 1, ITS_CMDQ_ALIGN, 0);
sc->sc_its_cmd_next_idx = 0;
cmd_paddr = vtophys(sc->sc_its_cmd_base);
/* Set the base of the command buffer */
reg = GITS_CBASER_VALID |
(GITS_CBASER_CACHE_NIWAWB << GITS_CBASER_CACHE_SHIFT) |
cmd_paddr | (GITS_CBASER_SHARE_IS << GITS_CBASER_SHARE_SHIFT) |
(ITS_CMDQ_SIZE / 4096 - 1);
gic_its_write_8(sc, GITS_CBASER, reg);
/* Read back to check for fixed value fields */
tmp = gic_its_read_8(sc, GITS_CBASER);
if ((tmp & GITS_CBASER_SHARE_MASK) !=
(GITS_CBASER_SHARE_IS << GITS_CBASER_SHARE_SHIFT)) {
/* Check if the hardware reported non-shareable */
if ((tmp & GITS_CBASER_SHARE_MASK) ==
(GITS_CBASER_SHARE_NS << GITS_CBASER_SHARE_SHIFT)) {
/* If so remove the cache attribute */
reg &= ~GITS_CBASER_CACHE_MASK;
reg &= ~GITS_CBASER_SHARE_MASK;
/* Set to Non-cacheable, Non-shareable */
reg |= GITS_CBASER_CACHE_NIN << GITS_CBASER_CACHE_SHIFT;
reg |= GITS_CBASER_SHARE_NS << GITS_CBASER_SHARE_SHIFT;
gic_its_write_8(sc, GITS_CBASER, reg);
}
/* The command queue has to be flushed after each command */
sc->sc_its_flags |= ITS_FLAGS_CMDQ_FLUSH;
}
/* Get the next command from the start of the buffer */
gic_its_write_8(sc, GITS_CWRITER, 0x0);
}
static int
gicv3_its_table_init(device_t dev, struct gicv3_its_softc *sc)
{
vm_offset_t table;
vm_paddr_t paddr;
uint64_t cache, reg, share, tmp, type;
size_t esize, its_tbl_size, nidents, nitspages, npages;
int i, page_size;
int devbits;
if ((sc->sc_its_flags & ITS_FLAGS_ERRATA_CAVIUM_22375) != 0) {
/*
* GITS_TYPER[17:13] of ThunderX reports that device IDs
* are to be 21 bits in length. The entry size of the ITS
* table can be read from GITS_BASERn[52:48] and on ThunderX
* is supposed to be 8 bytes in length (for device table).
* Finally the page size that is to be used by ITS to access
* this table will be set to 64KB.
*
* This gives 0x200000 entries of size 0x8 bytes covered by
* 256 pages each of which 64KB in size. The number of pages
* (minus 1) should then be written to GITS_BASERn[7:0]. In
* that case this value would be 0xFF but on ThunderX the
* maximum value that HW accepts is 0xFD.
*
* Set an arbitrary number of device ID bits to 20 in order
* to limit the number of entries in ITS device table to
* 0x100000 and the table size to 8MB.
*/
devbits = 20;
cache = 0;
} else {
devbits = GITS_TYPER_DEVB(gic_its_read_8(sc, GITS_TYPER));
cache = GITS_BASER_CACHE_WAWB;
}
share = GITS_BASER_SHARE_IS;
page_size = PAGE_SIZE_64K;
for (i = 0; i < GITS_BASER_NUM; i++) {
reg = gic_its_read_8(sc, GITS_BASER(i));
/* The type of table */
type = GITS_BASER_TYPE(reg);
/* The table entry size */
esize = GITS_BASER_ESIZE(reg);
switch(type) {
case GITS_BASER_TYPE_DEV:
nidents = (1 << devbits);
its_tbl_size = esize * nidents;
its_tbl_size = roundup2(its_tbl_size, PAGE_SIZE_64K);
break;
case GITS_BASER_TYPE_VP:
case GITS_BASER_TYPE_PP: /* Undocumented? */
case GITS_BASER_TYPE_IC:
its_tbl_size = page_size;
break;
default:
continue;
}
npages = howmany(its_tbl_size, PAGE_SIZE);
/* Allocate the table */
table = (vm_offset_t)contigmalloc(npages * PAGE_SIZE,
M_GICV3_ITS, M_WAITOK | M_ZERO, 0, (1ul << 48) - 1,
PAGE_SIZE_64K, 0);
sc->sc_its_ptab[i].ptab_vaddr = table;
sc->sc_its_ptab[i].ptab_size = npages * PAGE_SIZE;
paddr = vtophys(table);
while (1) {
nitspages = howmany(its_tbl_size, page_size);
/* Clear the fields we will be setting */
reg &= ~(GITS_BASER_VALID |
GITS_BASER_CACHE_MASK | GITS_BASER_TYPE_MASK |
GITS_BASER_ESIZE_MASK | GITS_BASER_PA_MASK |
GITS_BASER_SHARE_MASK | GITS_BASER_PSZ_MASK |
GITS_BASER_SIZE_MASK);
/* Set the new values */
reg |= GITS_BASER_VALID |
(cache << GITS_BASER_CACHE_SHIFT) |
(type << GITS_BASER_TYPE_SHIFT) |
((esize - 1) << GITS_BASER_ESIZE_SHIFT) |
paddr | (share << GITS_BASER_SHARE_SHIFT) |
(nitspages - 1);
switch (page_size) {
case PAGE_SIZE: /* 4KB */
reg |=
GITS_BASER_PSZ_4K << GITS_BASER_PSZ_SHIFT;
break;
case PAGE_SIZE_16K: /* 16KB */
reg |=
GITS_BASER_PSZ_4K << GITS_BASER_PSZ_SHIFT;
break;
case PAGE_SIZE_64K: /* 64KB */
reg |=
GITS_BASER_PSZ_64K << GITS_BASER_PSZ_SHIFT;
break;
}
gic_its_write_8(sc, GITS_BASER(i), reg);
/* Read back to check */
tmp = gic_its_read_8(sc, GITS_BASER(i));
/* Do the snareability masks line up? */
if ((tmp & GITS_BASER_SHARE_MASK) !=
(reg & GITS_BASER_SHARE_MASK)) {
share = (tmp & GITS_BASER_SHARE_MASK) >>
GITS_BASER_SHARE_SHIFT;
continue;
}
if ((tmp & GITS_BASER_PSZ_MASK) !=
(reg & GITS_BASER_PSZ_MASK)) {
switch (page_size) {
case PAGE_SIZE_16K:
page_size = PAGE_SIZE;
continue;
case PAGE_SIZE_64K:
page_size = PAGE_SIZE_16K;
continue;
}
}
if (tmp != reg) {
device_printf(dev, "GITS_BASER%d: "
"unable to be updated: %lx != %lx\n",
i, reg, tmp);
return (ENXIO);
}
/* We should have made all needed changes */
break;
}
}
return (0);
}
static void
gicv3_its_conftable_init(struct gicv3_its_softc *sc)
{
sc->sc_conf_base = (vm_offset_t)contigmalloc(LPI_CONFTAB_SIZE,
M_GICV3_ITS, M_WAITOK, 0, LPI_CONFTAB_MAX_ADDR, LPI_CONFTAB_ALIGN,
0);
/* Set the default configuration */
memset((void *)sc->sc_conf_base, GIC_PRIORITY_MAX | LPI_CONF_GROUP1,
LPI_CONFTAB_SIZE);
/* Flush the table to memory */
cpu_dcache_wb_range(sc->sc_conf_base, LPI_CONFTAB_SIZE);
}
static void
gicv3_its_pendtables_init(struct gicv3_its_softc *sc)
{
int i;
for (i = 0; i <= mp_maxid; i++) {
if (CPU_ISSET(i, &sc->sc_cpus) == 0)
continue;
sc->sc_pend_base[i] = (vm_offset_t)contigmalloc(
LPI_PENDTAB_SIZE, M_GICV3_ITS, M_WAITOK | M_ZERO,
0, LPI_PENDTAB_MAX_ADDR, LPI_PENDTAB_ALIGN, 0);
/* Flush so the ITS can see the memory */
cpu_dcache_wb_range((vm_offset_t)sc->sc_pend_base,
LPI_PENDTAB_SIZE);
}
}
static int
its_init_cpu(device_t dev, struct gicv3_its_softc *sc)
{
device_t gicv3;
vm_paddr_t target;
uint64_t xbaser, tmp;
uint32_t ctlr;
u_int cpuid;
int domain;
if (!CPU_ISSET(PCPU_GET(cpuid), &sc->sc_cpus))
return (0);
if (bus_get_domain(dev, &domain) == 0) {
if (PCPU_GET(domain) != domain)
return (0);
}
gicv3 = device_get_parent(dev);
cpuid = PCPU_GET(cpuid);
/* Check if the ITS is enabled on this CPU */
if ((gic_r_read_4(gicv3, GICR_TYPER) & GICR_TYPER_PLPIS) == 0) {
return (ENXIO);
}
/* Disable LPIs */
ctlr = gic_r_read_4(gicv3, GICR_CTLR);
ctlr &= ~GICR_CTLR_LPI_ENABLE;
gic_r_write_4(gicv3, GICR_CTLR, ctlr);
/* Make sure changes are observable my the GIC */
dsb(sy);
/*
* Set the redistributor base
*/
xbaser = vtophys(sc->sc_conf_base) |
(GICR_PROPBASER_SHARE_IS << GICR_PROPBASER_SHARE_SHIFT) |
(GICR_PROPBASER_CACHE_NIWAWB << GICR_PROPBASER_CACHE_SHIFT) |
(flsl(LPI_CONFTAB_SIZE | GIC_FIRST_LPI) - 1);
gic_r_write_8(gicv3, GICR_PROPBASER, xbaser);
/* Check the cache attributes we set */
tmp = gic_r_read_8(gicv3, GICR_PROPBASER);
if ((tmp & GICR_PROPBASER_SHARE_MASK) !=
(xbaser & GICR_PROPBASER_SHARE_MASK)) {
if ((tmp & GICR_PROPBASER_SHARE_MASK) ==
(GICR_PROPBASER_SHARE_NS << GICR_PROPBASER_SHARE_SHIFT)) {
/* We need to mark as non-cacheable */
xbaser &= ~(GICR_PROPBASER_SHARE_MASK |
GICR_PROPBASER_CACHE_MASK);
/* Non-cacheable */
xbaser |= GICR_PROPBASER_CACHE_NIN <<
GICR_PROPBASER_CACHE_SHIFT;
/* Non-sareable */
xbaser |= GICR_PROPBASER_SHARE_NS <<
GICR_PROPBASER_SHARE_SHIFT;
gic_r_write_8(gicv3, GICR_PROPBASER, xbaser);
}
sc->sc_its_flags |= ITS_FLAGS_LPI_CONF_FLUSH;
}
/*
* Set the LPI pending table base
*/
xbaser = vtophys(sc->sc_pend_base[cpuid]) |
(GICR_PENDBASER_CACHE_NIWAWB << GICR_PENDBASER_CACHE_SHIFT) |
(GICR_PENDBASER_SHARE_IS << GICR_PENDBASER_SHARE_SHIFT);
gic_r_write_8(gicv3, GICR_PENDBASER, xbaser);
tmp = gic_r_read_8(gicv3, GICR_PENDBASER);
if ((tmp & GICR_PENDBASER_SHARE_MASK) ==
(GICR_PENDBASER_SHARE_NS << GICR_PENDBASER_SHARE_SHIFT)) {
/* Clear the cahce and shareability bits */
xbaser &= ~(GICR_PENDBASER_CACHE_MASK |
GICR_PENDBASER_SHARE_MASK);
/* Mark as non-shareable */
xbaser |= GICR_PENDBASER_SHARE_NS << GICR_PENDBASER_SHARE_SHIFT;
/* And non-cacheable */
xbaser |= GICR_PENDBASER_CACHE_NIN <<
GICR_PENDBASER_CACHE_SHIFT;
}
/* Enable LPIs */
ctlr = gic_r_read_4(gicv3, GICR_CTLR);
ctlr |= GICR_CTLR_LPI_ENABLE;
gic_r_write_4(gicv3, GICR_CTLR, ctlr);
/* Make sure the GIC has seen everything */
dsb(sy);
if ((gic_its_read_8(sc, GITS_TYPER) & GITS_TYPER_PTA) != 0) {
/* This ITS wants the redistributor physical address */
target = vtophys(gicv3_get_redist_vaddr(dev));
} else {
/* This ITS wants the unique processor number */
target = GICR_TYPER_CPUNUM(gic_r_read_8(gicv3, GICR_TYPER));
}
sc->sc_its_cols[cpuid]->col_target = target;
sc->sc_its_cols[cpuid]->col_id = cpuid;
its_cmd_mapc(dev, sc->sc_its_cols[cpuid], 1);
its_cmd_invall(dev, sc->sc_its_cols[cpuid]);
return (0);
}
static int
gicv3_its_attach(device_t dev)
{
struct gicv3_its_softc *sc;
const char *name;
uint32_t iidr;
int domain, err, i, rid;
sc = device_get_softc(dev);
rid = 0;
sc->sc_its_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
RF_ACTIVE);
if (sc->sc_its_res == NULL) {
device_printf(dev, "Could not allocate memory\n");
return (ENXIO);
}
iidr = gic_its_read_4(sc, GITS_IIDR);
for (i = 0; i < nitems(its_quirks); i++) {
if ((iidr & its_quirks[i].iidr_mask) == its_quirks[i].iidr) {
if (bootverbose) {
device_printf(dev, "Applying %s\n",
its_quirks[i].desc);
}
its_quirks[i].func(dev);
break;
}
}
/* Allocate the private tables */
err = gicv3_its_table_init(dev, sc);
if (err != 0)
return (err);
/* Protects access to the device list */
mtx_init(&sc->sc_its_dev_lock, "ITS device lock", NULL, MTX_SPIN);
/* Protects access to the ITS command circular buffer. */
mtx_init(&sc->sc_its_cmd_lock, "ITS cmd lock", NULL, MTX_SPIN);
if (bus_get_domain(dev, &domain) == 0) {
CPU_ZERO(&sc->sc_cpus);
if (domain < MAXMEMDOM)
CPU_COPY(&cpuset_domain[domain], &sc->sc_cpus);
} else {
CPU_COPY(&all_cpus, &sc->sc_cpus);
}
/* Allocate the command circular buffer */
gicv3_its_cmdq_init(sc);
/* Allocate the per-CPU collections */
for (int cpu = 0; cpu <= mp_maxid; cpu++)
if (CPU_ISSET(cpu, &sc->sc_cpus) != 0)
sc->sc_its_cols[cpu] = malloc(
sizeof(*sc->sc_its_cols[0]), M_GICV3_ITS,
M_WAITOK | M_ZERO);
else
sc->sc_its_cols[cpu] = NULL;
/* Enable the ITS */
gic_its_write_4(sc, GITS_CTLR,
gic_its_read_4(sc, GITS_CTLR) | GITS_CTLR_EN);
/* Create the LPI configuration table */
gicv3_its_conftable_init(sc);
/* And the pending tebles */
gicv3_its_pendtables_init(sc);
/* Enable LPIs on this CPU */
its_init_cpu(dev, sc);
TAILQ_INIT(&sc->sc_its_dev_list);
/*
* Create the vmem object to allocate INTRNG IRQs from. We try to
* use all IRQs not already used by the GICv3.
* XXX: This assumes there are no other interrupt controllers in the
* system.
*/
sc->sc_irq_alloc = vmem_create("GICv3 ITS IRQs", 0,
gicv3_get_nirqs(dev), 1, 1, M_FIRSTFIT | M_WAITOK);
sc->sc_irqs = malloc(sizeof(*sc->sc_irqs) * sc->sc_irq_length,
M_GICV3_ITS, M_WAITOK | M_ZERO);
name = device_get_nameunit(dev);
for (i = 0; i < sc->sc_irq_length; i++) {
sc->sc_irqs[i].gi_irq = i;
err = intr_isrc_register(&sc->sc_irqs[i].gi_isrc, dev, 0,
"%s,%u", name, i);
}
return (0);
}
static int
gicv3_its_detach(device_t dev)
{
return (ENXIO);
}
static void
its_quirk_cavium_22375(device_t dev)
{
struct gicv3_its_softc *sc;
sc = device_get_softc(dev);
sc->sc_its_flags |= ITS_FLAGS_ERRATA_CAVIUM_22375;
}
static void
gicv3_its_disable_intr(device_t dev, struct intr_irqsrc *isrc)
{
struct gicv3_its_softc *sc;
struct gicv3_its_irqsrc *girq;
uint8_t *conf;
sc = device_get_softc(dev);
girq = (struct gicv3_its_irqsrc *)isrc;
conf = (uint8_t *)sc->sc_conf_base;
conf[girq->gi_irq] &= ~LPI_CONF_ENABLE;
if ((sc->sc_its_flags & ITS_FLAGS_LPI_CONF_FLUSH) != 0) {
/* Clean D-cache under command. */
cpu_dcache_wb_range((vm_offset_t)&conf[girq->gi_irq], 1);
} else {
/* DSB inner shareable, store */
dsb(ishst);
}
its_cmd_inv(dev, girq->gi_its_dev, girq);
}
static void
gicv3_its_enable_intr(device_t dev, struct intr_irqsrc *isrc)
{
struct gicv3_its_softc *sc;
struct gicv3_its_irqsrc *girq;
uint8_t *conf;
sc = device_get_softc(dev);
girq = (struct gicv3_its_irqsrc *)isrc;
conf = (uint8_t *)sc->sc_conf_base;
conf[girq->gi_irq] |= LPI_CONF_ENABLE;
if ((sc->sc_its_flags & ITS_FLAGS_LPI_CONF_FLUSH) != 0) {
/* Clean D-cache under command. */
cpu_dcache_wb_range((vm_offset_t)&conf[girq->gi_irq], 1);
} else {
/* DSB inner shareable, store */
dsb(ishst);
}
its_cmd_inv(dev, girq->gi_its_dev, girq);
}
static int
gicv3_its_intr(void *arg, uintptr_t irq)
{
struct gicv3_its_softc *sc = arg;
struct gicv3_its_irqsrc *girq;
struct trapframe *tf;
irq -= sc->sc_irq_base;
girq = &sc->sc_irqs[irq];
if (girq == NULL)
panic("gicv3_its_intr: Invalid interrupt %ld",
irq + sc->sc_irq_base);
tf = curthread->td_intr_frame;
intr_isrc_dispatch(&girq->gi_isrc, tf);
return (FILTER_HANDLED);
}
static void
gicv3_its_pre_ithread(device_t dev, struct intr_irqsrc *isrc)
{
struct gicv3_its_irqsrc *girq;
struct gicv3_its_softc *sc;
sc = device_get_softc(dev);
girq = (struct gicv3_its_irqsrc *)isrc;
gicv3_its_disable_intr(dev, isrc);
gic_icc_write(EOIR1, girq->gi_irq + sc->sc_irq_base);
}
static void
gicv3_its_post_ithread(device_t dev, struct intr_irqsrc *isrc)
{
gicv3_its_enable_intr(dev, isrc);
}
static void
gicv3_its_post_filter(device_t dev, struct intr_irqsrc *isrc)
{
struct gicv3_its_irqsrc *girq;
struct gicv3_its_softc *sc;
sc = device_get_softc(dev);
girq = (struct gicv3_its_irqsrc *)isrc;
gic_icc_write(EOIR1, girq->gi_irq + sc->sc_irq_base);
}
static int
gicv3_its_bind_intr(device_t dev, struct intr_irqsrc *isrc)
{
struct gicv3_its_irqsrc *girq;
struct gicv3_its_softc *sc;
sc = device_get_softc(dev);
girq = (struct gicv3_its_irqsrc *)isrc;
if (CPU_EMPTY(&isrc->isrc_cpu)) {
sc->gic_irq_cpu = intr_irq_next_cpu(sc->gic_irq_cpu,
&sc->sc_cpus);
CPU_SETOF(sc->gic_irq_cpu, &isrc->isrc_cpu);
}
its_cmd_movi(dev, girq);
return (0);
}
static int
gicv3_its_map_intr(device_t dev, struct intr_map_data *data,
struct intr_irqsrc **isrcp)
{
/*
* This should never happen, we only call this function to map
* interrupts found before the controller driver is ready.
*/
panic("gicv3_its_map_intr: Unable to map a MSI interrupt");
}
static int
gicv3_its_setup_intr(device_t dev, struct intr_irqsrc *isrc,
struct resource *res, struct intr_map_data *data)
{
/* Bind the interrupt to a CPU */
gicv3_its_bind_intr(dev, isrc);
return (0);
}
#ifdef SMP
static void
gicv3_its_init_secondary(device_t dev)
{
struct gicv3_its_softc *sc;
sc = device_get_softc(dev);
/*
* This is fatal as otherwise we may bind interrupts to this CPU.
* We need a way to tell the interrupt framework to only bind to a
* subset of given CPUs when it performs the shuffle.
*/
if (its_init_cpu(dev, sc) != 0)
panic("gicv3_its_init_secondary: No usable ITS on CPU%d",
PCPU_GET(cpuid));
}
#endif
static uint32_t
its_get_devid(device_t pci_dev)
{
uintptr_t id;
if (pci_get_id(pci_dev, PCI_ID_MSI, &id) != 0)
panic("its_get_devid: Unable to get the MSI DeviceID");
return (id);
}
static struct its_dev *
its_device_find(device_t dev, device_t child)
{
struct gicv3_its_softc *sc;
struct its_dev *its_dev = NULL;
sc = device_get_softc(dev);
mtx_lock_spin(&sc->sc_its_dev_lock);
TAILQ_FOREACH(its_dev, &sc->sc_its_dev_list, entry) {
if (its_dev->pci_dev == child)
break;
}
mtx_unlock_spin(&sc->sc_its_dev_lock);
return (its_dev);
}
static struct its_dev *
its_device_get(device_t dev, device_t child, u_int nvecs)
{
struct gicv3_its_softc *sc;
struct its_dev *its_dev;
vmem_addr_t irq_base;
size_t esize;
sc = device_get_softc(dev);
its_dev = its_device_find(dev, child);
if (its_dev != NULL)
return (its_dev);
its_dev = malloc(sizeof(*its_dev), M_GICV3_ITS, M_NOWAIT | M_ZERO);
if (its_dev == NULL)
return (NULL);
its_dev->pci_dev = child;
its_dev->devid = its_get_devid(child);
its_dev->lpis.lpi_busy = 0;
its_dev->lpis.lpi_num = nvecs;
its_dev->lpis.lpi_free = nvecs;
if (vmem_alloc(sc->sc_irq_alloc, nvecs, M_FIRSTFIT | M_NOWAIT,
&irq_base) != 0) {
free(its_dev, M_GICV3_ITS);
return (NULL);
}
its_dev->lpis.lpi_base = irq_base;
/* Get ITT entry size */
esize = GITS_TYPER_ITTES(gic_its_read_8(sc, GITS_TYPER));
/*
* Allocate ITT for this device.
* PA has to be 256 B aligned. At least two entries for device.
*/
its_dev->itt_size = roundup2(MAX(nvecs, 2) * esize, 256);
its_dev->itt = (vm_offset_t)contigmalloc(its_dev->itt_size,
M_GICV3_ITS, M_NOWAIT | M_ZERO, 0, LPI_INT_TRANS_TAB_MAX_ADDR,
LPI_INT_TRANS_TAB_ALIGN, 0);
if (its_dev->itt == 0) {
vmem_free(sc->sc_irq_alloc, its_dev->lpis.lpi_base, nvecs);
free(its_dev, M_GICV3_ITS);
return (NULL);
}
mtx_lock_spin(&sc->sc_its_dev_lock);
TAILQ_INSERT_TAIL(&sc->sc_its_dev_list, its_dev, entry);
mtx_unlock_spin(&sc->sc_its_dev_lock);
/* Map device to its ITT */
its_cmd_mapd(dev, its_dev, 1);
return (its_dev);
}
static void
its_device_release(device_t dev, struct its_dev *its_dev)
{
struct gicv3_its_softc *sc;
KASSERT(its_dev->lpis.lpi_busy == 0,
("its_device_release: Trying to release an inuse ITS device"));
/* Unmap device in ITS */
its_cmd_mapd(dev, its_dev, 0);
sc = device_get_softc(dev);
/* Remove the device from the list of devices */
mtx_lock_spin(&sc->sc_its_dev_lock);
TAILQ_REMOVE(&sc->sc_its_dev_list, its_dev, entry);
mtx_unlock_spin(&sc->sc_its_dev_lock);
/* Free ITT */
KASSERT(its_dev->itt != 0, ("Invalid ITT in valid ITS device"));
contigfree((void *)its_dev->itt, its_dev->itt_size, M_GICV3_ITS);
/* Free the IRQ allocation */
vmem_free(sc->sc_irq_alloc, its_dev->lpis.lpi_base,
its_dev->lpis.lpi_num);
free(its_dev, M_GICV3_ITS);
}
static int
gicv3_its_alloc_msi(device_t dev, device_t child, int count, int maxcount,
device_t *pic, struct intr_irqsrc **srcs)
{
struct gicv3_its_softc *sc;
struct gicv3_its_irqsrc *girq;
struct its_dev *its_dev;
u_int irq;
int i;
its_dev = its_device_get(dev, child, count);
if (its_dev == NULL)
return (ENXIO);
KASSERT(its_dev->lpis.lpi_free >= count,
("gicv3_its_alloc_msi: No free LPIs"));
sc = device_get_softc(dev);
irq = its_dev->lpis.lpi_base + its_dev->lpis.lpi_num -
its_dev->lpis.lpi_free;
for (i = 0; i < count; i++, irq++) {
its_dev->lpis.lpi_free--;
girq = &sc->sc_irqs[irq];
girq->gi_its_dev = its_dev;
srcs[i] = (struct intr_irqsrc *)girq;
}
its_dev->lpis.lpi_busy += count;
*pic = dev;
return (0);
}
static int
gicv3_its_release_msi(device_t dev, device_t child, int count,
struct intr_irqsrc **isrc)
{
- struct gicv3_its_softc *sc;
struct gicv3_its_irqsrc *girq;
struct its_dev *its_dev;
int i;
- sc = device_get_softc(dev);
its_dev = its_device_find(dev, child);
KASSERT(its_dev != NULL,
("gicv3_its_release_msi: Releasing a MSI interrupt with "
"no ITS device"));
KASSERT(its_dev->lpis.lpi_busy >= count,
("gicv3_its_release_msi: Releasing more interrupts than "
"were allocated: releasing %d, allocated %d", count,
its_dev->lpis.lpi_busy));
for (i = 0; i < count; i++) {
girq = (struct gicv3_its_irqsrc *)isrc[i];
girq->gi_its_dev = NULL;
}
its_dev->lpis.lpi_busy -= count;
if (its_dev->lpis.lpi_busy == 0)
its_device_release(dev, its_dev);
return (0);
}
static int
gicv3_its_alloc_msix(device_t dev, device_t child, device_t *pic,
struct intr_irqsrc **isrcp)
{
struct gicv3_its_softc *sc;
struct gicv3_its_irqsrc *girq;
struct its_dev *its_dev;
u_int nvecs, irq;
nvecs = pci_msix_count(child);
its_dev = its_device_get(dev, child, nvecs);
if (its_dev == NULL)
return (ENXIO);
KASSERT(its_dev->lpis.lpi_free > 0,
("gicv3_its_alloc_msix: No free LPIs"));
sc = device_get_softc(dev);
irq = its_dev->lpis.lpi_base + its_dev->lpis.lpi_num -
its_dev->lpis.lpi_free;
its_dev->lpis.lpi_free--;
its_dev->lpis.lpi_busy++;
girq = &sc->sc_irqs[irq];
girq->gi_its_dev = its_dev;
*pic = dev;
*isrcp = (struct intr_irqsrc *)girq;
return (0);
}
static int
gicv3_its_release_msix(device_t dev, device_t child, struct intr_irqsrc *isrc)
{
- struct gicv3_its_softc *sc;
struct gicv3_its_irqsrc *girq;
struct its_dev *its_dev;
- sc = device_get_softc(dev);
its_dev = its_device_find(dev, child);
KASSERT(its_dev != NULL,
("gicv3_its_release_msix: Releasing a MSI-X interrupt with "
"no ITS device"));
KASSERT(its_dev->lpis.lpi_busy > 0,
("gicv3_its_release_msix: Releasing more interrupts than "
"were allocated: allocated %d", its_dev->lpis.lpi_busy));
girq = (struct gicv3_its_irqsrc *)isrc;
girq->gi_its_dev = NULL;
its_dev->lpis.lpi_busy--;
if (its_dev->lpis.lpi_busy == 0)
its_device_release(dev, its_dev);
return (0);
}
static int
gicv3_its_map_msi(device_t dev, device_t child, struct intr_irqsrc *isrc,
uint64_t *addr, uint32_t *data)
{
struct gicv3_its_softc *sc;
struct gicv3_its_irqsrc *girq;
sc = device_get_softc(dev);
girq = (struct gicv3_its_irqsrc *)isrc;
/* Map the message to the given IRQ */
its_cmd_mapti(dev, girq);
*addr = vtophys(rman_get_virtual(sc->sc_its_res)) + GITS_TRANSLATER;
*data = girq->gi_irq - girq->gi_its_dev->lpis.lpi_base;
return (0);
}
/*
* Commands handling.
*/
static __inline void
cmd_format_command(struct its_cmd *cmd, uint8_t cmd_type)
{
/* Command field: DW0 [7:0] */
cmd->cmd_dword[0] &= htole64(~CMD_COMMAND_MASK);
cmd->cmd_dword[0] |= htole64(cmd_type);
}
static __inline void
cmd_format_devid(struct its_cmd *cmd, uint32_t devid)
{
/* Device ID field: DW0 [63:32] */
cmd->cmd_dword[0] &= htole64(~CMD_DEVID_MASK);
cmd->cmd_dword[0] |= htole64((uint64_t)devid << CMD_DEVID_SHIFT);
}
static __inline void
cmd_format_size(struct its_cmd *cmd, uint16_t size)
{
/* Size field: DW1 [4:0] */
cmd->cmd_dword[1] &= htole64(~CMD_SIZE_MASK);
cmd->cmd_dword[1] |= htole64((size & CMD_SIZE_MASK));
}
static __inline void
cmd_format_id(struct its_cmd *cmd, uint32_t id)
{
/* ID field: DW1 [31:0] */
cmd->cmd_dword[1] &= htole64(~CMD_ID_MASK);
cmd->cmd_dword[1] |= htole64(id);
}
static __inline void
cmd_format_pid(struct its_cmd *cmd, uint32_t pid)
{
/* Physical ID field: DW1 [63:32] */
cmd->cmd_dword[1] &= htole64(~CMD_PID_MASK);
cmd->cmd_dword[1] |= htole64((uint64_t)pid << CMD_PID_SHIFT);
}
static __inline void
cmd_format_col(struct its_cmd *cmd, uint16_t col_id)
{
/* Collection field: DW2 [16:0] */
cmd->cmd_dword[2] &= htole64(~CMD_COL_MASK);
cmd->cmd_dword[2] |= htole64(col_id);
}
static __inline void
cmd_format_target(struct its_cmd *cmd, uint64_t target)
{
/* Target Address field: DW2 [47:16] */
cmd->cmd_dword[2] &= htole64(~CMD_TARGET_MASK);
cmd->cmd_dword[2] |= htole64(target & CMD_TARGET_MASK);
}
static __inline void
cmd_format_itt(struct its_cmd *cmd, uint64_t itt)
{
/* ITT Address field: DW2 [47:8] */
cmd->cmd_dword[2] &= htole64(~CMD_ITT_MASK);
cmd->cmd_dword[2] |= htole64(itt & CMD_ITT_MASK);
}
static __inline void
cmd_format_valid(struct its_cmd *cmd, uint8_t valid)
{
/* Valid field: DW2 [63] */
cmd->cmd_dword[2] &= htole64(~CMD_VALID_MASK);
cmd->cmd_dword[2] |= htole64((uint64_t)valid << CMD_VALID_SHIFT);
}
static inline bool
its_cmd_queue_full(struct gicv3_its_softc *sc)
{
size_t read_idx, next_write_idx;
/* Get the index of the next command */
next_write_idx = (sc->sc_its_cmd_next_idx + 1) %
(ITS_CMDQ_SIZE / sizeof(struct its_cmd));
/* And the index of the current command being read */
read_idx = gic_its_read_4(sc, GITS_CREADR) / sizeof(struct its_cmd);
/*
* The queue is full when the write offset points
* at the command before the current read offset.
*/
return (next_write_idx == read_idx);
}
static inline void
its_cmd_sync(struct gicv3_its_softc *sc, struct its_cmd *cmd)
{
if ((sc->sc_its_flags & ITS_FLAGS_CMDQ_FLUSH) != 0) {
/* Clean D-cache under command. */
cpu_dcache_wb_range((vm_offset_t)cmd, sizeof(*cmd));
} else {
/* DSB inner shareable, store */
dsb(ishst);
}
}
static inline uint64_t
its_cmd_cwriter_offset(struct gicv3_its_softc *sc, struct its_cmd *cmd)
{
uint64_t off;
off = (cmd - sc->sc_its_cmd_base) * sizeof(*cmd);
return (off);
}
static void
its_cmd_wait_completion(device_t dev, struct its_cmd *cmd_first,
struct its_cmd *cmd_last)
{
struct gicv3_its_softc *sc;
uint64_t first, last, read;
size_t us_left;
sc = device_get_softc(dev);
/*
* XXX ARM64TODO: This is obviously a significant delay.
* The reason for that is that currently the time frames for
* the command to complete are not known.
*/
us_left = 1000000;
first = its_cmd_cwriter_offset(sc, cmd_first);
last = its_cmd_cwriter_offset(sc, cmd_last);
for (;;) {
read = gic_its_read_8(sc, GITS_CREADR);
if (first < last) {
if (read < first || read >= last)
break;
} else if (read < first && read >= last)
break;
if (us_left-- == 0) {
/* This means timeout */
device_printf(dev,
"Timeout while waiting for CMD completion.\n");
return;
}
DELAY(1);
}
}
static struct its_cmd *
its_cmd_alloc_locked(device_t dev)
{
struct gicv3_its_softc *sc;
struct its_cmd *cmd;
size_t us_left;
sc = device_get_softc(dev);
/*
* XXX ARM64TODO: This is obviously a significant delay.
* The reason for that is that currently the time frames for
* the command to complete (and therefore free the descriptor)
* are not known.
*/
us_left = 1000000;
mtx_assert(&sc->sc_its_cmd_lock, MA_OWNED);
while (its_cmd_queue_full(sc)) {
if (us_left-- == 0) {
/* Timeout while waiting for free command */
device_printf(dev,
"Timeout while waiting for free command\n");
return (NULL);
}
DELAY(1);
}
cmd = &sc->sc_its_cmd_base[sc->sc_its_cmd_next_idx];
sc->sc_its_cmd_next_idx++;
sc->sc_its_cmd_next_idx %= ITS_CMDQ_SIZE / sizeof(struct its_cmd);
return (cmd);
}
static uint64_t
its_cmd_prepare(struct its_cmd *cmd, struct its_cmd_desc *desc)
{
uint64_t target;
uint8_t cmd_type;
u_int size;
- boolean_t error;
- error = FALSE;
cmd_type = desc->cmd_type;
target = ITS_TARGET_NONE;
switch (cmd_type) {
case ITS_CMD_MOVI: /* Move interrupt ID to another collection */
target = desc->cmd_desc_movi.col->col_target;
cmd_format_command(cmd, ITS_CMD_MOVI);
cmd_format_id(cmd, desc->cmd_desc_movi.id);
cmd_format_col(cmd, desc->cmd_desc_movi.col->col_id);
cmd_format_devid(cmd, desc->cmd_desc_movi.its_dev->devid);
break;
case ITS_CMD_SYNC: /* Wait for previous commands completion */
target = desc->cmd_desc_sync.col->col_target;
cmd_format_command(cmd, ITS_CMD_SYNC);
cmd_format_target(cmd, target);
break;
case ITS_CMD_MAPD: /* Assign ITT to device */
cmd_format_command(cmd, ITS_CMD_MAPD);
cmd_format_itt(cmd, vtophys(desc->cmd_desc_mapd.its_dev->itt));
/*
* Size describes number of bits to encode interrupt IDs
* supported by the device minus one.
* When V (valid) bit is zero, this field should be written
* as zero.
*/
if (desc->cmd_desc_mapd.valid != 0) {
size = fls(desc->cmd_desc_mapd.its_dev->lpis.lpi_num);
size = MAX(1, size) - 1;
} else
size = 0;
cmd_format_size(cmd, size);
cmd_format_devid(cmd, desc->cmd_desc_mapd.its_dev->devid);
cmd_format_valid(cmd, desc->cmd_desc_mapd.valid);
break;
case ITS_CMD_MAPC: /* Map collection to Re-Distributor */
target = desc->cmd_desc_mapc.col->col_target;
cmd_format_command(cmd, ITS_CMD_MAPC);
cmd_format_col(cmd, desc->cmd_desc_mapc.col->col_id);
cmd_format_valid(cmd, desc->cmd_desc_mapc.valid);
cmd_format_target(cmd, target);
break;
case ITS_CMD_MAPTI:
target = desc->cmd_desc_mapvi.col->col_target;
cmd_format_command(cmd, ITS_CMD_MAPTI);
cmd_format_devid(cmd, desc->cmd_desc_mapvi.its_dev->devid);
cmd_format_id(cmd, desc->cmd_desc_mapvi.id);
cmd_format_pid(cmd, desc->cmd_desc_mapvi.pid);
cmd_format_col(cmd, desc->cmd_desc_mapvi.col->col_id);
break;
case ITS_CMD_MAPI:
target = desc->cmd_desc_mapi.col->col_target;
cmd_format_command(cmd, ITS_CMD_MAPI);
cmd_format_devid(cmd, desc->cmd_desc_mapi.its_dev->devid);
cmd_format_id(cmd, desc->cmd_desc_mapi.pid);
cmd_format_col(cmd, desc->cmd_desc_mapi.col->col_id);
break;
case ITS_CMD_INV:
target = desc->cmd_desc_inv.col->col_target;
cmd_format_command(cmd, ITS_CMD_INV);
cmd_format_devid(cmd, desc->cmd_desc_inv.its_dev->devid);
cmd_format_id(cmd, desc->cmd_desc_inv.pid);
break;
case ITS_CMD_INVALL:
cmd_format_command(cmd, ITS_CMD_INVALL);
cmd_format_col(cmd, desc->cmd_desc_invall.col->col_id);
break;
default:
panic("its_cmd_prepare: Invalid command: %x", cmd_type);
}
return (target);
}
static int
its_cmd_send(device_t dev, struct its_cmd_desc *desc)
{
struct gicv3_its_softc *sc;
struct its_cmd *cmd, *cmd_sync, *cmd_write;
struct its_col col_sync;
struct its_cmd_desc desc_sync;
uint64_t target, cwriter;
sc = device_get_softc(dev);
mtx_lock_spin(&sc->sc_its_cmd_lock);
cmd = its_cmd_alloc_locked(dev);
if (cmd == NULL) {
device_printf(dev, "could not allocate ITS command\n");
mtx_unlock_spin(&sc->sc_its_cmd_lock);
return (EBUSY);
}
target = its_cmd_prepare(cmd, desc);
its_cmd_sync(sc, cmd);
if (target != ITS_TARGET_NONE) {
cmd_sync = its_cmd_alloc_locked(dev);
if (cmd_sync != NULL) {
desc_sync.cmd_type = ITS_CMD_SYNC;
col_sync.col_target = target;
desc_sync.cmd_desc_sync.col = &col_sync;
its_cmd_prepare(cmd_sync, &desc_sync);
its_cmd_sync(sc, cmd_sync);
}
}
/* Update GITS_CWRITER */
cwriter = sc->sc_its_cmd_next_idx * sizeof(struct its_cmd);
gic_its_write_8(sc, GITS_CWRITER, cwriter);
cmd_write = &sc->sc_its_cmd_base[sc->sc_its_cmd_next_idx];
mtx_unlock_spin(&sc->sc_its_cmd_lock);
its_cmd_wait_completion(dev, cmd, cmd_write);
return (0);
}
/* Handlers to send commands */
static void
its_cmd_movi(device_t dev, struct gicv3_its_irqsrc *girq)
{
struct gicv3_its_softc *sc;
struct its_cmd_desc desc;
struct its_col *col;
sc = device_get_softc(dev);
col = sc->sc_its_cols[CPU_FFS(&girq->gi_isrc.isrc_cpu) - 1];
desc.cmd_type = ITS_CMD_MOVI;
desc.cmd_desc_movi.its_dev = girq->gi_its_dev;
desc.cmd_desc_movi.col = col;
desc.cmd_desc_movi.id = girq->gi_irq - girq->gi_its_dev->lpis.lpi_base;
its_cmd_send(dev, &desc);
}
static void
its_cmd_mapc(device_t dev, struct its_col *col, uint8_t valid)
{
struct its_cmd_desc desc;
desc.cmd_type = ITS_CMD_MAPC;
desc.cmd_desc_mapc.col = col;
/*
* Valid bit set - map the collection.
* Valid bit cleared - unmap the collection.
*/
desc.cmd_desc_mapc.valid = valid;
its_cmd_send(dev, &desc);
}
static void
its_cmd_mapti(device_t dev, struct gicv3_its_irqsrc *girq)
{
struct gicv3_its_softc *sc;
struct its_cmd_desc desc;
struct its_col *col;
u_int col_id;
sc = device_get_softc(dev);
col_id = CPU_FFS(&girq->gi_isrc.isrc_cpu) - 1;
col = sc->sc_its_cols[col_id];
desc.cmd_type = ITS_CMD_MAPTI;
desc.cmd_desc_mapvi.its_dev = girq->gi_its_dev;
desc.cmd_desc_mapvi.col = col;
/* The EventID sent to the device */
desc.cmd_desc_mapvi.id = girq->gi_irq - girq->gi_its_dev->lpis.lpi_base;
/* The physical interrupt presented to softeware */
desc.cmd_desc_mapvi.pid = girq->gi_irq + sc->sc_irq_base;
its_cmd_send(dev, &desc);
}
static void
its_cmd_mapd(device_t dev, struct its_dev *its_dev, uint8_t valid)
{
struct its_cmd_desc desc;
desc.cmd_type = ITS_CMD_MAPD;
desc.cmd_desc_mapd.its_dev = its_dev;
desc.cmd_desc_mapd.valid = valid;
its_cmd_send(dev, &desc);
}
static void
its_cmd_inv(device_t dev, struct its_dev *its_dev,
struct gicv3_its_irqsrc *girq)
{
struct gicv3_its_softc *sc;
struct its_cmd_desc desc;
struct its_col *col;
sc = device_get_softc(dev);
col = sc->sc_its_cols[CPU_FFS(&girq->gi_isrc.isrc_cpu) - 1];
desc.cmd_type = ITS_CMD_INV;
/* The EventID sent to the device */
desc.cmd_desc_inv.pid = girq->gi_irq - its_dev->lpis.lpi_base;
desc.cmd_desc_inv.its_dev = its_dev;
desc.cmd_desc_inv.col = col;
its_cmd_send(dev, &desc);
}
static void
its_cmd_invall(device_t dev, struct its_col *col)
{
struct its_cmd_desc desc;
desc.cmd_type = ITS_CMD_INVALL;
desc.cmd_desc_invall.col = col;
its_cmd_send(dev, &desc);
}
#ifdef FDT
static device_probe_t gicv3_its_fdt_probe;
static device_attach_t gicv3_its_fdt_attach;
static device_method_t gicv3_its_fdt_methods[] = {
/* Device interface */
DEVMETHOD(device_probe, gicv3_its_fdt_probe),
DEVMETHOD(device_attach, gicv3_its_fdt_attach),
/* End */
DEVMETHOD_END
};
#define its_baseclasses its_fdt_baseclasses
DEFINE_CLASS_1(its, gicv3_its_fdt_driver, gicv3_its_fdt_methods,
sizeof(struct gicv3_its_softc), gicv3_its_driver);
#undef its_baseclasses
static devclass_t gicv3_its_fdt_devclass;
EARLY_DRIVER_MODULE(its, gic, gicv3_its_fdt_driver,
gicv3_its_fdt_devclass, 0, 0, BUS_PASS_INTERRUPT + BUS_PASS_ORDER_MIDDLE);
static int
gicv3_its_fdt_probe(device_t dev)
{
if (!ofw_bus_status_okay(dev))
return (ENXIO);
if (!ofw_bus_is_compatible(dev, "arm,gic-v3-its"))
return (ENXIO);
device_set_desc(dev, "ARM GIC Interrupt Translation Service");
return (BUS_PROBE_DEFAULT);
}
static int
gicv3_its_fdt_attach(device_t dev)
{
struct gicv3_its_softc *sc;
phandle_t xref;
int err;
sc = device_get_softc(dev);
sc->sc_irq_length = gicv3_get_nirqs(dev);
sc->sc_irq_base = GIC_FIRST_LPI;
sc->sc_irq_base += device_get_unit(dev) * sc->sc_irq_length;
err = gicv3_its_attach(dev);
if (err != 0)
return (err);
/* Register this device as a interrupt controller */
xref = OF_xref_from_node(ofw_bus_get_node(dev));
sc->sc_pic = intr_pic_register(dev, xref);
intr_pic_add_handler(device_get_parent(dev), sc->sc_pic,
gicv3_its_intr, sc, sc->sc_irq_base, sc->sc_irq_length);
/* Register this device to handle MSI interrupts */
intr_msi_register(dev, xref);
return (0);
}
#endif
Index: head/sys/arm64/arm64/machdep.c
===================================================================
--- head/sys/arm64/arm64/machdep.c (revision 327172)
+++ head/sys/arm64/arm64/machdep.c (revision 327173)
@@ -1,1235 +1,1234 @@
/*-
* Copyright (c) 2014 Andrew Turner
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*/
#include "opt_acpi.h"
#include "opt_compat.h"
#include "opt_platform.h"
#include "opt_ddb.h"
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/bus.h>
#include <sys/cons.h>
#include <sys/cpu.h>
#include <sys/devmap.h>
#include <sys/efi.h>
#include <sys/exec.h>
#include <sys/imgact.h>
#include <sys/kdb.h>
#include <sys/kernel.h>
#include <sys/limits.h>
#include <sys/linker.h>
#include <sys/msgbuf.h>
#include <sys/pcpu.h>
#include <sys/proc.h>
#include <sys/ptrace.h>
#include <sys/reboot.h>
#include <sys/rwlock.h>
#include <sys/sched.h>
#include <sys/signalvar.h>
#include <sys/syscallsubr.h>
#include <sys/sysent.h>
#include <sys/sysproto.h>
#include <sys/ucontext.h>
#include <sys/vdso.h>
#include <vm/vm.h>
#include <vm/vm_kern.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_pager.h>
#include <machine/armreg.h>
#include <machine/cpu.h>
#include <machine/debug_monitor.h>
#include <machine/kdb.h>
#include <machine/machdep.h>
#include <machine/metadata.h>
#include <machine/md_var.h>
#include <machine/pcb.h>
#include <machine/reg.h>
#include <machine/undefined.h>
#include <machine/vmparam.h>
#ifdef VFP
#include <machine/vfp.h>
#endif
#ifdef DEV_ACPI
#include <contrib/dev/acpica/include/acpi.h>
#include <machine/acpica_machdep.h>
#endif
#ifdef FDT
#include <dev/fdt/fdt_common.h>
#include <dev/ofw/openfirm.h>
#endif
enum arm64_bus arm64_bus_method = ARM64_BUS_NONE;
struct pcpu __pcpu[MAXCPU];
static struct trapframe proc0_tf;
vm_paddr_t phys_avail[PHYS_AVAIL_SIZE + 2];
vm_paddr_t dump_avail[PHYS_AVAIL_SIZE + 2];
int early_boot = 1;
int cold = 1;
long realmem = 0;
long Maxmem = 0;
#define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1))
vm_paddr_t physmap[PHYSMAP_SIZE];
u_int physmap_idx;
struct kva_md_info kmi;
int64_t dcache_line_size; /* The minimum D cache line size */
int64_t icache_line_size; /* The minimum I cache line size */
int64_t idcache_line_size; /* The minimum cache line size */
int64_t dczva_line_size; /* The size of cache line the dc zva zeroes */
int has_pan;
/*
* Physical address of the EFI System Table. Stashed from the metadata hints
* passed into the kernel and used by the EFI code to call runtime services.
*/
vm_paddr_t efi_systbl_phys;
/* pagezero_* implementations are provided in support.S */
void pagezero_simple(void *);
void pagezero_cache(void *);
/* pagezero_simple is default pagezero */
void (*pagezero)(void *p) = pagezero_simple;
static void
pan_setup(void)
{
uint64_t id_aa64mfr1;
id_aa64mfr1 = READ_SPECIALREG(id_aa64mmfr1_el1);
if (ID_AA64MMFR1_PAN(id_aa64mfr1) != ID_AA64MMFR1_PAN_NONE)
has_pan = 1;
}
void
pan_enable(void)
{
/*
* The LLVM integrated assembler doesn't understand the PAN
* PSTATE field. Because of this we need to manually create
* the instruction in an asm block. This is equivalent to:
* msr pan, #1
*
* This sets the PAN bit, stopping the kernel from accessing
* memory when userspace can also access it unless the kernel
* uses the userspace load/store instructions.
*/
if (has_pan) {
WRITE_SPECIALREG(sctlr_el1,
READ_SPECIALREG(sctlr_el1) & ~SCTLR_SPAN);
__asm __volatile(".inst 0xd500409f | (0x1 << 8)");
}
}
static void
cpu_startup(void *dummy)
{
undef_init();
identify_cpu();
vm_ksubmap_init(&kmi);
bufinit();
vm_pager_bufferinit();
}
SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
int
cpu_idle_wakeup(int cpu)
{
return (0);
}
int
fill_regs(struct thread *td, struct reg *regs)
{
struct trapframe *frame;
frame = td->td_frame;
regs->sp = frame->tf_sp;
regs->lr = frame->tf_lr;
regs->elr = frame->tf_elr;
regs->spsr = frame->tf_spsr;
memcpy(regs->x, frame->tf_x, sizeof(regs->x));
return (0);
}
int
set_regs(struct thread *td, struct reg *regs)
{
struct trapframe *frame;
frame = td->td_frame;
frame->tf_sp = regs->sp;
frame->tf_lr = regs->lr;
frame->tf_elr = regs->elr;
frame->tf_spsr &= ~PSR_FLAGS;
frame->tf_spsr |= regs->spsr & PSR_FLAGS;
memcpy(frame->tf_x, regs->x, sizeof(frame->tf_x));
return (0);
}
int
fill_fpregs(struct thread *td, struct fpreg *regs)
{
#ifdef VFP
struct pcb *pcb;
pcb = td->td_pcb;
if ((pcb->pcb_fpflags & PCB_FP_STARTED) != 0) {
/*
* If we have just been running VFP instructions we will
* need to save the state to memcpy it below.
*/
if (td == curthread)
vfp_save_state(td, pcb);
KASSERT(pcb->pcb_fpusaved == &pcb->pcb_fpustate,
("Called fill_fpregs while the kernel is using the VFP"));
memcpy(regs->fp_q, pcb->pcb_fpustate.vfp_regs,
sizeof(regs->fp_q));
regs->fp_cr = pcb->pcb_fpustate.vfp_fpcr;
regs->fp_sr = pcb->pcb_fpustate.vfp_fpsr;
} else
#endif
memset(regs->fp_q, 0, sizeof(regs->fp_q));
return (0);
}
int
set_fpregs(struct thread *td, struct fpreg *regs)
{
#ifdef VFP
struct pcb *pcb;
pcb = td->td_pcb;
KASSERT(pcb->pcb_fpusaved == &pcb->pcb_fpustate,
("Called set_fpregs while the kernel is using the VFP"));
memcpy(pcb->pcb_fpustate.vfp_regs, regs->fp_q, sizeof(regs->fp_q));
pcb->pcb_fpustate.vfp_fpcr = regs->fp_cr;
pcb->pcb_fpustate.vfp_fpsr = regs->fp_sr;
#endif
return (0);
}
int
fill_dbregs(struct thread *td, struct dbreg *regs)
{
printf("ARM64TODO: fill_dbregs");
return (EDOOFUS);
}
int
set_dbregs(struct thread *td, struct dbreg *regs)
{
printf("ARM64TODO: set_dbregs");
return (EDOOFUS);
}
#ifdef COMPAT_FREEBSD32
int
fill_regs32(struct thread *td, struct reg32 *regs)
{
printf("ARM64TODO: fill_regs32");
return (EDOOFUS);
}
int
set_regs32(struct thread *td, struct reg32 *regs)
{
printf("ARM64TODO: set_regs32");
return (EDOOFUS);
}
int
fill_fpregs32(struct thread *td, struct fpreg32 *regs)
{
printf("ARM64TODO: fill_fpregs32");
return (EDOOFUS);
}
int
set_fpregs32(struct thread *td, struct fpreg32 *regs)
{
printf("ARM64TODO: set_fpregs32");
return (EDOOFUS);
}
int
fill_dbregs32(struct thread *td, struct dbreg32 *regs)
{
printf("ARM64TODO: fill_dbregs32");
return (EDOOFUS);
}
int
set_dbregs32(struct thread *td, struct dbreg32 *regs)
{
printf("ARM64TODO: set_dbregs32");
return (EDOOFUS);
}
#endif
int
ptrace_set_pc(struct thread *td, u_long addr)
{
printf("ARM64TODO: ptrace_set_pc");
return (EDOOFUS);
}
int
ptrace_single_step(struct thread *td)
{
td->td_frame->tf_spsr |= PSR_SS;
td->td_pcb->pcb_flags |= PCB_SINGLE_STEP;
return (0);
}
int
ptrace_clear_single_step(struct thread *td)
{
td->td_frame->tf_spsr &= ~PSR_SS;
td->td_pcb->pcb_flags &= ~PCB_SINGLE_STEP;
return (0);
}
void
exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
{
struct trapframe *tf = td->td_frame;
memset(tf, 0, sizeof(struct trapframe));
tf->tf_x[0] = stack;
tf->tf_sp = STACKALIGN(stack);
tf->tf_lr = imgp->entry_addr;
tf->tf_elr = imgp->entry_addr;
}
/* Sanity check these are the same size, they will be memcpy'd to and fro */
CTASSERT(sizeof(((struct trapframe *)0)->tf_x) ==
sizeof((struct gpregs *)0)->gp_x);
CTASSERT(sizeof(((struct trapframe *)0)->tf_x) ==
sizeof((struct reg *)0)->x);
int
get_mcontext(struct thread *td, mcontext_t *mcp, int clear_ret)
{
struct trapframe *tf = td->td_frame;
if (clear_ret & GET_MC_CLEAR_RET) {
mcp->mc_gpregs.gp_x[0] = 0;
mcp->mc_gpregs.gp_spsr = tf->tf_spsr & ~PSR_C;
} else {
mcp->mc_gpregs.gp_x[0] = tf->tf_x[0];
mcp->mc_gpregs.gp_spsr = tf->tf_spsr;
}
memcpy(&mcp->mc_gpregs.gp_x[1], &tf->tf_x[1],
sizeof(mcp->mc_gpregs.gp_x[1]) * (nitems(mcp->mc_gpregs.gp_x) - 1));
mcp->mc_gpregs.gp_sp = tf->tf_sp;
mcp->mc_gpregs.gp_lr = tf->tf_lr;
mcp->mc_gpregs.gp_elr = tf->tf_elr;
return (0);
}
int
set_mcontext(struct thread *td, mcontext_t *mcp)
{
struct trapframe *tf = td->td_frame;
uint32_t spsr;
spsr = mcp->mc_gpregs.gp_spsr;
if ((spsr & PSR_M_MASK) != PSR_M_EL0t ||
(spsr & (PSR_AARCH32 | PSR_F | PSR_I | PSR_A | PSR_D)) != 0)
return (EINVAL);
memcpy(tf->tf_x, mcp->mc_gpregs.gp_x, sizeof(tf->tf_x));
tf->tf_sp = mcp->mc_gpregs.gp_sp;
tf->tf_lr = mcp->mc_gpregs.gp_lr;
tf->tf_elr = mcp->mc_gpregs.gp_elr;
tf->tf_spsr = mcp->mc_gpregs.gp_spsr;
return (0);
}
static void
get_fpcontext(struct thread *td, mcontext_t *mcp)
{
#ifdef VFP
struct pcb *curpcb;
critical_enter();
curpcb = curthread->td_pcb;
if ((curpcb->pcb_fpflags & PCB_FP_STARTED) != 0) {
/*
* If we have just been running VFP instructions we will
* need to save the state to memcpy it below.
*/
vfp_save_state(td, curpcb);
KASSERT(curpcb->pcb_fpusaved == &curpcb->pcb_fpustate,
("Called get_fpcontext while the kernel is using the VFP"));
KASSERT((curpcb->pcb_fpflags & ~PCB_FP_USERMASK) == 0,
("Non-userspace FPU flags set in get_fpcontext"));
memcpy(mcp->mc_fpregs.fp_q, curpcb->pcb_fpustate.vfp_regs,
sizeof(mcp->mc_fpregs));
mcp->mc_fpregs.fp_cr = curpcb->pcb_fpustate.vfp_fpcr;
mcp->mc_fpregs.fp_sr = curpcb->pcb_fpustate.vfp_fpsr;
mcp->mc_fpregs.fp_flags = curpcb->pcb_fpflags;
mcp->mc_flags |= _MC_FP_VALID;
}
critical_exit();
#endif
}
static void
set_fpcontext(struct thread *td, mcontext_t *mcp)
{
#ifdef VFP
struct pcb *curpcb;
critical_enter();
if ((mcp->mc_flags & _MC_FP_VALID) != 0) {
curpcb = curthread->td_pcb;
/*
* Discard any vfp state for the current thread, we
* are about to override it.
*/
vfp_discard(td);
KASSERT(curpcb->pcb_fpusaved == &curpcb->pcb_fpustate,
("Called set_fpcontext while the kernel is using the VFP"));
memcpy(curpcb->pcb_fpustate.vfp_regs, mcp->mc_fpregs.fp_q,
sizeof(mcp->mc_fpregs));
curpcb->pcb_fpustate.vfp_fpcr = mcp->mc_fpregs.fp_cr;
curpcb->pcb_fpustate.vfp_fpsr = mcp->mc_fpregs.fp_sr;
curpcb->pcb_fpflags = mcp->mc_fpregs.fp_flags & PCB_FP_USERMASK;
}
critical_exit();
#endif
}
void
cpu_idle(int busy)
{
spinlock_enter();
if (!busy)
cpu_idleclock();
if (!sched_runnable())
__asm __volatile(
"dsb sy \n"
"wfi \n");
if (!busy)
cpu_activeclock();
spinlock_exit();
}
void
cpu_halt(void)
{
/* We should have shutdown by now, if not enter a low power sleep */
intr_disable();
while (1) {
__asm __volatile("wfi");
}
}
/*
* Flush the D-cache for non-DMA I/O so that the I-cache can
* be made coherent later.
*/
void
cpu_flush_dcache(void *ptr, size_t len)
{
/* ARM64TODO TBD */
}
/* Get current clock frequency for the given CPU ID. */
int
cpu_est_clockrate(int cpu_id, uint64_t *rate)
{
struct pcpu *pc;
pc = pcpu_find(cpu_id);
if (pc == NULL || rate == NULL)
return (EINVAL);
if (pc->pc_clock == 0)
return (EOPNOTSUPP);
*rate = pc->pc_clock;
return (0);
}
void
cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
{
pcpu->pc_acpi_id = 0xffffffff;
}
void
spinlock_enter(void)
{
struct thread *td;
register_t daif;
td = curthread;
if (td->td_md.md_spinlock_count == 0) {
daif = intr_disable();
td->td_md.md_spinlock_count = 1;
td->td_md.md_saved_daif = daif;
} else
td->td_md.md_spinlock_count++;
critical_enter();
}
void
spinlock_exit(void)
{
struct thread *td;
register_t daif;
td = curthread;
critical_exit();
daif = td->td_md.md_saved_daif;
td->td_md.md_spinlock_count--;
if (td->td_md.md_spinlock_count == 0)
intr_restore(daif);
}
#ifndef _SYS_SYSPROTO_H_
struct sigreturn_args {
ucontext_t *ucp;
};
#endif
int
sys_sigreturn(struct thread *td, struct sigreturn_args *uap)
{
ucontext_t uc;
int error;
if (uap == NULL)
return (EFAULT);
if (copyin(uap->sigcntxp, &uc, sizeof(uc)))
return (EFAULT);
error = set_mcontext(td, &uc.uc_mcontext);
if (error != 0)
return (error);
set_fpcontext(td, &uc.uc_mcontext);
/* Restore signal mask. */
kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0);
return (EJUSTRETURN);
}
/*
* Construct a PCB from a trapframe. This is called from kdb_trap() where
* we want to start a backtrace from the function that caused us to enter
* the debugger. We have the context in the trapframe, but base the trace
* on the PCB. The PCB doesn't have to be perfect, as long as it contains
* enough for a backtrace.
*/
void
makectx(struct trapframe *tf, struct pcb *pcb)
{
int i;
for (i = 0; i < PCB_LR; i++)
pcb->pcb_x[i] = tf->tf_x[i];
pcb->pcb_x[PCB_LR] = tf->tf_lr;
pcb->pcb_pc = tf->tf_elr;
pcb->pcb_sp = tf->tf_sp;
}
void
sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
{
struct thread *td;
struct proc *p;
struct trapframe *tf;
struct sigframe *fp, frame;
struct sigacts *psp;
struct sysentvec *sysent;
- int code, onstack, sig;
+ int onstack, sig;
td = curthread;
p = td->td_proc;
PROC_LOCK_ASSERT(p, MA_OWNED);
sig = ksi->ksi_signo;
- code = ksi->ksi_code;
psp = p->p_sigacts;
mtx_assert(&psp->ps_mtx, MA_OWNED);
tf = td->td_frame;
onstack = sigonstack(tf->tf_sp);
CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm,
catcher, sig);
/* Allocate and validate space for the signal handler context. */
if ((td->td_pflags & TDP_ALTSTACK) != 0 && !onstack &&
SIGISMEMBER(psp->ps_sigonstack, sig)) {
fp = (struct sigframe *)((uintptr_t)td->td_sigstk.ss_sp +
td->td_sigstk.ss_size);
#if defined(COMPAT_43)
td->td_sigstk.ss_flags |= SS_ONSTACK;
#endif
} else {
fp = (struct sigframe *)td->td_frame->tf_sp;
}
/* Make room, keeping the stack aligned */
fp--;
fp = (struct sigframe *)STACKALIGN(fp);
/* Fill in the frame to copy out */
get_mcontext(td, &frame.sf_uc.uc_mcontext, 0);
get_fpcontext(td, &frame.sf_uc.uc_mcontext);
frame.sf_si = ksi->ksi_info;
frame.sf_uc.uc_sigmask = *mask;
frame.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ?
((onstack) ? SS_ONSTACK : 0) : SS_DISABLE;
frame.sf_uc.uc_stack = td->td_sigstk;
mtx_unlock(&psp->ps_mtx);
PROC_UNLOCK(td->td_proc);
/* Copy the sigframe out to the user's stack. */
if (copyout(&frame, fp, sizeof(*fp)) != 0) {
/* Process has trashed its stack. Kill it. */
CTR2(KTR_SIG, "sendsig: sigexit td=%p fp=%p", td, fp);
PROC_LOCK(p);
sigexit(td, SIGILL);
}
tf->tf_x[0]= sig;
tf->tf_x[1] = (register_t)&fp->sf_si;
tf->tf_x[2] = (register_t)&fp->sf_uc;
tf->tf_elr = (register_t)catcher;
tf->tf_sp = (register_t)fp;
sysent = p->p_sysent;
if (sysent->sv_sigcode_base != 0)
tf->tf_lr = (register_t)sysent->sv_sigcode_base;
else
tf->tf_lr = (register_t)(sysent->sv_psstrings -
*(sysent->sv_szsigcode));
CTR3(KTR_SIG, "sendsig: return td=%p pc=%#x sp=%#x", td, tf->tf_elr,
tf->tf_sp);
PROC_LOCK(p);
mtx_lock(&psp->ps_mtx);
}
static void
init_proc0(vm_offset_t kstack)
{
struct pcpu *pcpup = &__pcpu[0];
proc_linkup0(&proc0, &thread0);
thread0.td_kstack = kstack;
thread0.td_pcb = (struct pcb *)(thread0.td_kstack) - 1;
thread0.td_pcb->pcb_fpflags = 0;
thread0.td_pcb->pcb_fpusaved = &thread0.td_pcb->pcb_fpustate;
thread0.td_pcb->pcb_vfpcpu = UINT_MAX;
thread0.td_frame = &proc0_tf;
pcpup->pc_curpcb = thread0.td_pcb;
}
typedef struct {
uint32_t type;
uint64_t phys_start;
uint64_t virt_start;
uint64_t num_pages;
uint64_t attr;
} EFI_MEMORY_DESCRIPTOR;
static int
add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
u_int *physmap_idxp)
{
u_int i, insert_idx, _physmap_idx;
_physmap_idx = *physmap_idxp;
if (length == 0)
return (1);
/*
* Find insertion point while checking for overlap. Start off by
* assuming the new entry will be added to the end.
*/
insert_idx = _physmap_idx;
for (i = 0; i <= _physmap_idx; i += 2) {
if (base < physmap[i + 1]) {
if (base + length <= physmap[i]) {
insert_idx = i;
break;
}
if (boothowto & RB_VERBOSE)
printf(
"Overlapping memory regions, ignoring second region\n");
return (1);
}
}
/* See if we can prepend to the next entry. */
if (insert_idx <= _physmap_idx &&
base + length == physmap[insert_idx]) {
physmap[insert_idx] = base;
return (1);
}
/* See if we can append to the previous entry. */
if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
physmap[insert_idx - 1] += length;
return (1);
}
_physmap_idx += 2;
*physmap_idxp = _physmap_idx;
if (_physmap_idx == PHYSMAP_SIZE) {
printf(
"Too many segments in the physical address map, giving up\n");
return (0);
}
/*
* Move the last 'N' entries down to make room for the new
* entry if needed.
*/
for (i = _physmap_idx; i > insert_idx; i -= 2) {
physmap[i] = physmap[i - 2];
physmap[i + 1] = physmap[i - 1];
}
/* Insert the new entry. */
physmap[insert_idx] = base;
physmap[insert_idx + 1] = base + length;
return (1);
}
#ifdef FDT
static void
add_fdt_mem_regions(struct mem_region *mr, int mrcnt, vm_paddr_t *physmap,
u_int *physmap_idxp)
{
for (int i = 0; i < mrcnt; i++) {
if (!add_physmap_entry(mr[i].mr_start, mr[i].mr_size, physmap,
physmap_idxp))
break;
}
}
#endif
static void
add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
u_int *physmap_idxp)
{
struct efi_md *map, *p;
const char *type;
size_t efisz;
int ndesc, i;
static const char *types[] = {
"Reserved",
"LoaderCode",
"LoaderData",
"BootServicesCode",
"BootServicesData",
"RuntimeServicesCode",
"RuntimeServicesData",
"ConventionalMemory",
"UnusableMemory",
"ACPIReclaimMemory",
"ACPIMemoryNVS",
"MemoryMappedIO",
"MemoryMappedIOPortSpace",
"PalCode",
"PersistentMemory"
};
/*
* Memory map data provided by UEFI via the GetMemoryMap
* Boot Services API.
*/
efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
map = (struct efi_md *)((uint8_t *)efihdr + efisz);
if (efihdr->descriptor_size == 0)
return;
ndesc = efihdr->memory_size / efihdr->descriptor_size;
if (boothowto & RB_VERBOSE)
printf("%23s %12s %12s %8s %4s\n",
"Type", "Physical", "Virtual", "#Pages", "Attr");
for (i = 0, p = map; i < ndesc; i++,
p = efi_next_descriptor(p, efihdr->descriptor_size)) {
if (boothowto & RB_VERBOSE) {
if (p->md_type < nitems(types))
type = types[p->md_type];
else
type = "<INVALID>";
printf("%23s %012lx %12p %08lx ", type, p->md_phys,
p->md_virt, p->md_pages);
if (p->md_attr & EFI_MD_ATTR_UC)
printf("UC ");
if (p->md_attr & EFI_MD_ATTR_WC)
printf("WC ");
if (p->md_attr & EFI_MD_ATTR_WT)
printf("WT ");
if (p->md_attr & EFI_MD_ATTR_WB)
printf("WB ");
if (p->md_attr & EFI_MD_ATTR_UCE)
printf("UCE ");
if (p->md_attr & EFI_MD_ATTR_WP)
printf("WP ");
if (p->md_attr & EFI_MD_ATTR_RP)
printf("RP ");
if (p->md_attr & EFI_MD_ATTR_XP)
printf("XP ");
if (p->md_attr & EFI_MD_ATTR_NV)
printf("NV ");
if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
printf("MORE_RELIABLE ");
if (p->md_attr & EFI_MD_ATTR_RO)
printf("RO ");
if (p->md_attr & EFI_MD_ATTR_RT)
printf("RUNTIME");
printf("\n");
}
switch (p->md_type) {
case EFI_MD_TYPE_CODE:
case EFI_MD_TYPE_DATA:
case EFI_MD_TYPE_BS_CODE:
case EFI_MD_TYPE_BS_DATA:
case EFI_MD_TYPE_FREE:
/*
* We're allowed to use any entry with these types.
*/
break;
default:
continue;
}
if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
physmap, physmap_idxp))
break;
}
}
#ifdef FDT
static void
try_load_dtb(caddr_t kmdp)
{
vm_offset_t dtbp;
dtbp = MD_FETCH(kmdp, MODINFOMD_DTBP, vm_offset_t);
if (dtbp == (vm_offset_t)NULL) {
printf("ERROR loading DTB\n");
return;
}
if (OF_install(OFW_FDT, 0) == FALSE)
panic("Cannot install FDT");
if (OF_init((void *)dtbp) != 0)
panic("OF_init failed with the found device tree");
}
#endif
static bool
bus_probe(void)
{
bool has_acpi, has_fdt;
char *order, *env;
has_acpi = has_fdt = false;
#ifdef FDT
has_fdt = (OF_peer(0) != 0);
#endif
#ifdef DEV_ACPI
has_acpi = (acpi_find_table(ACPI_SIG_SPCR) != 0);
#endif
env = kern_getenv("kern.cfg.order");
if (env != NULL) {
order = env;
while (order != NULL) {
if (has_acpi &&
strncmp(order, "acpi", 4) == 0 &&
(order[4] == ',' || order[4] == '\0')) {
arm64_bus_method = ARM64_BUS_ACPI;
break;
}
if (has_fdt &&
strncmp(order, "fdt", 3) == 0 &&
(order[3] == ',' || order[3] == '\0')) {
arm64_bus_method = ARM64_BUS_FDT;
break;
}
order = strchr(order, ',');
}
freeenv(env);
/* If we set the bus method it is valid */
if (arm64_bus_method != ARM64_BUS_NONE)
return (true);
}
/* If no order or an invalid order was set use the default */
if (arm64_bus_method == ARM64_BUS_NONE) {
if (has_fdt)
arm64_bus_method = ARM64_BUS_FDT;
else if (has_acpi)
arm64_bus_method = ARM64_BUS_ACPI;
}
/*
* If no option was set the default is valid, otherwise we are
* setting one to get cninit() working, then calling panic to tell
* the user about the invalid bus setup.
*/
return (env == NULL);
}
static void
cache_setup(void)
{
int dcache_line_shift, icache_line_shift, dczva_line_shift;
uint32_t ctr_el0;
uint32_t dczid_el0;
ctr_el0 = READ_SPECIALREG(ctr_el0);
/* Read the log2 words in each D cache line */
dcache_line_shift = CTR_DLINE_SIZE(ctr_el0);
/* Get the D cache line size */
dcache_line_size = sizeof(int) << dcache_line_shift;
/* And the same for the I cache */
icache_line_shift = CTR_ILINE_SIZE(ctr_el0);
icache_line_size = sizeof(int) << icache_line_shift;
idcache_line_size = MIN(dcache_line_size, icache_line_size);
dczid_el0 = READ_SPECIALREG(dczid_el0);
/* Check if dc zva is not prohibited */
if (dczid_el0 & DCZID_DZP)
dczva_line_size = 0;
else {
/* Same as with above calculations */
dczva_line_shift = DCZID_BS_SIZE(dczid_el0);
dczva_line_size = sizeof(int) << dczva_line_shift;
/* Change pagezero function */
pagezero = pagezero_cache;
}
}
void
initarm(struct arm64_bootparams *abp)
{
struct efi_map_header *efihdr;
struct pcpu *pcpup;
char *env;
#ifdef FDT
struct mem_region mem_regions[FDT_MEM_REGIONS];
int mem_regions_sz;
#endif
vm_offset_t lastaddr;
caddr_t kmdp;
vm_paddr_t mem_len;
bool valid;
int i;
/* Set the module data location */
preload_metadata = (caddr_t)(uintptr_t)(abp->modulep);
/* Find the kernel address */
kmdp = preload_search_by_type("elf kernel");
if (kmdp == NULL)
kmdp = preload_search_by_type("elf64 kernel");
boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
init_static_kenv(MD_FETCH(kmdp, MODINFOMD_ENVP, char *), 0);
#ifdef FDT
try_load_dtb(kmdp);
#endif
efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
/* Find the address to start allocating from */
lastaddr = MD_FETCH(kmdp, MODINFOMD_KERNEND, vm_offset_t);
/* Load the physical memory ranges */
physmap_idx = 0;
efihdr = (struct efi_map_header *)preload_search_info(kmdp,
MODINFO_METADATA | MODINFOMD_EFI_MAP);
if (efihdr != NULL)
add_efi_map_entries(efihdr, physmap, &physmap_idx);
#ifdef FDT
else {
/* Grab physical memory regions information from device tree. */
if (fdt_get_mem_regions(mem_regions, &mem_regions_sz,
NULL) != 0)
panic("Cannot get physical memory regions");
add_fdt_mem_regions(mem_regions, mem_regions_sz, physmap,
&physmap_idx);
}
#endif
/* Print the memory map */
mem_len = 0;
for (i = 0; i < physmap_idx; i += 2) {
dump_avail[i] = physmap[i];
dump_avail[i + 1] = physmap[i + 1];
mem_len += physmap[i + 1] - physmap[i];
}
dump_avail[i] = 0;
dump_avail[i + 1] = 0;
/* Set the pcpu data, this is needed by pmap_bootstrap */
pcpup = &__pcpu[0];
pcpu_init(pcpup, 0, sizeof(struct pcpu));
/*
* Set the pcpu pointer with a backup in tpidr_el1 to be
* loaded when entering the kernel from userland.
*/
__asm __volatile(
"mov x18, %0 \n"
"msr tpidr_el1, %0" :: "r"(pcpup));
PCPU_SET(curthread, &thread0);
/* Do basic tuning, hz etc */
init_param1();
cache_setup();
pan_setup();
/* Bootstrap enough of pmap to enter the kernel proper */
pmap_bootstrap(abp->kern_l0pt, abp->kern_l1pt,
KERNBASE - abp->kern_delta, lastaddr - KERNBASE);
devmap_bootstrap(0, NULL);
valid = bus_probe();
cninit();
if (!valid)
panic("Invalid bus configuration: %s",
kern_getenv("kern.cfg.order"));
init_proc0(abp->kern_stack);
msgbufinit(msgbufp, msgbufsize);
mutex_init();
init_param2(physmem);
dbg_init();
kdb_init();
pan_enable();
env = kern_getenv("kernelname");
if (env != NULL)
strlcpy(kernelname, env, sizeof(kernelname));
early_boot = 0;
}
void
dbg_init(void)
{
/* Clear OS lock */
WRITE_SPECIALREG(OSLAR_EL1, 0);
/* This permits DDB to use debug registers for watchpoints. */
dbg_monitor_init();
/* TODO: Eventually will need to initialize debug registers here. */
}
#ifdef DDB
#include <ddb/ddb.h>
DB_SHOW_COMMAND(specialregs, db_show_spregs)
{
#define PRINT_REG(reg) \
db_printf(__STRING(reg) " = %#016lx\n", READ_SPECIALREG(reg))
PRINT_REG(actlr_el1);
PRINT_REG(afsr0_el1);
PRINT_REG(afsr1_el1);
PRINT_REG(aidr_el1);
PRINT_REG(amair_el1);
PRINT_REG(ccsidr_el1);
PRINT_REG(clidr_el1);
PRINT_REG(contextidr_el1);
PRINT_REG(cpacr_el1);
PRINT_REG(csselr_el1);
PRINT_REG(ctr_el0);
PRINT_REG(currentel);
PRINT_REG(daif);
PRINT_REG(dczid_el0);
PRINT_REG(elr_el1);
PRINT_REG(esr_el1);
PRINT_REG(far_el1);
#if 0
/* ARM64TODO: Enable VFP before reading floating-point registers */
PRINT_REG(fpcr);
PRINT_REG(fpsr);
#endif
PRINT_REG(id_aa64afr0_el1);
PRINT_REG(id_aa64afr1_el1);
PRINT_REG(id_aa64dfr0_el1);
PRINT_REG(id_aa64dfr1_el1);
PRINT_REG(id_aa64isar0_el1);
PRINT_REG(id_aa64isar1_el1);
PRINT_REG(id_aa64pfr0_el1);
PRINT_REG(id_aa64pfr1_el1);
PRINT_REG(id_afr0_el1);
PRINT_REG(id_dfr0_el1);
PRINT_REG(id_isar0_el1);
PRINT_REG(id_isar1_el1);
PRINT_REG(id_isar2_el1);
PRINT_REG(id_isar3_el1);
PRINT_REG(id_isar4_el1);
PRINT_REG(id_isar5_el1);
PRINT_REG(id_mmfr0_el1);
PRINT_REG(id_mmfr1_el1);
PRINT_REG(id_mmfr2_el1);
PRINT_REG(id_mmfr3_el1);
#if 0
/* Missing from llvm */
PRINT_REG(id_mmfr4_el1);
#endif
PRINT_REG(id_pfr0_el1);
PRINT_REG(id_pfr1_el1);
PRINT_REG(isr_el1);
PRINT_REG(mair_el1);
PRINT_REG(midr_el1);
PRINT_REG(mpidr_el1);
PRINT_REG(mvfr0_el1);
PRINT_REG(mvfr1_el1);
PRINT_REG(mvfr2_el1);
PRINT_REG(revidr_el1);
PRINT_REG(sctlr_el1);
PRINT_REG(sp_el0);
PRINT_REG(spsel);
PRINT_REG(spsr_el1);
PRINT_REG(tcr_el1);
PRINT_REG(tpidr_el0);
PRINT_REG(tpidr_el1);
PRINT_REG(tpidrro_el0);
PRINT_REG(ttbr0_el1);
PRINT_REG(ttbr1_el1);
PRINT_REG(vbar_el1);
#undef PRINT_REG
}
DB_SHOW_COMMAND(vtop, db_show_vtop)
{
uint64_t phys;
if (have_addr) {
phys = arm64_address_translate_s1e1r(addr);
db_printf("EL1 physical address reg (read): 0x%016lx\n", phys);
phys = arm64_address_translate_s1e1w(addr);
db_printf("EL1 physical address reg (write): 0x%016lx\n", phys);
phys = arm64_address_translate_s1e0r(addr);
db_printf("EL0 physical address reg (read): 0x%016lx\n", phys);
phys = arm64_address_translate_s1e0w(addr);
db_printf("EL0 physical address reg (write): 0x%016lx\n", phys);
} else
db_printf("show vtop <virt_addr>\n");
}
#endif
Index: head/sys/arm64/arm64/pmap.c
===================================================================
--- head/sys/arm64/arm64/pmap.c (revision 327172)
+++ head/sys/arm64/arm64/pmap.c (revision 327173)
@@ -1,4838 +1,4837 @@
/*-
* Copyright (c) 1991 Regents of the University of California.
* All rights reserved.
* Copyright (c) 1994 John S. Dyson
* All rights reserved.
* Copyright (c) 1994 David Greenman
* All rights reserved.
* Copyright (c) 2003 Peter Wemm
* All rights reserved.
* Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
* All rights reserved.
* Copyright (c) 2014 Andrew Turner
* All rights reserved.
* Copyright (c) 2014-2016 The FreeBSD Foundation
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* the Systems Programming Group of the University of Utah Computer
* Science Department and William Jolitz of UUNET Technologies Inc.
*
* This software was developed by Andrew Turner under sponsorship from
* the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: @(#)pmap.c 7.7 (Berkeley) 5/12/91
*/
/*-
* Copyright (c) 2003 Networks Associates Technology, Inc.
* All rights reserved.
*
* This software was developed for the FreeBSD Project by Jake Burkholder,
* Safeport Network Services, and Network Associates Laboratories, the
* Security Research Division of Network Associates, Inc. under
* DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
* CHATS research program.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
/*
* Manages physical address maps.
*
* Since the information managed by this module is
* also stored by the logical address mapping module,
* this module may throw away valid virtual-to-physical
* mappings at almost any time. However, invalidations
* of virtual-to-physical mappings must be done as
* requested.
*
* In order to cope with hardware architectures which
* make virtual-to-physical map invalidates expensive,
* this module may delay invalidate or reduced protection
* operations until such time as they are actually
* necessary. This module is given full information as
* to which processors are currently using which maps,
* and to when physical maps must be made correct.
*/
#include "opt_vm.h"
#include <sys/param.h>
#include <sys/bitstring.h>
#include <sys/bus.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/ktr.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mman.h>
#include <sys/msgbuf.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/rwlock.h>
#include <sys/sx.h>
#include <sys/vmem.h>
#include <sys/vmmeter.h>
#include <sys/sched.h>
#include <sys/sysctl.h>
#include <sys/_unrhdr.h>
#include <sys/smp.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/vm_kern.h>
#include <vm/vm_page.h>
#include <vm/vm_map.h>
#include <vm/vm_object.h>
#include <vm/vm_extern.h>
#include <vm/vm_pageout.h>
#include <vm/vm_pager.h>
#include <vm/vm_phys.h>
#include <vm/vm_radix.h>
#include <vm/vm_reserv.h>
#include <vm/uma.h>
#include <machine/machdep.h>
#include <machine/md_var.h>
#include <machine/pcb.h>
#define NL0PG (PAGE_SIZE/(sizeof (pd_entry_t)))
#define NL1PG (PAGE_SIZE/(sizeof (pd_entry_t)))
#define NL2PG (PAGE_SIZE/(sizeof (pd_entry_t)))
#define NL3PG (PAGE_SIZE/(sizeof (pt_entry_t)))
#define NUL0E L0_ENTRIES
#define NUL1E (NUL0E * NL1PG)
#define NUL2E (NUL1E * NL2PG)
#if !defined(DIAGNOSTIC)
#ifdef __GNUC_GNU_INLINE__
#define PMAP_INLINE __attribute__((__gnu_inline__)) inline
#else
#define PMAP_INLINE extern inline
#endif
#else
#define PMAP_INLINE
#endif
/*
* These are configured by the mair_el1 register. This is set up in locore.S
*/
#define DEVICE_MEMORY 0
#define UNCACHED_MEMORY 1
#define CACHED_MEMORY 2
#ifdef PV_STATS
#define PV_STAT(x) do { x ; } while (0)
#else
#define PV_STAT(x) do { } while (0)
#endif
#define pmap_l2_pindex(v) ((v) >> L2_SHIFT)
#define pa_to_pvh(pa) (&pv_table[pmap_l2_pindex(pa)])
#define NPV_LIST_LOCKS MAXCPU
#define PHYS_TO_PV_LIST_LOCK(pa) \
(&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
#define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \
struct rwlock **_lockp = (lockp); \
struct rwlock *_new_lock; \
\
_new_lock = PHYS_TO_PV_LIST_LOCK(pa); \
if (_new_lock != *_lockp) { \
if (*_lockp != NULL) \
rw_wunlock(*_lockp); \
*_lockp = _new_lock; \
rw_wlock(*_lockp); \
} \
} while (0)
#define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \
CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
#define RELEASE_PV_LIST_LOCK(lockp) do { \
struct rwlock **_lockp = (lockp); \
\
if (*_lockp != NULL) { \
rw_wunlock(*_lockp); \
*_lockp = NULL; \
} \
} while (0)
#define VM_PAGE_TO_PV_LIST_LOCK(m) \
PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
struct pmap kernel_pmap_store;
vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */
vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */
vm_offset_t kernel_vm_end = 0;
/*
* Data for the pv entry allocation mechanism.
* Updates to pv_invl_gen are protected by the pv_list_locks[]
* elements, but reads are not.
*/
static struct md_page *pv_table;
static struct md_page pv_dummy;
vm_paddr_t dmap_phys_base; /* The start of the dmap region */
vm_paddr_t dmap_phys_max; /* The limit of the dmap region */
vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */
/* This code assumes all L1 DMAP entries will be used */
CTASSERT((DMAP_MIN_ADDRESS & ~L0_OFFSET) == DMAP_MIN_ADDRESS);
CTASSERT((DMAP_MAX_ADDRESS & ~L0_OFFSET) == DMAP_MAX_ADDRESS);
#define DMAP_TABLES ((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT)
extern pt_entry_t pagetable_dmap[];
static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
static int superpages_enabled = 1;
SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled,
CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0,
"Are large page mappings enabled?");
/*
* Data for the pv entry allocation mechanism
*/
static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
static struct mtx pv_chunks_mutex;
static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
static void free_pv_chunk(struct pv_chunk *pc);
static void free_pv_entry(pmap_t pmap, pv_entry_t pv);
static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
vm_offset_t va);
static int pmap_change_attr(vm_offset_t va, vm_size_t size, int mode);
static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va);
static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2,
vm_offset_t va, struct rwlock **lockp);
static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va);
static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
pd_entry_t l1e, struct spglist *free, struct rwlock **lockp);
static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
pd_entry_t l2e, struct spglist *free, struct rwlock **lockp);
static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
vm_page_t m, struct rwlock **lockp);
static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
struct rwlock **lockp);
static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m,
struct spglist *free);
static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
/*
* These load the old table data and store the new value.
* They need to be atomic as the System MMU may write to the table at
* the same time as the CPU.
*/
#define pmap_load_store(table, entry) atomic_swap_64(table, entry)
#define pmap_set(table, mask) atomic_set_64(table, mask)
#define pmap_load_clear(table) atomic_swap_64(table, 0)
#define pmap_load(table) (*table)
/********************/
/* Inline functions */
/********************/
static __inline void
pagecopy(void *s, void *d)
{
memcpy(d, s, PAGE_SIZE);
}
static __inline pd_entry_t *
pmap_l0(pmap_t pmap, vm_offset_t va)
{
return (&pmap->pm_l0[pmap_l0_index(va)]);
}
static __inline pd_entry_t *
pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va)
{
pd_entry_t *l1;
l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK);
return (&l1[pmap_l1_index(va)]);
}
static __inline pd_entry_t *
pmap_l1(pmap_t pmap, vm_offset_t va)
{
pd_entry_t *l0;
l0 = pmap_l0(pmap, va);
if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE)
return (NULL);
return (pmap_l0_to_l1(l0, va));
}
static __inline pd_entry_t *
pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va)
{
pd_entry_t *l2;
l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK);
return (&l2[pmap_l2_index(va)]);
}
static __inline pd_entry_t *
pmap_l2(pmap_t pmap, vm_offset_t va)
{
pd_entry_t *l1;
l1 = pmap_l1(pmap, va);
if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE)
return (NULL);
return (pmap_l1_to_l2(l1, va));
}
static __inline pt_entry_t *
pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va)
{
pt_entry_t *l3;
l3 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l2) & ~ATTR_MASK);
return (&l3[pmap_l3_index(va)]);
}
/*
* Returns the lowest valid pde for a given virtual address.
* The next level may or may not point to a valid page or block.
*/
static __inline pd_entry_t *
pmap_pde(pmap_t pmap, vm_offset_t va, int *level)
{
pd_entry_t *l0, *l1, *l2, desc;
l0 = pmap_l0(pmap, va);
desc = pmap_load(l0) & ATTR_DESCR_MASK;
if (desc != L0_TABLE) {
*level = -1;
return (NULL);
}
l1 = pmap_l0_to_l1(l0, va);
desc = pmap_load(l1) & ATTR_DESCR_MASK;
if (desc != L1_TABLE) {
*level = 0;
return (l0);
}
l2 = pmap_l1_to_l2(l1, va);
desc = pmap_load(l2) & ATTR_DESCR_MASK;
if (desc != L2_TABLE) {
*level = 1;
return (l1);
}
*level = 2;
return (l2);
}
/*
* Returns the lowest valid pte block or table entry for a given virtual
* address. If there are no valid entries return NULL and set the level to
* the first invalid level.
*/
static __inline pt_entry_t *
pmap_pte(pmap_t pmap, vm_offset_t va, int *level)
{
pd_entry_t *l1, *l2, desc;
pt_entry_t *l3;
l1 = pmap_l1(pmap, va);
if (l1 == NULL) {
*level = 0;
return (NULL);
}
desc = pmap_load(l1) & ATTR_DESCR_MASK;
if (desc == L1_BLOCK) {
*level = 1;
return (l1);
}
if (desc != L1_TABLE) {
*level = 1;
return (NULL);
}
l2 = pmap_l1_to_l2(l1, va);
desc = pmap_load(l2) & ATTR_DESCR_MASK;
if (desc == L2_BLOCK) {
*level = 2;
return (l2);
}
if (desc != L2_TABLE) {
*level = 2;
return (NULL);
}
*level = 3;
l3 = pmap_l2_to_l3(l2, va);
if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE)
return (NULL);
return (l3);
}
static inline bool
pmap_superpages_enabled(void)
{
return (superpages_enabled != 0);
}
bool
pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1,
pd_entry_t **l2, pt_entry_t **l3)
{
pd_entry_t *l0p, *l1p, *l2p;
if (pmap->pm_l0 == NULL)
return (false);
l0p = pmap_l0(pmap, va);
*l0 = l0p;
if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE)
return (false);
l1p = pmap_l0_to_l1(l0p, va);
*l1 = l1p;
if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) {
*l2 = NULL;
*l3 = NULL;
return (true);
}
if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE)
return (false);
l2p = pmap_l1_to_l2(l1p, va);
*l2 = l2p;
if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) {
*l3 = NULL;
return (true);
}
*l3 = pmap_l2_to_l3(l2p, va);
return (true);
}
static __inline int
pmap_l3_valid(pt_entry_t l3)
{
return ((l3 & ATTR_DESCR_MASK) == L3_PAGE);
}
CTASSERT(L1_BLOCK == L2_BLOCK);
/*
* Checks if the page is dirty. We currently lack proper tracking of this on
* arm64 so for now assume is a page mapped as rw was accessed it is.
*/
static inline int
pmap_page_dirty(pt_entry_t pte)
{
return ((pte & (ATTR_AF | ATTR_AP_RW_BIT)) ==
(ATTR_AF | ATTR_AP(ATTR_AP_RW)));
}
static __inline void
pmap_resident_count_inc(pmap_t pmap, int count)
{
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
pmap->pm_stats.resident_count += count;
}
static __inline void
pmap_resident_count_dec(pmap_t pmap, int count)
{
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
KASSERT(pmap->pm_stats.resident_count >= count,
("pmap %p resident count underflow %ld %d", pmap,
pmap->pm_stats.resident_count, count));
pmap->pm_stats.resident_count -= count;
}
static pt_entry_t *
pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot,
u_int *l2_slot)
{
pt_entry_t *l2;
pd_entry_t *l1;
l1 = (pd_entry_t *)l1pt;
*l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK;
/* Check locore has used a table L1 map */
KASSERT((l1[*l1_slot] & ATTR_DESCR_MASK) == L1_TABLE,
("Invalid bootstrap L1 table"));
/* Find the address of the L2 table */
l2 = (pt_entry_t *)init_pt_va;
*l2_slot = pmap_l2_index(va);
return (l2);
}
static vm_paddr_t
pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va)
{
u_int l1_slot, l2_slot;
pt_entry_t *l2;
l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot);
return ((l2[l2_slot] & ~ATTR_MASK) + (va & L2_OFFSET));
}
static void
pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, vm_paddr_t max_pa)
{
vm_offset_t va;
vm_paddr_t pa;
u_int l1_slot;
pa = dmap_phys_base = min_pa & ~L1_OFFSET;
va = DMAP_MIN_ADDRESS;
for (; va < DMAP_MAX_ADDRESS && pa < max_pa;
pa += L1_SIZE, va += L1_SIZE, l1_slot++) {
l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT);
pmap_load_store(&pagetable_dmap[l1_slot],
(pa & ~L1_OFFSET) | ATTR_DEFAULT | ATTR_XN |
ATTR_IDX(CACHED_MEMORY) | L1_BLOCK);
}
/* Set the upper limit of the DMAP region */
dmap_phys_max = pa;
dmap_max_addr = va;
cpu_tlb_flushID();
}
static vm_offset_t
pmap_bootstrap_l2(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l2_start)
{
vm_offset_t l2pt;
vm_paddr_t pa;
pd_entry_t *l1;
u_int l1_slot;
KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address"));
l1 = (pd_entry_t *)l1pt;
l1_slot = pmap_l1_index(va);
l2pt = l2_start;
for (; va < VM_MAX_KERNEL_ADDRESS; l1_slot++, va += L1_SIZE) {
KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index"));
pa = pmap_early_vtophys(l1pt, l2pt);
pmap_load_store(&l1[l1_slot],
(pa & ~Ln_TABLE_MASK) | L1_TABLE);
l2pt += PAGE_SIZE;
}
/* Clean the L2 page table */
memset((void *)l2_start, 0, l2pt - l2_start);
return l2pt;
}
static vm_offset_t
pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start)
{
- vm_offset_t l2pt, l3pt;
+ vm_offset_t l3pt;
vm_paddr_t pa;
pd_entry_t *l2;
u_int l2_slot;
KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address"));
l2 = pmap_l2(kernel_pmap, va);
l2 = (pd_entry_t *)rounddown2((uintptr_t)l2, PAGE_SIZE);
- l2pt = (vm_offset_t)l2;
l2_slot = pmap_l2_index(va);
l3pt = l3_start;
for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) {
KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index"));
pa = pmap_early_vtophys(l1pt, l3pt);
pmap_load_store(&l2[l2_slot],
(pa & ~Ln_TABLE_MASK) | L2_TABLE);
l3pt += PAGE_SIZE;
}
/* Clean the L2 page table */
memset((void *)l3_start, 0, l3pt - l3_start);
return l3pt;
}
/*
* Bootstrap the system enough to run with virtual memory.
*/
void
pmap_bootstrap(vm_offset_t l0pt, vm_offset_t l1pt, vm_paddr_t kernstart,
vm_size_t kernlen)
{
u_int l1_slot, l2_slot, avail_slot, map_slot, used_map_slot;
uint64_t kern_delta;
pt_entry_t *l2;
vm_offset_t va, freemempos;
vm_offset_t dpcpu, msgbufpv;
vm_paddr_t pa, max_pa, min_pa;
int i;
kern_delta = KERNBASE - kernstart;
physmem = 0;
printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen);
printf("%lx\n", l1pt);
printf("%lx\n", (KERNBASE >> L1_SHIFT) & Ln_ADDR_MASK);
/* Set this early so we can use the pagetable walking functions */
kernel_pmap_store.pm_l0 = (pd_entry_t *)l0pt;
PMAP_LOCK_INIT(kernel_pmap);
/* Assume the address we were loaded to is a valid physical address */
min_pa = max_pa = KERNBASE - kern_delta;
/*
* Find the minimum physical address. physmap is sorted,
* but may contain empty ranges.
*/
for (i = 0; i < (physmap_idx * 2); i += 2) {
if (physmap[i] == physmap[i + 1])
continue;
if (physmap[i] <= min_pa)
min_pa = physmap[i];
if (physmap[i + 1] > max_pa)
max_pa = physmap[i + 1];
}
/* Create a direct map region early so we can use it for pa -> va */
pmap_bootstrap_dmap(l1pt, min_pa, max_pa);
va = KERNBASE;
pa = KERNBASE - kern_delta;
/*
* Start to initialise phys_avail by copying from physmap
* up to the physical address KERNBASE points at.
*/
map_slot = avail_slot = 0;
for (; map_slot < (physmap_idx * 2) &&
avail_slot < (PHYS_AVAIL_SIZE - 2); map_slot += 2) {
if (physmap[map_slot] == physmap[map_slot + 1])
continue;
if (physmap[map_slot] <= pa &&
physmap[map_slot + 1] > pa)
break;
phys_avail[avail_slot] = physmap[map_slot];
phys_avail[avail_slot + 1] = physmap[map_slot + 1];
physmem += (phys_avail[avail_slot + 1] -
phys_avail[avail_slot]) >> PAGE_SHIFT;
avail_slot += 2;
}
/* Add the memory before the kernel */
if (physmap[avail_slot] < pa && avail_slot < (PHYS_AVAIL_SIZE - 2)) {
phys_avail[avail_slot] = physmap[map_slot];
phys_avail[avail_slot + 1] = pa;
physmem += (phys_avail[avail_slot + 1] -
phys_avail[avail_slot]) >> PAGE_SHIFT;
avail_slot += 2;
}
used_map_slot = map_slot;
/*
* Read the page table to find out what is already mapped.
* This assumes we have mapped a block of memory from KERNBASE
* using a single L1 entry.
*/
l2 = pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot);
/* Sanity check the index, KERNBASE should be the first VA */
KASSERT(l2_slot == 0, ("The L2 index is non-zero"));
/* Find how many pages we have mapped */
for (; l2_slot < Ln_ENTRIES; l2_slot++) {
if ((l2[l2_slot] & ATTR_DESCR_MASK) == 0)
break;
/* Check locore used L2 blocks */
KASSERT((l2[l2_slot] & ATTR_DESCR_MASK) == L2_BLOCK,
("Invalid bootstrap L2 table"));
KASSERT((l2[l2_slot] & ~ATTR_MASK) == pa,
("Incorrect PA in L2 table"));
va += L2_SIZE;
pa += L2_SIZE;
}
va = roundup2(va, L1_SIZE);
freemempos = KERNBASE + kernlen;
freemempos = roundup2(freemempos, PAGE_SIZE);
/* Create the l2 tables up to VM_MAX_KERNEL_ADDRESS */
freemempos = pmap_bootstrap_l2(l1pt, va, freemempos);
/* And the l3 tables for the early devmap */
freemempos = pmap_bootstrap_l3(l1pt,
VM_MAX_KERNEL_ADDRESS - L2_SIZE, freemempos);
cpu_tlb_flushID();
#define alloc_pages(var, np) \
(var) = freemempos; \
freemempos += (np * PAGE_SIZE); \
memset((char *)(var), 0, ((np) * PAGE_SIZE));
/* Allocate dynamic per-cpu area. */
alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE);
dpcpu_init((void *)dpcpu, 0);
/* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */
alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE);
msgbufp = (void *)msgbufpv;
virtual_avail = roundup2(freemempos, L1_SIZE);
virtual_end = VM_MAX_KERNEL_ADDRESS - L2_SIZE;
kernel_vm_end = virtual_avail;
pa = pmap_early_vtophys(l1pt, freemempos);
/* Finish initialising physmap */
map_slot = used_map_slot;
for (; avail_slot < (PHYS_AVAIL_SIZE - 2) &&
map_slot < (physmap_idx * 2); map_slot += 2) {
if (physmap[map_slot] == physmap[map_slot + 1])
continue;
/* Have we used the current range? */
if (physmap[map_slot + 1] <= pa)
continue;
/* Do we need to split the entry? */
if (physmap[map_slot] < pa) {
phys_avail[avail_slot] = pa;
phys_avail[avail_slot + 1] = physmap[map_slot + 1];
} else {
phys_avail[avail_slot] = physmap[map_slot];
phys_avail[avail_slot + 1] = physmap[map_slot + 1];
}
physmem += (phys_avail[avail_slot + 1] -
phys_avail[avail_slot]) >> PAGE_SHIFT;
avail_slot += 2;
}
phys_avail[avail_slot] = 0;
phys_avail[avail_slot + 1] = 0;
/*
* Maxmem isn't the "maximum memory", it's one larger than the
* highest page of the physical address space. It should be
* called something like "Maxphyspage".
*/
Maxmem = atop(phys_avail[avail_slot - 1]);
cpu_tlb_flushID();
}
/*
* Initialize a vm_page's machine-dependent fields.
*/
void
pmap_page_init(vm_page_t m)
{
TAILQ_INIT(&m->md.pv_list);
m->md.pv_memattr = VM_MEMATTR_WRITE_BACK;
}
/*
* Initialize the pmap module.
* Called by vm_init, to initialize any structures that the pmap
* system needs to map virtual memory.
*/
void
pmap_init(void)
{
vm_size_t s;
int i, pv_npg;
/*
* Are large page mappings enabled?
*/
TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled);
/*
* Initialize the pv chunk list mutex.
*/
mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
/*
* Initialize the pool of pv list locks.
*/
for (i = 0; i < NPV_LIST_LOCKS; i++)
rw_init(&pv_list_locks[i], "pmap pv list");
/*
* Calculate the size of the pv head table for superpages.
*/
pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE);
/*
* Allocate memory for the pv head table for superpages.
*/
s = (vm_size_t)(pv_npg * sizeof(struct md_page));
s = round_page(s);
pv_table = (struct md_page *)kmem_malloc(kernel_arena, s,
M_WAITOK | M_ZERO);
for (i = 0; i < pv_npg; i++)
TAILQ_INIT(&pv_table[i].pv_list);
TAILQ_INIT(&pv_dummy.pv_list);
}
static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD, 0,
"2MB page mapping counters");
static u_long pmap_l2_demotions;
SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD,
&pmap_l2_demotions, 0, "2MB page demotions");
static u_long pmap_l2_p_failures;
SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD,
&pmap_l2_p_failures, 0, "2MB page promotion failures");
static u_long pmap_l2_promotions;
SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD,
&pmap_l2_promotions, 0, "2MB page promotions");
/*
* Invalidate a single TLB entry.
*/
static __inline void
pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
{
sched_pin();
__asm __volatile(
"dsb ishst \n"
"tlbi vaae1is, %0 \n"
"dsb ish \n"
"isb \n"
: : "r"(va >> PAGE_SHIFT));
sched_unpin();
}
static __inline void
pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
{
vm_offset_t addr;
sched_pin();
dsb(ishst);
for (addr = sva; addr < eva; addr += PAGE_SIZE) {
__asm __volatile(
"tlbi vaae1is, %0" : : "r"(addr >> PAGE_SHIFT));
}
__asm __volatile(
"dsb ish \n"
"isb \n");
sched_unpin();
}
static __inline void
pmap_invalidate_all(pmap_t pmap)
{
sched_pin();
__asm __volatile(
"dsb ishst \n"
"tlbi vmalle1is \n"
"dsb ish \n"
"isb \n");
sched_unpin();
}
/*
* Routine: pmap_extract
* Function:
* Extract the physical page address associated
* with the given map/virtual_address pair.
*/
vm_paddr_t
pmap_extract(pmap_t pmap, vm_offset_t va)
{
pt_entry_t *pte, tpte;
vm_paddr_t pa;
int lvl;
pa = 0;
PMAP_LOCK(pmap);
/*
* Find the block or page map for this virtual address. pmap_pte
* will return either a valid block/page entry, or NULL.
*/
pte = pmap_pte(pmap, va, &lvl);
if (pte != NULL) {
tpte = pmap_load(pte);
pa = tpte & ~ATTR_MASK;
switch(lvl) {
case 1:
KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK,
("pmap_extract: Invalid L1 pte found: %lx",
tpte & ATTR_DESCR_MASK));
pa |= (va & L1_OFFSET);
break;
case 2:
KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK,
("pmap_extract: Invalid L2 pte found: %lx",
tpte & ATTR_DESCR_MASK));
pa |= (va & L2_OFFSET);
break;
case 3:
KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE,
("pmap_extract: Invalid L3 pte found: %lx",
tpte & ATTR_DESCR_MASK));
pa |= (va & L3_OFFSET);
break;
}
}
PMAP_UNLOCK(pmap);
return (pa);
}
/*
* Routine: pmap_extract_and_hold
* Function:
* Atomically extract and hold the physical page
* with the given pmap and virtual address pair
* if that mapping permits the given protection.
*/
vm_page_t
pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
{
pt_entry_t *pte, tpte;
vm_offset_t off;
vm_paddr_t pa;
vm_page_t m;
int lvl;
pa = 0;
m = NULL;
PMAP_LOCK(pmap);
retry:
pte = pmap_pte(pmap, va, &lvl);
if (pte != NULL) {
tpte = pmap_load(pte);
KASSERT(lvl > 0 && lvl <= 3,
("pmap_extract_and_hold: Invalid level %d", lvl));
CTASSERT(L1_BLOCK == L2_BLOCK);
KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) ||
(lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK),
("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl,
tpte & ATTR_DESCR_MASK));
if (((tpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) ||
((prot & VM_PROT_WRITE) == 0)) {
switch(lvl) {
case 1:
off = va & L1_OFFSET;
break;
case 2:
off = va & L2_OFFSET;
break;
case 3:
default:
off = 0;
}
if (vm_page_pa_tryrelock(pmap,
(tpte & ~ATTR_MASK) | off, &pa))
goto retry;
m = PHYS_TO_VM_PAGE((tpte & ~ATTR_MASK) | off);
vm_page_hold(m);
}
}
PA_UNLOCK_COND(pa);
PMAP_UNLOCK(pmap);
return (m);
}
vm_paddr_t
pmap_kextract(vm_offset_t va)
{
pt_entry_t *pte, tpte;
vm_paddr_t pa;
int lvl;
if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
pa = DMAP_TO_PHYS(va);
} else {
pa = 0;
pte = pmap_pte(kernel_pmap, va, &lvl);
if (pte != NULL) {
tpte = pmap_load(pte);
pa = tpte & ~ATTR_MASK;
switch(lvl) {
case 1:
KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK,
("pmap_kextract: Invalid L1 pte found: %lx",
tpte & ATTR_DESCR_MASK));
pa |= (va & L1_OFFSET);
break;
case 2:
KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK,
("pmap_kextract: Invalid L2 pte found: %lx",
tpte & ATTR_DESCR_MASK));
pa |= (va & L2_OFFSET);
break;
case 3:
KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE,
("pmap_kextract: Invalid L3 pte found: %lx",
tpte & ATTR_DESCR_MASK));
pa |= (va & L3_OFFSET);
break;
}
}
}
return (pa);
}
/***************************************************
* Low level mapping routines.....
***************************************************/
static void
pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode)
{
pd_entry_t *pde;
pt_entry_t *pte, attr;
vm_offset_t va;
int lvl;
KASSERT((pa & L3_OFFSET) == 0,
("pmap_kenter: Invalid physical address"));
KASSERT((sva & L3_OFFSET) == 0,
("pmap_kenter: Invalid virtual address"));
KASSERT((size & PAGE_MASK) == 0,
("pmap_kenter: Mapping is not page-sized"));
attr = ATTR_DEFAULT | ATTR_IDX(mode) | L3_PAGE;
if (mode == DEVICE_MEMORY)
attr |= ATTR_XN;
va = sva;
while (size != 0) {
pde = pmap_pde(kernel_pmap, va, &lvl);
KASSERT(pde != NULL,
("pmap_kenter: Invalid page entry, va: 0x%lx", va));
KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl));
pte = pmap_l2_to_l3(pde, va);
pmap_load_store(pte, (pa & ~L3_OFFSET) | attr);
va += PAGE_SIZE;
pa += PAGE_SIZE;
size -= PAGE_SIZE;
}
pmap_invalidate_range(kernel_pmap, sva, va);
}
void
pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa)
{
pmap_kenter(sva, size, pa, DEVICE_MEMORY);
}
/*
* Remove a page from the kernel pagetables.
*/
PMAP_INLINE void
pmap_kremove(vm_offset_t va)
{
pt_entry_t *pte;
int lvl;
pte = pmap_pte(kernel_pmap, va, &lvl);
KASSERT(pte != NULL, ("pmap_kremove: Invalid address"));
KASSERT(lvl == 3, ("pmap_kremove: Invalid pte level %d", lvl));
pmap_load_clear(pte);
pmap_invalidate_page(kernel_pmap, va);
}
void
pmap_kremove_device(vm_offset_t sva, vm_size_t size)
{
pt_entry_t *pte;
vm_offset_t va;
int lvl;
KASSERT((sva & L3_OFFSET) == 0,
("pmap_kremove_device: Invalid virtual address"));
KASSERT((size & PAGE_MASK) == 0,
("pmap_kremove_device: Mapping is not page-sized"));
va = sva;
while (size != 0) {
pte = pmap_pte(kernel_pmap, va, &lvl);
KASSERT(pte != NULL, ("Invalid page table, va: 0x%lx", va));
KASSERT(lvl == 3,
("Invalid device pagetable level: %d != 3", lvl));
pmap_load_clear(pte);
va += PAGE_SIZE;
size -= PAGE_SIZE;
}
pmap_invalidate_range(kernel_pmap, sva, va);
}
/*
* Used to map a range of physical addresses into kernel
* virtual address space.
*
* The value passed in '*virt' is a suggested virtual address for
* the mapping. Architectures which can support a direct-mapped
* physical to virtual region can return the appropriate address
* within that region, leaving '*virt' unchanged. Other
* architectures should map the pages starting at '*virt' and
* update '*virt' with the first usable address after the mapped
* region.
*/
vm_offset_t
pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
{
return PHYS_TO_DMAP(start);
}
/*
* Add a list of wired pages to the kva
* this routine is only used for temporary
* kernel mappings that do not need to have
* page modification or references recorded.
* Note that old mappings are simply written
* over. The page *must* be wired.
* Note: SMP coherent. Uses a ranged shootdown IPI.
*/
void
pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
{
pd_entry_t *pde;
pt_entry_t *pte, pa;
vm_offset_t va;
vm_page_t m;
int i, lvl;
va = sva;
for (i = 0; i < count; i++) {
pde = pmap_pde(kernel_pmap, va, &lvl);
KASSERT(pde != NULL,
("pmap_qenter: Invalid page entry, va: 0x%lx", va));
KASSERT(lvl == 2,
("pmap_qenter: Invalid level %d", lvl));
m = ma[i];
pa = VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | ATTR_AP(ATTR_AP_RW) |
ATTR_IDX(m->md.pv_memattr) | L3_PAGE;
if (m->md.pv_memattr == DEVICE_MEMORY)
pa |= ATTR_XN;
pte = pmap_l2_to_l3(pde, va);
pmap_load_store(pte, pa);
va += L3_SIZE;
}
pmap_invalidate_range(kernel_pmap, sva, va);
}
/*
* This routine tears out page mappings from the
* kernel -- it is meant only for temporary mappings.
*/
void
pmap_qremove(vm_offset_t sva, int count)
{
pt_entry_t *pte;
vm_offset_t va;
int lvl;
KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva));
va = sva;
while (count-- > 0) {
pte = pmap_pte(kernel_pmap, va, &lvl);
KASSERT(lvl == 3,
("Invalid device pagetable level: %d != 3", lvl));
if (pte != NULL) {
pmap_load_clear(pte);
}
va += PAGE_SIZE;
}
pmap_invalidate_range(kernel_pmap, sva, va);
}
/***************************************************
* Page table page management routines.....
***************************************************/
static __inline void
pmap_free_zero_pages(struct spglist *free)
{
vm_page_t m;
while ((m = SLIST_FIRST(free)) != NULL) {
SLIST_REMOVE_HEAD(free, plinks.s.ss);
/* Preserve the page's PG_ZERO setting. */
vm_page_free_toq(m);
}
}
/*
* Schedule the specified unused page table page to be freed. Specifically,
* add the page to the specified list of pages that will be released to the
* physical memory manager after the TLB has been updated.
*/
static __inline void
pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
boolean_t set_PG_ZERO)
{
if (set_PG_ZERO)
m->flags |= PG_ZERO;
else
m->flags &= ~PG_ZERO;
SLIST_INSERT_HEAD(free, m, plinks.s.ss);
}
/*
* Decrements a page table page's wire count, which is used to record the
* number of valid page table entries within the page. If the wire count
* drops to zero, then the page table page is unmapped. Returns TRUE if the
* page table page was unmapped and FALSE otherwise.
*/
static inline boolean_t
pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
{
--m->wire_count;
if (m->wire_count == 0) {
_pmap_unwire_l3(pmap, va, m, free);
return (TRUE);
} else
return (FALSE);
}
static void
_pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
{
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
/*
* unmap the page table page
*/
if (m->pindex >= (NUL2E + NUL1E)) {
/* l1 page */
pd_entry_t *l0;
l0 = pmap_l0(pmap, va);
pmap_load_clear(l0);
} else if (m->pindex >= NUL2E) {
/* l2 page */
pd_entry_t *l1;
l1 = pmap_l1(pmap, va);
pmap_load_clear(l1);
} else {
/* l3 page */
pd_entry_t *l2;
l2 = pmap_l2(pmap, va);
pmap_load_clear(l2);
}
pmap_resident_count_dec(pmap, 1);
if (m->pindex < NUL2E) {
/* We just released an l3, unhold the matching l2 */
pd_entry_t *l1, tl1;
vm_page_t l2pg;
l1 = pmap_l1(pmap, va);
tl1 = pmap_load(l1);
l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK);
pmap_unwire_l3(pmap, va, l2pg, free);
} else if (m->pindex < (NUL2E + NUL1E)) {
/* We just released an l2, unhold the matching l1 */
pd_entry_t *l0, tl0;
vm_page_t l1pg;
l0 = pmap_l0(pmap, va);
tl0 = pmap_load(l0);
l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK);
pmap_unwire_l3(pmap, va, l1pg, free);
}
pmap_invalidate_page(pmap, va);
/*
* This is a release store so that the ordinary store unmapping
* the page table page is globally performed before TLB shoot-
* down is begun.
*/
atomic_subtract_rel_int(&vm_cnt.v_wire_count, 1);
/*
* Put page on a list so that it is released after
* *ALL* TLB shootdown is done
*/
pmap_add_delayed_free_list(m, free, TRUE);
}
/*
* After removing a page table entry, this routine is used to
* conditionally free the page, and manage the hold/wire counts.
*/
static int
pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
struct spglist *free)
{
vm_page_t mpte;
if (va >= VM_MAXUSER_ADDRESS)
return (0);
KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
mpte = PHYS_TO_VM_PAGE(ptepde & ~ATTR_MASK);
return (pmap_unwire_l3(pmap, va, mpte, free));
}
void
pmap_pinit0(pmap_t pmap)
{
PMAP_LOCK_INIT(pmap);
bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
pmap->pm_l0 = kernel_pmap->pm_l0;
pmap->pm_root.rt_root = 0;
}
int
pmap_pinit(pmap_t pmap)
{
vm_paddr_t l0phys;
vm_page_t l0pt;
/*
* allocate the l0 page
*/
while ((l0pt = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
VM_WAIT;
l0phys = VM_PAGE_TO_PHYS(l0pt);
pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(l0phys);
if ((l0pt->flags & PG_ZERO) == 0)
pagezero(pmap->pm_l0);
pmap->pm_root.rt_root = 0;
bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
return (1);
}
/*
* This routine is called if the desired page table page does not exist.
*
* If page table page allocation fails, this routine may sleep before
* returning NULL. It sleeps only if a lock pointer was given.
*
* Note: If a page allocation fails at page table level two or three,
* one or two pages may be held during the wait, only to be released
* afterwards. This conservative approach is easily argued to avoid
* race conditions.
*/
static vm_page_t
_pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
{
vm_page_t m, l1pg, l2pg;
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
/*
* Allocate a page table page.
*/
if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
if (lockp != NULL) {
RELEASE_PV_LIST_LOCK(lockp);
PMAP_UNLOCK(pmap);
VM_WAIT;
PMAP_LOCK(pmap);
}
/*
* Indicate the need to retry. While waiting, the page table
* page may have been allocated.
*/
return (NULL);
}
if ((m->flags & PG_ZERO) == 0)
pmap_zero_page(m);
/*
* Map the pagetable page into the process address space, if
* it isn't already there.
*/
if (ptepindex >= (NUL2E + NUL1E)) {
pd_entry_t *l0;
vm_pindex_t l0index;
l0index = ptepindex - (NUL2E + NUL1E);
l0 = &pmap->pm_l0[l0index];
pmap_load_store(l0, VM_PAGE_TO_PHYS(m) | L0_TABLE);
} else if (ptepindex >= NUL2E) {
vm_pindex_t l0index, l1index;
pd_entry_t *l0, *l1;
pd_entry_t tl0;
l1index = ptepindex - NUL2E;
l0index = l1index >> L0_ENTRIES_SHIFT;
l0 = &pmap->pm_l0[l0index];
tl0 = pmap_load(l0);
if (tl0 == 0) {
/* recurse for allocating page dir */
if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index,
lockp) == NULL) {
--m->wire_count;
/* XXX: release mem barrier? */
atomic_subtract_int(&vm_cnt.v_wire_count, 1);
vm_page_free_zero(m);
return (NULL);
}
} else {
l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK);
l1pg->wire_count++;
}
l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK);
l1 = &l1[ptepindex & Ln_ADDR_MASK];
pmap_load_store(l1, VM_PAGE_TO_PHYS(m) | L1_TABLE);
} else {
vm_pindex_t l0index, l1index;
pd_entry_t *l0, *l1, *l2;
pd_entry_t tl0, tl1;
l1index = ptepindex >> Ln_ENTRIES_SHIFT;
l0index = l1index >> L0_ENTRIES_SHIFT;
l0 = &pmap->pm_l0[l0index];
tl0 = pmap_load(l0);
if (tl0 == 0) {
/* recurse for allocating page dir */
if (_pmap_alloc_l3(pmap, NUL2E + l1index,
lockp) == NULL) {
--m->wire_count;
atomic_subtract_int(&vm_cnt.v_wire_count, 1);
vm_page_free_zero(m);
return (NULL);
}
tl0 = pmap_load(l0);
l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK);
l1 = &l1[l1index & Ln_ADDR_MASK];
} else {
l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK);
l1 = &l1[l1index & Ln_ADDR_MASK];
tl1 = pmap_load(l1);
if (tl1 == 0) {
/* recurse for allocating page dir */
if (_pmap_alloc_l3(pmap, NUL2E + l1index,
lockp) == NULL) {
--m->wire_count;
/* XXX: release mem barrier? */
atomic_subtract_int(
&vm_cnt.v_wire_count, 1);
vm_page_free_zero(m);
return (NULL);
}
} else {
l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK);
l2pg->wire_count++;
}
}
l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK);
l2 = &l2[ptepindex & Ln_ADDR_MASK];
pmap_load_store(l2, VM_PAGE_TO_PHYS(m) | L2_TABLE);
}
pmap_resident_count_inc(pmap, 1);
return (m);
}
static vm_page_t
pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
{
vm_pindex_t ptepindex;
pd_entry_t *pde, tpde;
#ifdef INVARIANTS
pt_entry_t *pte;
#endif
vm_page_t m;
int lvl;
/*
* Calculate pagetable page index
*/
ptepindex = pmap_l2_pindex(va);
retry:
/*
* Get the page directory entry
*/
pde = pmap_pde(pmap, va, &lvl);
/*
* If the page table page is mapped, we just increment the hold count,
* and activate it. If we get a level 2 pde it will point to a level 3
* table.
*/
switch (lvl) {
case -1:
break;
case 0:
#ifdef INVARIANTS
pte = pmap_l0_to_l1(pde, va);
KASSERT(pmap_load(pte) == 0,
("pmap_alloc_l3: TODO: l0 superpages"));
#endif
break;
case 1:
#ifdef INVARIANTS
pte = pmap_l1_to_l2(pde, va);
KASSERT(pmap_load(pte) == 0,
("pmap_alloc_l3: TODO: l1 superpages"));
#endif
break;
case 2:
tpde = pmap_load(pde);
if (tpde != 0) {
m = PHYS_TO_VM_PAGE(tpde & ~ATTR_MASK);
m->wire_count++;
return (m);
}
break;
default:
panic("pmap_alloc_l3: Invalid level %d", lvl);
}
/*
* Here if the pte page isn't mapped, or if it has been deallocated.
*/
m = _pmap_alloc_l3(pmap, ptepindex, lockp);
if (m == NULL && lockp != NULL)
goto retry;
return (m);
}
/***************************************************
* Pmap allocation/deallocation routines.
***************************************************/
/*
* Release any resources held by the given physical map.
* Called when a pmap initialized by pmap_pinit is being released.
* Should only be called if the map contains no valid mappings.
*/
void
pmap_release(pmap_t pmap)
{
vm_page_t m;
KASSERT(pmap->pm_stats.resident_count == 0,
("pmap_release: pmap resident count %ld != 0",
pmap->pm_stats.resident_count));
KASSERT(vm_radix_is_empty(&pmap->pm_root),
("pmap_release: pmap has reserved page table page(s)"));
m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_l0));
m->wire_count--;
atomic_subtract_int(&vm_cnt.v_wire_count, 1);
vm_page_free_zero(m);
}
static int
kvm_size(SYSCTL_HANDLER_ARGS)
{
unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
return sysctl_handle_long(oidp, &ksize, 0, req);
}
SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
0, 0, kvm_size, "LU", "Size of KVM");
static int
kvm_free(SYSCTL_HANDLER_ARGS)
{
unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
return sysctl_handle_long(oidp, &kfree, 0, req);
}
SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
0, 0, kvm_free, "LU", "Amount of KVM free");
/*
* grow the number of kernel page table entries, if needed
*/
void
pmap_growkernel(vm_offset_t addr)
{
vm_paddr_t paddr;
vm_page_t nkpg;
pd_entry_t *l0, *l1, *l2;
mtx_assert(&kernel_map->system_mtx, MA_OWNED);
addr = roundup2(addr, L2_SIZE);
if (addr - 1 >= kernel_map->max_offset)
addr = kernel_map->max_offset;
while (kernel_vm_end < addr) {
l0 = pmap_l0(kernel_pmap, kernel_vm_end);
KASSERT(pmap_load(l0) != 0,
("pmap_growkernel: No level 0 kernel entry"));
l1 = pmap_l0_to_l1(l0, kernel_vm_end);
if (pmap_load(l1) == 0) {
/* We need a new PDP entry */
nkpg = vm_page_alloc(NULL, kernel_vm_end >> L1_SHIFT,
VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
VM_ALLOC_WIRED | VM_ALLOC_ZERO);
if (nkpg == NULL)
panic("pmap_growkernel: no memory to grow kernel");
if ((nkpg->flags & PG_ZERO) == 0)
pmap_zero_page(nkpg);
paddr = VM_PAGE_TO_PHYS(nkpg);
pmap_load_store(l1, paddr | L1_TABLE);
continue; /* try again */
}
l2 = pmap_l1_to_l2(l1, kernel_vm_end);
if ((pmap_load(l2) & ATTR_AF) != 0) {
kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
if (kernel_vm_end - 1 >= kernel_map->max_offset) {
kernel_vm_end = kernel_map->max_offset;
break;
}
continue;
}
nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_SHIFT,
VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
VM_ALLOC_ZERO);
if (nkpg == NULL)
panic("pmap_growkernel: no memory to grow kernel");
if ((nkpg->flags & PG_ZERO) == 0)
pmap_zero_page(nkpg);
paddr = VM_PAGE_TO_PHYS(nkpg);
pmap_load_store(l2, paddr | L2_TABLE);
pmap_invalidate_page(kernel_pmap, kernel_vm_end);
kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
if (kernel_vm_end - 1 >= kernel_map->max_offset) {
kernel_vm_end = kernel_map->max_offset;
break;
}
}
}
/***************************************************
* page management routines.
***************************************************/
CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
CTASSERT(_NPCM == 3);
CTASSERT(_NPCPV == 168);
static __inline struct pv_chunk *
pv_to_chunk(pv_entry_t pv)
{
return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
}
#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
#define PC_FREE0 0xfffffffffffffffful
#define PC_FREE1 0xfffffffffffffffful
#define PC_FREE2 0x000000fffffffffful
static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
#if 0
#ifdef PV_STATS
static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
"Current number of pv entry chunks");
SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
"Current number of pv entry chunks allocated");
SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
"Current number of pv entry chunks frees");
SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
"Number of times tried to get a chunk page but failed.");
static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
static int pv_entry_spare;
SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
"Current number of pv entry frees");
SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
"Current number of pv entry allocs");
SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
"Current number of pv entries");
SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
"Current number of spare pv entries");
#endif
#endif /* 0 */
/*
* We are in a serious low memory condition. Resort to
* drastic measures to free some pages so we can allocate
* another pv entry chunk.
*
* Returns NULL if PV entries were reclaimed from the specified pmap.
*
* We do not, however, unmap 2mpages because subsequent accesses will
* allocate per-page pv entries until repromotion occurs, thereby
* exacerbating the shortage of free pv entries.
*/
static vm_page_t
reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
{
struct pch new_tail;
struct pv_chunk *pc;
struct md_page *pvh;
pd_entry_t *pde;
pmap_t pmap;
pt_entry_t *pte, tpte;
pv_entry_t pv;
vm_offset_t va;
vm_page_t m, m_pc;
struct spglist free;
uint64_t inuse;
int bit, field, freed, lvl;
PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
pmap = NULL;
m_pc = NULL;
SLIST_INIT(&free);
TAILQ_INIT(&new_tail);
mtx_lock(&pv_chunks_mutex);
while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && SLIST_EMPTY(&free)) {
TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
mtx_unlock(&pv_chunks_mutex);
if (pmap != pc->pc_pmap) {
if (pmap != NULL && pmap != locked_pmap)
PMAP_UNLOCK(pmap);
pmap = pc->pc_pmap;
/* Avoid deadlock and lock recursion. */
if (pmap > locked_pmap) {
RELEASE_PV_LIST_LOCK(lockp);
PMAP_LOCK(pmap);
} else if (pmap != locked_pmap &&
!PMAP_TRYLOCK(pmap)) {
pmap = NULL;
TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
mtx_lock(&pv_chunks_mutex);
continue;
}
}
/*
* Destroy every non-wired, 4 KB page mapping in the chunk.
*/
freed = 0;
for (field = 0; field < _NPCM; field++) {
for (inuse = ~pc->pc_map[field] & pc_freemask[field];
inuse != 0; inuse &= ~(1UL << bit)) {
bit = ffsl(inuse) - 1;
pv = &pc->pc_pventry[field * 64 + bit];
va = pv->pv_va;
pde = pmap_pde(pmap, va, &lvl);
if (lvl != 2)
continue;
pte = pmap_l2_to_l3(pde, va);
tpte = pmap_load(pte);
if ((tpte & ATTR_SW_WIRED) != 0)
continue;
tpte = pmap_load_clear(pte);
pmap_invalidate_page(pmap, va);
m = PHYS_TO_VM_PAGE(tpte & ~ATTR_MASK);
if (pmap_page_dirty(tpte))
vm_page_dirty(m);
if ((tpte & ATTR_AF) != 0)
vm_page_aflag_set(m, PGA_REFERENCED);
CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
m->md.pv_gen++;
if (TAILQ_EMPTY(&m->md.pv_list) &&
(m->flags & PG_FICTITIOUS) == 0) {
pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
if (TAILQ_EMPTY(&pvh->pv_list)) {
vm_page_aflag_clear(m,
PGA_WRITEABLE);
}
}
pc->pc_map[field] |= 1UL << bit;
pmap_unuse_pt(pmap, va, pmap_load(pde), &free);
freed++;
}
}
if (freed == 0) {
TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
mtx_lock(&pv_chunks_mutex);
continue;
}
/* Every freed mapping is for a 4 KB page. */
pmap_resident_count_dec(pmap, freed);
PV_STAT(atomic_add_long(&pv_entry_frees, freed));
PV_STAT(atomic_add_int(&pv_entry_spare, freed));
PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
pc->pc_map[2] == PC_FREE2) {
PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
/* Entire chunk is free; return it. */
m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
dump_drop_page(m_pc->phys_addr);
mtx_lock(&pv_chunks_mutex);
break;
}
TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
mtx_lock(&pv_chunks_mutex);
/* One freed pv entry in locked_pmap is sufficient. */
if (pmap == locked_pmap)
break;
}
TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
mtx_unlock(&pv_chunks_mutex);
if (pmap != NULL && pmap != locked_pmap)
PMAP_UNLOCK(pmap);
if (m_pc == NULL && !SLIST_EMPTY(&free)) {
m_pc = SLIST_FIRST(&free);
SLIST_REMOVE_HEAD(&free, plinks.s.ss);
/* Recycle a freed page table page. */
m_pc->wire_count = 1;
atomic_add_int(&vm_cnt.v_wire_count, 1);
}
pmap_free_zero_pages(&free);
return (m_pc);
}
/*
* free the pv_entry back to the free list
*/
static void
free_pv_entry(pmap_t pmap, pv_entry_t pv)
{
struct pv_chunk *pc;
int idx, field, bit;
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
PV_STAT(atomic_add_long(&pv_entry_frees, 1));
PV_STAT(atomic_add_int(&pv_entry_spare, 1));
PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
pc = pv_to_chunk(pv);
idx = pv - &pc->pc_pventry[0];
field = idx / 64;
bit = idx % 64;
pc->pc_map[field] |= 1ul << bit;
if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
pc->pc_map[2] != PC_FREE2) {
/* 98% of the time, pc is already at the head of the list. */
if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
}
return;
}
TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
free_pv_chunk(pc);
}
static void
free_pv_chunk(struct pv_chunk *pc)
{
vm_page_t m;
mtx_lock(&pv_chunks_mutex);
TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
mtx_unlock(&pv_chunks_mutex);
PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
/* entire chunk is free, return it */
m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
dump_drop_page(m->phys_addr);
vm_page_unwire(m, PQ_NONE);
vm_page_free(m);
}
/*
* Returns a new PV entry, allocating a new PV chunk from the system when
* needed. If this PV chunk allocation fails and a PV list lock pointer was
* given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is
* returned.
*
* The given PV list lock may be released.
*/
static pv_entry_t
get_pv_entry(pmap_t pmap, struct rwlock **lockp)
{
int bit, field;
pv_entry_t pv;
struct pv_chunk *pc;
vm_page_t m;
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
retry:
pc = TAILQ_FIRST(&pmap->pm_pvchunk);
if (pc != NULL) {
for (field = 0; field < _NPCM; field++) {
if (pc->pc_map[field]) {
bit = ffsl(pc->pc_map[field]) - 1;
break;
}
}
if (field < _NPCM) {
pv = &pc->pc_pventry[field * 64 + bit];
pc->pc_map[field] &= ~(1ul << bit);
/* If this was the last item, move it to tail */
if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
pc->pc_map[2] == 0) {
TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
pc_list);
}
PV_STAT(atomic_add_long(&pv_entry_count, 1));
PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
return (pv);
}
}
/* No free items, allocate another chunk */
m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
VM_ALLOC_WIRED);
if (m == NULL) {
if (lockp == NULL) {
PV_STAT(pc_chunk_tryfail++);
return (NULL);
}
m = reclaim_pv_chunk(pmap, lockp);
if (m == NULL)
goto retry;
}
PV_STAT(atomic_add_int(&pc_chunk_count, 1));
PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
dump_add_page(m->phys_addr);
pc = (void *)PHYS_TO_DMAP(m->phys_addr);
pc->pc_pmap = pmap;
pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */
pc->pc_map[1] = PC_FREE1;
pc->pc_map[2] = PC_FREE2;
mtx_lock(&pv_chunks_mutex);
TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
mtx_unlock(&pv_chunks_mutex);
pv = &pc->pc_pventry[0];
TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
PV_STAT(atomic_add_long(&pv_entry_count, 1));
PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
return (pv);
}
/*
* Ensure that the number of spare PV entries in the specified pmap meets or
* exceeds the given count, "needed".
*
* The given PV list lock may be released.
*/
static void
reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
{
struct pch new_tail;
struct pv_chunk *pc;
int avail, free;
vm_page_t m;
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
/*
* Newly allocated PV chunks must be stored in a private list until
* the required number of PV chunks have been allocated. Otherwise,
* reclaim_pv_chunk() could recycle one of these chunks. In
* contrast, these chunks must be added to the pmap upon allocation.
*/
TAILQ_INIT(&new_tail);
retry:
avail = 0;
TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
bit_count((bitstr_t *)pc->pc_map, 0,
sizeof(pc->pc_map) * NBBY, &free);
if (free == 0)
break;
avail += free;
if (avail >= needed)
break;
}
for (; avail < needed; avail += _NPCPV) {
m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
VM_ALLOC_WIRED);
if (m == NULL) {
m = reclaim_pv_chunk(pmap, lockp);
if (m == NULL)
goto retry;
}
PV_STAT(atomic_add_int(&pc_chunk_count, 1));
PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
dump_add_page(m->phys_addr);
pc = (void *)PHYS_TO_DMAP(m->phys_addr);
pc->pc_pmap = pmap;
pc->pc_map[0] = PC_FREE0;
pc->pc_map[1] = PC_FREE1;
pc->pc_map[2] = PC_FREE2;
TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
}
if (!TAILQ_EMPTY(&new_tail)) {
mtx_lock(&pv_chunks_mutex);
TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
mtx_unlock(&pv_chunks_mutex);
}
}
/*
* First find and then remove the pv entry for the specified pmap and virtual
* address from the specified pv list. Returns the pv entry if found and NULL
* otherwise. This operation can be performed on pv lists for either 4KB or
* 2MB page mappings.
*/
static __inline pv_entry_t
pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
{
pv_entry_t pv;
TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
pvh->pv_gen++;
break;
}
}
return (pv);
}
/*
* After demotion from a 2MB page mapping to 512 4KB page mappings,
* destroy the pv entry for the 2MB page mapping and reinstantiate the pv
* entries for each of the 4KB page mappings.
*/
static void
pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
struct rwlock **lockp)
{
struct md_page *pvh;
struct pv_chunk *pc;
pv_entry_t pv;
vm_offset_t va_last;
vm_page_t m;
int bit, field;
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
KASSERT((pa & L2_OFFSET) == 0,
("pmap_pv_demote_l2: pa is not 2mpage aligned"));
CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
/*
* Transfer the 2mpage's pv entry for this mapping to the first
* page's pv list. Once this transfer begins, the pv list lock
* must not be released until the last pv entry is reinstantiated.
*/
pvh = pa_to_pvh(pa);
va = va & ~L2_OFFSET;
pv = pmap_pvh_remove(pvh, pmap, va);
KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found"));
m = PHYS_TO_VM_PAGE(pa);
TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
m->md.pv_gen++;
/* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */
PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1));
va_last = va + L2_SIZE - PAGE_SIZE;
for (;;) {
pc = TAILQ_FIRST(&pmap->pm_pvchunk);
KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare"));
for (field = 0; field < _NPCM; field++) {
while (pc->pc_map[field]) {
bit = ffsl(pc->pc_map[field]) - 1;
pc->pc_map[field] &= ~(1ul << bit);
pv = &pc->pc_pventry[field * 64 + bit];
va += PAGE_SIZE;
pv->pv_va = va;
m++;
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
("pmap_pv_demote_l2: page %p is not managed", m));
TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
m->md.pv_gen++;
if (va == va_last)
goto out;
}
}
TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
}
out:
if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
}
PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1));
PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1));
}
/*
* First find and then destroy the pv entry for the specified pmap and virtual
* address. This operation can be performed on pv lists for either 4KB or 2MB
* page mappings.
*/
static void
pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
{
pv_entry_t pv;
pv = pmap_pvh_remove(pvh, pmap, va);
KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
free_pv_entry(pmap, pv);
}
/*
* Conditionally create the PV entry for a 4KB page mapping if the required
* memory can be allocated without resorting to reclamation.
*/
static boolean_t
pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
struct rwlock **lockp)
{
pv_entry_t pv;
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
/* Pass NULL instead of the lock pointer to disable reclamation. */
if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
pv->pv_va = va;
CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
m->md.pv_gen++;
return (TRUE);
} else
return (FALSE);
}
/*
* pmap_remove_l2: do the things to unmap a level 2 superpage in a process
*/
static int
pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
pd_entry_t l1e, struct spglist *free, struct rwlock **lockp)
{
struct md_page *pvh;
pt_entry_t old_l2;
vm_offset_t eva, va;
vm_page_t m, ml3;
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned"));
old_l2 = pmap_load_clear(l2);
pmap_invalidate_range(pmap, sva, sva + L2_SIZE);
if (old_l2 & ATTR_SW_WIRED)
pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE;
pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE);
if (old_l2 & ATTR_SW_MANAGED) {
CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, old_l2 & ~ATTR_MASK);
pvh = pa_to_pvh(old_l2 & ~ATTR_MASK);
pmap_pvh_free(pvh, pmap, sva);
eva = sva + L2_SIZE;
for (va = sva, m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK);
va < eva; va += PAGE_SIZE, m++) {
if (pmap_page_dirty(old_l2))
vm_page_dirty(m);
if (old_l2 & ATTR_AF)
vm_page_aflag_set(m, PGA_REFERENCED);
if (TAILQ_EMPTY(&m->md.pv_list) &&
TAILQ_EMPTY(&pvh->pv_list))
vm_page_aflag_clear(m, PGA_WRITEABLE);
}
}
KASSERT(pmap != kernel_pmap,
("Attempting to remove an l2 kernel page"));
ml3 = pmap_remove_pt_page(pmap, sva);
if (ml3 != NULL) {
pmap_resident_count_dec(pmap, 1);
KASSERT(ml3->wire_count == NL3PG,
("pmap_remove_pages: l3 page wire count error"));
ml3->wire_count = 0;
pmap_add_delayed_free_list(ml3, free, FALSE);
atomic_subtract_int(&vm_cnt.v_wire_count, 1);
}
return (pmap_unuse_pt(pmap, sva, l1e, free));
}
/*
* pmap_remove_l3: do the things to unmap a page in a process
*/
static int
pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va,
pd_entry_t l2e, struct spglist *free, struct rwlock **lockp)
{
struct md_page *pvh;
pt_entry_t old_l3;
vm_page_t m;
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
old_l3 = pmap_load_clear(l3);
pmap_invalidate_page(pmap, va);
if (old_l3 & ATTR_SW_WIRED)
pmap->pm_stats.wired_count -= 1;
pmap_resident_count_dec(pmap, 1);
if (old_l3 & ATTR_SW_MANAGED) {
m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK);
if (pmap_page_dirty(old_l3))
vm_page_dirty(m);
if (old_l3 & ATTR_AF)
vm_page_aflag_set(m, PGA_REFERENCED);
CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
pmap_pvh_free(&m->md, pmap, va);
if (TAILQ_EMPTY(&m->md.pv_list) &&
(m->flags & PG_FICTITIOUS) == 0) {
pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
if (TAILQ_EMPTY(&pvh->pv_list))
vm_page_aflag_clear(m, PGA_WRITEABLE);
}
}
return (pmap_unuse_pt(pmap, va, l2e, free));
}
/*
* Remove the given range of addresses from the specified map.
*
* It is assumed that the start and end are properly
* rounded to the page size.
*/
void
pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
{
struct rwlock *lock;
vm_offset_t va, va_next;
pd_entry_t *l0, *l1, *l2;
pt_entry_t l3_paddr, *l3;
struct spglist free;
/*
* Perform an unsynchronized read. This is, however, safe.
*/
if (pmap->pm_stats.resident_count == 0)
return;
SLIST_INIT(&free);
PMAP_LOCK(pmap);
lock = NULL;
for (; sva < eva; sva = va_next) {
if (pmap->pm_stats.resident_count == 0)
break;
l0 = pmap_l0(pmap, sva);
if (pmap_load(l0) == 0) {
va_next = (sva + L0_SIZE) & ~L0_OFFSET;
if (va_next < sva)
va_next = eva;
continue;
}
l1 = pmap_l0_to_l1(l0, sva);
if (pmap_load(l1) == 0) {
va_next = (sva + L1_SIZE) & ~L1_OFFSET;
if (va_next < sva)
va_next = eva;
continue;
}
/*
* Calculate index for next page table.
*/
va_next = (sva + L2_SIZE) & ~L2_OFFSET;
if (va_next < sva)
va_next = eva;
l2 = pmap_l1_to_l2(l1, sva);
if (l2 == NULL)
continue;
l3_paddr = pmap_load(l2);
if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) {
if (sva + L2_SIZE == va_next && eva >= va_next) {
pmap_remove_l2(pmap, l2, sva, pmap_load(l1),
&free, &lock);
continue;
} else if (pmap_demote_l2_locked(pmap, l2,
sva &~L2_OFFSET, &lock) == NULL)
continue;
l3_paddr = pmap_load(l2);
}
/*
* Weed out invalid mappings.
*/
if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE)
continue;
/*
* Limit our scan to either the end of the va represented
* by the current page table page, or to the end of the
* range being removed.
*/
if (va_next > eva)
va_next = eva;
va = va_next;
for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
sva += L3_SIZE) {
if (l3 == NULL)
panic("l3 == NULL");
if (pmap_load(l3) == 0) {
if (va != va_next) {
pmap_invalidate_range(pmap, va, sva);
va = va_next;
}
continue;
}
if (va == va_next)
va = sva;
if (pmap_remove_l3(pmap, l3, sva, l3_paddr, &free,
&lock)) {
sva += L3_SIZE;
break;
}
}
if (va != va_next)
pmap_invalidate_range(pmap, va, sva);
}
if (lock != NULL)
rw_wunlock(lock);
PMAP_UNLOCK(pmap);
pmap_free_zero_pages(&free);
}
/*
* Routine: pmap_remove_all
* Function:
* Removes this physical page from
* all physical maps in which it resides.
* Reflects back modify bits to the pager.
*
* Notes:
* Original versions of this routine were very
* inefficient because they iteratively called
* pmap_remove (slow...)
*/
void
pmap_remove_all(vm_page_t m)
{
struct md_page *pvh;
pv_entry_t pv;
pmap_t pmap;
struct rwlock *lock;
pd_entry_t *pde, tpde;
pt_entry_t *pte, tpte;
vm_offset_t va;
struct spglist free;
int lvl, pvh_gen, md_gen;
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
("pmap_remove_all: page %p is not managed", m));
SLIST_INIT(&free);
lock = VM_PAGE_TO_PV_LIST_LOCK(m);
pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
pa_to_pvh(VM_PAGE_TO_PHYS(m));
retry:
rw_wlock(lock);
while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
pmap = PV_PMAP(pv);
if (!PMAP_TRYLOCK(pmap)) {
pvh_gen = pvh->pv_gen;
rw_wunlock(lock);
PMAP_LOCK(pmap);
rw_wlock(lock);
if (pvh_gen != pvh->pv_gen) {
rw_wunlock(lock);
PMAP_UNLOCK(pmap);
goto retry;
}
}
va = pv->pv_va;
pte = pmap_pte(pmap, va, &lvl);
KASSERT(pte != NULL,
("pmap_remove_all: no page table entry found"));
KASSERT(lvl == 2,
("pmap_remove_all: invalid pte level %d", lvl));
pmap_demote_l2_locked(pmap, pte, va, &lock);
PMAP_UNLOCK(pmap);
}
while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
pmap = PV_PMAP(pv);
if (!PMAP_TRYLOCK(pmap)) {
pvh_gen = pvh->pv_gen;
md_gen = m->md.pv_gen;
rw_wunlock(lock);
PMAP_LOCK(pmap);
rw_wlock(lock);
if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
rw_wunlock(lock);
PMAP_UNLOCK(pmap);
goto retry;
}
}
pmap_resident_count_dec(pmap, 1);
pde = pmap_pde(pmap, pv->pv_va, &lvl);
KASSERT(pde != NULL,
("pmap_remove_all: no page directory entry found"));
KASSERT(lvl == 2,
("pmap_remove_all: invalid pde level %d", lvl));
tpde = pmap_load(pde);
pte = pmap_l2_to_l3(pde, pv->pv_va);
tpte = pmap_load(pte);
pmap_load_clear(pte);
pmap_invalidate_page(pmap, pv->pv_va);
if (tpte & ATTR_SW_WIRED)
pmap->pm_stats.wired_count--;
if ((tpte & ATTR_AF) != 0)
vm_page_aflag_set(m, PGA_REFERENCED);
/*
* Update the vm_page_t clean and reference bits.
*/
if (pmap_page_dirty(tpte))
vm_page_dirty(m);
pmap_unuse_pt(pmap, pv->pv_va, tpde, &free);
TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
m->md.pv_gen++;
free_pv_entry(pmap, pv);
PMAP_UNLOCK(pmap);
}
vm_page_aflag_clear(m, PGA_WRITEABLE);
rw_wunlock(lock);
pmap_free_zero_pages(&free);
}
/*
* Set the physical protection on the
* specified range of this map as requested.
*/
void
pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
{
vm_offset_t va, va_next;
pd_entry_t *l0, *l1, *l2;
pt_entry_t *l3p, l3, nbits;
KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
if (prot == VM_PROT_NONE) {
pmap_remove(pmap, sva, eva);
return;
}
if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) ==
(VM_PROT_WRITE | VM_PROT_EXECUTE))
return;
PMAP_LOCK(pmap);
for (; sva < eva; sva = va_next) {
l0 = pmap_l0(pmap, sva);
if (pmap_load(l0) == 0) {
va_next = (sva + L0_SIZE) & ~L0_OFFSET;
if (va_next < sva)
va_next = eva;
continue;
}
l1 = pmap_l0_to_l1(l0, sva);
if (pmap_load(l1) == 0) {
va_next = (sva + L1_SIZE) & ~L1_OFFSET;
if (va_next < sva)
va_next = eva;
continue;
}
va_next = (sva + L2_SIZE) & ~L2_OFFSET;
if (va_next < sva)
va_next = eva;
l2 = pmap_l1_to_l2(l1, sva);
if (pmap_load(l2) == 0)
continue;
if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
l3p = pmap_demote_l2(pmap, l2, sva);
if (l3p == NULL)
continue;
}
KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
("pmap_protect: Invalid L2 entry after demotion"));
if (va_next > eva)
va_next = eva;
va = va_next;
for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++,
sva += L3_SIZE) {
l3 = pmap_load(l3p);
if (!pmap_l3_valid(l3))
continue;
nbits = 0;
if ((prot & VM_PROT_WRITE) == 0) {
if ((l3 & ATTR_SW_MANAGED) &&
pmap_page_dirty(l3)) {
vm_page_dirty(PHYS_TO_VM_PAGE(l3 &
~ATTR_MASK));
}
nbits |= ATTR_AP(ATTR_AP_RO);
}
if ((prot & VM_PROT_EXECUTE) == 0)
nbits |= ATTR_XN;
pmap_set(l3p, nbits);
/* XXX: Use pmap_invalidate_range */
pmap_invalidate_page(pmap, sva);
}
}
PMAP_UNLOCK(pmap);
}
/*
* Inserts the specified page table page into the specified pmap's collection
* of idle page table pages. Each of a pmap's page table pages is responsible
* for mapping a distinct range of virtual addresses. The pmap's collection is
* ordered by this virtual address range.
*/
static __inline int
pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
{
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
return (vm_radix_insert(&pmap->pm_root, mpte));
}
/*
* Removes the page table page mapping the specified virtual address from the
* specified pmap's collection of idle page table pages, and returns it.
* Otherwise, returns NULL if there is no page table page corresponding to the
* specified virtual address.
*/
static __inline vm_page_t
pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
{
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va)));
}
/*
* Performs a break-before-make update of a pmap entry. This is needed when
* either promoting or demoting pages to ensure the TLB doesn't get into an
* inconsistent state.
*/
static void
pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte,
vm_offset_t va, vm_size_t size)
{
register_t intr;
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
/*
* Ensure we don't get switched out with the page table in an
* inconsistent state. We also need to ensure no interrupts fire
* as they may make use of an address we are about to invalidate.
*/
intr = intr_disable();
critical_enter();
/* Clear the old mapping */
pmap_load_clear(pte);
pmap_invalidate_range(pmap, va, va + size);
/* Create the new mapping */
pmap_load_store(pte, newpte);
critical_exit();
intr_restore(intr);
}
#if VM_NRESERVLEVEL > 0
/*
* After promotion from 512 4KB page mappings to a single 2MB page mapping,
* replace the many pv entries for the 4KB page mappings by a single pv entry
* for the 2MB page mapping.
*/
static void
pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
struct rwlock **lockp)
{
struct md_page *pvh;
pv_entry_t pv;
vm_offset_t va_last;
vm_page_t m;
KASSERT((pa & L2_OFFSET) == 0,
("pmap_pv_promote_l2: pa is not 2mpage aligned"));
CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
/*
* Transfer the first page's pv entry for this mapping to the 2mpage's
* pv list. Aside from avoiding the cost of a call to get_pv_entry(),
* a transfer avoids the possibility that get_pv_entry() calls
* reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
* mappings that is being promoted.
*/
m = PHYS_TO_VM_PAGE(pa);
va = va & ~L2_OFFSET;
pv = pmap_pvh_remove(&m->md, pmap, va);
KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found"));
pvh = pa_to_pvh(pa);
TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
pvh->pv_gen++;
/* Free the remaining NPTEPG - 1 pv entries. */
va_last = va + L2_SIZE - PAGE_SIZE;
do {
m++;
va += PAGE_SIZE;
pmap_pvh_free(&m->md, pmap, va);
} while (va < va_last);
}
/*
* Tries to promote the 512, contiguous 4KB page mappings that are within a
* single level 2 table entry to a single 2MB page mapping. For promotion
* to occur, two conditions must be met: (1) the 4KB page mappings must map
* aligned, contiguous physical memory and (2) the 4KB page mappings must have
* identical characteristics.
*/
static void
pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va,
struct rwlock **lockp)
{
pt_entry_t *firstl3, *l3, newl2, oldl3, pa;
vm_page_t mpte;
vm_offset_t sva;
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
sva = va & ~L2_OFFSET;
firstl3 = pmap_l2_to_l3(l2, sva);
newl2 = pmap_load(firstl3);
/* Check the alingment is valid */
if (((newl2 & ~ATTR_MASK) & L2_OFFSET) != 0) {
atomic_add_long(&pmap_l2_p_failures, 1);
CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
" in pmap %p", va, pmap);
return;
}
pa = newl2 + L2_SIZE - PAGE_SIZE;
for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) {
oldl3 = pmap_load(l3);
if (oldl3 != pa) {
atomic_add_long(&pmap_l2_p_failures, 1);
CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
" in pmap %p", va, pmap);
return;
}
pa -= PAGE_SIZE;
}
/*
* Save the page table page in its current state until the L2
* mapping the superpage is demoted by pmap_demote_l2() or
* destroyed by pmap_remove_l3().
*/
mpte = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK);
KASSERT(mpte >= vm_page_array &&
mpte < &vm_page_array[vm_page_array_size],
("pmap_promote_l2: page table page is out of range"));
KASSERT(mpte->pindex == pmap_l2_pindex(va),
("pmap_promote_l2: page table page's pindex is wrong"));
if (pmap_insert_pt_page(pmap, mpte)) {
atomic_add_long(&pmap_l2_p_failures, 1);
CTR2(KTR_PMAP,
"pmap_promote_l2: failure for va %#lx in pmap %p", va,
pmap);
return;
}
if ((newl2 & ATTR_SW_MANAGED) != 0)
pmap_pv_promote_l2(pmap, va, newl2 & ~ATTR_MASK, lockp);
newl2 &= ~ATTR_DESCR_MASK;
newl2 |= L2_BLOCK;
pmap_update_entry(pmap, l2, newl2, sva, L2_SIZE);
atomic_add_long(&pmap_l2_promotions, 1);
CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va,
pmap);
}
#endif /* VM_NRESERVLEVEL > 0 */
/*
* Insert the given physical page (p) at
* the specified virtual address (v) in the
* target physical map with the protection requested.
*
* If specified, the page will be wired down, meaning
* that the related pte can not be reclaimed.
*
* NB: This is the only routine which MAY NOT lazy-evaluate
* or lose information. That is, this routine must actually
* insert this page into the given map NOW.
*/
int
pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
u_int flags, int8_t psind __unused)
{
struct rwlock *lock;
pd_entry_t *pde;
pt_entry_t new_l3, orig_l3;
pt_entry_t *l2, *l3;
pv_entry_t pv;
vm_paddr_t opa, pa, l1_pa, l2_pa, l3_pa;
vm_page_t mpte, om, l1_m, l2_m, l3_m;
boolean_t nosleep;
int lvl;
va = trunc_page(va);
if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
VM_OBJECT_ASSERT_LOCKED(m->object);
pa = VM_PAGE_TO_PHYS(m);
new_l3 = (pt_entry_t)(pa | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) |
L3_PAGE);
if ((prot & VM_PROT_WRITE) == 0)
new_l3 |= ATTR_AP(ATTR_AP_RO);
if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == DEVICE_MEMORY)
new_l3 |= ATTR_XN;
if ((flags & PMAP_ENTER_WIRED) != 0)
new_l3 |= ATTR_SW_WIRED;
if (va < VM_MAXUSER_ADDRESS)
new_l3 |= ATTR_AP(ATTR_AP_USER) | ATTR_PXN;
CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa);
mpte = NULL;
lock = NULL;
PMAP_LOCK(pmap);
pde = pmap_pde(pmap, va, &lvl);
if (pde != NULL && lvl == 1) {
l2 = pmap_l1_to_l2(pde, va);
if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK &&
(l3 = pmap_demote_l2_locked(pmap, l2, va & ~L2_OFFSET,
&lock)) != NULL) {
l3 = &l3[pmap_l3_index(va)];
if (va < VM_MAXUSER_ADDRESS) {
mpte = PHYS_TO_VM_PAGE(
pmap_load(l2) & ~ATTR_MASK);
mpte->wire_count++;
}
goto havel3;
}
}
if (va < VM_MAXUSER_ADDRESS) {
nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock);
if (mpte == NULL && nosleep) {
CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
if (lock != NULL)
rw_wunlock(lock);
PMAP_UNLOCK(pmap);
return (KERN_RESOURCE_SHORTAGE);
}
pde = pmap_pde(pmap, va, &lvl);
KASSERT(pde != NULL,
("pmap_enter: Invalid page entry, va: 0x%lx", va));
KASSERT(lvl == 2,
("pmap_enter: Invalid level %d", lvl));
l3 = pmap_l2_to_l3(pde, va);
} else {
/*
* If we get a level 2 pde it must point to a level 3 entry
* otherwise we will need to create the intermediate tables
*/
if (lvl < 2) {
switch(lvl) {
default:
case -1:
/* Get the l0 pde to update */
pde = pmap_l0(pmap, va);
KASSERT(pde != NULL, ("..."));
l1_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
VM_ALLOC_ZERO);
if (l1_m == NULL)
panic("pmap_enter: l1 pte_m == NULL");
if ((l1_m->flags & PG_ZERO) == 0)
pmap_zero_page(l1_m);
l1_pa = VM_PAGE_TO_PHYS(l1_m);
pmap_load_store(pde, l1_pa | L0_TABLE);
/* FALLTHROUGH */
case 0:
/* Get the l1 pde to update */
pde = pmap_l1_to_l2(pde, va);
KASSERT(pde != NULL, ("..."));
l2_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
VM_ALLOC_ZERO);
if (l2_m == NULL)
panic("pmap_enter: l2 pte_m == NULL");
if ((l2_m->flags & PG_ZERO) == 0)
pmap_zero_page(l2_m);
l2_pa = VM_PAGE_TO_PHYS(l2_m);
pmap_load_store(pde, l2_pa | L1_TABLE);
/* FALLTHROUGH */
case 1:
/* Get the l2 pde to update */
pde = pmap_l1_to_l2(pde, va);
l3_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
VM_ALLOC_ZERO);
if (l3_m == NULL)
panic("pmap_enter: l3 pte_m == NULL");
if ((l3_m->flags & PG_ZERO) == 0)
pmap_zero_page(l3_m);
l3_pa = VM_PAGE_TO_PHYS(l3_m);
pmap_load_store(pde, l3_pa | L2_TABLE);
break;
}
}
l3 = pmap_l2_to_l3(pde, va);
pmap_invalidate_page(pmap, va);
}
havel3:
om = NULL;
orig_l3 = pmap_load(l3);
opa = orig_l3 & ~ATTR_MASK;
/*
* Is the specified virtual address already mapped?
*/
if (pmap_l3_valid(orig_l3)) {
/*
* Wiring change, just update stats. We don't worry about
* wiring PT pages as they remain resident as long as there
* are valid mappings in them. Hence, if a user page is wired,
* the PT page will be also.
*/
if ((flags & PMAP_ENTER_WIRED) != 0 &&
(orig_l3 & ATTR_SW_WIRED) == 0)
pmap->pm_stats.wired_count++;
else if ((flags & PMAP_ENTER_WIRED) == 0 &&
(orig_l3 & ATTR_SW_WIRED) != 0)
pmap->pm_stats.wired_count--;
/*
* Remove the extra PT page reference.
*/
if (mpte != NULL) {
mpte->wire_count--;
KASSERT(mpte->wire_count > 0,
("pmap_enter: missing reference to page table page,"
" va: 0x%lx", va));
}
/*
* Has the physical page changed?
*/
if (opa == pa) {
/*
* No, might be a protection or wiring change.
*/
if ((orig_l3 & ATTR_SW_MANAGED) != 0) {
new_l3 |= ATTR_SW_MANAGED;
if ((new_l3 & ATTR_AP(ATTR_AP_RW)) ==
ATTR_AP(ATTR_AP_RW)) {
vm_page_aflag_set(m, PGA_WRITEABLE);
}
}
goto validate;
}
} else {
/*
* Increment the counters.
*/
if ((new_l3 & ATTR_SW_WIRED) != 0)
pmap->pm_stats.wired_count++;
pmap_resident_count_inc(pmap, 1);
}
/*
* Enter on the PV list if part of our managed memory.
*/
if ((m->oflags & VPO_UNMANAGED) == 0) {
new_l3 |= ATTR_SW_MANAGED;
pv = get_pv_entry(pmap, &lock);
pv->pv_va = va;
CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
m->md.pv_gen++;
if ((new_l3 & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW))
vm_page_aflag_set(m, PGA_WRITEABLE);
}
/*
* Update the L3 entry.
*/
if (orig_l3 != 0) {
validate:
orig_l3 = pmap_load(l3);
opa = orig_l3 & ~ATTR_MASK;
if (opa != pa) {
pmap_update_entry(pmap, l3, new_l3, va, PAGE_SIZE);
if ((orig_l3 & ATTR_SW_MANAGED) != 0) {
om = PHYS_TO_VM_PAGE(opa);
if (pmap_page_dirty(orig_l3))
vm_page_dirty(om);
if ((orig_l3 & ATTR_AF) != 0)
vm_page_aflag_set(om, PGA_REFERENCED);
CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
pmap_pvh_free(&om->md, pmap, va);
if ((om->aflags & PGA_WRITEABLE) != 0 &&
TAILQ_EMPTY(&om->md.pv_list) &&
((om->flags & PG_FICTITIOUS) != 0 ||
TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
vm_page_aflag_clear(om, PGA_WRITEABLE);
}
} else {
pmap_load_store(l3, new_l3);
pmap_invalidate_page(pmap, va);
if (pmap_page_dirty(orig_l3) &&
(orig_l3 & ATTR_SW_MANAGED) != 0)
vm_page_dirty(m);
}
} else {
pmap_load_store(l3, new_l3);
}
pmap_invalidate_page(pmap, va);
if (pmap != pmap_kernel()) {
if (pmap == &curproc->p_vmspace->vm_pmap &&
(prot & VM_PROT_EXECUTE) != 0)
cpu_icache_sync_range(va, PAGE_SIZE);
#if VM_NRESERVLEVEL > 0
if ((mpte == NULL || mpte->wire_count == NL3PG) &&
pmap_superpages_enabled() &&
(m->flags & PG_FICTITIOUS) == 0 &&
vm_reserv_level_iffullpop(m) == 0) {
pmap_promote_l2(pmap, pde, va, &lock);
}
#endif
}
if (lock != NULL)
rw_wunlock(lock);
PMAP_UNLOCK(pmap);
return (KERN_SUCCESS);
}
/*
* Maps a sequence of resident pages belonging to the same object.
* The sequence begins with the given page m_start. This page is
* mapped at the given virtual address start. Each subsequent page is
* mapped at a virtual address that is offset from start by the same
* amount as the page is offset from m_start within the object. The
* last page in the sequence is the page with the largest offset from
* m_start that can be mapped at a virtual address less than the given
* virtual address end. Not every virtual page between start and end
* is mapped; only those for which a resident page exists with the
* corresponding offset from m_start are mapped.
*/
void
pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
vm_page_t m_start, vm_prot_t prot)
{
struct rwlock *lock;
vm_offset_t va;
vm_page_t m, mpte;
vm_pindex_t diff, psize;
VM_OBJECT_ASSERT_LOCKED(m_start->object);
psize = atop(end - start);
mpte = NULL;
m = m_start;
lock = NULL;
PMAP_LOCK(pmap);
while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
va = start + ptoa(diff);
mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, &lock);
m = TAILQ_NEXT(m, listq);
}
if (lock != NULL)
rw_wunlock(lock);
PMAP_UNLOCK(pmap);
}
/*
* this code makes some *MAJOR* assumptions:
* 1. Current pmap & pmap exists.
* 2. Not wired.
* 3. Read access.
* 4. No page table pages.
* but is *MUCH* faster than pmap_enter...
*/
void
pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
{
struct rwlock *lock;
lock = NULL;
PMAP_LOCK(pmap);
(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
if (lock != NULL)
rw_wunlock(lock);
PMAP_UNLOCK(pmap);
}
static vm_page_t
pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
{
struct spglist free;
pd_entry_t *pde;
pt_entry_t *l2, *l3;
vm_paddr_t pa;
int lvl;
KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
(m->oflags & VPO_UNMANAGED) != 0,
("pmap_enter_quick_locked: managed mapping within the clean submap"));
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va);
/*
* In the case that a page table page is not
* resident, we are creating it here.
*/
if (va < VM_MAXUSER_ADDRESS) {
vm_pindex_t l2pindex;
/*
* Calculate pagetable page index
*/
l2pindex = pmap_l2_pindex(va);
if (mpte && (mpte->pindex == l2pindex)) {
mpte->wire_count++;
} else {
/*
* Get the l2 entry
*/
pde = pmap_pde(pmap, va, &lvl);
/*
* If the page table page is mapped, we just increment
* the hold count, and activate it. Otherwise, we
* attempt to allocate a page table page. If this
* attempt fails, we don't retry. Instead, we give up.
*/
if (lvl == 1) {
l2 = pmap_l1_to_l2(pde, va);
if ((pmap_load(l2) & ATTR_DESCR_MASK) ==
L2_BLOCK)
return (NULL);
}
if (lvl == 2 && pmap_load(pde) != 0) {
mpte =
PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK);
mpte->wire_count++;
} else {
/*
* Pass NULL instead of the PV list lock
* pointer, because we don't intend to sleep.
*/
mpte = _pmap_alloc_l3(pmap, l2pindex, NULL);
if (mpte == NULL)
return (mpte);
}
}
l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
l3 = &l3[pmap_l3_index(va)];
} else {
mpte = NULL;
pde = pmap_pde(kernel_pmap, va, &lvl);
KASSERT(pde != NULL,
("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx",
va));
KASSERT(lvl == 2,
("pmap_enter_quick_locked: Invalid level %d", lvl));
l3 = pmap_l2_to_l3(pde, va);
}
if (pmap_load(l3) != 0) {
if (mpte != NULL) {
mpte->wire_count--;
mpte = NULL;
}
return (mpte);
}
/*
* Enter on the PV list if part of our managed memory.
*/
if ((m->oflags & VPO_UNMANAGED) == 0 &&
!pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
if (mpte != NULL) {
SLIST_INIT(&free);
if (pmap_unwire_l3(pmap, va, mpte, &free)) {
pmap_invalidate_page(pmap, va);
pmap_free_zero_pages(&free);
}
mpte = NULL;
}
return (mpte);
}
/*
* Increment counters
*/
pmap_resident_count_inc(pmap, 1);
pa = VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) |
ATTR_AP(ATTR_AP_RO) | L3_PAGE;
if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == DEVICE_MEMORY)
pa |= ATTR_XN;
else if (va < VM_MAXUSER_ADDRESS)
pa |= ATTR_PXN;
/*
* Now validate mapping with RO protection
*/
if ((m->oflags & VPO_UNMANAGED) == 0)
pa |= ATTR_SW_MANAGED;
pmap_load_store(l3, pa);
pmap_invalidate_page(pmap, va);
return (mpte);
}
/*
* This code maps large physical mmap regions into the
* processor address space. Note that some shortcuts
* are taken, but the code works.
*/
void
pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
vm_pindex_t pindex, vm_size_t size)
{
VM_OBJECT_ASSERT_WLOCKED(object);
KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
("pmap_object_init_pt: non-device object"));
}
/*
* Clear the wired attribute from the mappings for the specified range of
* addresses in the given pmap. Every valid mapping within that range
* must have the wired attribute set. In contrast, invalid mappings
* cannot have the wired attribute set, so they are ignored.
*
* The wired attribute of the page table entry is not a hardware feature,
* so there is no need to invalidate any TLB entries.
*/
void
pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
{
vm_offset_t va_next;
pd_entry_t *l0, *l1, *l2;
pt_entry_t *l3;
PMAP_LOCK(pmap);
for (; sva < eva; sva = va_next) {
l0 = pmap_l0(pmap, sva);
if (pmap_load(l0) == 0) {
va_next = (sva + L0_SIZE) & ~L0_OFFSET;
if (va_next < sva)
va_next = eva;
continue;
}
l1 = pmap_l0_to_l1(l0, sva);
if (pmap_load(l1) == 0) {
va_next = (sva + L1_SIZE) & ~L1_OFFSET;
if (va_next < sva)
va_next = eva;
continue;
}
va_next = (sva + L2_SIZE) & ~L2_OFFSET;
if (va_next < sva)
va_next = eva;
l2 = pmap_l1_to_l2(l1, sva);
if (pmap_load(l2) == 0)
continue;
if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
l3 = pmap_demote_l2(pmap, l2, sva);
if (l3 == NULL)
continue;
}
KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
("pmap_unwire: Invalid l2 entry after demotion"));
if (va_next > eva)
va_next = eva;
for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
sva += L3_SIZE) {
if (pmap_load(l3) == 0)
continue;
if ((pmap_load(l3) & ATTR_SW_WIRED) == 0)
panic("pmap_unwire: l3 %#jx is missing "
"ATTR_SW_WIRED", (uintmax_t)pmap_load(l3));
/*
* PG_W must be cleared atomically. Although the pmap
* lock synchronizes access to PG_W, another processor
* could be setting PG_M and/or PG_A concurrently.
*/
atomic_clear_long(l3, ATTR_SW_WIRED);
pmap->pm_stats.wired_count--;
}
}
PMAP_UNLOCK(pmap);
}
/*
* Copy the range specified by src_addr/len
* from the source map to the range dst_addr/len
* in the destination map.
*
* This routine is only advisory and need not do anything.
*/
void
pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
vm_offset_t src_addr)
{
}
/*
* pmap_zero_page zeros the specified hardware page by mapping
* the page into KVM and using bzero to clear its contents.
*/
void
pmap_zero_page(vm_page_t m)
{
vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
pagezero((void *)va);
}
/*
* pmap_zero_page_area zeros the specified hardware page by mapping
* the page into KVM and using bzero to clear its contents.
*
* off and size may not cover an area beyond a single hardware page.
*/
void
pmap_zero_page_area(vm_page_t m, int off, int size)
{
vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
if (off == 0 && size == PAGE_SIZE)
pagezero((void *)va);
else
bzero((char *)va + off, size);
}
/*
* pmap_copy_page copies the specified (machine independent)
* page by mapping the page into virtual memory and using
* bcopy to copy the page, one machine dependent page at a
* time.
*/
void
pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
{
vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
pagecopy((void *)src, (void *)dst);
}
int unmapped_buf_allowed = 1;
void
pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
vm_offset_t b_offset, int xfersize)
{
void *a_cp, *b_cp;
vm_page_t m_a, m_b;
vm_paddr_t p_a, p_b;
vm_offset_t a_pg_offset, b_pg_offset;
int cnt;
while (xfersize > 0) {
a_pg_offset = a_offset & PAGE_MASK;
m_a = ma[a_offset >> PAGE_SHIFT];
p_a = m_a->phys_addr;
b_pg_offset = b_offset & PAGE_MASK;
m_b = mb[b_offset >> PAGE_SHIFT];
p_b = m_b->phys_addr;
cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
cnt = min(cnt, PAGE_SIZE - b_pg_offset);
if (__predict_false(!PHYS_IN_DMAP(p_a))) {
panic("!DMAP a %lx", p_a);
} else {
a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
}
if (__predict_false(!PHYS_IN_DMAP(p_b))) {
panic("!DMAP b %lx", p_b);
} else {
b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
}
bcopy(a_cp, b_cp, cnt);
a_offset += cnt;
b_offset += cnt;
xfersize -= cnt;
}
}
vm_offset_t
pmap_quick_enter_page(vm_page_t m)
{
return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)));
}
void
pmap_quick_remove_page(vm_offset_t addr)
{
}
/*
* Returns true if the pmap's pv is one of the first
* 16 pvs linked to from this page. This count may
* be changed upwards or downwards in the future; it
* is only necessary that true be returned for a small
* subset of pmaps for proper page aging.
*/
boolean_t
pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
{
struct md_page *pvh;
struct rwlock *lock;
pv_entry_t pv;
int loops = 0;
boolean_t rv;
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
("pmap_page_exists_quick: page %p is not managed", m));
rv = FALSE;
lock = VM_PAGE_TO_PV_LIST_LOCK(m);
rw_rlock(lock);
TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
if (PV_PMAP(pv) == pmap) {
rv = TRUE;
break;
}
loops++;
if (loops >= 16)
break;
}
if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
if (PV_PMAP(pv) == pmap) {
rv = TRUE;
break;
}
loops++;
if (loops >= 16)
break;
}
}
rw_runlock(lock);
return (rv);
}
/*
* pmap_page_wired_mappings:
*
* Return the number of managed mappings to the given physical page
* that are wired.
*/
int
pmap_page_wired_mappings(vm_page_t m)
{
struct rwlock *lock;
struct md_page *pvh;
pmap_t pmap;
pt_entry_t *pte;
pv_entry_t pv;
int count, lvl, md_gen, pvh_gen;
if ((m->oflags & VPO_UNMANAGED) != 0)
return (0);
lock = VM_PAGE_TO_PV_LIST_LOCK(m);
rw_rlock(lock);
restart:
count = 0;
TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
pmap = PV_PMAP(pv);
if (!PMAP_TRYLOCK(pmap)) {
md_gen = m->md.pv_gen;
rw_runlock(lock);
PMAP_LOCK(pmap);
rw_rlock(lock);
if (md_gen != m->md.pv_gen) {
PMAP_UNLOCK(pmap);
goto restart;
}
}
pte = pmap_pte(pmap, pv->pv_va, &lvl);
if (pte != NULL && (pmap_load(pte) & ATTR_SW_WIRED) != 0)
count++;
PMAP_UNLOCK(pmap);
}
if ((m->flags & PG_FICTITIOUS) == 0) {
pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
pmap = PV_PMAP(pv);
if (!PMAP_TRYLOCK(pmap)) {
md_gen = m->md.pv_gen;
pvh_gen = pvh->pv_gen;
rw_runlock(lock);
PMAP_LOCK(pmap);
rw_rlock(lock);
if (md_gen != m->md.pv_gen ||
pvh_gen != pvh->pv_gen) {
PMAP_UNLOCK(pmap);
goto restart;
}
}
pte = pmap_pte(pmap, pv->pv_va, &lvl);
if (pte != NULL &&
(pmap_load(pte) & ATTR_SW_WIRED) != 0)
count++;
PMAP_UNLOCK(pmap);
}
}
rw_runlock(lock);
return (count);
}
/*
* Destroy all managed, non-wired mappings in the given user-space
* pmap. This pmap cannot be active on any processor besides the
* caller.
*
* This function cannot be applied to the kernel pmap. Moreover, it
* is not intended for general use. It is only to be used during
* process termination. Consequently, it can be implemented in ways
* that make it faster than pmap_remove(). First, it can more quickly
* destroy mappings by iterating over the pmap's collection of PV
* entries, rather than searching the page table. Second, it doesn't
* have to test and clear the page table entries atomically, because
* no processor is currently accessing the user address space. In
* particular, a page table entry's dirty bit won't change state once
* this function starts.
*/
void
pmap_remove_pages(pmap_t pmap)
{
pd_entry_t *pde;
pt_entry_t *pte, tpte;
struct spglist free;
vm_page_t m, ml3, mt;
pv_entry_t pv;
struct md_page *pvh;
struct pv_chunk *pc, *npc;
struct rwlock *lock;
int64_t bit;
uint64_t inuse, bitmask;
int allfree, field, freed, idx, lvl;
vm_paddr_t pa;
lock = NULL;
SLIST_INIT(&free);
PMAP_LOCK(pmap);
TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
allfree = 1;
freed = 0;
for (field = 0; field < _NPCM; field++) {
inuse = ~pc->pc_map[field] & pc_freemask[field];
while (inuse != 0) {
bit = ffsl(inuse) - 1;
bitmask = 1UL << bit;
idx = field * 64 + bit;
pv = &pc->pc_pventry[idx];
inuse &= ~bitmask;
pde = pmap_pde(pmap, pv->pv_va, &lvl);
KASSERT(pde != NULL,
("Attempting to remove an unmapped page"));
switch(lvl) {
case 1:
pte = pmap_l1_to_l2(pde, pv->pv_va);
tpte = pmap_load(pte);
KASSERT((tpte & ATTR_DESCR_MASK) ==
L2_BLOCK,
("Attempting to remove an invalid "
"block: %lx", tpte));
tpte = pmap_load(pte);
break;
case 2:
pte = pmap_l2_to_l3(pde, pv->pv_va);
tpte = pmap_load(pte);
KASSERT((tpte & ATTR_DESCR_MASK) ==
L3_PAGE,
("Attempting to remove an invalid "
"page: %lx", tpte));
break;
default:
panic(
"Invalid page directory level: %d",
lvl);
}
/*
* We cannot remove wired pages from a process' mapping at this time
*/
if (tpte & ATTR_SW_WIRED) {
allfree = 0;
continue;
}
pa = tpte & ~ATTR_MASK;
m = PHYS_TO_VM_PAGE(pa);
KASSERT(m->phys_addr == pa,
("vm_page_t %p phys_addr mismatch %016jx %016jx",
m, (uintmax_t)m->phys_addr,
(uintmax_t)tpte));
KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
m < &vm_page_array[vm_page_array_size],
("pmap_remove_pages: bad pte %#jx",
(uintmax_t)tpte));
pmap_load_clear(pte);
/*
* Update the vm_page_t clean/reference bits.
*/
if ((tpte & ATTR_AP_RW_BIT) ==
ATTR_AP(ATTR_AP_RW)) {
switch (lvl) {
case 1:
for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
vm_page_dirty(m);
break;
case 2:
vm_page_dirty(m);
break;
}
}
CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
/* Mark free */
pc->pc_map[field] |= bitmask;
switch (lvl) {
case 1:
pmap_resident_count_dec(pmap,
L2_SIZE / PAGE_SIZE);
pvh = pa_to_pvh(tpte & ~ATTR_MASK);
TAILQ_REMOVE(&pvh->pv_list, pv,pv_next);
pvh->pv_gen++;
if (TAILQ_EMPTY(&pvh->pv_list)) {
for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
if ((mt->aflags & PGA_WRITEABLE) != 0 &&
TAILQ_EMPTY(&mt->md.pv_list))
vm_page_aflag_clear(mt, PGA_WRITEABLE);
}
ml3 = pmap_remove_pt_page(pmap,
pv->pv_va);
if (ml3 != NULL) {
pmap_resident_count_dec(pmap,1);
KASSERT(ml3->wire_count == NL3PG,
("pmap_remove_pages: l3 page wire count error"));
ml3->wire_count = 0;
pmap_add_delayed_free_list(ml3,
&free, FALSE);
atomic_subtract_int(
&vm_cnt.v_wire_count, 1);
}
break;
case 2:
pmap_resident_count_dec(pmap, 1);
TAILQ_REMOVE(&m->md.pv_list, pv,
pv_next);
m->md.pv_gen++;
if ((m->aflags & PGA_WRITEABLE) != 0 &&
TAILQ_EMPTY(&m->md.pv_list) &&
(m->flags & PG_FICTITIOUS) == 0) {
pvh = pa_to_pvh(
VM_PAGE_TO_PHYS(m));
if (TAILQ_EMPTY(&pvh->pv_list))
vm_page_aflag_clear(m,
PGA_WRITEABLE);
}
break;
}
pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde),
&free);
freed++;
}
}
PV_STAT(atomic_add_long(&pv_entry_frees, freed));
PV_STAT(atomic_add_int(&pv_entry_spare, freed));
PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
if (allfree) {
TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
free_pv_chunk(pc);
}
}
pmap_invalidate_all(pmap);
if (lock != NULL)
rw_wunlock(lock);
PMAP_UNLOCK(pmap);
pmap_free_zero_pages(&free);
}
/*
* This is used to check if a page has been accessed or modified. As we
* don't have a bit to see if it has been modified we have to assume it
* has been if the page is read/write.
*/
static boolean_t
pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
{
struct rwlock *lock;
pv_entry_t pv;
struct md_page *pvh;
pt_entry_t *pte, mask, value;
pmap_t pmap;
int lvl, md_gen, pvh_gen;
boolean_t rv;
rv = FALSE;
lock = VM_PAGE_TO_PV_LIST_LOCK(m);
rw_rlock(lock);
restart:
TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
pmap = PV_PMAP(pv);
if (!PMAP_TRYLOCK(pmap)) {
md_gen = m->md.pv_gen;
rw_runlock(lock);
PMAP_LOCK(pmap);
rw_rlock(lock);
if (md_gen != m->md.pv_gen) {
PMAP_UNLOCK(pmap);
goto restart;
}
}
pte = pmap_pte(pmap, pv->pv_va, &lvl);
KASSERT(lvl == 3,
("pmap_page_test_mappings: Invalid level %d", lvl));
mask = 0;
value = 0;
if (modified) {
mask |= ATTR_AP_RW_BIT;
value |= ATTR_AP(ATTR_AP_RW);
}
if (accessed) {
mask |= ATTR_AF | ATTR_DESCR_MASK;
value |= ATTR_AF | L3_PAGE;
}
rv = (pmap_load(pte) & mask) == value;
PMAP_UNLOCK(pmap);
if (rv)
goto out;
}
if ((m->flags & PG_FICTITIOUS) == 0) {
pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
pmap = PV_PMAP(pv);
if (!PMAP_TRYLOCK(pmap)) {
md_gen = m->md.pv_gen;
pvh_gen = pvh->pv_gen;
rw_runlock(lock);
PMAP_LOCK(pmap);
rw_rlock(lock);
if (md_gen != m->md.pv_gen ||
pvh_gen != pvh->pv_gen) {
PMAP_UNLOCK(pmap);
goto restart;
}
}
pte = pmap_pte(pmap, pv->pv_va, &lvl);
KASSERT(lvl == 2,
("pmap_page_test_mappings: Invalid level %d", lvl));
mask = 0;
value = 0;
if (modified) {
mask |= ATTR_AP_RW_BIT;
value |= ATTR_AP(ATTR_AP_RW);
}
if (accessed) {
mask |= ATTR_AF | ATTR_DESCR_MASK;
value |= ATTR_AF | L2_BLOCK;
}
rv = (pmap_load(pte) & mask) == value;
PMAP_UNLOCK(pmap);
if (rv)
goto out;
}
}
out:
rw_runlock(lock);
return (rv);
}
/*
* pmap_is_modified:
*
* Return whether or not the specified physical page was modified
* in any physical maps.
*/
boolean_t
pmap_is_modified(vm_page_t m)
{
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
("pmap_is_modified: page %p is not managed", m));
/*
* If the page is not exclusive busied, then PGA_WRITEABLE cannot be
* concurrently set while the object is locked. Thus, if PGA_WRITEABLE
* is clear, no PTEs can have PG_M set.
*/
VM_OBJECT_ASSERT_WLOCKED(m->object);
if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
return (FALSE);
return (pmap_page_test_mappings(m, FALSE, TRUE));
}
/*
* pmap_is_prefaultable:
*
* Return whether or not the specified virtual address is eligible
* for prefault.
*/
boolean_t
pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
{
pt_entry_t *pte;
boolean_t rv;
int lvl;
rv = FALSE;
PMAP_LOCK(pmap);
pte = pmap_pte(pmap, addr, &lvl);
if (pte != NULL && pmap_load(pte) != 0) {
rv = TRUE;
}
PMAP_UNLOCK(pmap);
return (rv);
}
/*
* pmap_is_referenced:
*
* Return whether or not the specified physical page was referenced
* in any physical maps.
*/
boolean_t
pmap_is_referenced(vm_page_t m)
{
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
("pmap_is_referenced: page %p is not managed", m));
return (pmap_page_test_mappings(m, TRUE, FALSE));
}
/*
* Clear the write and modified bits in each of the given page's mappings.
*/
void
pmap_remove_write(vm_page_t m)
{
struct md_page *pvh;
pmap_t pmap;
struct rwlock *lock;
pv_entry_t next_pv, pv;
pt_entry_t oldpte, *pte;
vm_offset_t va;
int lvl, md_gen, pvh_gen;
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
("pmap_remove_write: page %p is not managed", m));
/*
* If the page is not exclusive busied, then PGA_WRITEABLE cannot be
* set by another thread while the object is locked. Thus,
* if PGA_WRITEABLE is clear, no page table entries need updating.
*/
VM_OBJECT_ASSERT_WLOCKED(m->object);
if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
return;
lock = VM_PAGE_TO_PV_LIST_LOCK(m);
pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
pa_to_pvh(VM_PAGE_TO_PHYS(m));
retry_pv_loop:
rw_wlock(lock);
TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
pmap = PV_PMAP(pv);
if (!PMAP_TRYLOCK(pmap)) {
pvh_gen = pvh->pv_gen;
rw_wunlock(lock);
PMAP_LOCK(pmap);
rw_wlock(lock);
if (pvh_gen != pvh->pv_gen) {
PMAP_UNLOCK(pmap);
rw_wunlock(lock);
goto retry_pv_loop;
}
}
va = pv->pv_va;
pte = pmap_pte(pmap, pv->pv_va, &lvl);
if ((pmap_load(pte) & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW))
pmap_demote_l2_locked(pmap, pte, va & ~L2_OFFSET,
&lock);
KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
("inconsistent pv lock %p %p for page %p",
lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
PMAP_UNLOCK(pmap);
}
TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
pmap = PV_PMAP(pv);
if (!PMAP_TRYLOCK(pmap)) {
pvh_gen = pvh->pv_gen;
md_gen = m->md.pv_gen;
rw_wunlock(lock);
PMAP_LOCK(pmap);
rw_wlock(lock);
if (pvh_gen != pvh->pv_gen ||
md_gen != m->md.pv_gen) {
PMAP_UNLOCK(pmap);
rw_wunlock(lock);
goto retry_pv_loop;
}
}
pte = pmap_pte(pmap, pv->pv_va, &lvl);
retry:
oldpte = pmap_load(pte);
if ((oldpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) {
if (!atomic_cmpset_long(pte, oldpte,
oldpte | ATTR_AP(ATTR_AP_RO)))
goto retry;
if ((oldpte & ATTR_AF) != 0)
vm_page_dirty(m);
pmap_invalidate_page(pmap, pv->pv_va);
}
PMAP_UNLOCK(pmap);
}
rw_wunlock(lock);
vm_page_aflag_clear(m, PGA_WRITEABLE);
}
static __inline boolean_t
safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte)
{
return (FALSE);
}
/*
* pmap_ts_referenced:
*
* Return a count of reference bits for a page, clearing those bits.
* It is not necessary for every reference bit to be cleared, but it
* is necessary that 0 only be returned when there are truly no
* reference bits set.
*
* As an optimization, update the page's dirty field if a modified bit is
* found while counting reference bits. This opportunistic update can be
* performed at low cost and can eliminate the need for some future calls
* to pmap_is_modified(). However, since this function stops after
* finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
* dirty pages. Those dirty pages will only be detected by a future call
* to pmap_is_modified().
*/
int
pmap_ts_referenced(vm_page_t m)
{
struct md_page *pvh;
pv_entry_t pv, pvf;
pmap_t pmap;
struct rwlock *lock;
pd_entry_t *pde, tpde;
pt_entry_t *pte, tpte;
pt_entry_t *l3;
vm_offset_t va;
vm_paddr_t pa;
int cleared, md_gen, not_cleared, lvl, pvh_gen;
struct spglist free;
bool demoted;
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
("pmap_ts_referenced: page %p is not managed", m));
SLIST_INIT(&free);
cleared = 0;
pa = VM_PAGE_TO_PHYS(m);
lock = PHYS_TO_PV_LIST_LOCK(pa);
pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
rw_wlock(lock);
retry:
not_cleared = 0;
if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
goto small_mappings;
pv = pvf;
do {
if (pvf == NULL)
pvf = pv;
pmap = PV_PMAP(pv);
if (!PMAP_TRYLOCK(pmap)) {
pvh_gen = pvh->pv_gen;
rw_wunlock(lock);
PMAP_LOCK(pmap);
rw_wlock(lock);
if (pvh_gen != pvh->pv_gen) {
PMAP_UNLOCK(pmap);
goto retry;
}
}
va = pv->pv_va;
pde = pmap_pde(pmap, pv->pv_va, &lvl);
KASSERT(pde != NULL, ("pmap_ts_referenced: no l1 table found"));
KASSERT(lvl == 1,
("pmap_ts_referenced: invalid pde level %d", lvl));
tpde = pmap_load(pde);
KASSERT((tpde & ATTR_DESCR_MASK) == L1_TABLE,
("pmap_ts_referenced: found an invalid l1 table"));
pte = pmap_l1_to_l2(pde, pv->pv_va);
tpte = pmap_load(pte);
if (pmap_page_dirty(tpte)) {
/*
* Although "tpte" is mapping a 2MB page, because
* this function is called at a 4KB page granularity,
* we only update the 4KB page under test.
*/
vm_page_dirty(m);
}
if ((tpte & ATTR_AF) != 0) {
/*
* Since this reference bit is shared by 512 4KB
* pages, it should not be cleared every time it is
* tested. Apply a simple "hash" function on the
* physical page number, the virtual superpage number,
* and the pmap address to select one 4KB page out of
* the 512 on which testing the reference bit will
* result in clearing that reference bit. This
* function is designed to avoid the selection of the
* same 4KB page for every 2MB page mapping.
*
* On demotion, a mapping that hasn't been referenced
* is simply destroyed. To avoid the possibility of a
* subsequent page fault on a demoted wired mapping,
* always leave its reference bit set. Moreover,
* since the superpage is wired, the current state of
* its reference bit won't affect page replacement.
*/
if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^
(uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 &&
(tpte & ATTR_SW_WIRED) == 0) {
if (safe_to_clear_referenced(pmap, tpte)) {
/*
* TODO: We don't handle the access
* flag at all. We need to be able
* to set it in the exception handler.
*/
panic("ARM64TODO: "
"safe_to_clear_referenced\n");
} else if (pmap_demote_l2_locked(pmap, pte,
pv->pv_va, &lock) != NULL) {
demoted = true;
va += VM_PAGE_TO_PHYS(m) -
(tpte & ~ATTR_MASK);
l3 = pmap_l2_to_l3(pte, va);
pmap_remove_l3(pmap, l3, va,
pmap_load(pte), NULL, &lock);
} else
demoted = true;
if (demoted) {
/*
* The superpage mapping was removed
* entirely and therefore 'pv' is no
* longer valid.
*/
if (pvf == pv)
pvf = NULL;
pv = NULL;
}
cleared++;
KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
("inconsistent pv lock %p %p for page %p",
lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
} else
not_cleared++;
}
PMAP_UNLOCK(pmap);
/* Rotate the PV list if it has more than one entry. */
if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
pvh->pv_gen++;
}
if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
goto out;
} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
small_mappings:
if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
goto out;
pv = pvf;
do {
if (pvf == NULL)
pvf = pv;
pmap = PV_PMAP(pv);
if (!PMAP_TRYLOCK(pmap)) {
pvh_gen = pvh->pv_gen;
md_gen = m->md.pv_gen;
rw_wunlock(lock);
PMAP_LOCK(pmap);
rw_wlock(lock);
if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
PMAP_UNLOCK(pmap);
goto retry;
}
}
pde = pmap_pde(pmap, pv->pv_va, &lvl);
KASSERT(pde != NULL, ("pmap_ts_referenced: no l2 table found"));
KASSERT(lvl == 2,
("pmap_ts_referenced: invalid pde level %d", lvl));
tpde = pmap_load(pde);
KASSERT((tpde & ATTR_DESCR_MASK) == L2_TABLE,
("pmap_ts_referenced: found an invalid l2 table"));
pte = pmap_l2_to_l3(pde, pv->pv_va);
tpte = pmap_load(pte);
if (pmap_page_dirty(tpte))
vm_page_dirty(m);
if ((tpte & ATTR_AF) != 0) {
if (safe_to_clear_referenced(pmap, tpte)) {
/*
* TODO: We don't handle the access flag
* at all. We need to be able to set it in
* the exception handler.
*/
panic("ARM64TODO: safe_to_clear_referenced\n");
} else if ((tpte & ATTR_SW_WIRED) == 0) {
/*
* Wired pages cannot be paged out so
* doing accessed bit emulation for
* them is wasted effort. We do the
* hard work for unwired pages only.
*/
pmap_remove_l3(pmap, pte, pv->pv_va, tpde,
&free, &lock);
pmap_invalidate_page(pmap, pv->pv_va);
cleared++;
if (pvf == pv)
pvf = NULL;
pv = NULL;
KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
("inconsistent pv lock %p %p for page %p",
lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
} else
not_cleared++;
}
PMAP_UNLOCK(pmap);
/* Rotate the PV list if it has more than one entry. */
if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
m->md.pv_gen++;
}
} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
not_cleared < PMAP_TS_REFERENCED_MAX);
out:
rw_wunlock(lock);
pmap_free_zero_pages(&free);
return (cleared + not_cleared);
}
/*
* Apply the given advice to the specified range of addresses within the
* given pmap. Depending on the advice, clear the referenced and/or
* modified flags in each mapping and set the mapped page's dirty field.
*/
void
pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
{
}
/*
* Clear the modify bits on the specified physical page.
*/
void
pmap_clear_modify(vm_page_t m)
{
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
("pmap_clear_modify: page %p is not managed", m));
VM_OBJECT_ASSERT_WLOCKED(m->object);
KASSERT(!vm_page_xbusied(m),
("pmap_clear_modify: page %p is exclusive busied", m));
/*
* If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
* If the object containing the page is locked and the page is not
* exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
*/
if ((m->aflags & PGA_WRITEABLE) == 0)
return;
/* ARM64TODO: We lack support for tracking if a page is modified */
}
void *
pmap_mapbios(vm_paddr_t pa, vm_size_t size)
{
return ((void *)PHYS_TO_DMAP(pa));
}
void
pmap_unmapbios(vm_paddr_t pa, vm_size_t size)
{
}
/*
* Sets the memory attribute for the specified page.
*/
void
pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
{
m->md.pv_memattr = ma;
/*
* If "m" is a normal page, update its direct mapping. This update
* can be relied upon to perform any cache operations that are
* required for data coherence.
*/
if ((m->flags & PG_FICTITIOUS) == 0 &&
pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
m->md.pv_memattr) != 0)
panic("memory attribute change on the direct map failed");
}
/*
* Changes the specified virtual address range's memory type to that given by
* the parameter "mode". The specified virtual address range must be
* completely contained within either the direct map or the kernel map. If
* the virtual address range is contained within the kernel map, then the
* memory type for each of the corresponding ranges of the direct map is also
* changed. (The corresponding ranges of the direct map are those ranges that
* map the same physical pages as the specified virtual address range.) These
* changes to the direct map are necessary because Intel describes the
* behavior of their processors as "undefined" if two or more mappings to the
* same physical page have different memory types.
*
* Returns zero if the change completed successfully, and either EINVAL or
* ENOMEM if the change failed. Specifically, EINVAL is returned if some part
* of the virtual address range was not mapped, and ENOMEM is returned if
* there was insufficient memory available to complete the change. In the
* latter case, the memory type may have been changed on some part of the
* virtual address range or the direct map.
*/
static int
pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
{
int error;
PMAP_LOCK(kernel_pmap);
error = pmap_change_attr_locked(va, size, mode);
PMAP_UNLOCK(kernel_pmap);
return (error);
}
static int
pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
{
vm_offset_t base, offset, tmpva;
pt_entry_t l3, *pte, *newpte;
int lvl;
PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
base = trunc_page(va);
offset = va & PAGE_MASK;
size = round_page(offset + size);
if (!VIRT_IN_DMAP(base))
return (EINVAL);
for (tmpva = base; tmpva < base + size; ) {
pte = pmap_pte(kernel_pmap, va, &lvl);
if (pte == NULL)
return (EINVAL);
if ((pmap_load(pte) & ATTR_IDX_MASK) == ATTR_IDX(mode)) {
/*
* We already have the correct attribute,
* ignore this entry.
*/
switch (lvl) {
default:
panic("Invalid DMAP table level: %d\n", lvl);
case 1:
tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE;
break;
case 2:
tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE;
break;
case 3:
tmpva += PAGE_SIZE;
break;
}
} else {
/*
* Split the entry to an level 3 table, then
* set the new attribute.
*/
switch (lvl) {
default:
panic("Invalid DMAP table level: %d\n", lvl);
case 1:
newpte = pmap_demote_l1(kernel_pmap, pte,
tmpva & ~L1_OFFSET);
if (newpte == NULL)
return (EINVAL);
pte = pmap_l1_to_l2(pte, tmpva);
case 2:
newpte = pmap_demote_l2(kernel_pmap, pte,
tmpva & ~L2_OFFSET);
if (newpte == NULL)
return (EINVAL);
pte = pmap_l2_to_l3(pte, tmpva);
case 3:
/* Update the entry */
l3 = pmap_load(pte);
l3 &= ~ATTR_IDX_MASK;
l3 |= ATTR_IDX(mode);
if (mode == DEVICE_MEMORY)
l3 |= ATTR_XN;
pmap_update_entry(kernel_pmap, pte, l3, tmpva,
PAGE_SIZE);
/*
* If moving to a non-cacheable entry flush
* the cache.
*/
if (mode == VM_MEMATTR_UNCACHEABLE)
cpu_dcache_wbinv_range(tmpva, L3_SIZE);
break;
}
tmpva += PAGE_SIZE;
}
}
return (0);
}
/*
* Create an L2 table to map all addresses within an L1 mapping.
*/
static pt_entry_t *
pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va)
{
pt_entry_t *l2, newl2, oldl1;
vm_offset_t tmpl1;
vm_paddr_t l2phys, phys;
vm_page_t ml2;
int i;
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
oldl1 = pmap_load(l1);
KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK,
("pmap_demote_l1: Demoting a non-block entry"));
KASSERT((va & L1_OFFSET) == 0,
("pmap_demote_l1: Invalid virtual address %#lx", va));
KASSERT((oldl1 & ATTR_SW_MANAGED) == 0,
("pmap_demote_l1: Level 1 table shouldn't be managed"));
tmpl1 = 0;
if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) {
tmpl1 = kva_alloc(PAGE_SIZE);
if (tmpl1 == 0)
return (NULL);
}
if ((ml2 = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT |
VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx"
" in pmap %p", va, pmap);
return (NULL);
}
l2phys = VM_PAGE_TO_PHYS(ml2);
l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys);
/* Address the range points at */
phys = oldl1 & ~ATTR_MASK;
/* The attributed from the old l1 table to be copied */
newl2 = oldl1 & ATTR_MASK;
/* Create the new entries */
for (i = 0; i < Ln_ENTRIES; i++) {
l2[i] = newl2 | phys;
phys += L2_SIZE;
}
KASSERT(l2[0] == ((oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK),
("Invalid l2 page (%lx != %lx)", l2[0],
(oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK));
if (tmpl1 != 0) {
pmap_kenter(tmpl1, PAGE_SIZE,
DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET, CACHED_MEMORY);
l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK));
}
pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE);
if (tmpl1 != 0) {
pmap_kremove(tmpl1);
kva_free(tmpl1, PAGE_SIZE);
}
return (l2);
}
/*
* Create an L3 table to map all addresses within an L2 mapping.
*/
static pt_entry_t *
pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va,
struct rwlock **lockp)
{
pt_entry_t *l3, newl3, oldl2;
vm_offset_t tmpl2;
vm_paddr_t l3phys, phys;
vm_page_t ml3;
int i;
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
l3 = NULL;
oldl2 = pmap_load(l2);
KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK,
("pmap_demote_l2: Demoting a non-block entry"));
KASSERT((va & L2_OFFSET) == 0,
("pmap_demote_l2: Invalid virtual address %#lx", va));
tmpl2 = 0;
if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) {
tmpl2 = kva_alloc(PAGE_SIZE);
if (tmpl2 == 0)
return (NULL);
}
if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) {
ml3 = vm_page_alloc(NULL, pmap_l2_pindex(va),
(VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
if (ml3 == NULL) {
CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx"
" in pmap %p", va, pmap);
goto fail;
}
if (va < VM_MAXUSER_ADDRESS)
pmap_resident_count_inc(pmap, 1);
}
l3phys = VM_PAGE_TO_PHYS(ml3);
l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys);
/* Address the range points at */
phys = oldl2 & ~ATTR_MASK;
/* The attributed from the old l2 table to be copied */
newl3 = (oldl2 & (ATTR_MASK & ~ATTR_DESCR_MASK)) | L3_PAGE;
/*
* If the page table page is new, initialize it.
*/
if (ml3->wire_count == 1) {
for (i = 0; i < Ln_ENTRIES; i++) {
l3[i] = newl3 | phys;
phys += L3_SIZE;
}
}
KASSERT(l3[0] == ((oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE),
("Invalid l3 page (%lx != %lx)", l3[0],
(oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE));
/*
* Map the temporary page so we don't lose access to the l2 table.
*/
if (tmpl2 != 0) {
pmap_kenter(tmpl2, PAGE_SIZE,
DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET, CACHED_MEMORY);
l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK));
}
/*
* The spare PV entries must be reserved prior to demoting the
* mapping, that is, prior to changing the PDE. Otherwise, the state
* of the L2 and the PV lists will be inconsistent, which can result
* in reclaim_pv_chunk() attempting to remove a PV entry from the
* wrong PV list and pmap_pv_demote_l2() failing to find the expected
* PV entry for the 2MB page mapping that is being demoted.
*/
if ((oldl2 & ATTR_SW_MANAGED) != 0)
reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp);
pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE);
/*
* Demote the PV entry.
*/
if ((oldl2 & ATTR_SW_MANAGED) != 0)
pmap_pv_demote_l2(pmap, va, oldl2 & ~ATTR_MASK, lockp);
atomic_add_long(&pmap_l2_demotions, 1);
CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx"
" in pmap %p %lx", va, pmap, l3[0]);
fail:
if (tmpl2 != 0) {
pmap_kremove(tmpl2);
kva_free(tmpl2, PAGE_SIZE);
}
return (l3);
}
static pt_entry_t *
pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
{
struct rwlock *lock;
pt_entry_t *l3;
lock = NULL;
l3 = pmap_demote_l2_locked(pmap, l2, va, &lock);
if (lock != NULL)
rw_wunlock(lock);
return (l3);
}
/*
* perform the pmap work for mincore
*/
int
pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
{
pd_entry_t *l1p, l1;
pd_entry_t *l2p, l2;
pt_entry_t *l3p, l3;
vm_paddr_t pa;
bool managed;
int val;
PMAP_LOCK(pmap);
retry:
pa = 0;
val = 0;
managed = false;
l1p = pmap_l1(pmap, addr);
if (l1p == NULL) /* No l1 */
goto done;
l1 = pmap_load(l1p);
if ((l1 & ATTR_DESCR_MASK) == L1_INVAL)
goto done;
if ((l1 & ATTR_DESCR_MASK) == L1_BLOCK) {
pa = (l1 & ~ATTR_MASK) | (addr & L1_OFFSET);
managed = (l1 & ATTR_SW_MANAGED) == ATTR_SW_MANAGED;
val = MINCORE_SUPER | MINCORE_INCORE;
if (pmap_page_dirty(l1))
val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
if ((l1 & ATTR_AF) == ATTR_AF)
val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
goto done;
}
l2p = pmap_l1_to_l2(l1p, addr);
if (l2p == NULL) /* No l2 */
goto done;
l2 = pmap_load(l2p);
if ((l2 & ATTR_DESCR_MASK) == L2_INVAL)
goto done;
if ((l2 & ATTR_DESCR_MASK) == L2_BLOCK) {
pa = (l2 & ~ATTR_MASK) | (addr & L2_OFFSET);
managed = (l2 & ATTR_SW_MANAGED) == ATTR_SW_MANAGED;
val = MINCORE_SUPER | MINCORE_INCORE;
if (pmap_page_dirty(l2))
val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
if ((l2 & ATTR_AF) == ATTR_AF)
val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
goto done;
}
l3p = pmap_l2_to_l3(l2p, addr);
if (l3p == NULL) /* No l3 */
goto done;
l3 = pmap_load(l2p);
if ((l3 & ATTR_DESCR_MASK) == L3_INVAL)
goto done;
if ((l3 & ATTR_DESCR_MASK) == L3_PAGE) {
pa = (l3 & ~ATTR_MASK) | (addr & L3_OFFSET);
managed = (l3 & ATTR_SW_MANAGED) == ATTR_SW_MANAGED;
val = MINCORE_INCORE;
if (pmap_page_dirty(l3))
val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
if ((l3 & ATTR_AF) == ATTR_AF)
val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
}
done:
if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
(MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) {
/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
goto retry;
} else
PA_UNLOCK_COND(*locked_pa);
PMAP_UNLOCK(pmap);
return (val);
}
void
pmap_activate(struct thread *td)
{
pmap_t pmap;
critical_enter();
pmap = vmspace_pmap(td->td_proc->p_vmspace);
td->td_proc->p_md.md_l0addr = vtophys(pmap->pm_l0);
__asm __volatile("msr ttbr0_el1, %0" : :
"r"(td->td_proc->p_md.md_l0addr));
pmap_invalidate_all(pmap);
critical_exit();
}
void
pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz)
{
if (va >= VM_MIN_KERNEL_ADDRESS) {
cpu_icache_sync_range(va, sz);
} else {
u_int len, offset;
vm_paddr_t pa;
/* Find the length of data in this page to flush */
offset = va & PAGE_MASK;
len = imin(PAGE_SIZE - offset, sz);
while (sz != 0) {
/* Extract the physical address & find it in the DMAP */
pa = pmap_extract(pmap, va);
if (pa != 0)
cpu_icache_sync_range(PHYS_TO_DMAP(pa), len);
/* Move to the next page */
sz -= len;
va += len;
/* Set the length for the next iteration */
len = imin(PAGE_SIZE, sz);
}
}
}
int
pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far)
{
#ifdef SMP
uint64_t par;
#endif
switch (ESR_ELx_EXCEPTION(esr)) {
case EXCP_DATA_ABORT_L:
case EXCP_DATA_ABORT:
break;
default:
return (KERN_FAILURE);
}
#ifdef SMP
PMAP_LOCK(pmap);
switch (esr & ISS_DATA_DFSC_MASK) {
case ISS_DATA_DFSC_TF_L0:
case ISS_DATA_DFSC_TF_L1:
case ISS_DATA_DFSC_TF_L2:
case ISS_DATA_DFSC_TF_L3:
/* Ask the MMU to check the address */
if (pmap == kernel_pmap)
par = arm64_address_translate_s1e1r(far);
else
par = arm64_address_translate_s1e0r(far);
/*
* If the translation was successful the address was invalid
* due to a break-before-make sequence. We can unlock and
* return success to the trap handler.
*/
if (PAR_SUCCESS(par)) {
PMAP_UNLOCK(pmap);
return (KERN_SUCCESS);
}
break;
default:
break;
}
PMAP_UNLOCK(pmap);
#endif
return (KERN_FAILURE);
}
/*
* Increase the starting virtual address of the given mapping if a
* different alignment might result in more superpage mappings.
*/
void
pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
vm_offset_t *addr, vm_size_t size)
{
vm_offset_t superpage_offset;
if (size < L2_SIZE)
return;
if (object != NULL && (object->flags & OBJ_COLORED) != 0)
offset += ptoa(object->pg_color);
superpage_offset = offset & L2_OFFSET;
if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE ||
(*addr & L2_OFFSET) == superpage_offset)
return;
if ((*addr & L2_OFFSET) < superpage_offset)
*addr = (*addr & ~L2_OFFSET) + superpage_offset;
else
*addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset;
}
/**
* Get the kernel virtual address of a set of physical pages. If there are
* physical addresses not covered by the DMAP perform a transient mapping
* that will be removed when calling pmap_unmap_io_transient.
*
* \param page The pages the caller wishes to obtain the virtual
* address on the kernel memory map.
* \param vaddr On return contains the kernel virtual memory address
* of the pages passed in the page parameter.
* \param count Number of pages passed in.
* \param can_fault TRUE if the thread using the mapped pages can take
* page faults, FALSE otherwise.
*
* \returns TRUE if the caller must call pmap_unmap_io_transient when
* finished or FALSE otherwise.
*
*/
boolean_t
pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
boolean_t can_fault)
{
vm_paddr_t paddr;
boolean_t needs_mapping;
int error, i;
/*
* Allocate any KVA space that we need, this is done in a separate
* loop to prevent calling vmem_alloc while pinned.
*/
needs_mapping = FALSE;
for (i = 0; i < count; i++) {
paddr = VM_PAGE_TO_PHYS(page[i]);
if (__predict_false(!PHYS_IN_DMAP(paddr))) {
error = vmem_alloc(kernel_arena, PAGE_SIZE,
M_BESTFIT | M_WAITOK, &vaddr[i]);
KASSERT(error == 0, ("vmem_alloc failed: %d", error));
needs_mapping = TRUE;
} else {
vaddr[i] = PHYS_TO_DMAP(paddr);
}
}
/* Exit early if everything is covered by the DMAP */
if (!needs_mapping)
return (FALSE);
if (!can_fault)
sched_pin();
for (i = 0; i < count; i++) {
paddr = VM_PAGE_TO_PHYS(page[i]);
if (!PHYS_IN_DMAP(paddr)) {
panic(
"pmap_map_io_transient: TODO: Map out of DMAP data");
}
}
return (needs_mapping);
}
void
pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
boolean_t can_fault)
{
vm_paddr_t paddr;
int i;
if (!can_fault)
sched_unpin();
for (i = 0; i < count; i++) {
paddr = VM_PAGE_TO_PHYS(page[i]);
if (!PHYS_IN_DMAP(paddr)) {
panic("ARM64TODO: pmap_unmap_io_transient: Unmap data");
}
}
}
Index: head/sys/cam/ata/ata_da.c
===================================================================
--- head/sys/cam/ata/ata_da.c (revision 327172)
+++ head/sys/cam/ata/ata_da.c (revision 327173)
@@ -1,3587 +1,3584 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2009 Alexander Motin <mav@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer,
* without modification, immediately at the beginning of the file.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_ada.h"
#include <sys/param.h>
#ifdef _KERNEL
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/bio.h>
#include <sys/sysctl.h>
#include <sys/taskqueue.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/conf.h>
#include <sys/devicestat.h>
#include <sys/eventhandler.h>
#include <sys/malloc.h>
#include <sys/endian.h>
#include <sys/cons.h>
#include <sys/proc.h>
#include <sys/reboot.h>
#include <sys/sbuf.h>
#include <geom/geom_disk.h>
#endif /* _KERNEL */
#ifndef _KERNEL
#include <stdio.h>
#include <string.h>
#endif /* _KERNEL */
#include <cam/cam.h>
#include <cam/cam_ccb.h>
#include <cam/cam_periph.h>
#include <cam/cam_xpt_periph.h>
#include <cam/scsi/scsi_all.h>
#include <cam/scsi/scsi_da.h>
#include <cam/cam_sim.h>
#include <cam/cam_iosched.h>
#include <cam/ata/ata_all.h>
#include <machine/md_var.h> /* geometry translation */
#ifdef _KERNEL
#define ATA_MAX_28BIT_LBA 268435455UL
extern int iosched_debug;
typedef enum {
ADA_STATE_RAHEAD,
ADA_STATE_WCACHE,
ADA_STATE_LOGDIR,
ADA_STATE_IDDIR,
ADA_STATE_SUP_CAP,
ADA_STATE_ZONE,
ADA_STATE_NORMAL
} ada_state;
typedef enum {
ADA_FLAG_CAN_48BIT = 0x00000002,
ADA_FLAG_CAN_FLUSHCACHE = 0x00000004,
ADA_FLAG_CAN_NCQ = 0x00000008,
ADA_FLAG_CAN_DMA = 0x00000010,
ADA_FLAG_NEED_OTAG = 0x00000020,
ADA_FLAG_WAS_OTAG = 0x00000040,
ADA_FLAG_CAN_TRIM = 0x00000080,
ADA_FLAG_OPEN = 0x00000100,
ADA_FLAG_SCTX_INIT = 0x00000200,
ADA_FLAG_CAN_CFA = 0x00000400,
ADA_FLAG_CAN_POWERMGT = 0x00000800,
ADA_FLAG_CAN_DMA48 = 0x00001000,
ADA_FLAG_CAN_LOG = 0x00002000,
ADA_FLAG_CAN_IDLOG = 0x00004000,
ADA_FLAG_CAN_SUPCAP = 0x00008000,
ADA_FLAG_CAN_ZONE = 0x00010000,
ADA_FLAG_CAN_WCACHE = 0x00020000,
ADA_FLAG_CAN_RAHEAD = 0x00040000,
ADA_FLAG_PROBED = 0x00080000,
ADA_FLAG_ANNOUNCED = 0x00100000,
ADA_FLAG_DIRTY = 0x00200000,
ADA_FLAG_CAN_NCQ_TRIM = 0x00400000, /* CAN_TRIM also set */
ADA_FLAG_PIM_ATA_EXT = 0x00800000
} ada_flags;
typedef enum {
ADA_Q_NONE = 0x00,
ADA_Q_4K = 0x01,
ADA_Q_NCQ_TRIM_BROKEN = 0x02,
ADA_Q_LOG_BROKEN = 0x04,
ADA_Q_SMR_DM = 0x08
} ada_quirks;
#define ADA_Q_BIT_STRING \
"\020" \
"\0014K" \
"\002NCQ_TRIM_BROKEN" \
"\003LOG_BROKEN" \
"\004SMR_DM"
typedef enum {
ADA_CCB_RAHEAD = 0x01,
ADA_CCB_WCACHE = 0x02,
ADA_CCB_BUFFER_IO = 0x03,
ADA_CCB_DUMP = 0x05,
ADA_CCB_TRIM = 0x06,
ADA_CCB_LOGDIR = 0x07,
ADA_CCB_IDDIR = 0x08,
ADA_CCB_SUP_CAP = 0x09,
ADA_CCB_ZONE = 0x0a,
ADA_CCB_TYPE_MASK = 0x0F,
} ada_ccb_state;
typedef enum {
ADA_ZONE_NONE = 0x00,
ADA_ZONE_DRIVE_MANAGED = 0x01,
ADA_ZONE_HOST_AWARE = 0x02,
ADA_ZONE_HOST_MANAGED = 0x03
} ada_zone_mode;
typedef enum {
ADA_ZONE_FLAG_RZ_SUP = 0x0001,
ADA_ZONE_FLAG_OPEN_SUP = 0x0002,
ADA_ZONE_FLAG_CLOSE_SUP = 0x0004,
ADA_ZONE_FLAG_FINISH_SUP = 0x0008,
ADA_ZONE_FLAG_RWP_SUP = 0x0010,
ADA_ZONE_FLAG_SUP_MASK = (ADA_ZONE_FLAG_RZ_SUP |
ADA_ZONE_FLAG_OPEN_SUP |
ADA_ZONE_FLAG_CLOSE_SUP |
ADA_ZONE_FLAG_FINISH_SUP |
ADA_ZONE_FLAG_RWP_SUP),
ADA_ZONE_FLAG_URSWRZ = 0x0020,
ADA_ZONE_FLAG_OPT_SEQ_SET = 0x0040,
ADA_ZONE_FLAG_OPT_NONSEQ_SET = 0x0080,
ADA_ZONE_FLAG_MAX_SEQ_SET = 0x0100,
ADA_ZONE_FLAG_SET_MASK = (ADA_ZONE_FLAG_OPT_SEQ_SET |
ADA_ZONE_FLAG_OPT_NONSEQ_SET |
ADA_ZONE_FLAG_MAX_SEQ_SET)
} ada_zone_flags;
static struct ada_zone_desc {
ada_zone_flags value;
const char *desc;
} ada_zone_desc_table[] = {
{ADA_ZONE_FLAG_RZ_SUP, "Report Zones" },
{ADA_ZONE_FLAG_OPEN_SUP, "Open" },
{ADA_ZONE_FLAG_CLOSE_SUP, "Close" },
{ADA_ZONE_FLAG_FINISH_SUP, "Finish" },
{ADA_ZONE_FLAG_RWP_SUP, "Reset Write Pointer" },
};
/* Offsets into our private area for storing information */
#define ccb_state ppriv_field0
#define ccb_bp ppriv_ptr1
typedef enum {
ADA_DELETE_NONE,
ADA_DELETE_DISABLE,
ADA_DELETE_CFA_ERASE,
ADA_DELETE_DSM_TRIM,
ADA_DELETE_NCQ_DSM_TRIM,
ADA_DELETE_MIN = ADA_DELETE_CFA_ERASE,
ADA_DELETE_MAX = ADA_DELETE_NCQ_DSM_TRIM,
} ada_delete_methods;
static const char *ada_delete_method_names[] =
{ "NONE", "DISABLE", "CFA_ERASE", "DSM_TRIM", "NCQ_DSM_TRIM" };
#if 0
static const char *ada_delete_method_desc[] =
{ "NONE", "DISABLED", "CFA Erase", "DSM Trim", "DSM Trim via NCQ" };
#endif
struct disk_params {
u_int8_t heads;
u_int8_t secs_per_track;
u_int32_t cylinders;
u_int32_t secsize; /* Number of bytes/logical sector */
u_int64_t sectors; /* Total number sectors */
};
#define TRIM_MAX_BLOCKS 8
#define TRIM_MAX_RANGES (TRIM_MAX_BLOCKS * ATA_DSM_BLK_RANGES)
struct trim_request {
uint8_t data[TRIM_MAX_RANGES * ATA_DSM_RANGE_SIZE];
TAILQ_HEAD(, bio) bps;
};
struct ada_softc {
struct cam_iosched_softc *cam_iosched;
int outstanding_cmds; /* Number of active commands */
int refcount; /* Active xpt_action() calls */
ada_state state;
ada_flags flags;
ada_zone_mode zone_mode;
ada_zone_flags zone_flags;
struct ata_gp_log_dir ata_logdir;
int valid_logdir_len;
struct ata_identify_log_pages ata_iddir;
int valid_iddir_len;
uint64_t optimal_seq_zones;
uint64_t optimal_nonseq_zones;
uint64_t max_seq_zones;
ada_quirks quirks;
ada_delete_methods delete_method;
int trim_max_ranges;
int read_ahead;
int write_cache;
int unmappedio;
int rotating;
#ifdef ADA_TEST_FAILURE
int force_read_error;
int force_write_error;
int periodic_read_error;
int periodic_read_count;
#endif
struct disk_params params;
struct disk *disk;
struct task sysctl_task;
struct sysctl_ctx_list sysctl_ctx;
struct sysctl_oid *sysctl_tree;
struct callout sendordered_c;
struct trim_request trim_req;
#ifdef CAM_IO_STATS
struct sysctl_ctx_list sysctl_stats_ctx;
struct sysctl_oid *sysctl_stats_tree;
u_int timeouts;
u_int errors;
u_int invalidations;
#endif
#define ADA_ANNOUNCETMP_SZ 80
char announce_temp[ADA_ANNOUNCETMP_SZ];
#define ADA_ANNOUNCE_SZ 400
char announce_buffer[ADA_ANNOUNCE_SZ];
};
struct ada_quirk_entry {
struct scsi_inquiry_pattern inq_pat;
ada_quirks quirks;
};
static struct ada_quirk_entry ada_quirk_table[] =
{
{
/* Hitachi Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "Hitachi H??????????E3*", "*" },
/*quirks*/ADA_Q_4K
},
{
/* Samsung Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "SAMSUNG HD155UI*", "*" },
/*quirks*/ADA_Q_4K
},
{
/* Samsung Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "SAMSUNG HD204UI*", "*" },
/*quirks*/ADA_Q_4K
},
{
/* Seagate Barracuda Green Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "ST????DL*", "*" },
/*quirks*/ADA_Q_4K
},
{
/* Seagate Barracuda Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "ST???DM*", "*" },
/*quirks*/ADA_Q_4K
},
{
/* Seagate Barracuda Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "ST????DM*", "*" },
/*quirks*/ADA_Q_4K
},
{
/* Seagate Momentus Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "ST9500423AS*", "*" },
/*quirks*/ADA_Q_4K
},
{
/* Seagate Momentus Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "ST9500424AS*", "*" },
/*quirks*/ADA_Q_4K
},
{
/* Seagate Momentus Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "ST9640423AS*", "*" },
/*quirks*/ADA_Q_4K
},
{
/* Seagate Momentus Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "ST9640424AS*", "*" },
/*quirks*/ADA_Q_4K
},
{
/* Seagate Momentus Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "ST9750420AS*", "*" },
/*quirks*/ADA_Q_4K
},
{
/* Seagate Momentus Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "ST9750422AS*", "*" },
/*quirks*/ADA_Q_4K
},
{
/* Seagate Momentus Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "ST9750423AS*", "*" },
/*quirks*/ADA_Q_4K
},
{
/* Seagate Momentus Thin Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "ST???LT*", "*" },
/*quirks*/ADA_Q_4K
},
{
/* WDC Caviar Red Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "WDC WD????CX*", "*" },
/*quirks*/ADA_Q_4K
},
{
/* WDC Caviar Green Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "WDC WD????RS*", "*" },
/*quirks*/ADA_Q_4K
},
{
/* WDC Caviar Green/Red Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "WDC WD????RX*", "*" },
/*quirks*/ADA_Q_4K
},
{
/* WDC Caviar Red Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "WDC WD??????CX*", "*" },
/*quirks*/ADA_Q_4K
},
{
/* WDC Caviar Black Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "WDC WD??????EX*", "*" },
/*quirks*/ADA_Q_4K
},
{
/* WDC Caviar Green Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "WDC WD??????RS*", "*" },
/*quirks*/ADA_Q_4K
},
{
/* WDC Caviar Green Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "WDC WD??????RX*", "*" },
/*quirks*/ADA_Q_4K
},
{
/* WDC Scorpio Black Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "WDC WD???PKT*", "*" },
/*quirks*/ADA_Q_4K
},
{
/* WDC Scorpio Black Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "WDC WD?????PKT*", "*" },
/*quirks*/ADA_Q_4K
},
{
/* WDC Scorpio Blue Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "WDC WD???PVT*", "*" },
/*quirks*/ADA_Q_4K
},
{
/* WDC Scorpio Blue Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "WDC WD?????PVT*", "*" },
/*quirks*/ADA_Q_4K
},
/* SSDs */
{
/*
* Corsair Force 2 SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "Corsair CSSD-F*", "*" },
/*quirks*/ADA_Q_4K
},
{
/*
* Corsair Force 3 SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "Corsair Force 3*", "*" },
/*quirks*/ADA_Q_4K
},
{
/*
* Corsair Neutron GTX SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "Corsair Neutron GTX*", "*" },
/*quirks*/ADA_Q_4K
},
{
/*
* Corsair Force GT & GS SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "Corsair Force G*", "*" },
/*quirks*/ADA_Q_4K
},
{
/*
* Crucial M4 SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "M4-CT???M4SSD2*", "*" },
/*quirks*/ADA_Q_4K
},
{
/*
* Crucial M500 SSDs MU07 firmware
* NCQ Trim works
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "Crucial CT*M500*", "MU07" },
/*quirks*/0
},
{
/*
* Crucial M500 SSDs all other firmware
* NCQ Trim doesn't work
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "Crucial CT*M500*", "*" },
/*quirks*/ADA_Q_NCQ_TRIM_BROKEN
},
{
/*
* Crucial M550 SSDs
* NCQ Trim doesn't work, but only on MU01 firmware
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "Crucial CT*M550*", "MU01" },
/*quirks*/ADA_Q_NCQ_TRIM_BROKEN
},
{
/*
* Crucial MX100 SSDs
* NCQ Trim doesn't work, but only on MU01 firmware
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "Crucial CT*MX100*", "MU01" },
/*quirks*/ADA_Q_NCQ_TRIM_BROKEN
},
{
/*
* Crucial RealSSD C300 SSDs
* 4k optimised
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "C300-CTFDDAC???MAG*",
"*" }, /*quirks*/ADA_Q_4K
},
{
/*
* FCCT M500 SSDs
* NCQ Trim doesn't work
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "FCCT*M500*", "*" },
/*quirks*/ADA_Q_NCQ_TRIM_BROKEN
},
{
/*
* Intel 320 Series SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "INTEL SSDSA2CW*", "*" },
/*quirks*/ADA_Q_4K
},
{
/*
* Intel 330 Series SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "INTEL SSDSC2CT*", "*" },
/*quirks*/ADA_Q_4K
},
{
/*
* Intel 510 Series SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "INTEL SSDSC2MH*", "*" },
/*quirks*/ADA_Q_4K
},
{
/*
* Intel 520 Series SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "INTEL SSDSC2BW*", "*" },
/*quirks*/ADA_Q_4K
},
{
/*
* Intel S3610 Series SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "INTEL SSDSC2BX*", "*" },
/*quirks*/ADA_Q_4K
},
{
/*
* Intel X25-M Series SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "INTEL SSDSA2M*", "*" },
/*quirks*/ADA_Q_4K
},
{
/*
* Kingston E100 Series SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "KINGSTON SE100S3*", "*" },
/*quirks*/ADA_Q_4K
},
{
/*
* Kingston HyperX 3k SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "KINGSTON SH103S3*", "*" },
/*quirks*/ADA_Q_4K
},
{
/*
* Marvell SSDs (entry taken from OpenSolaris)
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "MARVELL SD88SA02*", "*" },
/*quirks*/ADA_Q_4K
},
{
/*
* Micron M500 SSDs firmware MU07
* NCQ Trim works?
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "Micron M500*", "MU07" },
/*quirks*/0
},
{
/*
* Micron M500 SSDs all other firmware
* NCQ Trim doesn't work
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "Micron M500*", "*" },
/*quirks*/ADA_Q_NCQ_TRIM_BROKEN
},
{
/*
* Micron M5[15]0 SSDs
* NCQ Trim doesn't work, but only MU01 firmware
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "Micron M5[15]0*", "MU01" },
/*quirks*/ADA_Q_NCQ_TRIM_BROKEN
},
{
/*
* Micron 5100 SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "Micron 5100 MTFDDAK*", "*" },
/*quirks*/ADA_Q_4K
},
{
/*
* OCZ Agility 2 SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "OCZ-AGILITY2*", "*" },
/*quirks*/ADA_Q_4K
},
{
/*
* OCZ Agility 3 SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "OCZ-AGILITY3*", "*" },
/*quirks*/ADA_Q_4K
},
{
/*
* OCZ Deneva R Series SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "DENRSTE251M45*", "*" },
/*quirks*/ADA_Q_4K
},
{
/*
* OCZ Vertex 2 SSDs (inc pro series)
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "OCZ?VERTEX2*", "*" },
/*quirks*/ADA_Q_4K
},
{
/*
* OCZ Vertex 3 SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "OCZ-VERTEX3*", "*" },
/*quirks*/ADA_Q_4K
},
{
/*
* OCZ Vertex 4 SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "OCZ-VERTEX4*", "*" },
/*quirks*/ADA_Q_4K
},
{
/*
* Samsung 750 SSDs
* 4k optimised, NCQ TRIM seems to work
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "Samsung SSD 750*", "*" },
/*quirks*/ADA_Q_4K
},
{
/*
* Samsung 830 Series SSDs
* 4k optimised, NCQ TRIM Broken (normal TRIM is fine)
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "SAMSUNG SSD 830 Series*", "*" },
/*quirks*/ADA_Q_4K | ADA_Q_NCQ_TRIM_BROKEN
},
{
/*
* Samsung 840 SSDs
* 4k optimised, NCQ TRIM Broken (normal TRIM is fine)
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "Samsung SSD 840*", "*" },
/*quirks*/ADA_Q_4K | ADA_Q_NCQ_TRIM_BROKEN
},
{
/*
* Samsung 845 SSDs
* 4k optimised, NCQ TRIM Broken (normal TRIM is fine)
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "Samsung SSD 845*", "*" },
/*quirks*/ADA_Q_4K | ADA_Q_NCQ_TRIM_BROKEN
},
{
/*
* Samsung 850 SSDs
* 4k optimised, NCQ TRIM broken (normal TRIM fine)
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "Samsung SSD 850*", "*" },
/*quirks*/ADA_Q_4K | ADA_Q_NCQ_TRIM_BROKEN
},
{
/*
* Samsung SM863 Series SSDs (MZ7KM*)
* 4k optimised, NCQ believed to be working
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "SAMSUNG MZ7KM*", "*" },
/*quirks*/ADA_Q_4K
},
{
/*
* Samsung 843T Series SSDs (MZ7WD*)
* Samsung PM851 Series SSDs (MZ7TE*)
* Samsung PM853T Series SSDs (MZ7GE*)
* 4k optimised, NCQ believed to be broken since these are
* appear to be built with the same controllers as the 840/850.
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "SAMSUNG MZ7*", "*" },
/*quirks*/ADA_Q_4K | ADA_Q_NCQ_TRIM_BROKEN
},
{
/*
* Same as for SAMSUNG MZ7* but enable the quirks for SSD
* starting with MZ7* too
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "MZ7*", "*" },
/*quirks*/ADA_Q_4K | ADA_Q_NCQ_TRIM_BROKEN
},
{
/*
* Samsung PM851 Series SSDs Dell OEM
* device model "SAMSUNG SSD PM851 mSATA 256GB"
* 4k optimised, NCQ broken
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "SAMSUNG SSD PM851*", "*" },
/*quirks*/ADA_Q_4K | ADA_Q_NCQ_TRIM_BROKEN
},
{
/*
* SuperTalent TeraDrive CT SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "FTM??CT25H*", "*" },
/*quirks*/ADA_Q_4K
},
{
/*
* XceedIOPS SATA SSDs
* 4k optimised
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "SG9XCS2D*", "*" },
/*quirks*/ADA_Q_4K
},
{
/*
* Samsung drive that doesn't support READ LOG EXT or
* READ LOG DMA EXT, despite reporting that it does in
* ATA identify data:
* SAMSUNG HD200HJ KF100-06
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "SAMSUNG HD200*", "*" },
/*quirks*/ADA_Q_LOG_BROKEN
},
{
/*
* Samsung drive that doesn't support READ LOG EXT or
* READ LOG DMA EXT, despite reporting that it does in
* ATA identify data:
* SAMSUNG HD501LJ CR100-10
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "SAMSUNG HD501*", "*" },
/*quirks*/ADA_Q_LOG_BROKEN
},
{
/*
* Seagate Lamarr 8TB Shingled Magnetic Recording (SMR)
* Drive Managed SATA hard drive. This drive doesn't report
* in firmware that it is a drive managed SMR drive.
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "ST8000AS000[23]*", "*" },
/*quirks*/ADA_Q_SMR_DM
},
{
/* Default */
{
T_ANY, SIP_MEDIA_REMOVABLE|SIP_MEDIA_FIXED,
/*vendor*/"*", /*product*/"*", /*revision*/"*"
},
/*quirks*/0
},
};
static disk_strategy_t adastrategy;
static dumper_t adadump;
static periph_init_t adainit;
static void adadiskgonecb(struct disk *dp);
static periph_oninv_t adaoninvalidate;
static periph_dtor_t adacleanup;
static void adaasync(void *callback_arg, u_int32_t code,
struct cam_path *path, void *arg);
static int adazonemodesysctl(SYSCTL_HANDLER_ARGS);
static int adazonesupsysctl(SYSCTL_HANDLER_ARGS);
static void adasysctlinit(void *context, int pending);
static int adagetattr(struct bio *bp);
static void adasetflags(struct ada_softc *softc,
struct ccb_getdev *cgd);
static periph_ctor_t adaregister;
static void ada_dsmtrim(struct ada_softc *softc, struct bio *bp,
struct ccb_ataio *ataio);
static void ada_cfaerase(struct ada_softc *softc, struct bio *bp,
struct ccb_ataio *ataio);
static int ada_zone_bio_to_ata(int disk_zone_cmd);
static int ada_zone_cmd(struct cam_periph *periph, union ccb *ccb,
struct bio *bp, int *queue_ccb);
static periph_start_t adastart;
static void adaprobedone(struct cam_periph *periph, union ccb *ccb);
static void adazonedone(struct cam_periph *periph, union ccb *ccb);
static void adadone(struct cam_periph *periph,
union ccb *done_ccb);
static int adaerror(union ccb *ccb, u_int32_t cam_flags,
u_int32_t sense_flags);
static void adagetparams(struct cam_periph *periph,
struct ccb_getdev *cgd);
static timeout_t adasendorderedtag;
static void adashutdown(void *arg, int howto);
static void adasuspend(void *arg);
static void adaresume(void *arg);
#ifndef ADA_DEFAULT_TIMEOUT
#define ADA_DEFAULT_TIMEOUT 30 /* Timeout in seconds */
#endif
#ifndef ADA_DEFAULT_RETRY
#define ADA_DEFAULT_RETRY 4
#endif
#ifndef ADA_DEFAULT_SEND_ORDERED
#define ADA_DEFAULT_SEND_ORDERED 1
#endif
#ifndef ADA_DEFAULT_SPINDOWN_SHUTDOWN
#define ADA_DEFAULT_SPINDOWN_SHUTDOWN 1
#endif
#ifndef ADA_DEFAULT_SPINDOWN_SUSPEND
#define ADA_DEFAULT_SPINDOWN_SUSPEND 1
#endif
#ifndef ADA_DEFAULT_READ_AHEAD
#define ADA_DEFAULT_READ_AHEAD 1
#endif
#ifndef ADA_DEFAULT_WRITE_CACHE
#define ADA_DEFAULT_WRITE_CACHE 1
#endif
#define ADA_RA (softc->read_ahead >= 0 ? \
softc->read_ahead : ada_read_ahead)
#define ADA_WC (softc->write_cache >= 0 ? \
softc->write_cache : ada_write_cache)
/*
* Most platforms map firmware geometry to actual, but some don't. If
* not overridden, default to nothing.
*/
#ifndef ata_disk_firmware_geom_adjust
#define ata_disk_firmware_geom_adjust(disk)
#endif
static int ada_retry_count = ADA_DEFAULT_RETRY;
static int ada_default_timeout = ADA_DEFAULT_TIMEOUT;
static int ada_send_ordered = ADA_DEFAULT_SEND_ORDERED;
static int ada_spindown_shutdown = ADA_DEFAULT_SPINDOWN_SHUTDOWN;
static int ada_spindown_suspend = ADA_DEFAULT_SPINDOWN_SUSPEND;
static int ada_read_ahead = ADA_DEFAULT_READ_AHEAD;
static int ada_write_cache = ADA_DEFAULT_WRITE_CACHE;
static SYSCTL_NODE(_kern_cam, OID_AUTO, ada, CTLFLAG_RD, 0,
"CAM Direct Access Disk driver");
SYSCTL_INT(_kern_cam_ada, OID_AUTO, retry_count, CTLFLAG_RWTUN,
&ada_retry_count, 0, "Normal I/O retry count");
SYSCTL_INT(_kern_cam_ada, OID_AUTO, default_timeout, CTLFLAG_RWTUN,
&ada_default_timeout, 0, "Normal I/O timeout (in seconds)");
SYSCTL_INT(_kern_cam_ada, OID_AUTO, send_ordered, CTLFLAG_RWTUN,
&ada_send_ordered, 0, "Send Ordered Tags");
SYSCTL_INT(_kern_cam_ada, OID_AUTO, spindown_shutdown, CTLFLAG_RWTUN,
&ada_spindown_shutdown, 0, "Spin down upon shutdown");
SYSCTL_INT(_kern_cam_ada, OID_AUTO, spindown_suspend, CTLFLAG_RWTUN,
&ada_spindown_suspend, 0, "Spin down upon suspend");
SYSCTL_INT(_kern_cam_ada, OID_AUTO, read_ahead, CTLFLAG_RWTUN,
&ada_read_ahead, 0, "Enable disk read-ahead");
SYSCTL_INT(_kern_cam_ada, OID_AUTO, write_cache, CTLFLAG_RWTUN,
&ada_write_cache, 0, "Enable disk write cache");
/*
* ADA_ORDEREDTAG_INTERVAL determines how often, relative
* to the default timeout, we check to see whether an ordered
* tagged transaction is appropriate to prevent simple tag
* starvation. Since we'd like to ensure that there is at least
* 1/2 of the timeout length left for a starved transaction to
* complete after we've sent an ordered tag, we must poll at least
* four times in every timeout period. This takes care of the worst
* case where a starved transaction starts during an interval that
* meets the requirement "don't send an ordered tag" test so it takes
* us two intervals to determine that a tag must be sent.
*/
#ifndef ADA_ORDEREDTAG_INTERVAL
#define ADA_ORDEREDTAG_INTERVAL 4
#endif
static struct periph_driver adadriver =
{
adainit, "ada",
TAILQ_HEAD_INITIALIZER(adadriver.units), /* generation */ 0
};
static int adadeletemethodsysctl(SYSCTL_HANDLER_ARGS);
PERIPHDRIVER_DECLARE(ada, adadriver);
static MALLOC_DEFINE(M_ATADA, "ata_da", "ata_da buffers");
static int
adaopen(struct disk *dp)
{
struct cam_periph *periph;
struct ada_softc *softc;
int error;
periph = (struct cam_periph *)dp->d_drv1;
if (cam_periph_acquire(periph) != CAM_REQ_CMP) {
return(ENXIO);
}
cam_periph_lock(periph);
if ((error = cam_periph_hold(periph, PRIBIO|PCATCH)) != 0) {
cam_periph_unlock(periph);
cam_periph_release(periph);
return (error);
}
CAM_DEBUG(periph->path, CAM_DEBUG_TRACE | CAM_DEBUG_PERIPH,
("adaopen\n"));
softc = (struct ada_softc *)periph->softc;
softc->flags |= ADA_FLAG_OPEN;
cam_periph_unhold(periph);
cam_periph_unlock(periph);
return (0);
}
static int
adaclose(struct disk *dp)
{
struct cam_periph *periph;
struct ada_softc *softc;
union ccb *ccb;
int error;
periph = (struct cam_periph *)dp->d_drv1;
softc = (struct ada_softc *)periph->softc;
cam_periph_lock(periph);
CAM_DEBUG(periph->path, CAM_DEBUG_TRACE | CAM_DEBUG_PERIPH,
("adaclose\n"));
/* We only sync the cache if the drive is capable of it. */
if ((softc->flags & ADA_FLAG_DIRTY) != 0 &&
(softc->flags & ADA_FLAG_CAN_FLUSHCACHE) != 0 &&
(periph->flags & CAM_PERIPH_INVALID) == 0 &&
cam_periph_hold(periph, PRIBIO) == 0) {
ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
cam_fill_ataio(&ccb->ataio,
1,
adadone,
CAM_DIR_NONE,
0,
NULL,
0,
ada_default_timeout*1000);
if (softc->flags & ADA_FLAG_CAN_48BIT)
ata_48bit_cmd(&ccb->ataio, ATA_FLUSHCACHE48, 0, 0, 0);
else
ata_28bit_cmd(&ccb->ataio, ATA_FLUSHCACHE, 0, 0, 0);
error = cam_periph_runccb(ccb, adaerror, /*cam_flags*/0,
/*sense_flags*/0, softc->disk->d_devstat);
if (error != 0)
xpt_print(periph->path, "Synchronize cache failed\n");
softc->flags &= ~ADA_FLAG_DIRTY;
xpt_release_ccb(ccb);
cam_periph_unhold(periph);
}
softc->flags &= ~ADA_FLAG_OPEN;
while (softc->refcount != 0)
cam_periph_sleep(periph, &softc->refcount, PRIBIO, "adaclose", 1);
cam_periph_unlock(periph);
cam_periph_release(periph);
return (0);
}
static void
adaschedule(struct cam_periph *periph)
{
struct ada_softc *softc = (struct ada_softc *)periph->softc;
if (softc->state != ADA_STATE_NORMAL)
return;
cam_iosched_schedule(softc->cam_iosched, periph);
}
/*
* Actually translate the requested transfer into one the physical driver
* can understand. The transfer is described by a buf and will include
* only one physical transfer.
*/
static void
adastrategy(struct bio *bp)
{
struct cam_periph *periph;
struct ada_softc *softc;
periph = (struct cam_periph *)bp->bio_disk->d_drv1;
softc = (struct ada_softc *)periph->softc;
cam_periph_lock(periph);
CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("adastrategy(%p)\n", bp));
/*
* If the device has been made invalid, error out
*/
if ((periph->flags & CAM_PERIPH_INVALID) != 0) {
cam_periph_unlock(periph);
biofinish(bp, NULL, ENXIO);
return;
}
/*
* Zone commands must be ordered, because they can depend on the
* effects of previously issued commands, and they may affect
* commands after them.
*/
if (bp->bio_cmd == BIO_ZONE)
bp->bio_flags |= BIO_ORDERED;
/*
* Place it in the queue of disk activities for this disk
*/
cam_iosched_queue_work(softc->cam_iosched, bp);
/*
* Schedule ourselves for performing the work.
*/
adaschedule(periph);
cam_periph_unlock(periph);
return;
}
static int
adadump(void *arg, void *virtual, vm_offset_t physical, off_t offset, size_t length)
{
struct cam_periph *periph;
struct ada_softc *softc;
u_int secsize;
struct ccb_ataio ataio;
struct disk *dp;
uint64_t lba;
uint16_t count;
int error = 0;
dp = arg;
periph = dp->d_drv1;
softc = (struct ada_softc *)periph->softc;
cam_periph_lock(periph);
secsize = softc->params.secsize;
lba = offset / secsize;
count = length / secsize;
if ((periph->flags & CAM_PERIPH_INVALID) != 0) {
cam_periph_unlock(periph);
return (ENXIO);
}
memset(&ataio, 0, sizeof(ataio));
if (length > 0) {
xpt_setup_ccb(&ataio.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
ataio.ccb_h.ccb_state = ADA_CCB_DUMP;
cam_fill_ataio(&ataio,
0,
adadone,
CAM_DIR_OUT,
0,
(u_int8_t *) virtual,
length,
ada_default_timeout*1000);
if ((softc->flags & ADA_FLAG_CAN_48BIT) &&
(lba + count >= ATA_MAX_28BIT_LBA ||
count >= 256)) {
ata_48bit_cmd(&ataio, ATA_WRITE_DMA48,
0, lba, count);
} else {
ata_28bit_cmd(&ataio, ATA_WRITE_DMA,
0, lba, count);
}
error = cam_periph_runccb((union ccb *)&ataio, adaerror,
0, SF_NO_RECOVERY | SF_NO_RETRY, NULL);
if (error != 0)
printf("Aborting dump due to I/O error.\n");
cam_periph_unlock(periph);
return (error);
}
if (softc->flags & ADA_FLAG_CAN_FLUSHCACHE) {
xpt_setup_ccb(&ataio.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
/*
* Tell the drive to flush its internal cache. if we
* can't flush in 5s we have big problems. No need to
* wait the default 60s to detect problems.
*/
ataio.ccb_h.ccb_state = ADA_CCB_DUMP;
cam_fill_ataio(&ataio,
0,
adadone,
CAM_DIR_NONE,
0,
NULL,
0,
5*1000);
if (softc->flags & ADA_FLAG_CAN_48BIT)
ata_48bit_cmd(&ataio, ATA_FLUSHCACHE48, 0, 0, 0);
else
ata_28bit_cmd(&ataio, ATA_FLUSHCACHE, 0, 0, 0);
error = cam_periph_runccb((union ccb *)&ataio, adaerror,
0, SF_NO_RECOVERY | SF_NO_RETRY, NULL);
if (error != 0)
xpt_print(periph->path, "Synchronize cache failed\n");
}
cam_periph_unlock(periph);
return (error);
}
static void
adainit(void)
{
cam_status status;
/*
* Install a global async callback. This callback will
* receive async callbacks like "new device found".
*/
status = xpt_register_async(AC_FOUND_DEVICE, adaasync, NULL, NULL);
if (status != CAM_REQ_CMP) {
printf("ada: Failed to attach master async callback "
"due to status 0x%x!\n", status);
} else if (ada_send_ordered) {
/* Register our event handlers */
if ((EVENTHANDLER_REGISTER(power_suspend, adasuspend,
NULL, EVENTHANDLER_PRI_LAST)) == NULL)
printf("adainit: power event registration failed!\n");
if ((EVENTHANDLER_REGISTER(power_resume, adaresume,
NULL, EVENTHANDLER_PRI_LAST)) == NULL)
printf("adainit: power event registration failed!\n");
if ((EVENTHANDLER_REGISTER(shutdown_post_sync, adashutdown,
NULL, SHUTDOWN_PRI_DEFAULT)) == NULL)
printf("adainit: shutdown event registration failed!\n");
}
}
/*
* Callback from GEOM, called when it has finished cleaning up its
* resources.
*/
static void
adadiskgonecb(struct disk *dp)
{
struct cam_periph *periph;
periph = (struct cam_periph *)dp->d_drv1;
cam_periph_release(periph);
}
static void
adaoninvalidate(struct cam_periph *periph)
{
struct ada_softc *softc;
softc = (struct ada_softc *)periph->softc;
/*
* De-register any async callbacks.
*/
xpt_register_async(0, adaasync, periph, periph->path);
#ifdef CAM_IO_STATS
softc->invalidations++;
#endif
/*
* Return all queued I/O with ENXIO.
* XXX Handle any transactions queued to the card
* with XPT_ABORT_CCB.
*/
cam_iosched_flush(softc->cam_iosched, NULL, ENXIO);
disk_gone(softc->disk);
}
static void
adacleanup(struct cam_periph *periph)
{
struct ada_softc *softc;
softc = (struct ada_softc *)periph->softc;
cam_periph_unlock(periph);
cam_iosched_fini(softc->cam_iosched);
/*
* If we can't free the sysctl tree, oh well...
*/
if ((softc->flags & ADA_FLAG_SCTX_INIT) != 0) {
#ifdef CAM_IO_STATS
if (sysctl_ctx_free(&softc->sysctl_stats_ctx) != 0)
xpt_print(periph->path,
"can't remove sysctl stats context\n");
#endif
if (sysctl_ctx_free(&softc->sysctl_ctx) != 0)
xpt_print(periph->path,
"can't remove sysctl context\n");
}
disk_destroy(softc->disk);
callout_drain(&softc->sendordered_c);
free(softc, M_DEVBUF);
cam_periph_lock(periph);
}
static void
adasetdeletemethod(struct ada_softc *softc)
{
if (softc->flags & ADA_FLAG_CAN_NCQ_TRIM)
softc->delete_method = ADA_DELETE_NCQ_DSM_TRIM;
else if (softc->flags & ADA_FLAG_CAN_TRIM)
softc->delete_method = ADA_DELETE_DSM_TRIM;
else if ((softc->flags & ADA_FLAG_CAN_CFA) && !(softc->flags & ADA_FLAG_CAN_48BIT))
softc->delete_method = ADA_DELETE_CFA_ERASE;
else
softc->delete_method = ADA_DELETE_NONE;
}
static void
adaasync(void *callback_arg, u_int32_t code,
struct cam_path *path, void *arg)
{
struct ccb_getdev cgd;
struct cam_periph *periph;
struct ada_softc *softc;
periph = (struct cam_periph *)callback_arg;
switch (code) {
case AC_FOUND_DEVICE:
{
struct ccb_getdev *cgd;
cam_status status;
cgd = (struct ccb_getdev *)arg;
if (cgd == NULL)
break;
if (cgd->protocol != PROTO_ATA)
break;
/*
* Allocate a peripheral instance for
* this device and start the probe
* process.
*/
status = cam_periph_alloc(adaregister, adaoninvalidate,
adacleanup, adastart,
"ada", CAM_PERIPH_BIO,
path, adaasync,
AC_FOUND_DEVICE, cgd);
if (status != CAM_REQ_CMP
&& status != CAM_REQ_INPROG)
printf("adaasync: Unable to attach to new device "
"due to status 0x%x\n", status);
break;
}
case AC_GETDEV_CHANGED:
{
softc = (struct ada_softc *)periph->softc;
xpt_setup_ccb(&cgd.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
cgd.ccb_h.func_code = XPT_GDEV_TYPE;
xpt_action((union ccb *)&cgd);
/*
* Set/clear support flags based on the new Identify data.
*/
adasetflags(softc, &cgd);
cam_periph_async(periph, code, path, arg);
break;
}
case AC_ADVINFO_CHANGED:
{
uintptr_t buftype;
buftype = (uintptr_t)arg;
if (buftype == CDAI_TYPE_PHYS_PATH) {
struct ada_softc *softc;
softc = periph->softc;
disk_attr_changed(softc->disk, "GEOM::physpath",
M_NOWAIT);
}
break;
}
case AC_SENT_BDR:
case AC_BUS_RESET:
{
softc = (struct ada_softc *)periph->softc;
cam_periph_async(periph, code, path, arg);
if (softc->state != ADA_STATE_NORMAL)
break;
xpt_setup_ccb(&cgd.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
cgd.ccb_h.func_code = XPT_GDEV_TYPE;
xpt_action((union ccb *)&cgd);
if (ADA_RA >= 0 && softc->flags & ADA_FLAG_CAN_RAHEAD)
softc->state = ADA_STATE_RAHEAD;
else if (ADA_WC >= 0 && softc->flags & ADA_FLAG_CAN_WCACHE)
softc->state = ADA_STATE_WCACHE;
else if ((softc->flags & ADA_FLAG_CAN_LOG)
&& (softc->zone_mode != ADA_ZONE_NONE))
softc->state = ADA_STATE_LOGDIR;
else
break;
if (cam_periph_acquire(periph) != CAM_REQ_CMP)
softc->state = ADA_STATE_NORMAL;
else
xpt_schedule(periph, CAM_PRIORITY_DEV);
}
default:
cam_periph_async(periph, code, path, arg);
break;
}
}
static int
adazonemodesysctl(SYSCTL_HANDLER_ARGS)
{
char tmpbuf[40];
struct ada_softc *softc;
int error;
softc = (struct ada_softc *)arg1;
switch (softc->zone_mode) {
case ADA_ZONE_DRIVE_MANAGED:
snprintf(tmpbuf, sizeof(tmpbuf), "Drive Managed");
break;
case ADA_ZONE_HOST_AWARE:
snprintf(tmpbuf, sizeof(tmpbuf), "Host Aware");
break;
case ADA_ZONE_HOST_MANAGED:
snprintf(tmpbuf, sizeof(tmpbuf), "Host Managed");
break;
case ADA_ZONE_NONE:
default:
snprintf(tmpbuf, sizeof(tmpbuf), "Not Zoned");
break;
}
error = sysctl_handle_string(oidp, tmpbuf, sizeof(tmpbuf), req);
return (error);
}
static int
adazonesupsysctl(SYSCTL_HANDLER_ARGS)
{
char tmpbuf[180];
struct ada_softc *softc;
struct sbuf sb;
int error, first;
unsigned int i;
softc = (struct ada_softc *)arg1;
error = 0;
first = 1;
sbuf_new(&sb, tmpbuf, sizeof(tmpbuf), 0);
for (i = 0; i < sizeof(ada_zone_desc_table) /
sizeof(ada_zone_desc_table[0]); i++) {
if (softc->zone_flags & ada_zone_desc_table[i].value) {
if (first == 0)
sbuf_printf(&sb, ", ");
else
first = 0;
sbuf_cat(&sb, ada_zone_desc_table[i].desc);
}
}
if (first == 1)
sbuf_printf(&sb, "None");
sbuf_finish(&sb);
error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
return (error);
}
static void
adasysctlinit(void *context, int pending)
{
struct cam_periph *periph;
struct ada_softc *softc;
char tmpstr[32], tmpstr2[16];
periph = (struct cam_periph *)context;
/* periph was held for us when this task was enqueued */
if ((periph->flags & CAM_PERIPH_INVALID) != 0) {
cam_periph_release(periph);
return;
}
softc = (struct ada_softc *)periph->softc;
snprintf(tmpstr, sizeof(tmpstr), "CAM ADA unit %d",periph->unit_number);
snprintf(tmpstr2, sizeof(tmpstr2), "%d", periph->unit_number);
sysctl_ctx_init(&softc->sysctl_ctx);
softc->flags |= ADA_FLAG_SCTX_INIT;
softc->sysctl_tree = SYSCTL_ADD_NODE_WITH_LABEL(&softc->sysctl_ctx,
SYSCTL_STATIC_CHILDREN(_kern_cam_ada), OID_AUTO, tmpstr2,
CTLFLAG_RD, 0, tmpstr, "device_index");
if (softc->sysctl_tree == NULL) {
printf("adasysctlinit: unable to allocate sysctl tree\n");
cam_periph_release(periph);
return;
}
SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
OID_AUTO, "delete_method", CTLTYPE_STRING | CTLFLAG_RW,
softc, 0, adadeletemethodsysctl, "A",
"BIO_DELETE execution method");
SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
OID_AUTO, "read_ahead", CTLFLAG_RW | CTLFLAG_MPSAFE,
&softc->read_ahead, 0, "Enable disk read ahead.");
SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
OID_AUTO, "write_cache", CTLFLAG_RW | CTLFLAG_MPSAFE,
&softc->write_cache, 0, "Enable disk write cache.");
SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
OID_AUTO, "unmapped_io", CTLFLAG_RD | CTLFLAG_MPSAFE,
&softc->unmappedio, 0, "Unmapped I/O leaf");
SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
OID_AUTO, "rotating", CTLFLAG_RD | CTLFLAG_MPSAFE,
&softc->rotating, 0, "Rotating media");
SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
OID_AUTO, "zone_mode", CTLTYPE_STRING | CTLFLAG_RD,
softc, 0, adazonemodesysctl, "A",
"Zone Mode");
SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
OID_AUTO, "zone_support", CTLTYPE_STRING | CTLFLAG_RD,
softc, 0, adazonesupsysctl, "A",
"Zone Support");
SYSCTL_ADD_UQUAD(&softc->sysctl_ctx,
SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO,
"optimal_seq_zones", CTLFLAG_RD, &softc->optimal_seq_zones,
"Optimal Number of Open Sequential Write Preferred Zones");
SYSCTL_ADD_UQUAD(&softc->sysctl_ctx,
SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO,
"optimal_nonseq_zones", CTLFLAG_RD,
&softc->optimal_nonseq_zones,
"Optimal Number of Non-Sequentially Written Sequential Write "
"Preferred Zones");
SYSCTL_ADD_UQUAD(&softc->sysctl_ctx,
SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO,
"max_seq_zones", CTLFLAG_RD, &softc->max_seq_zones,
"Maximum Number of Open Sequential Write Required Zones");
#ifdef ADA_TEST_FAILURE
/*
* Add a 'door bell' sysctl which allows one to set it from userland
* and cause something bad to happen. For the moment, we only allow
* whacking the next read or write.
*/
SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
OID_AUTO, "force_read_error", CTLFLAG_RW | CTLFLAG_MPSAFE,
&softc->force_read_error, 0,
"Force a read error for the next N reads.");
SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
OID_AUTO, "force_write_error", CTLFLAG_RW | CTLFLAG_MPSAFE,
&softc->force_write_error, 0,
"Force a write error for the next N writes.");
SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
OID_AUTO, "periodic_read_error", CTLFLAG_RW | CTLFLAG_MPSAFE,
&softc->periodic_read_error, 0,
"Force a read error every N reads (don't set too low).");
#endif
#ifdef CAM_IO_STATS
softc->sysctl_stats_tree = SYSCTL_ADD_NODE(&softc->sysctl_stats_ctx,
SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "stats",
CTLFLAG_RD, 0, "Statistics");
SYSCTL_ADD_INT(&softc->sysctl_stats_ctx,
SYSCTL_CHILDREN(softc->sysctl_stats_tree),
OID_AUTO, "timeouts", CTLFLAG_RD | CTLFLAG_MPSAFE,
&softc->timeouts, 0,
"Device timeouts reported by the SIM");
SYSCTL_ADD_INT(&softc->sysctl_stats_ctx,
SYSCTL_CHILDREN(softc->sysctl_stats_tree),
OID_AUTO, "errors", CTLFLAG_RD | CTLFLAG_MPSAFE,
&softc->errors, 0,
"Transport errors reported by the SIM.");
SYSCTL_ADD_INT(&softc->sysctl_stats_ctx,
SYSCTL_CHILDREN(softc->sysctl_stats_tree),
OID_AUTO, "pack_invalidations", CTLFLAG_RD | CTLFLAG_MPSAFE,
&softc->invalidations, 0,
"Device pack invalidations.");
#endif
cam_iosched_sysctl_init(softc->cam_iosched, &softc->sysctl_ctx,
softc->sysctl_tree);
cam_periph_release(periph);
}
static int
adagetattr(struct bio *bp)
{
int ret;
struct cam_periph *periph;
periph = (struct cam_periph *)bp->bio_disk->d_drv1;
cam_periph_lock(periph);
ret = xpt_getattr(bp->bio_data, bp->bio_length, bp->bio_attribute,
periph->path);
cam_periph_unlock(periph);
if (ret == 0)
bp->bio_completed = bp->bio_length;
return ret;
}
static int
adadeletemethodsysctl(SYSCTL_HANDLER_ARGS)
{
char buf[16];
const char *p;
struct ada_softc *softc;
int i, error, value, methods;
softc = (struct ada_softc *)arg1;
value = softc->delete_method;
if (value < 0 || value > ADA_DELETE_MAX)
p = "UNKNOWN";
else
p = ada_delete_method_names[value];
strncpy(buf, p, sizeof(buf));
error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
if (error != 0 || req->newptr == NULL)
return (error);
methods = 1 << ADA_DELETE_DISABLE;
if ((softc->flags & ADA_FLAG_CAN_CFA) &&
!(softc->flags & ADA_FLAG_CAN_48BIT))
methods |= 1 << ADA_DELETE_CFA_ERASE;
if (softc->flags & ADA_FLAG_CAN_TRIM)
methods |= 1 << ADA_DELETE_DSM_TRIM;
if (softc->flags & ADA_FLAG_CAN_NCQ_TRIM)
methods |= 1 << ADA_DELETE_NCQ_DSM_TRIM;
for (i = 0; i <= ADA_DELETE_MAX; i++) {
if (!(methods & (1 << i)) ||
strcmp(buf, ada_delete_method_names[i]) != 0)
continue;
softc->delete_method = i;
return (0);
}
return (EINVAL);
}
static void
adasetflags(struct ada_softc *softc, struct ccb_getdev *cgd)
{
if ((cgd->ident_data.capabilities1 & ATA_SUPPORT_DMA) &&
(cgd->inq_flags & SID_DMA))
softc->flags |= ADA_FLAG_CAN_DMA;
else
softc->flags &= ~ADA_FLAG_CAN_DMA;
if (cgd->ident_data.support.command2 & ATA_SUPPORT_ADDRESS48) {
softc->flags |= ADA_FLAG_CAN_48BIT;
if (cgd->inq_flags & SID_DMA48)
softc->flags |= ADA_FLAG_CAN_DMA48;
else
softc->flags &= ~ADA_FLAG_CAN_DMA48;
} else
softc->flags &= ~(ADA_FLAG_CAN_48BIT | ADA_FLAG_CAN_DMA48);
if (cgd->ident_data.support.command2 & ATA_SUPPORT_FLUSHCACHE)
softc->flags |= ADA_FLAG_CAN_FLUSHCACHE;
else
softc->flags &= ~ADA_FLAG_CAN_FLUSHCACHE;
if (cgd->ident_data.support.command1 & ATA_SUPPORT_POWERMGT)
softc->flags |= ADA_FLAG_CAN_POWERMGT;
else
softc->flags &= ~ADA_FLAG_CAN_POWERMGT;
if ((cgd->ident_data.satacapabilities & ATA_SUPPORT_NCQ) &&
(cgd->inq_flags & SID_DMA) && (cgd->inq_flags & SID_CmdQue))
softc->flags |= ADA_FLAG_CAN_NCQ;
else
softc->flags &= ~ADA_FLAG_CAN_NCQ;
if ((cgd->ident_data.support_dsm & ATA_SUPPORT_DSM_TRIM) &&
(cgd->inq_flags & SID_DMA)) {
softc->flags |= ADA_FLAG_CAN_TRIM;
softc->trim_max_ranges = TRIM_MAX_RANGES;
if (cgd->ident_data.max_dsm_blocks != 0) {
softc->trim_max_ranges =
min(cgd->ident_data.max_dsm_blocks *
ATA_DSM_BLK_RANGES, softc->trim_max_ranges);
}
/*
* If we can do RCVSND_FPDMA_QUEUED commands, we may be able
* to do NCQ trims, if we support trims at all. We also need
* support from the SIM to do things properly. Perhaps we
* should look at log 13 dword 0 bit 0 and dword 1 bit 0 are
* set too...
*/
if ((softc->quirks & ADA_Q_NCQ_TRIM_BROKEN) == 0 &&
(softc->flags & ADA_FLAG_PIM_ATA_EXT) != 0 &&
(cgd->ident_data.satacapabilities2 &
ATA_SUPPORT_RCVSND_FPDMA_QUEUED) != 0 &&
(softc->flags & ADA_FLAG_CAN_TRIM) != 0)
softc->flags |= ADA_FLAG_CAN_NCQ_TRIM;
else
softc->flags &= ~ADA_FLAG_CAN_NCQ_TRIM;
} else
softc->flags &= ~(ADA_FLAG_CAN_TRIM | ADA_FLAG_CAN_NCQ_TRIM);
if (cgd->ident_data.support.command2 & ATA_SUPPORT_CFA)
softc->flags |= ADA_FLAG_CAN_CFA;
else
softc->flags &= ~ADA_FLAG_CAN_CFA;
/*
* Now that we've set the appropriate flags, setup the delete
* method.
*/
adasetdeletemethod(softc);
if ((cgd->ident_data.support.extension & ATA_SUPPORT_GENLOG)
&& ((softc->quirks & ADA_Q_LOG_BROKEN) == 0))
softc->flags |= ADA_FLAG_CAN_LOG;
else
softc->flags &= ~ADA_FLAG_CAN_LOG;
if ((cgd->ident_data.support3 & ATA_SUPPORT_ZONE_MASK) ==
ATA_SUPPORT_ZONE_HOST_AWARE)
softc->zone_mode = ADA_ZONE_HOST_AWARE;
else if (((cgd->ident_data.support3 & ATA_SUPPORT_ZONE_MASK) ==
ATA_SUPPORT_ZONE_DEV_MANAGED)
|| (softc->quirks & ADA_Q_SMR_DM))
softc->zone_mode = ADA_ZONE_DRIVE_MANAGED;
else
softc->zone_mode = ADA_ZONE_NONE;
if (cgd->ident_data.support.command1 & ATA_SUPPORT_LOOKAHEAD)
softc->flags |= ADA_FLAG_CAN_RAHEAD;
else
softc->flags &= ~ADA_FLAG_CAN_RAHEAD;
if (cgd->ident_data.support.command1 & ATA_SUPPORT_WRITECACHE)
softc->flags |= ADA_FLAG_CAN_WCACHE;
else
softc->flags &= ~ADA_FLAG_CAN_WCACHE;
}
static cam_status
adaregister(struct cam_periph *periph, void *arg)
{
struct ada_softc *softc;
struct ccb_pathinq cpi;
struct ccb_getdev *cgd;
struct disk_params *dp;
struct sbuf sb;
char *announce_buf;
caddr_t match;
u_int maxio;
int quirks;
cgd = (struct ccb_getdev *)arg;
if (cgd == NULL) {
printf("adaregister: no getdev CCB, can't register device\n");
return(CAM_REQ_CMP_ERR);
}
softc = (struct ada_softc *)malloc(sizeof(*softc), M_DEVBUF,
M_NOWAIT|M_ZERO);
if (softc == NULL) {
printf("adaregister: Unable to probe new device. "
"Unable to allocate softc\n");
return(CAM_REQ_CMP_ERR);
}
announce_buf = softc->announce_temp;
bzero(announce_buf, ADA_ANNOUNCETMP_SZ);
if (cam_iosched_init(&softc->cam_iosched, periph) != 0) {
printf("adaregister: Unable to probe new device. "
"Unable to allocate iosched memory\n");
free(softc, M_DEVBUF);
return(CAM_REQ_CMP_ERR);
}
periph->softc = softc;
/*
* See if this device has any quirks.
*/
match = cam_quirkmatch((caddr_t)&cgd->ident_data,
(caddr_t)ada_quirk_table,
nitems(ada_quirk_table),
sizeof(*ada_quirk_table), ata_identify_match);
if (match != NULL)
softc->quirks = ((struct ada_quirk_entry *)match)->quirks;
else
softc->quirks = ADA_Q_NONE;
xpt_path_inq(&cpi, periph->path);
TASK_INIT(&softc->sysctl_task, 0, adasysctlinit, periph);
/*
* Register this media as a disk
*/
(void)cam_periph_hold(periph, PRIBIO);
cam_periph_unlock(periph);
snprintf(announce_buf, ADA_ANNOUNCETMP_SZ,
"kern.cam.ada.%d.quirks", periph->unit_number);
quirks = softc->quirks;
TUNABLE_INT_FETCH(announce_buf, &quirks);
softc->quirks = quirks;
softc->read_ahead = -1;
snprintf(announce_buf, ADA_ANNOUNCETMP_SZ,
"kern.cam.ada.%d.read_ahead", periph->unit_number);
TUNABLE_INT_FETCH(announce_buf, &softc->read_ahead);
softc->write_cache = -1;
snprintf(announce_buf, ADA_ANNOUNCETMP_SZ,
"kern.cam.ada.%d.write_cache", periph->unit_number);
TUNABLE_INT_FETCH(announce_buf, &softc->write_cache);
/*
* Set support flags based on the Identify data and quirks.
*/
adasetflags(softc, cgd);
/* Disable queue sorting for non-rotational media by default. */
if (cgd->ident_data.media_rotation_rate == ATA_RATE_NON_ROTATING) {
softc->rotating = 0;
} else {
softc->rotating = 1;
}
cam_iosched_set_sort_queue(softc->cam_iosched, softc->rotating ? -1 : 0);
adagetparams(periph, cgd);
softc->disk = disk_alloc();
softc->disk->d_rotation_rate = cgd->ident_data.media_rotation_rate;
softc->disk->d_devstat = devstat_new_entry(periph->periph_name,
periph->unit_number, softc->params.secsize,
DEVSTAT_ALL_SUPPORTED,
DEVSTAT_TYPE_DIRECT |
XPORT_DEVSTAT_TYPE(cpi.transport),
DEVSTAT_PRIORITY_DISK);
softc->disk->d_open = adaopen;
softc->disk->d_close = adaclose;
softc->disk->d_strategy = adastrategy;
softc->disk->d_getattr = adagetattr;
softc->disk->d_dump = adadump;
softc->disk->d_gone = adadiskgonecb;
softc->disk->d_name = "ada";
softc->disk->d_drv1 = periph;
maxio = cpi.maxio; /* Honor max I/O size of SIM */
if (maxio == 0)
maxio = DFLTPHYS; /* traditional default */
else if (maxio > MAXPHYS)
maxio = MAXPHYS; /* for safety */
if (softc->flags & ADA_FLAG_CAN_48BIT)
maxio = min(maxio, 65536 * softc->params.secsize);
else /* 28bit ATA command limit */
maxio = min(maxio, 256 * softc->params.secsize);
softc->disk->d_maxsize = maxio;
softc->disk->d_unit = periph->unit_number;
softc->disk->d_flags = DISKFLAG_DIRECT_COMPLETION | DISKFLAG_CANZONE;
if (softc->flags & ADA_FLAG_CAN_FLUSHCACHE)
softc->disk->d_flags |= DISKFLAG_CANFLUSHCACHE;
if (softc->flags & ADA_FLAG_CAN_TRIM) {
softc->disk->d_flags |= DISKFLAG_CANDELETE;
softc->disk->d_delmaxsize = softc->params.secsize *
ATA_DSM_RANGE_MAX *
softc->trim_max_ranges;
} else if ((softc->flags & ADA_FLAG_CAN_CFA) &&
!(softc->flags & ADA_FLAG_CAN_48BIT)) {
softc->disk->d_flags |= DISKFLAG_CANDELETE;
softc->disk->d_delmaxsize = 256 * softc->params.secsize;
} else
softc->disk->d_delmaxsize = maxio;
if ((cpi.hba_misc & PIM_UNMAPPED) != 0) {
softc->disk->d_flags |= DISKFLAG_UNMAPPED_BIO;
softc->unmappedio = 1;
}
if (cpi.hba_misc & PIM_ATA_EXT)
softc->flags |= ADA_FLAG_PIM_ATA_EXT;
strlcpy(softc->disk->d_descr, cgd->ident_data.model,
MIN(sizeof(softc->disk->d_descr), sizeof(cgd->ident_data.model)));
strlcpy(softc->disk->d_ident, cgd->ident_data.serial,
MIN(sizeof(softc->disk->d_ident), sizeof(cgd->ident_data.serial)));
softc->disk->d_hba_vendor = cpi.hba_vendor;
softc->disk->d_hba_device = cpi.hba_device;
softc->disk->d_hba_subvendor = cpi.hba_subvendor;
softc->disk->d_hba_subdevice = cpi.hba_subdevice;
softc->disk->d_sectorsize = softc->params.secsize;
softc->disk->d_mediasize = (off_t)softc->params.sectors *
softc->params.secsize;
if (ata_physical_sector_size(&cgd->ident_data) !=
softc->params.secsize) {
softc->disk->d_stripesize =
ata_physical_sector_size(&cgd->ident_data);
softc->disk->d_stripeoffset = (softc->disk->d_stripesize -
ata_logical_sector_offset(&cgd->ident_data)) %
softc->disk->d_stripesize;
} else if (softc->quirks & ADA_Q_4K) {
softc->disk->d_stripesize = 4096;
softc->disk->d_stripeoffset = 0;
}
softc->disk->d_fwsectors = softc->params.secs_per_track;
softc->disk->d_fwheads = softc->params.heads;
ata_disk_firmware_geom_adjust(softc->disk);
/*
* Acquire a reference to the periph before we register with GEOM.
* We'll release this reference once GEOM calls us back (via
* adadiskgonecb()) telling us that our provider has been freed.
*/
if (cam_periph_acquire(periph) != CAM_REQ_CMP) {
xpt_print(periph->path, "%s: lost periph during "
"registration!\n", __func__);
cam_periph_lock(periph);
return (CAM_REQ_CMP_ERR);
}
disk_create(softc->disk, DISK_VERSION);
cam_periph_lock(periph);
dp = &softc->params;
snprintf(announce_buf, ADA_ANNOUNCETMP_SZ,
"%juMB (%ju %u byte sectors)",
((uintmax_t)dp->secsize * dp->sectors) / (1024 * 1024),
(uintmax_t)dp->sectors, dp->secsize);
sbuf_new(&sb, softc->announce_buffer, ADA_ANNOUNCE_SZ, SBUF_FIXEDLEN);
xpt_announce_periph_sbuf(periph, &sb, announce_buf);
xpt_announce_quirks_sbuf(periph, &sb, softc->quirks, ADA_Q_BIT_STRING);
sbuf_finish(&sb);
sbuf_putbuf(&sb);
/*
* Create our sysctl variables, now that we know
* we have successfully attached.
*/
if (cam_periph_acquire(periph) == CAM_REQ_CMP)
taskqueue_enqueue(taskqueue_thread, &softc->sysctl_task);
/*
* Add async callbacks for bus reset and
* bus device reset calls. I don't bother
* checking if this fails as, in most cases,
* the system will function just fine without
* them and the only alternative would be to
* not attach the device on failure.
*/
xpt_register_async(AC_SENT_BDR | AC_BUS_RESET | AC_LOST_DEVICE |
AC_GETDEV_CHANGED | AC_ADVINFO_CHANGED,
adaasync, periph, periph->path);
/*
* Schedule a periodic event to occasionally send an
* ordered tag to a device.
*/
callout_init_mtx(&softc->sendordered_c, cam_periph_mtx(periph), 0);
callout_reset(&softc->sendordered_c,
(ada_default_timeout * hz) / ADA_ORDEREDTAG_INTERVAL,
adasendorderedtag, softc);
if (ADA_RA >= 0 && softc->flags & ADA_FLAG_CAN_RAHEAD) {
softc->state = ADA_STATE_RAHEAD;
} else if (ADA_WC >= 0 && softc->flags & ADA_FLAG_CAN_WCACHE) {
softc->state = ADA_STATE_WCACHE;
} else if ((softc->flags & ADA_FLAG_CAN_LOG)
&& (softc->zone_mode != ADA_ZONE_NONE)) {
softc->state = ADA_STATE_LOGDIR;
} else {
/*
* Nothing to probe, so we can just transition to the
* normal state.
*/
adaprobedone(periph, NULL);
return(CAM_REQ_CMP);
}
xpt_schedule(periph, CAM_PRIORITY_DEV);
return(CAM_REQ_CMP);
}
static int
ada_dsmtrim_req_create(struct ada_softc *softc, struct bio *bp, struct trim_request *req)
{
uint64_t lastlba = (uint64_t)-1;
int c, lastcount = 0, off, ranges = 0;
bzero(req, sizeof(*req));
TAILQ_INIT(&req->bps);
do {
uint64_t lba = bp->bio_pblkno;
int count = bp->bio_bcount / softc->params.secsize;
/* Try to extend the previous range. */
if (lba == lastlba) {
c = min(count, ATA_DSM_RANGE_MAX - lastcount);
lastcount += c;
off = (ranges - 1) * ATA_DSM_RANGE_SIZE;
req->data[off + 6] = lastcount & 0xff;
req->data[off + 7] =
(lastcount >> 8) & 0xff;
count -= c;
lba += c;
}
while (count > 0) {
c = min(count, ATA_DSM_RANGE_MAX);
off = ranges * ATA_DSM_RANGE_SIZE;
req->data[off + 0] = lba & 0xff;
req->data[off + 1] = (lba >> 8) & 0xff;
req->data[off + 2] = (lba >> 16) & 0xff;
req->data[off + 3] = (lba >> 24) & 0xff;
req->data[off + 4] = (lba >> 32) & 0xff;
req->data[off + 5] = (lba >> 40) & 0xff;
req->data[off + 6] = c & 0xff;
req->data[off + 7] = (c >> 8) & 0xff;
lba += c;
count -= c;
lastcount = c;
ranges++;
/*
* Its the caller's responsibility to ensure the
* request will fit so we don't need to check for
* overrun here
*/
}
lastlba = lba;
TAILQ_INSERT_TAIL(&req->bps, bp, bio_queue);
bp = cam_iosched_next_trim(softc->cam_iosched);
if (bp == NULL)
break;
if (bp->bio_bcount / softc->params.secsize >
(softc->trim_max_ranges - ranges) * ATA_DSM_RANGE_MAX) {
cam_iosched_put_back_trim(softc->cam_iosched, bp);
break;
}
} while (1);
return (ranges);
}
static void
ada_dsmtrim(struct ada_softc *softc, struct bio *bp, struct ccb_ataio *ataio)
{
struct trim_request *req = &softc->trim_req;
int ranges;
ranges = ada_dsmtrim_req_create(softc, bp, req);
cam_fill_ataio(ataio,
ada_retry_count,
adadone,
CAM_DIR_OUT,
0,
req->data,
howmany(ranges, ATA_DSM_BLK_RANGES) * ATA_DSM_BLK_SIZE,
ada_default_timeout * 1000);
ata_48bit_cmd(ataio, ATA_DATA_SET_MANAGEMENT,
ATA_DSM_TRIM, 0, howmany(ranges, ATA_DSM_BLK_RANGES));
}
static void
ada_ncq_dsmtrim(struct ada_softc *softc, struct bio *bp, struct ccb_ataio *ataio)
{
struct trim_request *req = &softc->trim_req;
int ranges;
ranges = ada_dsmtrim_req_create(softc, bp, req);
cam_fill_ataio(ataio,
ada_retry_count,
adadone,
CAM_DIR_OUT,
0,
req->data,
howmany(ranges, ATA_DSM_BLK_RANGES) * ATA_DSM_BLK_SIZE,
ada_default_timeout * 1000);
ata_ncq_cmd(ataio,
ATA_SEND_FPDMA_QUEUED,
0,
howmany(ranges, ATA_DSM_BLK_RANGES));
ataio->cmd.sector_count_exp = ATA_SFPDMA_DSM;
ataio->ata_flags |= ATA_FLAG_AUX;
ataio->aux = 1;
}
static void
ada_cfaerase(struct ada_softc *softc, struct bio *bp, struct ccb_ataio *ataio)
{
struct trim_request *req = &softc->trim_req;
uint64_t lba = bp->bio_pblkno;
uint16_t count = bp->bio_bcount / softc->params.secsize;
bzero(req, sizeof(*req));
TAILQ_INIT(&req->bps);
TAILQ_INSERT_TAIL(&req->bps, bp, bio_queue);
cam_fill_ataio(ataio,
ada_retry_count,
adadone,
CAM_DIR_NONE,
0,
NULL,
0,
ada_default_timeout*1000);
if (count >= 256)
count = 0;
ata_28bit_cmd(ataio, ATA_CFA_ERASE, 0, lba, count);
}
static int
ada_zone_bio_to_ata(int disk_zone_cmd)
{
switch (disk_zone_cmd) {
case DISK_ZONE_OPEN:
return ATA_ZM_OPEN_ZONE;
case DISK_ZONE_CLOSE:
return ATA_ZM_CLOSE_ZONE;
case DISK_ZONE_FINISH:
return ATA_ZM_FINISH_ZONE;
case DISK_ZONE_RWP:
return ATA_ZM_RWP;
}
return -1;
}
static int
ada_zone_cmd(struct cam_periph *periph, union ccb *ccb, struct bio *bp,
int *queue_ccb)
{
struct ada_softc *softc;
int error;
error = 0;
if (bp->bio_cmd != BIO_ZONE) {
error = EINVAL;
goto bailout;
}
softc = periph->softc;
switch (bp->bio_zone.zone_cmd) {
case DISK_ZONE_OPEN:
case DISK_ZONE_CLOSE:
case DISK_ZONE_FINISH:
case DISK_ZONE_RWP: {
int zone_flags;
int zone_sa;
uint64_t lba;
zone_sa = ada_zone_bio_to_ata(bp->bio_zone.zone_cmd);
if (zone_sa == -1) {
xpt_print(periph->path, "Cannot translate zone "
"cmd %#x to ATA\n", bp->bio_zone.zone_cmd);
error = EINVAL;
goto bailout;
}
zone_flags = 0;
lba = bp->bio_zone.zone_params.rwp.id;
if (bp->bio_zone.zone_params.rwp.flags &
DISK_ZONE_RWP_FLAG_ALL)
zone_flags |= ZBC_OUT_ALL;
ata_zac_mgmt_out(&ccb->ataio,
/*retries*/ ada_retry_count,
/*cbfcnp*/ adadone,
/*use_ncq*/ (softc->flags &
ADA_FLAG_PIM_ATA_EXT) ? 1 : 0,
/*zm_action*/ zone_sa,
/*zone_id*/ lba,
/*zone_flags*/ zone_flags,
/*sector_count*/ 0,
/*data_ptr*/ NULL,
/*dxfer_len*/ 0,
/*timeout*/ ada_default_timeout * 1000);
*queue_ccb = 1;
break;
}
case DISK_ZONE_REPORT_ZONES: {
uint8_t *rz_ptr;
uint32_t num_entries, alloc_size;
struct disk_zone_report *rep;
rep = &bp->bio_zone.zone_params.report;
num_entries = rep->entries_allocated;
if (num_entries == 0) {
xpt_print(periph->path, "No entries allocated for "
"Report Zones request\n");
error = EINVAL;
goto bailout;
}
alloc_size = sizeof(struct scsi_report_zones_hdr) +
(sizeof(struct scsi_report_zones_desc) * num_entries);
alloc_size = min(alloc_size, softc->disk->d_maxsize);
rz_ptr = malloc(alloc_size, M_ATADA, M_NOWAIT | M_ZERO);
if (rz_ptr == NULL) {
xpt_print(periph->path, "Unable to allocate memory "
"for Report Zones request\n");
error = ENOMEM;
goto bailout;
}
ata_zac_mgmt_in(&ccb->ataio,
/*retries*/ ada_retry_count,
/*cbcfnp*/ adadone,
/*use_ncq*/ (softc->flags &
ADA_FLAG_PIM_ATA_EXT) ? 1 : 0,
/*zm_action*/ ATA_ZM_REPORT_ZONES,
/*zone_id*/ rep->starting_id,
/*zone_flags*/ rep->rep_options,
/*data_ptr*/ rz_ptr,
/*dxfer_len*/ alloc_size,
/*timeout*/ ada_default_timeout * 1000);
/*
* For BIO_ZONE, this isn't normally needed. However, it
* is used by devstat_end_transaction_bio() to determine
* how much data was transferred.
*/
/*
* XXX KDM we have a problem. But I'm not sure how to fix
* it. devstat uses bio_bcount - bio_resid to calculate
* the amount of data transferred. The GEOM disk code
* uses bio_length - bio_resid to calculate the amount of
* data in bio_completed. We have different structure
* sizes above and below the ada(4) driver. So, if we
* use the sizes above, the amount transferred won't be
* quite accurate for devstat. If we use different sizes
* for bio_bcount and bio_length (above and below
* respectively), then the residual needs to match one or
* the other. Everything is calculated after the bio
* leaves the driver, so changing the values around isn't
* really an option. For now, just set the count to the
* passed in length. This means that the calculations
* above (e.g. bio_completed) will be correct, but the
* amount of data reported to devstat will be slightly
* under or overstated.
*/
bp->bio_bcount = bp->bio_length;
*queue_ccb = 1;
break;
}
case DISK_ZONE_GET_PARAMS: {
struct disk_zone_disk_params *params;
params = &bp->bio_zone.zone_params.disk_params;
bzero(params, sizeof(*params));
switch (softc->zone_mode) {
case ADA_ZONE_DRIVE_MANAGED:
params->zone_mode = DISK_ZONE_MODE_DRIVE_MANAGED;
break;
case ADA_ZONE_HOST_AWARE:
params->zone_mode = DISK_ZONE_MODE_HOST_AWARE;
break;
case ADA_ZONE_HOST_MANAGED:
params->zone_mode = DISK_ZONE_MODE_HOST_MANAGED;
break;
default:
case ADA_ZONE_NONE:
params->zone_mode = DISK_ZONE_MODE_NONE;
break;
}
if (softc->zone_flags & ADA_ZONE_FLAG_URSWRZ)
params->flags |= DISK_ZONE_DISK_URSWRZ;
if (softc->zone_flags & ADA_ZONE_FLAG_OPT_SEQ_SET) {
params->optimal_seq_zones = softc->optimal_seq_zones;
params->flags |= DISK_ZONE_OPT_SEQ_SET;
}
if (softc->zone_flags & ADA_ZONE_FLAG_OPT_NONSEQ_SET) {
params->optimal_nonseq_zones =
softc->optimal_nonseq_zones;
params->flags |= DISK_ZONE_OPT_NONSEQ_SET;
}
if (softc->zone_flags & ADA_ZONE_FLAG_MAX_SEQ_SET) {
params->max_seq_zones = softc->max_seq_zones;
params->flags |= DISK_ZONE_MAX_SEQ_SET;
}
if (softc->zone_flags & ADA_ZONE_FLAG_RZ_SUP)
params->flags |= DISK_ZONE_RZ_SUP;
if (softc->zone_flags & ADA_ZONE_FLAG_OPEN_SUP)
params->flags |= DISK_ZONE_OPEN_SUP;
if (softc->zone_flags & ADA_ZONE_FLAG_CLOSE_SUP)
params->flags |= DISK_ZONE_CLOSE_SUP;
if (softc->zone_flags & ADA_ZONE_FLAG_FINISH_SUP)
params->flags |= DISK_ZONE_FINISH_SUP;
if (softc->zone_flags & ADA_ZONE_FLAG_RWP_SUP)
params->flags |= DISK_ZONE_RWP_SUP;
break;
}
default:
break;
}
bailout:
return (error);
}
static void
adastart(struct cam_periph *periph, union ccb *start_ccb)
{
struct ada_softc *softc = (struct ada_softc *)periph->softc;
struct ccb_ataio *ataio = &start_ccb->ataio;
CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("adastart\n"));
switch (softc->state) {
case ADA_STATE_NORMAL:
{
struct bio *bp;
u_int8_t tag_code;
bp = cam_iosched_next_bio(softc->cam_iosched);
if (bp == NULL) {
xpt_release_ccb(start_ccb);
break;
}
if ((bp->bio_flags & BIO_ORDERED) != 0 ||
(bp->bio_cmd != BIO_DELETE && (softc->flags & ADA_FLAG_NEED_OTAG) != 0)) {
softc->flags &= ~ADA_FLAG_NEED_OTAG;
softc->flags |= ADA_FLAG_WAS_OTAG;
tag_code = 0;
} else {
tag_code = 1;
}
switch (bp->bio_cmd) {
case BIO_WRITE:
case BIO_READ:
{
uint64_t lba = bp->bio_pblkno;
uint16_t count = bp->bio_bcount / softc->params.secsize;
void *data_ptr;
int rw_op;
if (bp->bio_cmd == BIO_WRITE) {
softc->flags |= ADA_FLAG_DIRTY;
rw_op = CAM_DIR_OUT;
} else {
rw_op = CAM_DIR_IN;
}
data_ptr = bp->bio_data;
if ((bp->bio_flags & (BIO_UNMAPPED|BIO_VLIST)) != 0) {
rw_op |= CAM_DATA_BIO;
data_ptr = bp;
}
#ifdef ADA_TEST_FAILURE
int fail = 0;
/*
* Support the failure ioctls. If the command is a
* read, and there are pending forced read errors, or
* if a write and pending write errors, then fail this
* operation with EIO. This is useful for testing
* purposes. Also, support having every Nth read fail.
*
* This is a rather blunt tool.
*/
if (bp->bio_cmd == BIO_READ) {
if (softc->force_read_error) {
softc->force_read_error--;
fail = 1;
}
if (softc->periodic_read_error > 0) {
if (++softc->periodic_read_count >=
softc->periodic_read_error) {
softc->periodic_read_count = 0;
fail = 1;
}
}
} else {
if (softc->force_write_error) {
softc->force_write_error--;
fail = 1;
}
}
if (fail) {
biofinish(bp, NULL, EIO);
xpt_release_ccb(start_ccb);
adaschedule(periph);
return;
}
#endif
KASSERT((bp->bio_flags & BIO_UNMAPPED) == 0 ||
round_page(bp->bio_bcount + bp->bio_ma_offset) /
PAGE_SIZE == bp->bio_ma_n,
("Short bio %p", bp));
cam_fill_ataio(ataio,
ada_retry_count,
adadone,
rw_op,
0,
data_ptr,
bp->bio_bcount,
ada_default_timeout*1000);
if ((softc->flags & ADA_FLAG_CAN_NCQ) && tag_code) {
if (bp->bio_cmd == BIO_READ) {
ata_ncq_cmd(ataio, ATA_READ_FPDMA_QUEUED,
lba, count);
} else {
ata_ncq_cmd(ataio, ATA_WRITE_FPDMA_QUEUED,
lba, count);
}
} else if ((softc->flags & ADA_FLAG_CAN_48BIT) &&
(lba + count >= ATA_MAX_28BIT_LBA ||
count > 256)) {
if (softc->flags & ADA_FLAG_CAN_DMA48) {
if (bp->bio_cmd == BIO_READ) {
ata_48bit_cmd(ataio, ATA_READ_DMA48,
0, lba, count);
} else {
ata_48bit_cmd(ataio, ATA_WRITE_DMA48,
0, lba, count);
}
} else {
if (bp->bio_cmd == BIO_READ) {
ata_48bit_cmd(ataio, ATA_READ_MUL48,
0, lba, count);
} else {
ata_48bit_cmd(ataio, ATA_WRITE_MUL48,
0, lba, count);
}
}
} else {
if (count == 256)
count = 0;
if (softc->flags & ADA_FLAG_CAN_DMA) {
if (bp->bio_cmd == BIO_READ) {
ata_28bit_cmd(ataio, ATA_READ_DMA,
0, lba, count);
} else {
ata_28bit_cmd(ataio, ATA_WRITE_DMA,
0, lba, count);
}
} else {
if (bp->bio_cmd == BIO_READ) {
ata_28bit_cmd(ataio, ATA_READ_MUL,
0, lba, count);
} else {
ata_28bit_cmd(ataio, ATA_WRITE_MUL,
0, lba, count);
}
}
}
break;
}
case BIO_DELETE:
switch (softc->delete_method) {
case ADA_DELETE_NCQ_DSM_TRIM:
ada_ncq_dsmtrim(softc, bp, ataio);
break;
case ADA_DELETE_DSM_TRIM:
ada_dsmtrim(softc, bp, ataio);
break;
case ADA_DELETE_CFA_ERASE:
ada_cfaerase(softc, bp, ataio);
break;
default:
biofinish(bp, NULL, EOPNOTSUPP);
xpt_release_ccb(start_ccb);
adaschedule(periph);
return;
}
start_ccb->ccb_h.ccb_state = ADA_CCB_TRIM;
start_ccb->ccb_h.flags |= CAM_UNLOCKED;
cam_iosched_submit_trim(softc->cam_iosched);
goto out;
case BIO_FLUSH:
cam_fill_ataio(ataio,
1,
adadone,
CAM_DIR_NONE,
0,
NULL,
0,
ada_default_timeout*1000);
if (softc->flags & ADA_FLAG_CAN_48BIT)
ata_48bit_cmd(ataio, ATA_FLUSHCACHE48, 0, 0, 0);
else
ata_28bit_cmd(ataio, ATA_FLUSHCACHE, 0, 0, 0);
break;
case BIO_ZONE: {
int error, queue_ccb;
queue_ccb = 0;
error = ada_zone_cmd(periph, start_ccb, bp, &queue_ccb);
if ((error != 0)
|| (queue_ccb == 0)) {
biofinish(bp, NULL, error);
xpt_release_ccb(start_ccb);
return;
}
break;
}
}
start_ccb->ccb_h.ccb_state = ADA_CCB_BUFFER_IO;
start_ccb->ccb_h.flags |= CAM_UNLOCKED;
out:
start_ccb->ccb_h.ccb_bp = bp;
softc->outstanding_cmds++;
softc->refcount++;
cam_periph_unlock(periph);
xpt_action(start_ccb);
cam_periph_lock(periph);
softc->refcount--;
/* May have more work to do, so ensure we stay scheduled */
adaschedule(periph);
break;
}
case ADA_STATE_RAHEAD:
case ADA_STATE_WCACHE:
{
cam_fill_ataio(ataio,
1,
adadone,
CAM_DIR_NONE,
0,
NULL,
0,
ada_default_timeout*1000);
if (softc->state == ADA_STATE_RAHEAD) {
ata_28bit_cmd(ataio, ATA_SETFEATURES, ADA_RA ?
ATA_SF_ENAB_RCACHE : ATA_SF_DIS_RCACHE, 0, 0);
start_ccb->ccb_h.ccb_state = ADA_CCB_RAHEAD;
} else {
ata_28bit_cmd(ataio, ATA_SETFEATURES, ADA_WC ?
ATA_SF_ENAB_WCACHE : ATA_SF_DIS_WCACHE, 0, 0);
start_ccb->ccb_h.ccb_state = ADA_CCB_WCACHE;
}
start_ccb->ccb_h.flags |= CAM_DEV_QFREEZE;
xpt_action(start_ccb);
break;
}
case ADA_STATE_LOGDIR:
{
struct ata_gp_log_dir *log_dir;
if ((softc->flags & ADA_FLAG_CAN_LOG) == 0) {
adaprobedone(periph, start_ccb);
break;
}
log_dir = malloc(sizeof(*log_dir), M_ATADA, M_NOWAIT|M_ZERO);
if (log_dir == NULL) {
xpt_print(periph->path, "Couldn't malloc log_dir "
"data\n");
softc->state = ADA_STATE_NORMAL;
xpt_release_ccb(start_ccb);
break;
}
ata_read_log(ataio,
/*retries*/1,
/*cbfcnp*/adadone,
/*log_address*/ ATA_LOG_DIRECTORY,
/*page_number*/ 0,
/*block_count*/ 1,
/*protocol*/ softc->flags & ADA_FLAG_CAN_DMA ?
CAM_ATAIO_DMA : 0,
/*data_ptr*/ (uint8_t *)log_dir,
/*dxfer_len*/sizeof(*log_dir),
/*timeout*/ada_default_timeout*1000);
start_ccb->ccb_h.ccb_state = ADA_CCB_LOGDIR;
xpt_action(start_ccb);
break;
}
case ADA_STATE_IDDIR:
{
struct ata_identify_log_pages *id_dir;
id_dir = malloc(sizeof(*id_dir), M_ATADA, M_NOWAIT | M_ZERO);
if (id_dir == NULL) {
xpt_print(periph->path, "Couldn't malloc id_dir "
"data\n");
adaprobedone(periph, start_ccb);
break;
}
ata_read_log(ataio,
/*retries*/1,
/*cbfcnp*/adadone,
/*log_address*/ ATA_IDENTIFY_DATA_LOG,
/*page_number*/ ATA_IDL_PAGE_LIST,
/*block_count*/ 1,
/*protocol*/ softc->flags & ADA_FLAG_CAN_DMA ?
CAM_ATAIO_DMA : 0,
/*data_ptr*/ (uint8_t *)id_dir,
/*dxfer_len*/ sizeof(*id_dir),
/*timeout*/ada_default_timeout*1000);
start_ccb->ccb_h.ccb_state = ADA_CCB_IDDIR;
xpt_action(start_ccb);
break;
}
case ADA_STATE_SUP_CAP:
{
struct ata_identify_log_sup_cap *sup_cap;
sup_cap = malloc(sizeof(*sup_cap), M_ATADA, M_NOWAIT|M_ZERO);
if (sup_cap == NULL) {
xpt_print(periph->path, "Couldn't malloc sup_cap "
"data\n");
adaprobedone(periph, start_ccb);
break;
}
ata_read_log(ataio,
/*retries*/1,
/*cbfcnp*/adadone,
/*log_address*/ ATA_IDENTIFY_DATA_LOG,
/*page_number*/ ATA_IDL_SUP_CAP,
/*block_count*/ 1,
/*protocol*/ softc->flags & ADA_FLAG_CAN_DMA ?
CAM_ATAIO_DMA : 0,
/*data_ptr*/ (uint8_t *)sup_cap,
/*dxfer_len*/ sizeof(*sup_cap),
/*timeout*/ada_default_timeout*1000);
start_ccb->ccb_h.ccb_state = ADA_CCB_SUP_CAP;
xpt_action(start_ccb);
break;
}
case ADA_STATE_ZONE:
{
struct ata_zoned_info_log *ata_zone;
ata_zone = malloc(sizeof(*ata_zone), M_ATADA, M_NOWAIT|M_ZERO);
if (ata_zone == NULL) {
xpt_print(periph->path, "Couldn't malloc ata_zone "
"data\n");
adaprobedone(periph, start_ccb);
break;
}
ata_read_log(ataio,
/*retries*/1,
/*cbfcnp*/adadone,
/*log_address*/ ATA_IDENTIFY_DATA_LOG,
/*page_number*/ ATA_IDL_ZDI,
/*block_count*/ 1,
/*protocol*/ softc->flags & ADA_FLAG_CAN_DMA ?
CAM_ATAIO_DMA : 0,
/*data_ptr*/ (uint8_t *)ata_zone,
/*dxfer_len*/ sizeof(*ata_zone),
/*timeout*/ada_default_timeout*1000);
start_ccb->ccb_h.ccb_state = ADA_CCB_ZONE;
xpt_action(start_ccb);
break;
}
}
}
static void
adaprobedone(struct cam_periph *periph, union ccb *ccb)
{
struct ada_softc *softc;
softc = (struct ada_softc *)periph->softc;
if (ccb != NULL)
xpt_release_ccb(ccb);
softc->state = ADA_STATE_NORMAL;
softc->flags |= ADA_FLAG_PROBED;
adaschedule(periph);
if ((softc->flags & ADA_FLAG_ANNOUNCED) == 0) {
softc->flags |= ADA_FLAG_ANNOUNCED;
cam_periph_unhold(periph);
} else {
cam_periph_release_locked(periph);
}
}
static void
adazonedone(struct cam_periph *periph, union ccb *ccb)
{
- struct ada_softc *softc;
struct bio *bp;
- softc = periph->softc;
bp = (struct bio *)ccb->ccb_h.ccb_bp;
switch (bp->bio_zone.zone_cmd) {
case DISK_ZONE_OPEN:
case DISK_ZONE_CLOSE:
case DISK_ZONE_FINISH:
case DISK_ZONE_RWP:
break;
case DISK_ZONE_REPORT_ZONES: {
uint32_t avail_len;
struct disk_zone_report *rep;
struct scsi_report_zones_hdr *hdr;
struct scsi_report_zones_desc *desc;
struct disk_zone_rep_entry *entry;
- uint32_t num_alloced, hdr_len, num_avail;
+ uint32_t hdr_len, num_avail;
uint32_t num_to_fill, i;
rep = &bp->bio_zone.zone_params.report;
avail_len = ccb->ataio.dxfer_len - ccb->ataio.resid;
/*
* Note that bio_resid isn't normally used for zone
* commands, but it is used by devstat_end_transaction_bio()
* to determine how much data was transferred. Because
* the size of the SCSI/ATA data structures is different
* than the size of the BIO interface structures, the
* amount of data actually transferred from the drive will
* be different than the amount of data transferred to
* the user.
*/
- num_alloced = rep->entries_allocated;
hdr = (struct scsi_report_zones_hdr *)ccb->ataio.data_ptr;
if (avail_len < sizeof(*hdr)) {
/*
* Is there a better error than EIO here? We asked
* for at least the header, and we got less than
* that.
*/
bp->bio_error = EIO;
bp->bio_flags |= BIO_ERROR;
bp->bio_resid = bp->bio_bcount;
break;
}
hdr_len = le32dec(hdr->length);
if (hdr_len > 0)
rep->entries_available = hdr_len / sizeof(*desc);
else
rep->entries_available = 0;
/*
* NOTE: using the same values for the BIO version of the
* same field as the SCSI/ATA values. This means we could
* get some additional values that aren't defined in bio.h
* if more values of the same field are defined later.
*/
rep->header.same = hdr->byte4 & SRZ_SAME_MASK;
rep->header.maximum_lba = le64dec(hdr->maximum_lba);
/*
* If the drive reports no entries that match the query,
* we're done.
*/
if (hdr_len == 0) {
rep->entries_filled = 0;
bp->bio_resid = bp->bio_bcount;
break;
}
num_avail = min((avail_len - sizeof(*hdr)) / sizeof(*desc),
hdr_len / sizeof(*desc));
/*
* If the drive didn't return any data, then we're done.
*/
if (num_avail == 0) {
rep->entries_filled = 0;
bp->bio_resid = bp->bio_bcount;
break;
}
num_to_fill = min(num_avail, rep->entries_allocated);
/*
* If the user didn't allocate any entries for us to fill,
* we're done.
*/
if (num_to_fill == 0) {
rep->entries_filled = 0;
bp->bio_resid = bp->bio_bcount;
break;
}
for (i = 0, desc = &hdr->desc_list[0], entry=&rep->entries[0];
i < num_to_fill; i++, desc++, entry++) {
/*
* NOTE: we're mapping the values here directly
* from the SCSI/ATA bit definitions to the bio.h
* definitions. There is also a warning in
* disk_zone.h, but the impact is that if
* additional values are added in the SCSI/ATA
* specs these will be visible to consumers of
* this interface.
*/
entry->zone_type = desc->zone_type & SRZ_TYPE_MASK;
entry->zone_condition =
(desc->zone_flags & SRZ_ZONE_COND_MASK) >>
SRZ_ZONE_COND_SHIFT;
entry->zone_flags |= desc->zone_flags &
(SRZ_ZONE_NON_SEQ|SRZ_ZONE_RESET);
entry->zone_length = le64dec(desc->zone_length);
entry->zone_start_lba = le64dec(desc->zone_start_lba);
entry->write_pointer_lba =
le64dec(desc->write_pointer_lba);
}
rep->entries_filled = num_to_fill;
/*
* Note that this residual is accurate from the user's
* standpoint, but the amount transferred isn't accurate
* from the standpoint of what actually came back from the
* drive.
*/
bp->bio_resid = bp->bio_bcount - (num_to_fill * sizeof(*entry));
break;
}
case DISK_ZONE_GET_PARAMS:
default:
/*
* In theory we should not get a GET_PARAMS bio, since it
* should be handled without queueing the command to the
* drive.
*/
panic("%s: Invalid zone command %d", __func__,
bp->bio_zone.zone_cmd);
break;
}
if (bp->bio_zone.zone_cmd == DISK_ZONE_REPORT_ZONES)
free(ccb->ataio.data_ptr, M_ATADA);
}
static void
adadone(struct cam_periph *periph, union ccb *done_ccb)
{
struct ada_softc *softc;
struct ccb_ataio *ataio;
struct cam_path *path;
uint32_t priority;
int state;
softc = (struct ada_softc *)periph->softc;
ataio = &done_ccb->ataio;
path = done_ccb->ccb_h.path;
priority = done_ccb->ccb_h.pinfo.priority;
CAM_DEBUG(path, CAM_DEBUG_TRACE, ("adadone\n"));
state = ataio->ccb_h.ccb_state & ADA_CCB_TYPE_MASK;
switch (state) {
case ADA_CCB_BUFFER_IO:
case ADA_CCB_TRIM:
{
struct bio *bp;
int error;
cam_periph_lock(periph);
bp = (struct bio *)done_ccb->ccb_h.ccb_bp;
if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
error = adaerror(done_ccb, 0, 0);
if (error == ERESTART) {
/* A retry was scheduled, so just return. */
cam_periph_unlock(periph);
return;
}
if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0)
cam_release_devq(path,
/*relsim_flags*/0,
/*reduction*/0,
/*timeout*/0,
/*getcount_only*/0);
/*
* If we get an error on an NCQ DSM TRIM, fall back
* to a non-NCQ DSM TRIM forever. Please note that if
* CAN_NCQ_TRIM is set, CAN_TRIM is necessarily set too.
* However, for this one trim, we treat it as advisory
* and return success up the stack.
*/
if (state == ADA_CCB_TRIM &&
error != 0 &&
(softc->flags & ADA_FLAG_CAN_NCQ_TRIM) != 0) {
softc->flags &= ~ADA_FLAG_CAN_NCQ_TRIM;
error = 0;
adasetdeletemethod(softc);
}
} else {
if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0)
panic("REQ_CMP with QFRZN");
error = 0;
}
bp->bio_error = error;
if (error != 0) {
bp->bio_resid = bp->bio_bcount;
bp->bio_flags |= BIO_ERROR;
} else {
if (bp->bio_cmd == BIO_ZONE)
adazonedone(periph, done_ccb);
else if (state == ADA_CCB_TRIM)
bp->bio_resid = 0;
else
bp->bio_resid = ataio->resid;
if ((bp->bio_resid > 0)
&& (bp->bio_cmd != BIO_ZONE))
bp->bio_flags |= BIO_ERROR;
}
softc->outstanding_cmds--;
if (softc->outstanding_cmds == 0)
softc->flags |= ADA_FLAG_WAS_OTAG;
/*
* We need to call cam_iosched before we call biodone so that we
* don't measure any activity that happens in the completion
* routine, which in the case of sendfile can be quite
* extensive.
*/
cam_iosched_bio_complete(softc->cam_iosched, bp, done_ccb);
xpt_release_ccb(done_ccb);
if (state == ADA_CCB_TRIM) {
TAILQ_HEAD(, bio) queue;
struct bio *bp1;
TAILQ_INIT(&queue);
TAILQ_CONCAT(&queue, &softc->trim_req.bps, bio_queue);
/*
* Normally, the xpt_release_ccb() above would make sure
* that when we have more work to do, that work would
* get kicked off. However, we specifically keep
* trim_running set to 0 before the call above to allow
* other I/O to progress when many BIO_DELETE requests
* are pushed down. We set trim_running to 0 and call
* daschedule again so that we don't stall if there are
* no other I/Os pending apart from BIO_DELETEs.
*/
cam_iosched_trim_done(softc->cam_iosched);
adaschedule(periph);
cam_periph_unlock(periph);
while ((bp1 = TAILQ_FIRST(&queue)) != NULL) {
TAILQ_REMOVE(&queue, bp1, bio_queue);
bp1->bio_error = error;
if (error != 0) {
bp1->bio_flags |= BIO_ERROR;
bp1->bio_resid = bp1->bio_bcount;
} else
bp1->bio_resid = 0;
biodone(bp1);
}
} else {
adaschedule(periph);
cam_periph_unlock(periph);
biodone(bp);
}
return;
}
case ADA_CCB_RAHEAD:
{
if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
if (adaerror(done_ccb, 0, 0) == ERESTART) {
/* Drop freeze taken due to CAM_DEV_QFREEZE */
cam_release_devq(path, 0, 0, 0, FALSE);
return;
} else if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) {
cam_release_devq(path,
/*relsim_flags*/0,
/*reduction*/0,
/*timeout*/0,
/*getcount_only*/0);
}
}
/*
* Since our peripheral may be invalidated by an error
* above or an external event, we must release our CCB
* before releasing the reference on the peripheral.
* The peripheral will only go away once the last reference
* is removed, and we need it around for the CCB release
* operation.
*/
xpt_release_ccb(done_ccb);
softc->state = ADA_STATE_WCACHE;
xpt_schedule(periph, priority);
/* Drop freeze taken due to CAM_DEV_QFREEZE */
cam_release_devq(path, 0, 0, 0, FALSE);
return;
}
case ADA_CCB_WCACHE:
{
if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
if (adaerror(done_ccb, 0, 0) == ERESTART) {
/* Drop freeze taken due to CAM_DEV_QFREEZE */
cam_release_devq(path, 0, 0, 0, FALSE);
return;
} else if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) {
cam_release_devq(path,
/*relsim_flags*/0,
/*reduction*/0,
/*timeout*/0,
/*getcount_only*/0);
}
}
/* Drop freeze taken due to CAM_DEV_QFREEZE */
cam_release_devq(path, 0, 0, 0, FALSE);
if ((softc->flags & ADA_FLAG_CAN_LOG)
&& (softc->zone_mode != ADA_ZONE_NONE)) {
xpt_release_ccb(done_ccb);
softc->state = ADA_STATE_LOGDIR;
xpt_schedule(periph, priority);
} else {
adaprobedone(periph, done_ccb);
}
return;
}
case ADA_CCB_LOGDIR:
{
int error;
if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
error = 0;
softc->valid_logdir_len = 0;
bzero(&softc->ata_logdir, sizeof(softc->ata_logdir));
softc->valid_logdir_len =
ataio->dxfer_len - ataio->resid;
if (softc->valid_logdir_len > 0)
bcopy(ataio->data_ptr, &softc->ata_logdir,
min(softc->valid_logdir_len,
sizeof(softc->ata_logdir)));
/*
* Figure out whether the Identify Device log is
* supported. The General Purpose log directory
* has a header, and lists the number of pages
* available for each GP log identified by the
* offset into the list.
*/
if ((softc->valid_logdir_len >=
((ATA_IDENTIFY_DATA_LOG + 1) * sizeof(uint16_t)))
&& (le16dec(softc->ata_logdir.header) ==
ATA_GP_LOG_DIR_VERSION)
&& (le16dec(&softc->ata_logdir.num_pages[
(ATA_IDENTIFY_DATA_LOG *
sizeof(uint16_t)) - sizeof(uint16_t)]) > 0)){
softc->flags |= ADA_FLAG_CAN_IDLOG;
} else {
softc->flags &= ~ADA_FLAG_CAN_IDLOG;
}
} else {
error = adaerror(done_ccb, CAM_RETRY_SELTO,
SF_RETRY_UA|SF_NO_PRINT);
if (error == ERESTART)
return;
else if (error != 0) {
/*
* If we can't get the ATA log directory,
* then ATA logs are effectively not
* supported even if the bit is set in the
* identify data.
*/
softc->flags &= ~(ADA_FLAG_CAN_LOG |
ADA_FLAG_CAN_IDLOG);
if ((done_ccb->ccb_h.status &
CAM_DEV_QFRZN) != 0) {
/* Don't wedge this device's queue */
cam_release_devq(done_ccb->ccb_h.path,
/*relsim_flags*/0,
/*reduction*/0,
/*timeout*/0,
/*getcount_only*/0);
}
}
}
free(ataio->data_ptr, M_ATADA);
if ((error == 0)
&& (softc->flags & ADA_FLAG_CAN_IDLOG)) {
softc->state = ADA_STATE_IDDIR;
xpt_release_ccb(done_ccb);
xpt_schedule(periph, priority);
} else
adaprobedone(periph, done_ccb);
return;
}
case ADA_CCB_IDDIR: {
int error;
if ((ataio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
off_t entries_offset, max_entries;
error = 0;
softc->valid_iddir_len = 0;
bzero(&softc->ata_iddir, sizeof(softc->ata_iddir));
softc->flags &= ~(ADA_FLAG_CAN_SUPCAP |
ADA_FLAG_CAN_ZONE);
softc->valid_iddir_len =
ataio->dxfer_len - ataio->resid;
if (softc->valid_iddir_len > 0)
bcopy(ataio->data_ptr, &softc->ata_iddir,
min(softc->valid_iddir_len,
sizeof(softc->ata_iddir)));
entries_offset =
__offsetof(struct ata_identify_log_pages,entries);
max_entries = softc->valid_iddir_len - entries_offset;
if ((softc->valid_iddir_len > (entries_offset + 1))
&& (le64dec(softc->ata_iddir.header) ==
ATA_IDLOG_REVISION)
&& (softc->ata_iddir.entry_count > 0)) {
int num_entries, i;
num_entries = softc->ata_iddir.entry_count;
num_entries = min(num_entries,
softc->valid_iddir_len - entries_offset);
for (i = 0; i < num_entries &&
i < max_entries; i++) {
if (softc->ata_iddir.entries[i] ==
ATA_IDL_SUP_CAP)
softc->flags |=
ADA_FLAG_CAN_SUPCAP;
else if (softc->ata_iddir.entries[i]==
ATA_IDL_ZDI)
softc->flags |=
ADA_FLAG_CAN_ZONE;
if ((softc->flags &
ADA_FLAG_CAN_SUPCAP)
&& (softc->flags &
ADA_FLAG_CAN_ZONE))
break;
}
}
} else {
error = adaerror(done_ccb, CAM_RETRY_SELTO,
SF_RETRY_UA|SF_NO_PRINT);
if (error == ERESTART)
return;
else if (error != 0) {
/*
* If we can't get the ATA Identify Data log
* directory, then it effectively isn't
* supported even if the ATA Log directory
* a non-zero number of pages present for
* this log.
*/
softc->flags &= ~ADA_FLAG_CAN_IDLOG;
if ((done_ccb->ccb_h.status &
CAM_DEV_QFRZN) != 0) {
/* Don't wedge this device's queue */
cam_release_devq(done_ccb->ccb_h.path,
/*relsim_flags*/0,
/*reduction*/0,
/*timeout*/0,
/*getcount_only*/0);
}
}
}
free(ataio->data_ptr, M_ATADA);
if ((error == 0)
&& (softc->flags & ADA_FLAG_CAN_SUPCAP)) {
softc->state = ADA_STATE_SUP_CAP;
xpt_release_ccb(done_ccb);
xpt_schedule(periph, priority);
} else
adaprobedone(periph, done_ccb);
return;
}
case ADA_CCB_SUP_CAP: {
int error;
if ((ataio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
uint32_t valid_len;
size_t needed_size;
struct ata_identify_log_sup_cap *sup_cap;
error = 0;
sup_cap = (struct ata_identify_log_sup_cap *)
ataio->data_ptr;
valid_len = ataio->dxfer_len - ataio->resid;
needed_size =
__offsetof(struct ata_identify_log_sup_cap,
sup_zac_cap) + 1 + sizeof(sup_cap->sup_zac_cap);
if (valid_len >= needed_size) {
uint64_t zoned, zac_cap;
zoned = le64dec(sup_cap->zoned_cap);
if (zoned & ATA_ZONED_VALID) {
/*
* This should have already been
* set, because this is also in the
* ATA identify data.
*/
if ((zoned & ATA_ZONED_MASK) ==
ATA_SUPPORT_ZONE_HOST_AWARE)
softc->zone_mode =
ADA_ZONE_HOST_AWARE;
else if ((zoned & ATA_ZONED_MASK) ==
ATA_SUPPORT_ZONE_DEV_MANAGED)
softc->zone_mode =
ADA_ZONE_DRIVE_MANAGED;
}
zac_cap = le64dec(sup_cap->sup_zac_cap);
if (zac_cap & ATA_SUP_ZAC_CAP_VALID) {
if (zac_cap & ATA_REPORT_ZONES_SUP)
softc->zone_flags |=
ADA_ZONE_FLAG_RZ_SUP;
if (zac_cap & ATA_ND_OPEN_ZONE_SUP)
softc->zone_flags |=
ADA_ZONE_FLAG_OPEN_SUP;
if (zac_cap & ATA_ND_CLOSE_ZONE_SUP)
softc->zone_flags |=
ADA_ZONE_FLAG_CLOSE_SUP;
if (zac_cap & ATA_ND_FINISH_ZONE_SUP)
softc->zone_flags |=
ADA_ZONE_FLAG_FINISH_SUP;
if (zac_cap & ATA_ND_RWP_SUP)
softc->zone_flags |=
ADA_ZONE_FLAG_RWP_SUP;
} else {
/*
* This field was introduced in
* ACS-4, r08 on April 28th, 2015.
* If the drive firmware was written
* to an earlier spec, it won't have
* the field. So, assume all
* commands are supported.
*/
softc->zone_flags |=
ADA_ZONE_FLAG_SUP_MASK;
}
}
} else {
error = adaerror(done_ccb, CAM_RETRY_SELTO,
SF_RETRY_UA|SF_NO_PRINT);
if (error == ERESTART)
return;
else if (error != 0) {
/*
* If we can't get the ATA Identify Data
* Supported Capabilities page, clear the
* flag...
*/
softc->flags &= ~ADA_FLAG_CAN_SUPCAP;
/*
* And clear zone capabilities.
*/
softc->zone_flags &= ~ADA_ZONE_FLAG_SUP_MASK;
if ((done_ccb->ccb_h.status &
CAM_DEV_QFRZN) != 0) {
/* Don't wedge this device's queue */
cam_release_devq(done_ccb->ccb_h.path,
/*relsim_flags*/0,
/*reduction*/0,
/*timeout*/0,
/*getcount_only*/0);
}
}
}
free(ataio->data_ptr, M_ATADA);
if ((error == 0)
&& (softc->flags & ADA_FLAG_CAN_ZONE)) {
softc->state = ADA_STATE_ZONE;
xpt_release_ccb(done_ccb);
xpt_schedule(periph, priority);
} else
adaprobedone(periph, done_ccb);
return;
}
case ADA_CCB_ZONE: {
int error;
if ((ataio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
struct ata_zoned_info_log *zi_log;
uint32_t valid_len;
size_t needed_size;
zi_log = (struct ata_zoned_info_log *)ataio->data_ptr;
valid_len = ataio->dxfer_len - ataio->resid;
needed_size = __offsetof(struct ata_zoned_info_log,
version_info) + 1 + sizeof(zi_log->version_info);
if (valid_len >= needed_size) {
uint64_t tmpvar;
tmpvar = le64dec(zi_log->zoned_cap);
if (tmpvar & ATA_ZDI_CAP_VALID) {
if (tmpvar & ATA_ZDI_CAP_URSWRZ)
softc->zone_flags |=
ADA_ZONE_FLAG_URSWRZ;
else
softc->zone_flags &=
~ADA_ZONE_FLAG_URSWRZ;
}
tmpvar = le64dec(zi_log->optimal_seq_zones);
if (tmpvar & ATA_ZDI_OPT_SEQ_VALID) {
softc->zone_flags |=
ADA_ZONE_FLAG_OPT_SEQ_SET;
softc->optimal_seq_zones = (tmpvar &
ATA_ZDI_OPT_SEQ_MASK);
} else {
softc->zone_flags &=
~ADA_ZONE_FLAG_OPT_SEQ_SET;
softc->optimal_seq_zones = 0;
}
tmpvar =le64dec(zi_log->optimal_nonseq_zones);
if (tmpvar & ATA_ZDI_OPT_NS_VALID) {
softc->zone_flags |=
ADA_ZONE_FLAG_OPT_NONSEQ_SET;
softc->optimal_nonseq_zones =
(tmpvar & ATA_ZDI_OPT_NS_MASK);
} else {
softc->zone_flags &=
~ADA_ZONE_FLAG_OPT_NONSEQ_SET;
softc->optimal_nonseq_zones = 0;
}
tmpvar = le64dec(zi_log->max_seq_req_zones);
if (tmpvar & ATA_ZDI_MAX_SEQ_VALID) {
softc->zone_flags |=
ADA_ZONE_FLAG_MAX_SEQ_SET;
softc->max_seq_zones =
(tmpvar & ATA_ZDI_MAX_SEQ_MASK);
} else {
softc->zone_flags &=
~ADA_ZONE_FLAG_MAX_SEQ_SET;
softc->max_seq_zones = 0;
}
}
} else {
error = adaerror(done_ccb, CAM_RETRY_SELTO,
SF_RETRY_UA|SF_NO_PRINT);
if (error == ERESTART)
return;
else if (error != 0) {
softc->flags &= ~ADA_FLAG_CAN_ZONE;
softc->flags &= ~ADA_ZONE_FLAG_SET_MASK;
if ((done_ccb->ccb_h.status &
CAM_DEV_QFRZN) != 0) {
/* Don't wedge this device's queue */
cam_release_devq(done_ccb->ccb_h.path,
/*relsim_flags*/0,
/*reduction*/0,
/*timeout*/0,
/*getcount_only*/0);
}
}
}
free(ataio->data_ptr, M_ATADA);
adaprobedone(periph, done_ccb);
return;
}
case ADA_CCB_DUMP:
/* No-op. We're polling */
return;
default:
break;
}
xpt_release_ccb(done_ccb);
}
static int
adaerror(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags)
{
#ifdef CAM_IO_STATS
struct ada_softc *softc;
struct cam_periph *periph;
periph = xpt_path_periph(ccb->ccb_h.path);
softc = (struct ada_softc *)periph->softc;
switch (ccb->ccb_h.status & CAM_STATUS_MASK) {
case CAM_CMD_TIMEOUT:
softc->timeouts++;
break;
case CAM_REQ_ABORTED:
case CAM_REQ_CMP_ERR:
case CAM_REQ_TERMIO:
case CAM_UNREC_HBA_ERROR:
case CAM_DATA_RUN_ERR:
case CAM_ATA_STATUS_ERROR:
softc->errors++;
break;
default:
break;
}
#endif
return(cam_periph_error(ccb, cam_flags, sense_flags));
}
static void
adagetparams(struct cam_periph *periph, struct ccb_getdev *cgd)
{
struct ada_softc *softc = (struct ada_softc *)periph->softc;
struct disk_params *dp = &softc->params;
u_int64_t lbasize48;
u_int32_t lbasize;
dp->secsize = ata_logical_sector_size(&cgd->ident_data);
if ((cgd->ident_data.atavalid & ATA_FLAG_54_58) &&
cgd->ident_data.current_heads && cgd->ident_data.current_sectors) {
dp->heads = cgd->ident_data.current_heads;
dp->secs_per_track = cgd->ident_data.current_sectors;
dp->cylinders = cgd->ident_data.cylinders;
dp->sectors = (u_int32_t)cgd->ident_data.current_size_1 |
((u_int32_t)cgd->ident_data.current_size_2 << 16);
} else {
dp->heads = cgd->ident_data.heads;
dp->secs_per_track = cgd->ident_data.sectors;
dp->cylinders = cgd->ident_data.cylinders;
dp->sectors = cgd->ident_data.cylinders *
(u_int32_t)(dp->heads * dp->secs_per_track);
}
lbasize = (u_int32_t)cgd->ident_data.lba_size_1 |
((u_int32_t)cgd->ident_data.lba_size_2 << 16);
/* use the 28bit LBA size if valid or bigger than the CHS mapping */
if (cgd->ident_data.cylinders == 16383 || dp->sectors < lbasize)
dp->sectors = lbasize;
/* use the 48bit LBA size if valid */
lbasize48 = ((u_int64_t)cgd->ident_data.lba_size48_1) |
((u_int64_t)cgd->ident_data.lba_size48_2 << 16) |
((u_int64_t)cgd->ident_data.lba_size48_3 << 32) |
((u_int64_t)cgd->ident_data.lba_size48_4 << 48);
if ((cgd->ident_data.support.command2 & ATA_SUPPORT_ADDRESS48) &&
lbasize48 > ATA_MAX_28BIT_LBA)
dp->sectors = lbasize48;
}
static void
adasendorderedtag(void *arg)
{
struct ada_softc *softc = arg;
if (ada_send_ordered) {
if (softc->outstanding_cmds > 0) {
if ((softc->flags & ADA_FLAG_WAS_OTAG) == 0)
softc->flags |= ADA_FLAG_NEED_OTAG;
softc->flags &= ~ADA_FLAG_WAS_OTAG;
}
}
/* Queue us up again */
callout_reset(&softc->sendordered_c,
(ada_default_timeout * hz) / ADA_ORDEREDTAG_INTERVAL,
adasendorderedtag, softc);
}
/*
* Step through all ADA peripheral drivers, and if the device is still open,
* sync the disk cache to physical media.
*/
static void
adaflush(void)
{
struct cam_periph *periph;
struct ada_softc *softc;
union ccb *ccb;
int error;
CAM_PERIPH_FOREACH(periph, &adadriver) {
softc = (struct ada_softc *)periph->softc;
if (SCHEDULER_STOPPED()) {
/* If we paniced with the lock held, do not recurse. */
if (!cam_periph_owned(periph) &&
(softc->flags & ADA_FLAG_OPEN)) {
adadump(softc->disk, NULL, 0, 0, 0);
}
continue;
}
cam_periph_lock(periph);
/*
* We only sync the cache if the drive is still open, and
* if the drive is capable of it..
*/
if (((softc->flags & ADA_FLAG_OPEN) == 0) ||
(softc->flags & ADA_FLAG_CAN_FLUSHCACHE) == 0) {
cam_periph_unlock(periph);
continue;
}
ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
cam_fill_ataio(&ccb->ataio,
0,
adadone,
CAM_DIR_NONE,
0,
NULL,
0,
ada_default_timeout*1000);
if (softc->flags & ADA_FLAG_CAN_48BIT)
ata_48bit_cmd(&ccb->ataio, ATA_FLUSHCACHE48, 0, 0, 0);
else
ata_28bit_cmd(&ccb->ataio, ATA_FLUSHCACHE, 0, 0, 0);
error = cam_periph_runccb(ccb, adaerror, /*cam_flags*/0,
/*sense_flags*/ SF_NO_RECOVERY | SF_NO_RETRY,
softc->disk->d_devstat);
if (error != 0)
xpt_print(periph->path, "Synchronize cache failed\n");
xpt_release_ccb(ccb);
cam_periph_unlock(periph);
}
}
static void
adaspindown(uint8_t cmd, int flags)
{
struct cam_periph *periph;
struct ada_softc *softc;
struct ccb_ataio local_ccb;
int error;
CAM_PERIPH_FOREACH(periph, &adadriver) {
/* If we paniced with lock held - not recurse here. */
if (cam_periph_owned(periph))
continue;
cam_periph_lock(periph);
softc = (struct ada_softc *)periph->softc;
/*
* We only spin-down the drive if it is capable of it..
*/
if ((softc->flags & ADA_FLAG_CAN_POWERMGT) == 0) {
cam_periph_unlock(periph);
continue;
}
if (bootverbose)
xpt_print(periph->path, "spin-down\n");
memset(&local_ccb, 0, sizeof(local_ccb));
xpt_setup_ccb(&local_ccb.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
local_ccb.ccb_h.ccb_state = ADA_CCB_DUMP;
cam_fill_ataio(&local_ccb,
0,
adadone,
CAM_DIR_NONE | flags,
0,
NULL,
0,
ada_default_timeout*1000);
ata_28bit_cmd(&local_ccb, cmd, 0, 0, 0);
error = cam_periph_runccb((union ccb *)&local_ccb, adaerror,
/*cam_flags*/0, /*sense_flags*/ SF_NO_RECOVERY | SF_NO_RETRY,
softc->disk->d_devstat);
if (error != 0)
xpt_print(periph->path, "Spin-down disk failed\n");
cam_periph_unlock(periph);
}
}
static void
adashutdown(void *arg, int howto)
{
int how;
adaflush();
/*
* STANDBY IMMEDIATE saves any volatile data to the drive. It also spins
* down hard drives. IDLE IMMEDIATE also saves the volatile data without
* a spindown. We send the former when we expect to lose power soon. For
* a warm boot, we send the latter to avoid a thundering herd of spinups
* just after the kernel loads while probing. We have to do something to
* flush the data because the BIOS in many systems resets the HBA
* causing a COMINIT/COMRESET negotiation, which some drives interpret
* as license to toss the volatile data, and others count as unclean
* shutdown when in the Active PM state in SMART attributes.
*
* adaspindown will ensure that we don't send this to a drive that
* doesn't support it.
*/
if (ada_spindown_shutdown != 0) {
how = (howto & (RB_HALT | RB_POWEROFF | RB_POWERCYCLE)) ?
ATA_STANDBY_IMMEDIATE : ATA_IDLE_IMMEDIATE;
adaspindown(how, 0);
}
}
static void
adasuspend(void *arg)
{
adaflush();
/*
* SLEEP also fushes any volatile data, like STANDBY IMEDIATE,
* so we don't need to send it as well.
*/
if (ada_spindown_suspend != 0)
adaspindown(ATA_SLEEP, CAM_DEV_QFREEZE);
}
static void
adaresume(void *arg)
{
struct cam_periph *periph;
struct ada_softc *softc;
if (ada_spindown_suspend == 0)
return;
CAM_PERIPH_FOREACH(periph, &adadriver) {
cam_periph_lock(periph);
softc = (struct ada_softc *)periph->softc;
/*
* We only spin-down the drive if it is capable of it..
*/
if ((softc->flags & ADA_FLAG_CAN_POWERMGT) == 0) {
cam_periph_unlock(periph);
continue;
}
if (bootverbose)
xpt_print(periph->path, "resume\n");
/*
* Drop freeze taken due to CAM_DEV_QFREEZE flag set on
* sleep request.
*/
cam_release_devq(periph->path,
/*relsim_flags*/0,
/*openings*/0,
/*timeout*/0,
/*getcount_only*/0);
cam_periph_unlock(periph);
}
}
#endif /* _KERNEL */
Index: head/sys/cam/scsi/scsi_da.c
===================================================================
--- head/sys/cam/scsi/scsi_da.c (revision 327172)
+++ head/sys/cam/scsi/scsi_da.c (revision 327173)
@@ -1,6056 +1,6053 @@
/*-
* Implementation of SCSI Direct Access Peripheral driver for CAM.
*
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 1997 Justin T. Gibbs.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions, and the following disclaimer,
* without modification, immediately at the beginning of the file.
* 2. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#ifdef _KERNEL
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/bio.h>
#include <sys/sysctl.h>
#include <sys/taskqueue.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/conf.h>
#include <sys/devicestat.h>
#include <sys/eventhandler.h>
#include <sys/malloc.h>
#include <sys/cons.h>
#include <sys/endian.h>
#include <sys/proc.h>
#include <sys/sbuf.h>
#include <geom/geom.h>
#include <geom/geom_disk.h>
#endif /* _KERNEL */
#ifndef _KERNEL
#include <stdio.h>
#include <string.h>
#endif /* _KERNEL */
#include <cam/cam.h>
#include <cam/cam_ccb.h>
#include <cam/cam_periph.h>
#include <cam/cam_xpt_periph.h>
#include <cam/cam_sim.h>
#include <cam/cam_iosched.h>
#include <cam/scsi/scsi_message.h>
#include <cam/scsi/scsi_da.h>
#ifdef _KERNEL
/*
* Note that there are probe ordering dependencies here. The order isn't
* controlled by this enumeration, but by explicit state transitions in
* dastart() and dadone(). Here are some of the dependencies:
*
* 1. RC should come first, before RC16, unless there is evidence that RC16
* is supported.
* 2. BDC needs to come before any of the ATA probes, or the ZONE probe.
* 3. The ATA probes should go in this order:
* ATA -> LOGDIR -> IDDIR -> SUP -> ATA_ZONE
*/
typedef enum {
DA_STATE_PROBE_RC,
DA_STATE_PROBE_RC16,
DA_STATE_PROBE_LBP,
DA_STATE_PROBE_BLK_LIMITS,
DA_STATE_PROBE_BDC,
DA_STATE_PROBE_ATA,
DA_STATE_PROBE_ATA_LOGDIR,
DA_STATE_PROBE_ATA_IDDIR,
DA_STATE_PROBE_ATA_SUP,
DA_STATE_PROBE_ATA_ZONE,
DA_STATE_PROBE_ZONE,
DA_STATE_NORMAL
} da_state;
typedef enum {
DA_FLAG_PACK_INVALID = 0x000001,
DA_FLAG_NEW_PACK = 0x000002,
DA_FLAG_PACK_LOCKED = 0x000004,
DA_FLAG_PACK_REMOVABLE = 0x000008,
DA_FLAG_NEED_OTAG = 0x000020,
DA_FLAG_WAS_OTAG = 0x000040,
DA_FLAG_RETRY_UA = 0x000080,
DA_FLAG_OPEN = 0x000100,
DA_FLAG_SCTX_INIT = 0x000200,
DA_FLAG_CAN_RC16 = 0x000400,
DA_FLAG_PROBED = 0x000800,
DA_FLAG_DIRTY = 0x001000,
DA_FLAG_ANNOUNCED = 0x002000,
DA_FLAG_CAN_ATA_DMA = 0x004000,
DA_FLAG_CAN_ATA_LOG = 0x008000,
DA_FLAG_CAN_ATA_IDLOG = 0x010000,
DA_FLAG_CAN_ATA_SUPCAP = 0x020000,
DA_FLAG_CAN_ATA_ZONE = 0x040000
} da_flags;
typedef enum {
DA_Q_NONE = 0x00,
DA_Q_NO_SYNC_CACHE = 0x01,
DA_Q_NO_6_BYTE = 0x02,
DA_Q_NO_PREVENT = 0x04,
DA_Q_4K = 0x08,
DA_Q_NO_RC16 = 0x10,
DA_Q_NO_UNMAP = 0x20,
DA_Q_RETRY_BUSY = 0x40,
DA_Q_SMR_DM = 0x80,
DA_Q_STRICT_UNMAP = 0x100
} da_quirks;
#define DA_Q_BIT_STRING \
"\020" \
"\001NO_SYNC_CACHE" \
"\002NO_6_BYTE" \
"\003NO_PREVENT" \
"\0044K" \
"\005NO_RC16" \
"\006NO_UNMAP" \
"\007RETRY_BUSY" \
"\010SMR_DM" \
"\011STRICT_UNMAP"
typedef enum {
DA_CCB_PROBE_RC = 0x01,
DA_CCB_PROBE_RC16 = 0x02,
DA_CCB_PROBE_LBP = 0x03,
DA_CCB_PROBE_BLK_LIMITS = 0x04,
DA_CCB_PROBE_BDC = 0x05,
DA_CCB_PROBE_ATA = 0x06,
DA_CCB_BUFFER_IO = 0x07,
DA_CCB_DUMP = 0x0A,
DA_CCB_DELETE = 0x0B,
DA_CCB_TUR = 0x0C,
DA_CCB_PROBE_ZONE = 0x0D,
DA_CCB_PROBE_ATA_LOGDIR = 0x0E,
DA_CCB_PROBE_ATA_IDDIR = 0x0F,
DA_CCB_PROBE_ATA_SUP = 0x10,
DA_CCB_PROBE_ATA_ZONE = 0x11,
DA_CCB_TYPE_MASK = 0x1F,
DA_CCB_RETRY_UA = 0x20
} da_ccb_state;
/*
* Order here is important for method choice
*
* We prefer ATA_TRIM as tests run against a Sandforce 2281 SSD attached to
* LSI 2008 (mps) controller (FW: v12, Drv: v14) resulted 20% quicker deletes
* using ATA_TRIM than the corresponding UNMAP results for a real world mysql
* import taking 5mins.
*
*/
typedef enum {
DA_DELETE_NONE,
DA_DELETE_DISABLE,
DA_DELETE_ATA_TRIM,
DA_DELETE_UNMAP,
DA_DELETE_WS16,
DA_DELETE_WS10,
DA_DELETE_ZERO,
DA_DELETE_MIN = DA_DELETE_ATA_TRIM,
DA_DELETE_MAX = DA_DELETE_ZERO
} da_delete_methods;
/*
* For SCSI, host managed drives show up as a separate device type. For
* ATA, host managed drives also have a different device signature.
* XXX KDM figure out the ATA host managed signature.
*/
typedef enum {
DA_ZONE_NONE = 0x00,
DA_ZONE_DRIVE_MANAGED = 0x01,
DA_ZONE_HOST_AWARE = 0x02,
DA_ZONE_HOST_MANAGED = 0x03
} da_zone_mode;
/*
* We distinguish between these interface cases in addition to the drive type:
* o ATA drive behind a SCSI translation layer that knows about ZBC/ZAC
* o ATA drive behind a SCSI translation layer that does not know about
* ZBC/ZAC, and so needs to be managed via ATA passthrough. In this
* case, we would need to share the ATA code with the ada(4) driver.
* o SCSI drive.
*/
typedef enum {
DA_ZONE_IF_SCSI,
DA_ZONE_IF_ATA_PASS,
DA_ZONE_IF_ATA_SAT,
} da_zone_interface;
typedef enum {
DA_ZONE_FLAG_RZ_SUP = 0x0001,
DA_ZONE_FLAG_OPEN_SUP = 0x0002,
DA_ZONE_FLAG_CLOSE_SUP = 0x0004,
DA_ZONE_FLAG_FINISH_SUP = 0x0008,
DA_ZONE_FLAG_RWP_SUP = 0x0010,
DA_ZONE_FLAG_SUP_MASK = (DA_ZONE_FLAG_RZ_SUP |
DA_ZONE_FLAG_OPEN_SUP |
DA_ZONE_FLAG_CLOSE_SUP |
DA_ZONE_FLAG_FINISH_SUP |
DA_ZONE_FLAG_RWP_SUP),
DA_ZONE_FLAG_URSWRZ = 0x0020,
DA_ZONE_FLAG_OPT_SEQ_SET = 0x0040,
DA_ZONE_FLAG_OPT_NONSEQ_SET = 0x0080,
DA_ZONE_FLAG_MAX_SEQ_SET = 0x0100,
DA_ZONE_FLAG_SET_MASK = (DA_ZONE_FLAG_OPT_SEQ_SET |
DA_ZONE_FLAG_OPT_NONSEQ_SET |
DA_ZONE_FLAG_MAX_SEQ_SET)
} da_zone_flags;
static struct da_zone_desc {
da_zone_flags value;
const char *desc;
} da_zone_desc_table[] = {
{DA_ZONE_FLAG_RZ_SUP, "Report Zones" },
{DA_ZONE_FLAG_OPEN_SUP, "Open" },
{DA_ZONE_FLAG_CLOSE_SUP, "Close" },
{DA_ZONE_FLAG_FINISH_SUP, "Finish" },
{DA_ZONE_FLAG_RWP_SUP, "Reset Write Pointer" },
};
typedef void da_delete_func_t (struct cam_periph *periph, union ccb *ccb,
struct bio *bp);
static da_delete_func_t da_delete_trim;
static da_delete_func_t da_delete_unmap;
static da_delete_func_t da_delete_ws;
static const void * da_delete_functions[] = {
NULL,
NULL,
da_delete_trim,
da_delete_unmap,
da_delete_ws,
da_delete_ws,
da_delete_ws
};
static const char *da_delete_method_names[] =
{ "NONE", "DISABLE", "ATA_TRIM", "UNMAP", "WS16", "WS10", "ZERO" };
static const char *da_delete_method_desc[] =
{ "NONE", "DISABLED", "ATA TRIM", "UNMAP", "WRITE SAME(16) with UNMAP",
"WRITE SAME(10) with UNMAP", "ZERO" };
/* Offsets into our private area for storing information */
#define ccb_state ppriv_field0
#define ccb_bp ppriv_ptr1
struct disk_params {
u_int8_t heads;
u_int32_t cylinders;
u_int8_t secs_per_track;
u_int32_t secsize; /* Number of bytes/sector */
u_int64_t sectors; /* total number sectors */
u_int stripesize;
u_int stripeoffset;
};
#define UNMAP_RANGE_MAX 0xffffffff
#define UNMAP_HEAD_SIZE 8
#define UNMAP_RANGE_SIZE 16
#define UNMAP_MAX_RANGES 2048 /* Protocol Max is 4095 */
#define UNMAP_BUF_SIZE ((UNMAP_MAX_RANGES * UNMAP_RANGE_SIZE) + \
UNMAP_HEAD_SIZE)
#define WS10_MAX_BLKS 0xffff
#define WS16_MAX_BLKS 0xffffffff
#define ATA_TRIM_MAX_RANGES ((UNMAP_BUF_SIZE / \
(ATA_DSM_RANGE_SIZE * ATA_DSM_BLK_SIZE)) * ATA_DSM_BLK_SIZE)
#define DA_WORK_TUR (1 << 16)
struct da_softc {
struct cam_iosched_softc *cam_iosched;
struct bio_queue_head delete_run_queue;
LIST_HEAD(, ccb_hdr) pending_ccbs;
int refcount; /* Active xpt_action() calls */
da_state state;
da_flags flags;
da_quirks quirks;
int minimum_cmd_size;
int error_inject;
int trim_max_ranges;
int delete_available; /* Delete methods possibly available */
da_zone_mode zone_mode;
da_zone_interface zone_interface;
da_zone_flags zone_flags;
struct ata_gp_log_dir ata_logdir;
int valid_logdir_len;
struct ata_identify_log_pages ata_iddir;
int valid_iddir_len;
uint64_t optimal_seq_zones;
uint64_t optimal_nonseq_zones;
uint64_t max_seq_zones;
u_int maxio;
uint32_t unmap_max_ranges;
uint32_t unmap_max_lba; /* Max LBAs in UNMAP req */
uint32_t unmap_gran;
uint32_t unmap_gran_align;
uint64_t ws_max_blks;
da_delete_methods delete_method_pref;
da_delete_methods delete_method;
da_delete_func_t *delete_func;
int unmappedio;
int rotating;
struct disk_params params;
struct disk *disk;
union ccb saved_ccb;
struct task sysctl_task;
struct sysctl_ctx_list sysctl_ctx;
struct sysctl_oid *sysctl_tree;
struct callout sendordered_c;
uint64_t wwpn;
uint8_t unmap_buf[UNMAP_BUF_SIZE];
struct scsi_read_capacity_data_long rcaplong;
struct callout mediapoll_c;
#ifdef CAM_IO_STATS
struct sysctl_ctx_list sysctl_stats_ctx;
struct sysctl_oid *sysctl_stats_tree;
u_int errors;
u_int timeouts;
u_int invalidations;
#endif
#define DA_ANNOUNCETMP_SZ 80
char announce_temp[DA_ANNOUNCETMP_SZ];
#define DA_ANNOUNCE_SZ 400
char announcebuf[DA_ANNOUNCE_SZ];
};
#define dadeleteflag(softc, delete_method, enable) \
if (enable) { \
softc->delete_available |= (1 << delete_method); \
} else { \
softc->delete_available &= ~(1 << delete_method); \
}
struct da_quirk_entry {
struct scsi_inquiry_pattern inq_pat;
da_quirks quirks;
};
static const char quantum[] = "QUANTUM";
static const char microp[] = "MICROP";
static struct da_quirk_entry da_quirk_table[] =
{
/* SPI, FC devices */
{
/*
* Fujitsu M2513A MO drives.
* Tested devices: M2513A2 firmware versions 1200 & 1300.
* (dip switch selects whether T_DIRECT or T_OPTICAL device)
* Reported by: W.Scholten <whs@xs4all.nl>
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "FUJITSU", "M2513A", "*"},
/*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/* See above. */
{T_OPTICAL, SIP_MEDIA_REMOVABLE, "FUJITSU", "M2513A", "*"},
/*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* This particular Fujitsu drive doesn't like the
* synchronize cache command.
* Reported by: Tom Jackson <toj@gorilla.net>
*/
{T_DIRECT, SIP_MEDIA_FIXED, "FUJITSU", "M2954*", "*"},
/*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* This drive doesn't like the synchronize cache command
* either. Reported by: Matthew Jacob <mjacob@feral.com>
* in NetBSD PR kern/6027, August 24, 1998.
*/
{T_DIRECT, SIP_MEDIA_FIXED, microp, "2217*", "*"},
/*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* This drive doesn't like the synchronize cache command
* either. Reported by: Hellmuth Michaelis (hm@kts.org)
* (PR 8882).
*/
{T_DIRECT, SIP_MEDIA_FIXED, microp, "2112*", "*"},
/*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* Doesn't like the synchronize cache command.
* Reported by: Blaz Zupan <blaz@gold.amis.net>
*/
{T_DIRECT, SIP_MEDIA_FIXED, "NEC", "D3847*", "*"},
/*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* Doesn't like the synchronize cache command.
* Reported by: Blaz Zupan <blaz@gold.amis.net>
*/
{T_DIRECT, SIP_MEDIA_FIXED, quantum, "MAVERICK 540S", "*"},
/*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* Doesn't like the synchronize cache command.
*/
{T_DIRECT, SIP_MEDIA_FIXED, quantum, "LPS525S", "*"},
/*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* Doesn't like the synchronize cache command.
* Reported by: walter@pelissero.de
*/
{T_DIRECT, SIP_MEDIA_FIXED, quantum, "LPS540S", "*"},
/*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* Doesn't work correctly with 6 byte reads/writes.
* Returns illegal request, and points to byte 9 of the
* 6-byte CDB.
* Reported by: Adam McDougall <bsdx@spawnet.com>
*/
{T_DIRECT, SIP_MEDIA_FIXED, quantum, "VIKING 4*", "*"},
/*quirks*/ DA_Q_NO_6_BYTE
},
{
/* See above. */
{T_DIRECT, SIP_MEDIA_FIXED, quantum, "VIKING 2*", "*"},
/*quirks*/ DA_Q_NO_6_BYTE
},
{
/*
* Doesn't like the synchronize cache command.
* Reported by: walter@pelissero.de
*/
{T_DIRECT, SIP_MEDIA_FIXED, "CONNER", "CP3500*", "*"},
/*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* The CISS RAID controllers do not support SYNC_CACHE
*/
{T_DIRECT, SIP_MEDIA_FIXED, "COMPAQ", "RAID*", "*"},
/*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* The STEC SSDs sometimes hang on UNMAP.
*/
{T_DIRECT, SIP_MEDIA_FIXED, "STEC", "*", "*"},
/*quirks*/ DA_Q_NO_UNMAP
},
{
/*
* VMware returns BUSY status when storage has transient
* connectivity problems, so better wait.
* Also VMware returns odd errors on misaligned UNMAPs.
*/
{T_DIRECT, SIP_MEDIA_FIXED, "VMware*", "*", "*"},
/*quirks*/ DA_Q_RETRY_BUSY | DA_Q_STRICT_UNMAP
},
/* USB mass storage devices supported by umass(4) */
{
/*
* EXATELECOM (Sigmatel) i-Bead 100/105 USB Flash MP3 Player
* PR: kern/51675
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "EXATEL", "i-BEAD10*", "*"},
/*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* Power Quotient Int. (PQI) USB flash key
* PR: kern/53067
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "Generic*", "USB Flash Disk*",
"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* Creative Nomad MUVO mp3 player (USB)
* PR: kern/53094
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "CREATIVE", "NOMAD_MUVO", "*"},
/*quirks*/ DA_Q_NO_SYNC_CACHE|DA_Q_NO_PREVENT
},
{
/*
* Jungsoft NEXDISK USB flash key
* PR: kern/54737
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "JUNGSOFT", "NEXDISK*", "*"},
/*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* FreeDik USB Mini Data Drive
* PR: kern/54786
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "FreeDik*", "Mini Data Drive",
"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* Sigmatel USB Flash MP3 Player
* PR: kern/57046
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "SigmaTel", "MSCN", "*"},
/*quirks*/ DA_Q_NO_SYNC_CACHE|DA_Q_NO_PREVENT
},
{
/*
* Neuros USB Digital Audio Computer
* PR: kern/63645
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "NEUROS", "dig. audio comp.",
"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* SEAGRAND NP-900 MP3 Player
* PR: kern/64563
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "SEAGRAND", "NP-900*", "*"},
/*quirks*/ DA_Q_NO_SYNC_CACHE|DA_Q_NO_PREVENT
},
{
/*
* iRiver iFP MP3 player (with UMS Firmware)
* PR: kern/54881, i386/63941, kern/66124
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "iRiver", "iFP*", "*"},
/*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* Frontier Labs NEX IA+ Digital Audio Player, rev 1.10/0.01
* PR: kern/70158
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "FL" , "Nex*", "*"},
/*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* ZICPlay USB MP3 Player with FM
* PR: kern/75057
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "ACTIONS*" , "USB DISK*", "*"},
/*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* TEAC USB floppy mechanisms
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "TEAC" , "FD-05*", "*"},
/*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* Kingston DataTraveler II+ USB Pen-Drive.
* Reported by: Pawel Jakub Dawidek <pjd@FreeBSD.org>
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "Kingston" , "DataTraveler II+",
"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* USB DISK Pro PMAP
* Reported by: jhs
* PR: usb/96381
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, " ", "USB DISK Pro", "PMAP"},
/*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* Motorola E398 Mobile Phone (TransFlash memory card).
* Reported by: Wojciech A. Koszek <dunstan@FreeBSD.czest.pl>
* PR: usb/89889
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "Motorola" , "Motorola Phone",
"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* Qware BeatZkey! Pro
* PR: usb/79164
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "GENERIC", "USB DISK DEVICE",
"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* Time DPA20B 1GB MP3 Player
* PR: usb/81846
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "USB2.0*", "(FS) FLASH DISK*",
"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* Samsung USB key 128Mb
* PR: usb/90081
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "USB-DISK", "FreeDik-FlashUsb",
"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* Kingston DataTraveler 2.0 USB Flash memory.
* PR: usb/89196
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "Kingston", "DataTraveler 2.0",
"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* Creative MUVO Slim mp3 player (USB)
* PR: usb/86131
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "CREATIVE", "MuVo Slim",
"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE|DA_Q_NO_PREVENT
},
{
/*
* United MP5512 Portable MP3 Player (2-in-1 USB DISK/MP3)
* PR: usb/80487
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "Generic*", "MUSIC DISK",
"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* SanDisk Micro Cruzer 128MB
* PR: usb/75970
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "SanDisk" , "Micro Cruzer",
"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* TOSHIBA TransMemory USB sticks
* PR: kern/94660
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "TOSHIBA", "TransMemory",
"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* PNY USB 3.0 Flash Drives
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "PNY", "USB 3.0 FD*",
"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE | DA_Q_NO_RC16
},
{
/*
* PNY USB Flash keys
* PR: usb/75578, usb/72344, usb/65436
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "*" , "USB DISK*",
"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* Genesys GL3224
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "Generic*", "STORAGE DEVICE*",
"120?"}, /*quirks*/ DA_Q_NO_SYNC_CACHE | DA_Q_4K | DA_Q_NO_RC16
},
{
/*
* Genesys 6-in-1 Card Reader
* PR: usb/94647
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "Generic*", "STORAGE DEVICE*",
"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* Rekam Digital CAMERA
* PR: usb/98713
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "CAMERA*", "4MP-9J6*",
"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* iRiver H10 MP3 player
* PR: usb/102547
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "iriver", "H10*",
"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* iRiver U10 MP3 player
* PR: usb/92306
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "iriver", "U10*",
"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* X-Micro Flash Disk
* PR: usb/96901
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "X-Micro", "Flash Disk",
"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* EasyMP3 EM732X USB 2.0 Flash MP3 Player
* PR: usb/96546
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "EM732X", "MP3 Player*",
"1.00"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* Denver MP3 player
* PR: usb/107101
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "DENVER", "MP3 PLAYER",
"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* Philips USB Key Audio KEY013
* PR: usb/68412
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "PHILIPS", "Key*", "*"},
/*quirks*/ DA_Q_NO_SYNC_CACHE | DA_Q_NO_PREVENT
},
{
/*
* JNC MP3 Player
* PR: usb/94439
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "JNC*" , "MP3 Player*",
"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* SAMSUNG MP0402H
* PR: usb/108427
*/
{T_DIRECT, SIP_MEDIA_FIXED, "SAMSUNG", "MP0402H", "*"},
/*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* I/O Magic USB flash - Giga Bank
* PR: usb/108810
*/
{T_DIRECT, SIP_MEDIA_FIXED, "GS-Magic", "stor*", "*"},
/*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* JoyFly 128mb USB Flash Drive
* PR: 96133
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "USB 2.0", "Flash Disk*",
"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* ChipsBnk usb stick
* PR: 103702
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "ChipsBnk", "USB*",
"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* Storcase (Kingston) InfoStation IFS FC2/SATA-R 201A
* PR: 129858
*/
{T_DIRECT, SIP_MEDIA_FIXED, "IFS", "FC2/SATA-R*",
"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* Samsung YP-U3 mp3-player
* PR: 125398
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "Samsung", "YP-U3",
"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
{T_DIRECT, SIP_MEDIA_REMOVABLE, "Netac", "OnlyDisk*",
"2000"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* Sony Cyber-Shot DSC cameras
* PR: usb/137035
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "Sony", "Sony DSC", "*"},
/*quirks*/ DA_Q_NO_SYNC_CACHE | DA_Q_NO_PREVENT
},
{
{T_DIRECT, SIP_MEDIA_REMOVABLE, "Kingston", "DataTraveler G3",
"1.00"}, /*quirks*/ DA_Q_NO_PREVENT
},
{
/* At least several Transcent USB sticks lie on RC16. */
{T_DIRECT, SIP_MEDIA_REMOVABLE, "JetFlash", "Transcend*",
"*"}, /*quirks*/ DA_Q_NO_RC16
},
{
/*
* I-O Data USB Flash Disk
* PR: usb/211716
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "I-O DATA", "USB Flash Disk*",
"*"}, /*quirks*/ DA_Q_NO_RC16
},
/* ATA/SATA devices over SAS/USB/... */
{
/* Hitachi Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "Hitachi", "H??????????E3*", "*" },
/*quirks*/DA_Q_4K
},
{
/* Micron Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Micron 5100 MTFDDAK*", "*" },
/*quirks*/DA_Q_4K
},
{
/* Samsung Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "SAMSUNG HD155UI*", "*" },
/*quirks*/DA_Q_4K
},
{
/* Samsung Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "SAMSUNG", "HD155UI*", "*" },
/*quirks*/DA_Q_4K
},
{
/* Samsung Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "SAMSUNG HD204UI*", "*" },
/*quirks*/DA_Q_4K
},
{
/* Samsung Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "SAMSUNG", "HD204UI*", "*" },
/*quirks*/DA_Q_4K
},
{
/* Seagate Barracuda Green Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST????DL*", "*" },
/*quirks*/DA_Q_4K
},
{
/* Seagate Barracuda Green Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "ST????DL", "*", "*" },
/*quirks*/DA_Q_4K
},
{
/* Seagate Barracuda Green Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST???DM*", "*" },
/*quirks*/DA_Q_4K
},
{
/* Seagate Barracuda Green Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "ST???DM*", "*", "*" },
/*quirks*/DA_Q_4K
},
{
/* Seagate Barracuda Green Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST????DM*", "*" },
/*quirks*/DA_Q_4K
},
{
/* Seagate Barracuda Green Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "ST????DM", "*", "*" },
/*quirks*/DA_Q_4K
},
{
/* Seagate Momentus Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9500423AS*", "*" },
/*quirks*/DA_Q_4K
},
{
/* Seagate Momentus Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "ST950042", "3AS*", "*" },
/*quirks*/DA_Q_4K
},
{
/* Seagate Momentus Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9500424AS*", "*" },
/*quirks*/DA_Q_4K
},
{
/* Seagate Momentus Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "ST950042", "4AS*", "*" },
/*quirks*/DA_Q_4K
},
{
/* Seagate Momentus Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9640423AS*", "*" },
/*quirks*/DA_Q_4K
},
{
/* Seagate Momentus Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "ST964042", "3AS*", "*" },
/*quirks*/DA_Q_4K
},
{
/* Seagate Momentus Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9640424AS*", "*" },
/*quirks*/DA_Q_4K
},
{
/* Seagate Momentus Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "ST964042", "4AS*", "*" },
/*quirks*/DA_Q_4K
},
{
/* Seagate Momentus Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9750420AS*", "*" },
/*quirks*/DA_Q_4K
},
{
/* Seagate Momentus Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "ST975042", "0AS*", "*" },
/*quirks*/DA_Q_4K
},
{
/* Seagate Momentus Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9750422AS*", "*" },
/*quirks*/DA_Q_4K
},
{
/* Seagate Momentus Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "ST975042", "2AS*", "*" },
/*quirks*/DA_Q_4K
},
{
/* Seagate Momentus Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9750423AS*", "*" },
/*quirks*/DA_Q_4K
},
{
/* Seagate Momentus Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "ST975042", "3AS*", "*" },
/*quirks*/DA_Q_4K
},
{
/* Seagate Momentus Thin Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST???LT*", "*" },
/*quirks*/DA_Q_4K
},
{
/* Seagate Momentus Thin Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "ST???LT*", "*", "*" },
/*quirks*/DA_Q_4K
},
{
/* WDC Caviar Green Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD????RS*", "*" },
/*quirks*/DA_Q_4K
},
{
/* WDC Caviar Green Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "??RS*", "*" },
/*quirks*/DA_Q_4K
},
{
/* WDC Caviar Green Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD????RX*", "*" },
/*quirks*/DA_Q_4K
},
{
/* WDC Caviar Green Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "??RX*", "*" },
/*quirks*/DA_Q_4K
},
{
/* WDC Caviar Green Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD??????RS*", "*" },
/*quirks*/DA_Q_4K
},
{
/* WDC Caviar Green Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "????RS*", "*" },
/*quirks*/DA_Q_4K
},
{
/* WDC Caviar Green Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD??????RX*", "*" },
/*quirks*/DA_Q_4K
},
{
/* WDC Caviar Green Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "????RX*", "*" },
/*quirks*/DA_Q_4K
},
{
/* WDC Scorpio Black Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD???PKT*", "*" },
/*quirks*/DA_Q_4K
},
{
/* WDC Scorpio Black Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "?PKT*", "*" },
/*quirks*/DA_Q_4K
},
{
/* WDC Scorpio Black Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD?????PKT*", "*" },
/*quirks*/DA_Q_4K
},
{
/* WDC Scorpio Black Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "???PKT*", "*" },
/*quirks*/DA_Q_4K
},
{
/* WDC Scorpio Blue Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD???PVT*", "*" },
/*quirks*/DA_Q_4K
},
{
/* WDC Scorpio Blue Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "?PVT*", "*" },
/*quirks*/DA_Q_4K
},
{
/* WDC Scorpio Blue Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD?????PVT*", "*" },
/*quirks*/DA_Q_4K
},
{
/* WDC Scorpio Blue Advanced Format (4k) drives */
{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "???PVT*", "*" },
/*quirks*/DA_Q_4K
},
{
/*
* Olympus FE-210 camera
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "OLYMPUS", "FE210*",
"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* LG UP3S MP3 player
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "LG", "UP3S",
"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* Laser MP3-2GA13 MP3 player
*/
{T_DIRECT, SIP_MEDIA_REMOVABLE, "USB 2.0", "(HS) Flash Disk",
"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
},
{
/*
* LaCie external 250GB Hard drive des by Porsche
* Submitted by: Ben Stuyts <ben@altesco.nl>
* PR: 121474
*/
{T_DIRECT, SIP_MEDIA_FIXED, "SAMSUNG", "HM250JI", "*"},
/*quirks*/ DA_Q_NO_SYNC_CACHE
},
/* SATA SSDs */
{
/*
* Corsair Force 2 SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Corsair CSSD-F*", "*" },
/*quirks*/DA_Q_4K
},
{
/*
* Corsair Force 3 SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Corsair Force 3*", "*" },
/*quirks*/DA_Q_4K
},
{
/*
* Corsair Neutron GTX SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "Corsair Neutron GTX*", "*" },
/*quirks*/DA_Q_4K
},
{
/*
* Corsair Force GT & GS SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Corsair Force G*", "*" },
/*quirks*/DA_Q_4K
},
{
/*
* Crucial M4 SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "M4-CT???M4SSD2*", "*" },
/*quirks*/DA_Q_4K
},
{
/*
* Crucial RealSSD C300 SSDs
* 4k optimised
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "C300-CTFDDAC???MAG*",
"*" }, /*quirks*/DA_Q_4K
},
{
/*
* Intel 320 Series SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "INTEL SSDSA2CW*", "*" },
/*quirks*/DA_Q_4K
},
{
/*
* Intel 330 Series SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "INTEL SSDSC2CT*", "*" },
/*quirks*/DA_Q_4K
},
{
/*
* Intel 510 Series SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "INTEL SSDSC2MH*", "*" },
/*quirks*/DA_Q_4K
},
{
/*
* Intel 520 Series SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "INTEL SSDSC2BW*", "*" },
/*quirks*/DA_Q_4K
},
{
/*
* Intel S3610 Series SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "INTEL SSDSC2BX*", "*" },
/*quirks*/DA_Q_4K
},
{
/*
* Intel X25-M Series SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "INTEL SSDSA2M*", "*" },
/*quirks*/DA_Q_4K
},
{
/*
* Kingston E100 Series SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "KINGSTON SE100S3*", "*" },
/*quirks*/DA_Q_4K
},
{
/*
* Kingston HyperX 3k SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "KINGSTON SH103S3*", "*" },
/*quirks*/DA_Q_4K
},
{
/*
* Marvell SSDs (entry taken from OpenSolaris)
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "MARVELL SD88SA02*", "*" },
/*quirks*/DA_Q_4K
},
{
/*
* OCZ Agility 2 SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "OCZ-AGILITY2*", "*" },
/*quirks*/DA_Q_4K
},
{
/*
* OCZ Agility 3 SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "OCZ-AGILITY3*", "*" },
/*quirks*/DA_Q_4K
},
{
/*
* OCZ Deneva R Series SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "DENRSTE251M45*", "*" },
/*quirks*/DA_Q_4K
},
{
/*
* OCZ Vertex 2 SSDs (inc pro series)
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "OCZ?VERTEX2*", "*" },
/*quirks*/DA_Q_4K
},
{
/*
* OCZ Vertex 3 SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "OCZ-VERTEX3*", "*" },
/*quirks*/DA_Q_4K
},
{
/*
* OCZ Vertex 4 SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "OCZ-VERTEX4*", "*" },
/*quirks*/DA_Q_4K
},
{
/*
* Samsung 750 Series SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Samsung SSD 750*", "*" },
/*quirks*/DA_Q_4K
},
{
/*
* Samsung 830 Series SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "SAMSUNG SSD 830 Series*", "*" },
/*quirks*/DA_Q_4K
},
{
/*
* Samsung 840 SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Samsung SSD 840*", "*" },
/*quirks*/DA_Q_4K
},
{
/*
* Samsung 845 SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Samsung SSD 845*", "*" },
/*quirks*/DA_Q_4K
},
{
/*
* Samsung 850 SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Samsung SSD 850*", "*" },
/*quirks*/DA_Q_4K
},
{
/*
* Samsung 843T Series SSDs (MZ7WD*)
* Samsung PM851 Series SSDs (MZ7TE*)
* Samsung PM853T Series SSDs (MZ7GE*)
* Samsung SM863 Series SSDs (MZ7KM*)
* 4k optimised
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "SAMSUNG MZ7*", "*" },
/*quirks*/DA_Q_4K
},
{
/*
* Same as for SAMSUNG MZ7* but enable the quirks for SSD
* starting with MZ7* too
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "MZ7*", "*" },
/*quirks*/DA_Q_4K
},
{
/*
* SuperTalent TeraDrive CT SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "FTM??CT25H*", "*" },
/*quirks*/DA_Q_4K
},
{
/*
* XceedIOPS SATA SSDs
* 4k optimised
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "SG9XCS2D*", "*" },
/*quirks*/DA_Q_4K
},
{
/*
* Hama Innostor USB-Stick
*/
{ T_DIRECT, SIP_MEDIA_REMOVABLE, "Innostor", "Innostor*", "*" },
/*quirks*/DA_Q_NO_RC16
},
{
/*
* Seagate Lamarr 8TB Shingled Magnetic Recording (SMR)
* Drive Managed SATA hard drive. This drive doesn't report
* in firmware that it is a drive managed SMR drive.
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST8000AS000[23]*", "*" },
/*quirks*/DA_Q_SMR_DM
},
{
/*
* MX-ES USB Drive by Mach Xtreme
*/
{ T_DIRECT, SIP_MEDIA_REMOVABLE, "MX", "MXUB3*", "*"},
/*quirks*/DA_Q_NO_RC16
},
};
static disk_strategy_t dastrategy;
static dumper_t dadump;
static periph_init_t dainit;
static void daasync(void *callback_arg, u_int32_t code,
struct cam_path *path, void *arg);
static void dasysctlinit(void *context, int pending);
static int dasysctlsofttimeout(SYSCTL_HANDLER_ARGS);
static int dacmdsizesysctl(SYSCTL_HANDLER_ARGS);
static int dadeletemethodsysctl(SYSCTL_HANDLER_ARGS);
static int dazonemodesysctl(SYSCTL_HANDLER_ARGS);
static int dazonesupsysctl(SYSCTL_HANDLER_ARGS);
static int dadeletemaxsysctl(SYSCTL_HANDLER_ARGS);
static void dadeletemethodset(struct da_softc *softc,
da_delete_methods delete_method);
static off_t dadeletemaxsize(struct da_softc *softc,
da_delete_methods delete_method);
static void dadeletemethodchoose(struct da_softc *softc,
da_delete_methods default_method);
static void daprobedone(struct cam_periph *periph, union ccb *ccb);
static periph_ctor_t daregister;
static periph_dtor_t dacleanup;
static periph_start_t dastart;
static periph_oninv_t daoninvalidate;
static void dazonedone(struct cam_periph *periph, union ccb *ccb);
static void dadone(struct cam_periph *periph,
union ccb *done_ccb);
static int daerror(union ccb *ccb, u_int32_t cam_flags,
u_int32_t sense_flags);
static void daprevent(struct cam_periph *periph, int action);
static void dareprobe(struct cam_periph *periph);
static void dasetgeom(struct cam_periph *periph, uint32_t block_len,
uint64_t maxsector,
struct scsi_read_capacity_data_long *rcaplong,
size_t rcap_size);
static timeout_t dasendorderedtag;
static void dashutdown(void *arg, int howto);
static timeout_t damediapoll;
#ifndef DA_DEFAULT_POLL_PERIOD
#define DA_DEFAULT_POLL_PERIOD 3
#endif
#ifndef DA_DEFAULT_TIMEOUT
#define DA_DEFAULT_TIMEOUT 60 /* Timeout in seconds */
#endif
#ifndef DA_DEFAULT_SOFTTIMEOUT
#define DA_DEFAULT_SOFTTIMEOUT 0
#endif
#ifndef DA_DEFAULT_RETRY
#define DA_DEFAULT_RETRY 4
#endif
#ifndef DA_DEFAULT_SEND_ORDERED
#define DA_DEFAULT_SEND_ORDERED 1
#endif
static int da_poll_period = DA_DEFAULT_POLL_PERIOD;
static int da_retry_count = DA_DEFAULT_RETRY;
static int da_default_timeout = DA_DEFAULT_TIMEOUT;
static sbintime_t da_default_softtimeout = DA_DEFAULT_SOFTTIMEOUT;
static int da_send_ordered = DA_DEFAULT_SEND_ORDERED;
static SYSCTL_NODE(_kern_cam, OID_AUTO, da, CTLFLAG_RD, 0,
"CAM Direct Access Disk driver");
SYSCTL_INT(_kern_cam_da, OID_AUTO, poll_period, CTLFLAG_RWTUN,
&da_poll_period, 0, "Media polling period in seconds");
SYSCTL_INT(_kern_cam_da, OID_AUTO, retry_count, CTLFLAG_RWTUN,
&da_retry_count, 0, "Normal I/O retry count");
SYSCTL_INT(_kern_cam_da, OID_AUTO, default_timeout, CTLFLAG_RWTUN,
&da_default_timeout, 0, "Normal I/O timeout (in seconds)");
SYSCTL_INT(_kern_cam_da, OID_AUTO, send_ordered, CTLFLAG_RWTUN,
&da_send_ordered, 0, "Send Ordered Tags");
SYSCTL_PROC(_kern_cam_da, OID_AUTO, default_softtimeout,
CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, dasysctlsofttimeout, "I",
"Soft I/O timeout (ms)");
TUNABLE_INT64("kern.cam.da.default_softtimeout", &da_default_softtimeout);
/*
* DA_ORDEREDTAG_INTERVAL determines how often, relative
* to the default timeout, we check to see whether an ordered
* tagged transaction is appropriate to prevent simple tag
* starvation. Since we'd like to ensure that there is at least
* 1/2 of the timeout length left for a starved transaction to
* complete after we've sent an ordered tag, we must poll at least
* four times in every timeout period. This takes care of the worst
* case where a starved transaction starts during an interval that
* meets the requirement "don't send an ordered tag" test so it takes
* us two intervals to determine that a tag must be sent.
*/
#ifndef DA_ORDEREDTAG_INTERVAL
#define DA_ORDEREDTAG_INTERVAL 4
#endif
static struct periph_driver dadriver =
{
dainit, "da",
TAILQ_HEAD_INITIALIZER(dadriver.units), /* generation */ 0
};
PERIPHDRIVER_DECLARE(da, dadriver);
static MALLOC_DEFINE(M_SCSIDA, "scsi_da", "scsi_da buffers");
static int
daopen(struct disk *dp)
{
struct cam_periph *periph;
struct da_softc *softc;
int error;
periph = (struct cam_periph *)dp->d_drv1;
if (cam_periph_acquire(periph) != CAM_REQ_CMP) {
return (ENXIO);
}
cam_periph_lock(periph);
if ((error = cam_periph_hold(periph, PRIBIO|PCATCH)) != 0) {
cam_periph_unlock(periph);
cam_periph_release(periph);
return (error);
}
CAM_DEBUG(periph->path, CAM_DEBUG_TRACE | CAM_DEBUG_PERIPH,
("daopen\n"));
softc = (struct da_softc *)periph->softc;
dareprobe(periph);
/* Wait for the disk size update. */
error = cam_periph_sleep(periph, &softc->disk->d_mediasize, PRIBIO,
"dareprobe", 0);
if (error != 0)
xpt_print(periph->path, "unable to retrieve capacity data\n");
if (periph->flags & CAM_PERIPH_INVALID)
error = ENXIO;
if (error == 0 && (softc->flags & DA_FLAG_PACK_REMOVABLE) != 0 &&
(softc->quirks & DA_Q_NO_PREVENT) == 0)
daprevent(periph, PR_PREVENT);
if (error == 0) {
softc->flags &= ~DA_FLAG_PACK_INVALID;
softc->flags |= DA_FLAG_OPEN;
}
cam_periph_unhold(periph);
cam_periph_unlock(periph);
if (error != 0)
cam_periph_release(periph);
return (error);
}
static int
daclose(struct disk *dp)
{
struct cam_periph *periph;
struct da_softc *softc;
union ccb *ccb;
- int error;
periph = (struct cam_periph *)dp->d_drv1;
softc = (struct da_softc *)periph->softc;
cam_periph_lock(periph);
CAM_DEBUG(periph->path, CAM_DEBUG_TRACE | CAM_DEBUG_PERIPH,
("daclose\n"));
if (cam_periph_hold(periph, PRIBIO) == 0) {
/* Flush disk cache. */
if ((softc->flags & DA_FLAG_DIRTY) != 0 &&
(softc->quirks & DA_Q_NO_SYNC_CACHE) == 0 &&
(softc->flags & DA_FLAG_PACK_INVALID) == 0) {
ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
scsi_synchronize_cache(&ccb->csio, /*retries*/1,
/*cbfcnp*/dadone, MSG_SIMPLE_Q_TAG,
/*begin_lba*/0, /*lb_count*/0, SSD_FULL_SIZE,
5 * 60 * 1000);
- error = cam_periph_runccb(ccb, daerror, /*cam_flags*/0,
+ cam_periph_runccb(ccb, daerror, /*cam_flags*/0,
/*sense_flags*/SF_RETRY_UA | SF_QUIET_IR,
softc->disk->d_devstat);
softc->flags &= ~DA_FLAG_DIRTY;
xpt_release_ccb(ccb);
}
/* Allow medium removal. */
if ((softc->flags & DA_FLAG_PACK_REMOVABLE) != 0 &&
(softc->quirks & DA_Q_NO_PREVENT) == 0)
daprevent(periph, PR_ALLOW);
cam_periph_unhold(periph);
}
/*
* If we've got removeable media, mark the blocksize as
* unavailable, since it could change when new media is
* inserted.
*/
if ((softc->flags & DA_FLAG_PACK_REMOVABLE) != 0)
softc->disk->d_devstat->flags |= DEVSTAT_BS_UNAVAILABLE;
softc->flags &= ~DA_FLAG_OPEN;
while (softc->refcount != 0)
cam_periph_sleep(periph, &softc->refcount, PRIBIO, "daclose", 1);
cam_periph_unlock(periph);
cam_periph_release(periph);
return (0);
}
static void
daschedule(struct cam_periph *periph)
{
struct da_softc *softc = (struct da_softc *)periph->softc;
if (softc->state != DA_STATE_NORMAL)
return;
cam_iosched_schedule(softc->cam_iosched, periph);
}
/*
* Actually translate the requested transfer into one the physical driver
* can understand. The transfer is described by a buf and will include
* only one physical transfer.
*/
static void
dastrategy(struct bio *bp)
{
struct cam_periph *periph;
struct da_softc *softc;
periph = (struct cam_periph *)bp->bio_disk->d_drv1;
softc = (struct da_softc *)periph->softc;
cam_periph_lock(periph);
/*
* If the device has been made invalid, error out
*/
if ((softc->flags & DA_FLAG_PACK_INVALID)) {
cam_periph_unlock(periph);
biofinish(bp, NULL, ENXIO);
return;
}
CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("dastrategy(%p)\n", bp));
/*
* Zone commands must be ordered, because they can depend on the
* effects of previously issued commands, and they may affect
* commands after them.
*/
if (bp->bio_cmd == BIO_ZONE)
bp->bio_flags |= BIO_ORDERED;
/*
* Place it in the queue of disk activities for this disk
*/
cam_iosched_queue_work(softc->cam_iosched, bp);
/*
* Schedule ourselves for performing the work.
*/
daschedule(periph);
cam_periph_unlock(periph);
return;
}
static int
dadump(void *arg, void *virtual, vm_offset_t physical, off_t offset, size_t length)
{
struct cam_periph *periph;
struct da_softc *softc;
u_int secsize;
struct ccb_scsiio csio;
struct disk *dp;
int error = 0;
dp = arg;
periph = dp->d_drv1;
softc = (struct da_softc *)periph->softc;
cam_periph_lock(periph);
secsize = softc->params.secsize;
if ((softc->flags & DA_FLAG_PACK_INVALID) != 0) {
cam_periph_unlock(periph);
return (ENXIO);
}
memset(&csio, 0, sizeof(csio));
if (length > 0) {
xpt_setup_ccb(&csio.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
csio.ccb_h.ccb_state = DA_CCB_DUMP;
scsi_read_write(&csio,
/*retries*/0,
dadone,
MSG_ORDERED_Q_TAG,
/*read*/SCSI_RW_WRITE,
/*byte2*/0,
/*minimum_cmd_size*/ softc->minimum_cmd_size,
offset / secsize,
length / secsize,
/*data_ptr*/(u_int8_t *) virtual,
/*dxfer_len*/length,
/*sense_len*/SSD_FULL_SIZE,
da_default_timeout * 1000);
error = cam_periph_runccb((union ccb *)&csio, cam_periph_error,
0, SF_NO_RECOVERY | SF_NO_RETRY, NULL);
if (error != 0)
printf("Aborting dump due to I/O error.\n");
cam_periph_unlock(periph);
return (error);
}
/*
* Sync the disk cache contents to the physical media.
*/
if ((softc->quirks & DA_Q_NO_SYNC_CACHE) == 0) {
xpt_setup_ccb(&csio.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
csio.ccb_h.ccb_state = DA_CCB_DUMP;
scsi_synchronize_cache(&csio,
/*retries*/0,
/*cbfcnp*/dadone,
MSG_SIMPLE_Q_TAG,
/*begin_lba*/0,/* Cover the whole disk */
/*lb_count*/0,
SSD_FULL_SIZE,
5 * 1000);
error = cam_periph_runccb((union ccb *)&csio, cam_periph_error,
0, SF_NO_RECOVERY | SF_NO_RETRY, NULL);
if (error != 0)
xpt_print(periph->path, "Synchronize cache failed\n");
}
cam_periph_unlock(periph);
return (error);
}
static int
dagetattr(struct bio *bp)
{
int ret;
struct cam_periph *periph;
periph = (struct cam_periph *)bp->bio_disk->d_drv1;
cam_periph_lock(periph);
ret = xpt_getattr(bp->bio_data, bp->bio_length, bp->bio_attribute,
periph->path);
cam_periph_unlock(periph);
if (ret == 0)
bp->bio_completed = bp->bio_length;
return ret;
}
static void
dainit(void)
{
cam_status status;
/*
* Install a global async callback. This callback will
* receive async callbacks like "new device found".
*/
status = xpt_register_async(AC_FOUND_DEVICE, daasync, NULL, NULL);
if (status != CAM_REQ_CMP) {
printf("da: Failed to attach master async callback "
"due to status 0x%x!\n", status);
} else if (da_send_ordered) {
/* Register our shutdown event handler */
if ((EVENTHANDLER_REGISTER(shutdown_post_sync, dashutdown,
NULL, SHUTDOWN_PRI_DEFAULT)) == NULL)
printf("dainit: shutdown event registration failed!\n");
}
}
/*
* Callback from GEOM, called when it has finished cleaning up its
* resources.
*/
static void
dadiskgonecb(struct disk *dp)
{
struct cam_periph *periph;
periph = (struct cam_periph *)dp->d_drv1;
cam_periph_release(periph);
}
static void
daoninvalidate(struct cam_periph *periph)
{
struct da_softc *softc;
softc = (struct da_softc *)periph->softc;
/*
* De-register any async callbacks.
*/
xpt_register_async(0, daasync, periph, periph->path);
softc->flags |= DA_FLAG_PACK_INVALID;
#ifdef CAM_IO_STATS
softc->invalidations++;
#endif
/*
* Return all queued I/O with ENXIO.
* XXX Handle any transactions queued to the card
* with XPT_ABORT_CCB.
*/
cam_iosched_flush(softc->cam_iosched, NULL, ENXIO);
/*
* Tell GEOM that we've gone away, we'll get a callback when it is
* done cleaning up its resources.
*/
disk_gone(softc->disk);
}
static void
dacleanup(struct cam_periph *periph)
{
struct da_softc *softc;
softc = (struct da_softc *)periph->softc;
cam_periph_unlock(periph);
cam_iosched_fini(softc->cam_iosched);
/*
* If we can't free the sysctl tree, oh well...
*/
if ((softc->flags & DA_FLAG_SCTX_INIT) != 0) {
#ifdef CAM_IO_STATS
if (sysctl_ctx_free(&softc->sysctl_stats_ctx) != 0)
xpt_print(periph->path,
"can't remove sysctl stats context\n");
#endif
if (sysctl_ctx_free(&softc->sysctl_ctx) != 0)
xpt_print(periph->path,
"can't remove sysctl context\n");
}
callout_drain(&softc->mediapoll_c);
disk_destroy(softc->disk);
callout_drain(&softc->sendordered_c);
free(softc, M_DEVBUF);
cam_periph_lock(periph);
}
static void
daasync(void *callback_arg, u_int32_t code,
struct cam_path *path, void *arg)
{
struct cam_periph *periph;
struct da_softc *softc;
periph = (struct cam_periph *)callback_arg;
switch (code) {
case AC_FOUND_DEVICE:
{
struct ccb_getdev *cgd;
cam_status status;
cgd = (struct ccb_getdev *)arg;
if (cgd == NULL)
break;
if (cgd->protocol != PROTO_SCSI)
break;
if (SID_QUAL(&cgd->inq_data) != SID_QUAL_LU_CONNECTED)
break;
if (SID_TYPE(&cgd->inq_data) != T_DIRECT
&& SID_TYPE(&cgd->inq_data) != T_RBC
&& SID_TYPE(&cgd->inq_data) != T_OPTICAL
&& SID_TYPE(&cgd->inq_data) != T_ZBC_HM)
break;
/*
* Allocate a peripheral instance for
* this device and start the probe
* process.
*/
status = cam_periph_alloc(daregister, daoninvalidate,
dacleanup, dastart,
"da", CAM_PERIPH_BIO,
path, daasync,
AC_FOUND_DEVICE, cgd);
if (status != CAM_REQ_CMP
&& status != CAM_REQ_INPROG)
printf("daasync: Unable to attach to new device "
"due to status 0x%x\n", status);
return;
}
case AC_ADVINFO_CHANGED:
{
uintptr_t buftype;
buftype = (uintptr_t)arg;
if (buftype == CDAI_TYPE_PHYS_PATH) {
struct da_softc *softc;
softc = periph->softc;
disk_attr_changed(softc->disk, "GEOM::physpath",
M_NOWAIT);
}
break;
}
case AC_UNIT_ATTENTION:
{
union ccb *ccb;
int error_code, sense_key, asc, ascq;
softc = (struct da_softc *)periph->softc;
ccb = (union ccb *)arg;
/*
* Handle all UNIT ATTENTIONs except our own,
* as they will be handled by daerror().
*/
if (xpt_path_periph(ccb->ccb_h.path) != periph &&
scsi_extract_sense_ccb(ccb,
&error_code, &sense_key, &asc, &ascq)) {
if (asc == 0x2A && ascq == 0x09) {
xpt_print(ccb->ccb_h.path,
"Capacity data has changed\n");
softc->flags &= ~DA_FLAG_PROBED;
dareprobe(periph);
} else if (asc == 0x28 && ascq == 0x00) {
softc->flags &= ~DA_FLAG_PROBED;
disk_media_changed(softc->disk, M_NOWAIT);
} else if (asc == 0x3F && ascq == 0x03) {
xpt_print(ccb->ccb_h.path,
"INQUIRY data has changed\n");
softc->flags &= ~DA_FLAG_PROBED;
dareprobe(periph);
}
}
break;
}
case AC_SCSI_AEN:
softc = (struct da_softc *)periph->softc;
if (!cam_iosched_has_work_flags(softc->cam_iosched, DA_WORK_TUR)) {
if (cam_periph_acquire(periph) == CAM_REQ_CMP) {
cam_iosched_set_work_flags(softc->cam_iosched, DA_WORK_TUR);
daschedule(periph);
}
}
/* FALLTHROUGH */
case AC_SENT_BDR:
case AC_BUS_RESET:
{
struct ccb_hdr *ccbh;
softc = (struct da_softc *)periph->softc;
/*
* Don't fail on the expected unit attention
* that will occur.
*/
softc->flags |= DA_FLAG_RETRY_UA;
LIST_FOREACH(ccbh, &softc->pending_ccbs, periph_links.le)
ccbh->ccb_state |= DA_CCB_RETRY_UA;
break;
}
case AC_INQ_CHANGED:
softc = (struct da_softc *)periph->softc;
softc->flags &= ~DA_FLAG_PROBED;
dareprobe(periph);
break;
default:
break;
}
cam_periph_async(periph, code, path, arg);
}
static void
dasysctlinit(void *context, int pending)
{
struct cam_periph *periph;
struct da_softc *softc;
char tmpstr[32], tmpstr2[16];
struct ccb_trans_settings cts;
periph = (struct cam_periph *)context;
/*
* periph was held for us when this task was enqueued
*/
if (periph->flags & CAM_PERIPH_INVALID) {
cam_periph_release(periph);
return;
}
softc = (struct da_softc *)periph->softc;
snprintf(tmpstr, sizeof(tmpstr), "CAM DA unit %d", periph->unit_number);
snprintf(tmpstr2, sizeof(tmpstr2), "%d", periph->unit_number);
sysctl_ctx_init(&softc->sysctl_ctx);
softc->flags |= DA_FLAG_SCTX_INIT;
softc->sysctl_tree = SYSCTL_ADD_NODE_WITH_LABEL(&softc->sysctl_ctx,
SYSCTL_STATIC_CHILDREN(_kern_cam_da), OID_AUTO, tmpstr2,
CTLFLAG_RD, 0, tmpstr, "device_index");
if (softc->sysctl_tree == NULL) {
printf("dasysctlinit: unable to allocate sysctl tree\n");
cam_periph_release(periph);
return;
}
/*
* Now register the sysctl handler, so the user can change the value on
* the fly.
*/
SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
OID_AUTO, "delete_method", CTLTYPE_STRING | CTLFLAG_RWTUN,
softc, 0, dadeletemethodsysctl, "A",
"BIO_DELETE execution method");
SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
OID_AUTO, "delete_max", CTLTYPE_U64 | CTLFLAG_RW,
softc, 0, dadeletemaxsysctl, "Q",
"Maximum BIO_DELETE size");
SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
OID_AUTO, "minimum_cmd_size", CTLTYPE_INT | CTLFLAG_RW,
&softc->minimum_cmd_size, 0, dacmdsizesysctl, "I",
"Minimum CDB size");
SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
OID_AUTO, "zone_mode", CTLTYPE_STRING | CTLFLAG_RD,
softc, 0, dazonemodesysctl, "A",
"Zone Mode");
SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
OID_AUTO, "zone_support", CTLTYPE_STRING | CTLFLAG_RD,
softc, 0, dazonesupsysctl, "A",
"Zone Support");
SYSCTL_ADD_UQUAD(&softc->sysctl_ctx,
SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO,
"optimal_seq_zones", CTLFLAG_RD, &softc->optimal_seq_zones,
"Optimal Number of Open Sequential Write Preferred Zones");
SYSCTL_ADD_UQUAD(&softc->sysctl_ctx,
SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO,
"optimal_nonseq_zones", CTLFLAG_RD,
&softc->optimal_nonseq_zones,
"Optimal Number of Non-Sequentially Written Sequential Write "
"Preferred Zones");
SYSCTL_ADD_UQUAD(&softc->sysctl_ctx,
SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO,
"max_seq_zones", CTLFLAG_RD, &softc->max_seq_zones,
"Maximum Number of Open Sequential Write Required Zones");
SYSCTL_ADD_INT(&softc->sysctl_ctx,
SYSCTL_CHILDREN(softc->sysctl_tree),
OID_AUTO,
"error_inject",
CTLFLAG_RW,
&softc->error_inject,
0,
"error_inject leaf");
SYSCTL_ADD_INT(&softc->sysctl_ctx,
SYSCTL_CHILDREN(softc->sysctl_tree),
OID_AUTO,
"unmapped_io",
CTLFLAG_RD,
&softc->unmappedio,
0,
"Unmapped I/O leaf");
SYSCTL_ADD_INT(&softc->sysctl_ctx,
SYSCTL_CHILDREN(softc->sysctl_tree),
OID_AUTO,
"rotating",
CTLFLAG_RD,
&softc->rotating,
0,
"Rotating media");
/*
* Add some addressing info.
*/
memset(&cts, 0, sizeof (cts));
xpt_setup_ccb(&cts.ccb_h, periph->path, CAM_PRIORITY_NONE);
cts.ccb_h.func_code = XPT_GET_TRAN_SETTINGS;
cts.type = CTS_TYPE_CURRENT_SETTINGS;
cam_periph_lock(periph);
xpt_action((union ccb *)&cts);
cam_periph_unlock(periph);
if (cts.ccb_h.status != CAM_REQ_CMP) {
cam_periph_release(periph);
return;
}
if (cts.protocol == PROTO_SCSI && cts.transport == XPORT_FC) {
struct ccb_trans_settings_fc *fc = &cts.xport_specific.fc;
if (fc->valid & CTS_FC_VALID_WWPN) {
softc->wwpn = fc->wwpn;
SYSCTL_ADD_UQUAD(&softc->sysctl_ctx,
SYSCTL_CHILDREN(softc->sysctl_tree),
OID_AUTO, "wwpn", CTLFLAG_RD,
&softc->wwpn, "World Wide Port Name");
}
}
#ifdef CAM_IO_STATS
/*
* Now add some useful stats.
* XXX These should live in cam_periph and be common to all periphs
*/
softc->sysctl_stats_tree = SYSCTL_ADD_NODE(&softc->sysctl_stats_ctx,
SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "stats",
CTLFLAG_RD, 0, "Statistics");
SYSCTL_ADD_INT(&softc->sysctl_stats_ctx,
SYSCTL_CHILDREN(softc->sysctl_stats_tree),
OID_AUTO,
"errors",
CTLFLAG_RD,
&softc->errors,
0,
"Transport errors reported by the SIM");
SYSCTL_ADD_INT(&softc->sysctl_stats_ctx,
SYSCTL_CHILDREN(softc->sysctl_stats_tree),
OID_AUTO,
"timeouts",
CTLFLAG_RD,
&softc->timeouts,
0,
"Device timeouts reported by the SIM");
SYSCTL_ADD_INT(&softc->sysctl_stats_ctx,
SYSCTL_CHILDREN(softc->sysctl_stats_tree),
OID_AUTO,
"pack_invalidations",
CTLFLAG_RD,
&softc->invalidations,
0,
"Device pack invalidations");
#endif
cam_iosched_sysctl_init(softc->cam_iosched, &softc->sysctl_ctx,
softc->sysctl_tree);
cam_periph_release(periph);
}
static int
dadeletemaxsysctl(SYSCTL_HANDLER_ARGS)
{
int error;
uint64_t value;
struct da_softc *softc;
softc = (struct da_softc *)arg1;
value = softc->disk->d_delmaxsize;
error = sysctl_handle_64(oidp, &value, 0, req);
if ((error != 0) || (req->newptr == NULL))
return (error);
/* only accept values smaller than the calculated value */
if (value > dadeletemaxsize(softc, softc->delete_method)) {
return (EINVAL);
}
softc->disk->d_delmaxsize = value;
return (0);
}
static int
dacmdsizesysctl(SYSCTL_HANDLER_ARGS)
{
int error, value;
value = *(int *)arg1;
error = sysctl_handle_int(oidp, &value, 0, req);
if ((error != 0)
|| (req->newptr == NULL))
return (error);
/*
* Acceptable values here are 6, 10, 12 or 16.
*/
if (value < 6)
value = 6;
else if ((value > 6)
&& (value <= 10))
value = 10;
else if ((value > 10)
&& (value <= 12))
value = 12;
else if (value > 12)
value = 16;
*(int *)arg1 = value;
return (0);
}
static int
dasysctlsofttimeout(SYSCTL_HANDLER_ARGS)
{
sbintime_t value;
int error;
value = da_default_softtimeout / SBT_1MS;
error = sysctl_handle_int(oidp, (int *)&value, 0, req);
if ((error != 0) || (req->newptr == NULL))
return (error);
/* XXX Should clip this to a reasonable level */
if (value > da_default_timeout * 1000)
return (EINVAL);
da_default_softtimeout = value * SBT_1MS;
return (0);
}
static void
dadeletemethodset(struct da_softc *softc, da_delete_methods delete_method)
{
softc->delete_method = delete_method;
softc->disk->d_delmaxsize = dadeletemaxsize(softc, delete_method);
softc->delete_func = da_delete_functions[delete_method];
if (softc->delete_method > DA_DELETE_DISABLE)
softc->disk->d_flags |= DISKFLAG_CANDELETE;
else
softc->disk->d_flags &= ~DISKFLAG_CANDELETE;
}
static off_t
dadeletemaxsize(struct da_softc *softc, da_delete_methods delete_method)
{
off_t sectors;
switch(delete_method) {
case DA_DELETE_UNMAP:
sectors = (off_t)softc->unmap_max_lba;
break;
case DA_DELETE_ATA_TRIM:
sectors = (off_t)ATA_DSM_RANGE_MAX * softc->trim_max_ranges;
break;
case DA_DELETE_WS16:
sectors = omin(softc->ws_max_blks, WS16_MAX_BLKS);
break;
case DA_DELETE_ZERO:
case DA_DELETE_WS10:
sectors = omin(softc->ws_max_blks, WS10_MAX_BLKS);
break;
default:
return 0;
}
return (off_t)softc->params.secsize *
omin(sectors, softc->params.sectors);
}
static void
daprobedone(struct cam_periph *periph, union ccb *ccb)
{
struct da_softc *softc;
softc = (struct da_softc *)periph->softc;
dadeletemethodchoose(softc, DA_DELETE_NONE);
if (bootverbose && (softc->flags & DA_FLAG_ANNOUNCED) == 0) {
char buf[80];
int i, sep;
snprintf(buf, sizeof(buf), "Delete methods: <");
sep = 0;
for (i = 0; i <= DA_DELETE_MAX; i++) {
if ((softc->delete_available & (1 << i)) == 0 &&
i != softc->delete_method)
continue;
if (sep)
strlcat(buf, ",", sizeof(buf));
strlcat(buf, da_delete_method_names[i],
sizeof(buf));
if (i == softc->delete_method)
strlcat(buf, "(*)", sizeof(buf));
sep = 1;
}
strlcat(buf, ">", sizeof(buf));
printf("%s%d: %s\n", periph->periph_name,
periph->unit_number, buf);
}
/*
* Since our peripheral may be invalidated by an error
* above or an external event, we must release our CCB
* before releasing the probe lock on the peripheral.
* The peripheral will only go away once the last lock
* is removed, and we need it around for the CCB release
* operation.
*/
xpt_release_ccb(ccb);
softc->state = DA_STATE_NORMAL;
softc->flags |= DA_FLAG_PROBED;
daschedule(periph);
wakeup(&softc->disk->d_mediasize);
if ((softc->flags & DA_FLAG_ANNOUNCED) == 0) {
softc->flags |= DA_FLAG_ANNOUNCED;
cam_periph_unhold(periph);
} else
cam_periph_release_locked(periph);
}
static void
dadeletemethodchoose(struct da_softc *softc, da_delete_methods default_method)
{
int i, methods;
/* If available, prefer the method requested by user. */
i = softc->delete_method_pref;
methods = softc->delete_available | (1 << DA_DELETE_DISABLE);
if (methods & (1 << i)) {
dadeletemethodset(softc, i);
return;
}
/* Use the pre-defined order to choose the best performing delete. */
for (i = DA_DELETE_MIN; i <= DA_DELETE_MAX; i++) {
if (i == DA_DELETE_ZERO)
continue;
if (softc->delete_available & (1 << i)) {
dadeletemethodset(softc, i);
return;
}
}
/* Fallback to default. */
dadeletemethodset(softc, default_method);
}
static int
dadeletemethodsysctl(SYSCTL_HANDLER_ARGS)
{
char buf[16];
const char *p;
struct da_softc *softc;
- int i, error, methods, value;
+ int i, error, value;
softc = (struct da_softc *)arg1;
value = softc->delete_method;
if (value < 0 || value > DA_DELETE_MAX)
p = "UNKNOWN";
else
p = da_delete_method_names[value];
strncpy(buf, p, sizeof(buf));
error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
if (error != 0 || req->newptr == NULL)
return (error);
- methods = softc->delete_available | (1 << DA_DELETE_DISABLE);
for (i = 0; i <= DA_DELETE_MAX; i++) {
if (strcmp(buf, da_delete_method_names[i]) == 0)
break;
}
if (i > DA_DELETE_MAX)
return (EINVAL);
softc->delete_method_pref = i;
dadeletemethodchoose(softc, DA_DELETE_NONE);
return (0);
}
static int
dazonemodesysctl(SYSCTL_HANDLER_ARGS)
{
char tmpbuf[40];
struct da_softc *softc;
int error;
softc = (struct da_softc *)arg1;
switch (softc->zone_mode) {
case DA_ZONE_DRIVE_MANAGED:
snprintf(tmpbuf, sizeof(tmpbuf), "Drive Managed");
break;
case DA_ZONE_HOST_AWARE:
snprintf(tmpbuf, sizeof(tmpbuf), "Host Aware");
break;
case DA_ZONE_HOST_MANAGED:
snprintf(tmpbuf, sizeof(tmpbuf), "Host Managed");
break;
case DA_ZONE_NONE:
default:
snprintf(tmpbuf, sizeof(tmpbuf), "Not Zoned");
break;
}
error = sysctl_handle_string(oidp, tmpbuf, sizeof(tmpbuf), req);
return (error);
}
static int
dazonesupsysctl(SYSCTL_HANDLER_ARGS)
{
char tmpbuf[180];
struct da_softc *softc;
struct sbuf sb;
int error, first;
unsigned int i;
softc = (struct da_softc *)arg1;
error = 0;
first = 1;
sbuf_new(&sb, tmpbuf, sizeof(tmpbuf), 0);
for (i = 0; i < sizeof(da_zone_desc_table) /
sizeof(da_zone_desc_table[0]); i++) {
if (softc->zone_flags & da_zone_desc_table[i].value) {
if (first == 0)
sbuf_printf(&sb, ", ");
else
first = 0;
sbuf_cat(&sb, da_zone_desc_table[i].desc);
}
}
if (first == 1)
sbuf_printf(&sb, "None");
sbuf_finish(&sb);
error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
return (error);
}
static cam_status
daregister(struct cam_periph *periph, void *arg)
{
struct da_softc *softc;
struct ccb_pathinq cpi;
struct ccb_getdev *cgd;
char tmpstr[80];
caddr_t match;
cgd = (struct ccb_getdev *)arg;
if (cgd == NULL) {
printf("daregister: no getdev CCB, can't register device\n");
return(CAM_REQ_CMP_ERR);
}
softc = (struct da_softc *)malloc(sizeof(*softc), M_DEVBUF,
M_NOWAIT|M_ZERO);
if (softc == NULL) {
printf("daregister: Unable to probe new device. "
"Unable to allocate softc\n");
return(CAM_REQ_CMP_ERR);
}
if (cam_iosched_init(&softc->cam_iosched, periph) != 0) {
printf("daregister: Unable to probe new device. "
"Unable to allocate iosched memory\n");
free(softc, M_DEVBUF);
return(CAM_REQ_CMP_ERR);
}
LIST_INIT(&softc->pending_ccbs);
softc->state = DA_STATE_PROBE_RC;
bioq_init(&softc->delete_run_queue);
if (SID_IS_REMOVABLE(&cgd->inq_data))
softc->flags |= DA_FLAG_PACK_REMOVABLE;
softc->unmap_max_ranges = UNMAP_MAX_RANGES;
softc->unmap_max_lba = UNMAP_RANGE_MAX;
softc->unmap_gran = 0;
softc->unmap_gran_align = 0;
softc->ws_max_blks = WS16_MAX_BLKS;
softc->trim_max_ranges = ATA_TRIM_MAX_RANGES;
softc->rotating = 1;
periph->softc = softc;
/*
* See if this device has any quirks.
*/
match = cam_quirkmatch((caddr_t)&cgd->inq_data,
(caddr_t)da_quirk_table,
nitems(da_quirk_table),
sizeof(*da_quirk_table), scsi_inquiry_match);
if (match != NULL)
softc->quirks = ((struct da_quirk_entry *)match)->quirks;
else
softc->quirks = DA_Q_NONE;
/* Check if the SIM does not want 6 byte commands */
xpt_path_inq(&cpi, periph->path);
if (cpi.ccb_h.status == CAM_REQ_CMP && (cpi.hba_misc & PIM_NO_6_BYTE))
softc->quirks |= DA_Q_NO_6_BYTE;
if (SID_TYPE(&cgd->inq_data) == T_ZBC_HM)
softc->zone_mode = DA_ZONE_HOST_MANAGED;
else if (softc->quirks & DA_Q_SMR_DM)
softc->zone_mode = DA_ZONE_DRIVE_MANAGED;
else
softc->zone_mode = DA_ZONE_NONE;
if (softc->zone_mode != DA_ZONE_NONE) {
if (scsi_vpd_supported_page(periph, SVPD_ATA_INFORMATION)) {
if (scsi_vpd_supported_page(periph, SVPD_ZONED_BDC))
softc->zone_interface = DA_ZONE_IF_ATA_SAT;
else
softc->zone_interface = DA_ZONE_IF_ATA_PASS;
} else
softc->zone_interface = DA_ZONE_IF_SCSI;
}
TASK_INIT(&softc->sysctl_task, 0, dasysctlinit, periph);
/*
* Take an exclusive refcount on the periph while dastart is called
* to finish the probe. The reference will be dropped in dadone at
* the end of probe.
*/
(void)cam_periph_hold(periph, PRIBIO);
/*
* Schedule a periodic event to occasionally send an
* ordered tag to a device.
*/
callout_init_mtx(&softc->sendordered_c, cam_periph_mtx(periph), 0);
callout_reset(&softc->sendordered_c,
(da_default_timeout * hz) / DA_ORDEREDTAG_INTERVAL,
dasendorderedtag, softc);
cam_periph_unlock(periph);
/*
* RBC devices don't have to support READ(6), only READ(10).
*/
if (softc->quirks & DA_Q_NO_6_BYTE || SID_TYPE(&cgd->inq_data) == T_RBC)
softc->minimum_cmd_size = 10;
else
softc->minimum_cmd_size = 6;
/*
* Load the user's default, if any.
*/
snprintf(tmpstr, sizeof(tmpstr), "kern.cam.da.%d.minimum_cmd_size",
periph->unit_number);
TUNABLE_INT_FETCH(tmpstr, &softc->minimum_cmd_size);
/*
* 6, 10, 12 and 16 are the currently permissible values.
*/
if (softc->minimum_cmd_size > 12)
softc->minimum_cmd_size = 16;
else if (softc->minimum_cmd_size > 10)
softc->minimum_cmd_size = 12;
else if (softc->minimum_cmd_size > 6)
softc->minimum_cmd_size = 10;
else
softc->minimum_cmd_size = 6;
/* Predict whether device may support READ CAPACITY(16). */
if (SID_ANSI_REV(&cgd->inq_data) >= SCSI_REV_SPC3 &&
(softc->quirks & DA_Q_NO_RC16) == 0) {
softc->flags |= DA_FLAG_CAN_RC16;
softc->state = DA_STATE_PROBE_RC16;
}
/*
* Register this media as a disk.
*/
softc->disk = disk_alloc();
softc->disk->d_devstat = devstat_new_entry(periph->periph_name,
periph->unit_number, 0,
DEVSTAT_BS_UNAVAILABLE,
SID_TYPE(&cgd->inq_data) |
XPORT_DEVSTAT_TYPE(cpi.transport),
DEVSTAT_PRIORITY_DISK);
softc->disk->d_open = daopen;
softc->disk->d_close = daclose;
softc->disk->d_strategy = dastrategy;
softc->disk->d_dump = dadump;
softc->disk->d_getattr = dagetattr;
softc->disk->d_gone = dadiskgonecb;
softc->disk->d_name = "da";
softc->disk->d_drv1 = periph;
if (cpi.maxio == 0)
softc->maxio = DFLTPHYS; /* traditional default */
else if (cpi.maxio > MAXPHYS)
softc->maxio = MAXPHYS; /* for safety */
else
softc->maxio = cpi.maxio;
softc->disk->d_maxsize = softc->maxio;
softc->disk->d_unit = periph->unit_number;
softc->disk->d_flags = DISKFLAG_DIRECT_COMPLETION | DISKFLAG_CANZONE;
if ((softc->quirks & DA_Q_NO_SYNC_CACHE) == 0)
softc->disk->d_flags |= DISKFLAG_CANFLUSHCACHE;
if ((cpi.hba_misc & PIM_UNMAPPED) != 0) {
softc->unmappedio = 1;
softc->disk->d_flags |= DISKFLAG_UNMAPPED_BIO;
}
cam_strvis(softc->disk->d_descr, cgd->inq_data.vendor,
sizeof(cgd->inq_data.vendor), sizeof(softc->disk->d_descr));
strlcat(softc->disk->d_descr, " ", sizeof(softc->disk->d_descr));
cam_strvis(&softc->disk->d_descr[strlen(softc->disk->d_descr)],
cgd->inq_data.product, sizeof(cgd->inq_data.product),
sizeof(softc->disk->d_descr) - strlen(softc->disk->d_descr));
softc->disk->d_hba_vendor = cpi.hba_vendor;
softc->disk->d_hba_device = cpi.hba_device;
softc->disk->d_hba_subvendor = cpi.hba_subvendor;
softc->disk->d_hba_subdevice = cpi.hba_subdevice;
/*
* Acquire a reference to the periph before we register with GEOM.
* We'll release this reference once GEOM calls us back (via
* dadiskgonecb()) telling us that our provider has been freed.
*/
if (cam_periph_acquire(periph) != CAM_REQ_CMP) {
xpt_print(periph->path, "%s: lost periph during "
"registration!\n", __func__);
cam_periph_lock(periph);
return (CAM_REQ_CMP_ERR);
}
disk_create(softc->disk, DISK_VERSION);
cam_periph_lock(periph);
/*
* Add async callbacks for events of interest.
* I don't bother checking if this fails as,
* in most cases, the system will function just
* fine without them and the only alternative
* would be to not attach the device on failure.
*/
xpt_register_async(AC_SENT_BDR | AC_BUS_RESET | AC_LOST_DEVICE |
AC_ADVINFO_CHANGED | AC_SCSI_AEN | AC_UNIT_ATTENTION |
AC_INQ_CHANGED, daasync, periph, periph->path);
/*
* Emit an attribute changed notification just in case
* physical path information arrived before our async
* event handler was registered, but after anyone attaching
* to our disk device polled it.
*/
disk_attr_changed(softc->disk, "GEOM::physpath", M_NOWAIT);
/*
* Schedule a periodic media polling events.
*/
callout_init_mtx(&softc->mediapoll_c, cam_periph_mtx(periph), 0);
if ((softc->flags & DA_FLAG_PACK_REMOVABLE) &&
(cgd->inq_flags & SID_AEN) == 0 &&
da_poll_period != 0)
callout_reset(&softc->mediapoll_c, da_poll_period * hz,
damediapoll, periph);
xpt_schedule(periph, CAM_PRIORITY_DEV);
return(CAM_REQ_CMP);
}
static int
da_zone_bio_to_scsi(int disk_zone_cmd)
{
switch (disk_zone_cmd) {
case DISK_ZONE_OPEN:
return ZBC_OUT_SA_OPEN;
case DISK_ZONE_CLOSE:
return ZBC_OUT_SA_CLOSE;
case DISK_ZONE_FINISH:
return ZBC_OUT_SA_FINISH;
case DISK_ZONE_RWP:
return ZBC_OUT_SA_RWP;
}
return -1;
}
static int
da_zone_cmd(struct cam_periph *periph, union ccb *ccb, struct bio *bp,
int *queue_ccb)
{
struct da_softc *softc;
int error;
error = 0;
if (bp->bio_cmd != BIO_ZONE) {
error = EINVAL;
goto bailout;
}
softc = periph->softc;
switch (bp->bio_zone.zone_cmd) {
case DISK_ZONE_OPEN:
case DISK_ZONE_CLOSE:
case DISK_ZONE_FINISH:
case DISK_ZONE_RWP: {
int zone_flags;
int zone_sa;
uint64_t lba;
zone_sa = da_zone_bio_to_scsi(bp->bio_zone.zone_cmd);
if (zone_sa == -1) {
xpt_print(periph->path, "Cannot translate zone "
"cmd %#x to SCSI\n", bp->bio_zone.zone_cmd);
error = EINVAL;
goto bailout;
}
zone_flags = 0;
lba = bp->bio_zone.zone_params.rwp.id;
if (bp->bio_zone.zone_params.rwp.flags &
DISK_ZONE_RWP_FLAG_ALL)
zone_flags |= ZBC_OUT_ALL;
if (softc->zone_interface != DA_ZONE_IF_ATA_PASS) {
scsi_zbc_out(&ccb->csio,
/*retries*/ da_retry_count,
/*cbfcnp*/ dadone,
/*tag_action*/ MSG_SIMPLE_Q_TAG,
/*service_action*/ zone_sa,
/*zone_id*/ lba,
/*zone_flags*/ zone_flags,
/*data_ptr*/ NULL,
/*dxfer_len*/ 0,
/*sense_len*/ SSD_FULL_SIZE,
/*timeout*/ da_default_timeout * 1000);
} else {
/*
* Note that in this case, even though we can
* technically use NCQ, we don't bother for several
* reasons:
* 1. It hasn't been tested on a SAT layer that
* supports it. This is new as of SAT-4.
* 2. Even when there is a SAT layer that supports
* it, that SAT layer will also probably support
* ZBC -> ZAC translation, since they are both
* in the SAT-4 spec.
* 3. Translation will likely be preferable to ATA
* passthrough. LSI / Avago at least single
* steps ATA passthrough commands in the HBA,
* regardless of protocol, so unless that
* changes, there is a performance penalty for
* doing ATA passthrough no matter whether
* you're using NCQ/FPDMA, DMA or PIO.
* 4. It requires a 32-byte CDB, which at least at
* this point in CAM requires a CDB pointer, which
* would require us to allocate an additional bit
* of storage separate from the CCB.
*/
error = scsi_ata_zac_mgmt_out(&ccb->csio,
/*retries*/ da_retry_count,
/*cbfcnp*/ dadone,
/*tag_action*/ MSG_SIMPLE_Q_TAG,
/*use_ncq*/ 0,
/*zm_action*/ zone_sa,
/*zone_id*/ lba,
/*zone_flags*/ zone_flags,
/*data_ptr*/ NULL,
/*dxfer_len*/ 0,
/*cdb_storage*/ NULL,
/*cdb_storage_len*/ 0,
/*sense_len*/ SSD_FULL_SIZE,
/*timeout*/ da_default_timeout * 1000);
if (error != 0) {
error = EINVAL;
xpt_print(periph->path,
"scsi_ata_zac_mgmt_out() returned an "
"error!");
goto bailout;
}
}
*queue_ccb = 1;
break;
}
case DISK_ZONE_REPORT_ZONES: {
uint8_t *rz_ptr;
uint32_t num_entries, alloc_size;
struct disk_zone_report *rep;
rep = &bp->bio_zone.zone_params.report;
num_entries = rep->entries_allocated;
if (num_entries == 0) {
xpt_print(periph->path, "No entries allocated for "
"Report Zones request\n");
error = EINVAL;
goto bailout;
}
alloc_size = sizeof(struct scsi_report_zones_hdr) +
(sizeof(struct scsi_report_zones_desc) * num_entries);
alloc_size = min(alloc_size, softc->disk->d_maxsize);
rz_ptr = malloc(alloc_size, M_SCSIDA, M_NOWAIT | M_ZERO);
if (rz_ptr == NULL) {
xpt_print(periph->path, "Unable to allocate memory "
"for Report Zones request\n");
error = ENOMEM;
goto bailout;
}
if (softc->zone_interface != DA_ZONE_IF_ATA_PASS) {
scsi_zbc_in(&ccb->csio,
/*retries*/ da_retry_count,
/*cbcfnp*/ dadone,
/*tag_action*/ MSG_SIMPLE_Q_TAG,
/*service_action*/ ZBC_IN_SA_REPORT_ZONES,
/*zone_start_lba*/ rep->starting_id,
/*zone_options*/ rep->rep_options,
/*data_ptr*/ rz_ptr,
/*dxfer_len*/ alloc_size,
/*sense_len*/ SSD_FULL_SIZE,
/*timeout*/ da_default_timeout * 1000);
} else {
/*
* Note that in this case, even though we can
* technically use NCQ, we don't bother for several
* reasons:
* 1. It hasn't been tested on a SAT layer that
* supports it. This is new as of SAT-4.
* 2. Even when there is a SAT layer that supports
* it, that SAT layer will also probably support
* ZBC -> ZAC translation, since they are both
* in the SAT-4 spec.
* 3. Translation will likely be preferable to ATA
* passthrough. LSI / Avago at least single
* steps ATA passthrough commands in the HBA,
* regardless of protocol, so unless that
* changes, there is a performance penalty for
* doing ATA passthrough no matter whether
* you're using NCQ/FPDMA, DMA or PIO.
* 4. It requires a 32-byte CDB, which at least at
* this point in CAM requires a CDB pointer, which
* would require us to allocate an additional bit
* of storage separate from the CCB.
*/
error = scsi_ata_zac_mgmt_in(&ccb->csio,
/*retries*/ da_retry_count,
/*cbcfnp*/ dadone,
/*tag_action*/ MSG_SIMPLE_Q_TAG,
/*use_ncq*/ 0,
/*zm_action*/ ATA_ZM_REPORT_ZONES,
/*zone_id*/ rep->starting_id,
/*zone_flags*/ rep->rep_options,
/*data_ptr*/ rz_ptr,
/*dxfer_len*/ alloc_size,
/*cdb_storage*/ NULL,
/*cdb_storage_len*/ 0,
/*sense_len*/ SSD_FULL_SIZE,
/*timeout*/ da_default_timeout * 1000);
if (error != 0) {
error = EINVAL;
xpt_print(periph->path,
"scsi_ata_zac_mgmt_in() returned an "
"error!");
goto bailout;
}
}
/*
* For BIO_ZONE, this isn't normally needed. However, it
* is used by devstat_end_transaction_bio() to determine
* how much data was transferred.
*/
/*
* XXX KDM we have a problem. But I'm not sure how to fix
* it. devstat uses bio_bcount - bio_resid to calculate
* the amount of data transferred. The GEOM disk code
* uses bio_length - bio_resid to calculate the amount of
* data in bio_completed. We have different structure
* sizes above and below the ada(4) driver. So, if we
* use the sizes above, the amount transferred won't be
* quite accurate for devstat. If we use different sizes
* for bio_bcount and bio_length (above and below
* respectively), then the residual needs to match one or
* the other. Everything is calculated after the bio
* leaves the driver, so changing the values around isn't
* really an option. For now, just set the count to the
* passed in length. This means that the calculations
* above (e.g. bio_completed) will be correct, but the
* amount of data reported to devstat will be slightly
* under or overstated.
*/
bp->bio_bcount = bp->bio_length;
*queue_ccb = 1;
break;
}
case DISK_ZONE_GET_PARAMS: {
struct disk_zone_disk_params *params;
params = &bp->bio_zone.zone_params.disk_params;
bzero(params, sizeof(*params));
switch (softc->zone_mode) {
case DA_ZONE_DRIVE_MANAGED:
params->zone_mode = DISK_ZONE_MODE_DRIVE_MANAGED;
break;
case DA_ZONE_HOST_AWARE:
params->zone_mode = DISK_ZONE_MODE_HOST_AWARE;
break;
case DA_ZONE_HOST_MANAGED:
params->zone_mode = DISK_ZONE_MODE_HOST_MANAGED;
break;
default:
case DA_ZONE_NONE:
params->zone_mode = DISK_ZONE_MODE_NONE;
break;
}
if (softc->zone_flags & DA_ZONE_FLAG_URSWRZ)
params->flags |= DISK_ZONE_DISK_URSWRZ;
if (softc->zone_flags & DA_ZONE_FLAG_OPT_SEQ_SET) {
params->optimal_seq_zones = softc->optimal_seq_zones;
params->flags |= DISK_ZONE_OPT_SEQ_SET;
}
if (softc->zone_flags & DA_ZONE_FLAG_OPT_NONSEQ_SET) {
params->optimal_nonseq_zones =
softc->optimal_nonseq_zones;
params->flags |= DISK_ZONE_OPT_NONSEQ_SET;
}
if (softc->zone_flags & DA_ZONE_FLAG_MAX_SEQ_SET) {
params->max_seq_zones = softc->max_seq_zones;
params->flags |= DISK_ZONE_MAX_SEQ_SET;
}
if (softc->zone_flags & DA_ZONE_FLAG_RZ_SUP)
params->flags |= DISK_ZONE_RZ_SUP;
if (softc->zone_flags & DA_ZONE_FLAG_OPEN_SUP)
params->flags |= DISK_ZONE_OPEN_SUP;
if (softc->zone_flags & DA_ZONE_FLAG_CLOSE_SUP)
params->flags |= DISK_ZONE_CLOSE_SUP;
if (softc->zone_flags & DA_ZONE_FLAG_FINISH_SUP)
params->flags |= DISK_ZONE_FINISH_SUP;
if (softc->zone_flags & DA_ZONE_FLAG_RWP_SUP)
params->flags |= DISK_ZONE_RWP_SUP;
break;
}
default:
break;
}
bailout:
return (error);
}
static void
dastart(struct cam_periph *periph, union ccb *start_ccb)
{
struct da_softc *softc;
softc = (struct da_softc *)periph->softc;
CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("dastart\n"));
skipstate:
switch (softc->state) {
case DA_STATE_NORMAL:
{
struct bio *bp;
uint8_t tag_code;
more:
bp = cam_iosched_next_bio(softc->cam_iosched);
if (bp == NULL) {
if (cam_iosched_has_work_flags(softc->cam_iosched, DA_WORK_TUR)) {
cam_iosched_clr_work_flags(softc->cam_iosched, DA_WORK_TUR);
scsi_test_unit_ready(&start_ccb->csio,
/*retries*/ da_retry_count,
dadone,
MSG_SIMPLE_Q_TAG,
SSD_FULL_SIZE,
da_default_timeout * 1000);
start_ccb->ccb_h.ccb_bp = NULL;
start_ccb->ccb_h.ccb_state = DA_CCB_TUR;
xpt_action(start_ccb);
} else
xpt_release_ccb(start_ccb);
break;
}
if (bp->bio_cmd == BIO_DELETE) {
if (softc->delete_func != NULL) {
softc->delete_func(periph, start_ccb, bp);
goto out;
} else {
/* Not sure this is possible, but failsafe by lying and saying "sure, done." */
biofinish(bp, NULL, 0);
goto more;
}
}
if (cam_iosched_has_work_flags(softc->cam_iosched, DA_WORK_TUR)) {
cam_iosched_clr_work_flags(softc->cam_iosched, DA_WORK_TUR);
cam_periph_release_locked(periph); /* XXX is this still valid? I think so but unverified */
}
if ((bp->bio_flags & BIO_ORDERED) != 0 ||
(softc->flags & DA_FLAG_NEED_OTAG) != 0) {
softc->flags &= ~DA_FLAG_NEED_OTAG;
softc->flags |= DA_FLAG_WAS_OTAG;
tag_code = MSG_ORDERED_Q_TAG;
} else {
tag_code = MSG_SIMPLE_Q_TAG;
}
switch (bp->bio_cmd) {
case BIO_WRITE:
case BIO_READ:
{
void *data_ptr;
int rw_op;
biotrack(bp, __func__);
if (bp->bio_cmd == BIO_WRITE) {
softc->flags |= DA_FLAG_DIRTY;
rw_op = SCSI_RW_WRITE;
} else {
rw_op = SCSI_RW_READ;
}
data_ptr = bp->bio_data;
if ((bp->bio_flags & (BIO_UNMAPPED|BIO_VLIST)) != 0) {
rw_op |= SCSI_RW_BIO;
data_ptr = bp;
}
scsi_read_write(&start_ccb->csio,
/*retries*/da_retry_count,
/*cbfcnp*/dadone,
/*tag_action*/tag_code,
rw_op,
/*byte2*/0,
softc->minimum_cmd_size,
/*lba*/bp->bio_pblkno,
/*block_count*/bp->bio_bcount /
softc->params.secsize,
data_ptr,
/*dxfer_len*/ bp->bio_bcount,
/*sense_len*/SSD_FULL_SIZE,
da_default_timeout * 1000);
#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
start_ccb->csio.bio = bp;
#endif
break;
}
case BIO_FLUSH:
/*
* If we don't support sync cache, or the disk
* isn't dirty, FLUSH is a no-op. Use the
* allocated * CCB for the next bio if one is
* available.
*/
if ((softc->quirks & DA_Q_NO_SYNC_CACHE) != 0 ||
(softc->flags & DA_FLAG_DIRTY) == 0) {
biodone(bp);
goto skipstate;
}
/*
* BIO_FLUSH doesn't currently communicate
* range data, so we synchronize the cache
* over the whole disk. We also force
* ordered tag semantics the flush applies
* to all previously queued I/O.
*/
scsi_synchronize_cache(&start_ccb->csio,
/*retries*/1,
/*cbfcnp*/dadone,
MSG_ORDERED_Q_TAG,
/*begin_lba*/0,
/*lb_count*/0,
SSD_FULL_SIZE,
da_default_timeout*1000);
/*
* Clear the dirty flag before sending the command.
* Either this sync cache will be successful, or it
* will fail after a retry. If it fails, it is
* unlikely to be successful if retried later, so
* we'll save ourselves time by just marking the
* device clean.
*/
softc->flags &= ~DA_FLAG_DIRTY;
break;
case BIO_ZONE: {
int error, queue_ccb;
queue_ccb = 0;
error = da_zone_cmd(periph, start_ccb, bp,&queue_ccb);
if ((error != 0)
|| (queue_ccb == 0)) {
biofinish(bp, NULL, error);
xpt_release_ccb(start_ccb);
return;
}
break;
}
}
start_ccb->ccb_h.ccb_state = DA_CCB_BUFFER_IO;
start_ccb->ccb_h.flags |= CAM_UNLOCKED;
start_ccb->ccb_h.softtimeout = sbttotv(da_default_softtimeout);
out:
LIST_INSERT_HEAD(&softc->pending_ccbs,
&start_ccb->ccb_h, periph_links.le);
/* We expect a unit attention from this device */
if ((softc->flags & DA_FLAG_RETRY_UA) != 0) {
start_ccb->ccb_h.ccb_state |= DA_CCB_RETRY_UA;
softc->flags &= ~DA_FLAG_RETRY_UA;
}
start_ccb->ccb_h.ccb_bp = bp;
softc->refcount++;
cam_periph_unlock(periph);
xpt_action(start_ccb);
cam_periph_lock(periph);
softc->refcount--;
/* May have more work to do, so ensure we stay scheduled */
daschedule(periph);
break;
}
case DA_STATE_PROBE_RC:
{
struct scsi_read_capacity_data *rcap;
rcap = (struct scsi_read_capacity_data *)
malloc(sizeof(*rcap), M_SCSIDA, M_NOWAIT|M_ZERO);
if (rcap == NULL) {
printf("dastart: Couldn't malloc read_capacity data\n");
/* da_free_periph??? */
break;
}
scsi_read_capacity(&start_ccb->csio,
/*retries*/da_retry_count,
dadone,
MSG_SIMPLE_Q_TAG,
rcap,
SSD_FULL_SIZE,
/*timeout*/5000);
start_ccb->ccb_h.ccb_bp = NULL;
start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_RC;
xpt_action(start_ccb);
break;
}
case DA_STATE_PROBE_RC16:
{
struct scsi_read_capacity_data_long *rcaplong;
rcaplong = (struct scsi_read_capacity_data_long *)
malloc(sizeof(*rcaplong), M_SCSIDA, M_NOWAIT|M_ZERO);
if (rcaplong == NULL) {
printf("dastart: Couldn't malloc read_capacity data\n");
/* da_free_periph??? */
break;
}
scsi_read_capacity_16(&start_ccb->csio,
/*retries*/ da_retry_count,
/*cbfcnp*/ dadone,
/*tag_action*/ MSG_SIMPLE_Q_TAG,
/*lba*/ 0,
/*reladr*/ 0,
/*pmi*/ 0,
/*rcap_buf*/ (uint8_t *)rcaplong,
/*rcap_buf_len*/ sizeof(*rcaplong),
/*sense_len*/ SSD_FULL_SIZE,
/*timeout*/ da_default_timeout * 1000);
start_ccb->ccb_h.ccb_bp = NULL;
start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_RC16;
xpt_action(start_ccb);
break;
}
case DA_STATE_PROBE_LBP:
{
struct scsi_vpd_logical_block_prov *lbp;
if (!scsi_vpd_supported_page(periph, SVPD_LBP)) {
/*
* If we get here we don't support any SBC-3 delete
* methods with UNMAP as the Logical Block Provisioning
* VPD page support is required for devices which
* support it according to T10/1799-D Revision 31
* however older revisions of the spec don't mandate
* this so we currently don't remove these methods
* from the available set.
*/
softc->state = DA_STATE_PROBE_BLK_LIMITS;
goto skipstate;
}
lbp = (struct scsi_vpd_logical_block_prov *)
malloc(sizeof(*lbp), M_SCSIDA, M_NOWAIT|M_ZERO);
if (lbp == NULL) {
printf("dastart: Couldn't malloc lbp data\n");
/* da_free_periph??? */
break;
}
scsi_inquiry(&start_ccb->csio,
/*retries*/da_retry_count,
/*cbfcnp*/dadone,
/*tag_action*/MSG_SIMPLE_Q_TAG,
/*inq_buf*/(u_int8_t *)lbp,
/*inq_len*/sizeof(*lbp),
/*evpd*/TRUE,
/*page_code*/SVPD_LBP,
/*sense_len*/SSD_MIN_SIZE,
/*timeout*/da_default_timeout * 1000);
start_ccb->ccb_h.ccb_bp = NULL;
start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_LBP;
xpt_action(start_ccb);
break;
}
case DA_STATE_PROBE_BLK_LIMITS:
{
struct scsi_vpd_block_limits *block_limits;
if (!scsi_vpd_supported_page(periph, SVPD_BLOCK_LIMITS)) {
/* Not supported skip to next probe */
softc->state = DA_STATE_PROBE_BDC;
goto skipstate;
}
block_limits = (struct scsi_vpd_block_limits *)
malloc(sizeof(*block_limits), M_SCSIDA, M_NOWAIT|M_ZERO);
if (block_limits == NULL) {
printf("dastart: Couldn't malloc block_limits data\n");
/* da_free_periph??? */
break;
}
scsi_inquiry(&start_ccb->csio,
/*retries*/da_retry_count,
/*cbfcnp*/dadone,
/*tag_action*/MSG_SIMPLE_Q_TAG,
/*inq_buf*/(u_int8_t *)block_limits,
/*inq_len*/sizeof(*block_limits),
/*evpd*/TRUE,
/*page_code*/SVPD_BLOCK_LIMITS,
/*sense_len*/SSD_MIN_SIZE,
/*timeout*/da_default_timeout * 1000);
start_ccb->ccb_h.ccb_bp = NULL;
start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_BLK_LIMITS;
xpt_action(start_ccb);
break;
}
case DA_STATE_PROBE_BDC:
{
struct scsi_vpd_block_characteristics *bdc;
if (!scsi_vpd_supported_page(periph, SVPD_BDC)) {
softc->state = DA_STATE_PROBE_ATA;
goto skipstate;
}
bdc = (struct scsi_vpd_block_characteristics *)
malloc(sizeof(*bdc), M_SCSIDA, M_NOWAIT|M_ZERO);
if (bdc == NULL) {
printf("dastart: Couldn't malloc bdc data\n");
/* da_free_periph??? */
break;
}
scsi_inquiry(&start_ccb->csio,
/*retries*/da_retry_count,
/*cbfcnp*/dadone,
/*tag_action*/MSG_SIMPLE_Q_TAG,
/*inq_buf*/(u_int8_t *)bdc,
/*inq_len*/sizeof(*bdc),
/*evpd*/TRUE,
/*page_code*/SVPD_BDC,
/*sense_len*/SSD_MIN_SIZE,
/*timeout*/da_default_timeout * 1000);
start_ccb->ccb_h.ccb_bp = NULL;
start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_BDC;
xpt_action(start_ccb);
break;
}
case DA_STATE_PROBE_ATA:
{
struct ata_params *ata_params;
if (!scsi_vpd_supported_page(periph, SVPD_ATA_INFORMATION)) {
if ((softc->zone_mode == DA_ZONE_HOST_AWARE)
|| (softc->zone_mode == DA_ZONE_HOST_MANAGED)) {
/*
* Note that if the ATA VPD page isn't
* supported, we aren't talking to an ATA
* device anyway. Support for that VPD
* page is mandatory for SCSI to ATA (SAT)
* translation layers.
*/
softc->state = DA_STATE_PROBE_ZONE;
goto skipstate;
}
daprobedone(periph, start_ccb);
break;
}
ata_params = (struct ata_params*)
malloc(sizeof(*ata_params), M_SCSIDA,M_NOWAIT|M_ZERO);
if (ata_params == NULL) {
xpt_print(periph->path, "Couldn't malloc ata_params "
"data\n");
/* da_free_periph??? */
break;
}
scsi_ata_identify(&start_ccb->csio,
/*retries*/da_retry_count,
/*cbfcnp*/dadone,
/*tag_action*/MSG_SIMPLE_Q_TAG,
/*data_ptr*/(u_int8_t *)ata_params,
/*dxfer_len*/sizeof(*ata_params),
/*sense_len*/SSD_FULL_SIZE,
/*timeout*/da_default_timeout * 1000);
start_ccb->ccb_h.ccb_bp = NULL;
start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_ATA;
xpt_action(start_ccb);
break;
}
case DA_STATE_PROBE_ATA_LOGDIR:
{
struct ata_gp_log_dir *log_dir;
int retval;
retval = 0;
if ((softc->flags & DA_FLAG_CAN_ATA_LOG) == 0) {
/*
* If we don't have log support, not much point in
* trying to probe zone support.
*/
daprobedone(periph, start_ccb);
break;
}
/*
* If we have an ATA device (the SCSI ATA Information VPD
* page should be present and the ATA identify should have
* succeeded) and it supports logs, ask for the log directory.
*/
log_dir = malloc(sizeof(*log_dir), M_SCSIDA, M_NOWAIT|M_ZERO);
if (log_dir == NULL) {
xpt_print(periph->path, "Couldn't malloc log_dir "
"data\n");
daprobedone(periph, start_ccb);
break;
}
retval = scsi_ata_read_log(&start_ccb->csio,
/*retries*/ da_retry_count,
/*cbfcnp*/ dadone,
/*tag_action*/ MSG_SIMPLE_Q_TAG,
/*log_address*/ ATA_LOG_DIRECTORY,
/*page_number*/ 0,
/*block_count*/ 1,
/*protocol*/ softc->flags & DA_FLAG_CAN_ATA_DMA ?
AP_PROTO_DMA : AP_PROTO_PIO_IN,
/*data_ptr*/ (uint8_t *)log_dir,
/*dxfer_len*/ sizeof(*log_dir),
/*sense_len*/ SSD_FULL_SIZE,
/*timeout*/ da_default_timeout * 1000);
if (retval != 0) {
xpt_print(periph->path, "scsi_ata_read_log() failed!");
free(log_dir, M_SCSIDA);
daprobedone(periph, start_ccb);
break;
}
start_ccb->ccb_h.ccb_bp = NULL;
start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_ATA_LOGDIR;
xpt_action(start_ccb);
break;
}
case DA_STATE_PROBE_ATA_IDDIR:
{
struct ata_identify_log_pages *id_dir;
int retval;
retval = 0;
/*
* Check here to see whether the Identify Device log is
* supported in the directory of logs. If so, continue
* with requesting the log of identify device pages.
*/
if ((softc->flags & DA_FLAG_CAN_ATA_IDLOG) == 0) {
daprobedone(periph, start_ccb);
break;
}
id_dir = malloc(sizeof(*id_dir), M_SCSIDA, M_NOWAIT | M_ZERO);
if (id_dir == NULL) {
xpt_print(periph->path, "Couldn't malloc id_dir "
"data\n");
daprobedone(periph, start_ccb);
break;
}
retval = scsi_ata_read_log(&start_ccb->csio,
/*retries*/ da_retry_count,
/*cbfcnp*/ dadone,
/*tag_action*/ MSG_SIMPLE_Q_TAG,
/*log_address*/ ATA_IDENTIFY_DATA_LOG,
/*page_number*/ ATA_IDL_PAGE_LIST,
/*block_count*/ 1,
/*protocol*/ softc->flags & DA_FLAG_CAN_ATA_DMA ?
AP_PROTO_DMA : AP_PROTO_PIO_IN,
/*data_ptr*/ (uint8_t *)id_dir,
/*dxfer_len*/ sizeof(*id_dir),
/*sense_len*/ SSD_FULL_SIZE,
/*timeout*/ da_default_timeout * 1000);
if (retval != 0) {
xpt_print(periph->path, "scsi_ata_read_log() failed!");
free(id_dir, M_SCSIDA);
daprobedone(periph, start_ccb);
break;
}
start_ccb->ccb_h.ccb_bp = NULL;
start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_ATA_IDDIR;
xpt_action(start_ccb);
break;
}
case DA_STATE_PROBE_ATA_SUP:
{
struct ata_identify_log_sup_cap *sup_cap;
int retval;
retval = 0;
/*
* Check here to see whether the Supported Capabilities log
* is in the list of Identify Device logs.
*/
if ((softc->flags & DA_FLAG_CAN_ATA_SUPCAP) == 0) {
daprobedone(periph, start_ccb);
break;
}
sup_cap = malloc(sizeof(*sup_cap), M_SCSIDA, M_NOWAIT|M_ZERO);
if (sup_cap == NULL) {
xpt_print(periph->path, "Couldn't malloc sup_cap "
"data\n");
daprobedone(periph, start_ccb);
break;
}
retval = scsi_ata_read_log(&start_ccb->csio,
/*retries*/ da_retry_count,
/*cbfcnp*/ dadone,
/*tag_action*/ MSG_SIMPLE_Q_TAG,
/*log_address*/ ATA_IDENTIFY_DATA_LOG,
/*page_number*/ ATA_IDL_SUP_CAP,
/*block_count*/ 1,
/*protocol*/ softc->flags & DA_FLAG_CAN_ATA_DMA ?
AP_PROTO_DMA : AP_PROTO_PIO_IN,
/*data_ptr*/ (uint8_t *)sup_cap,
/*dxfer_len*/ sizeof(*sup_cap),
/*sense_len*/ SSD_FULL_SIZE,
/*timeout*/ da_default_timeout * 1000);
if (retval != 0) {
xpt_print(periph->path, "scsi_ata_read_log() failed!");
free(sup_cap, M_SCSIDA);
daprobedone(periph, start_ccb);
break;
}
start_ccb->ccb_h.ccb_bp = NULL;
start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_ATA_SUP;
xpt_action(start_ccb);
break;
}
case DA_STATE_PROBE_ATA_ZONE:
{
struct ata_zoned_info_log *ata_zone;
int retval;
retval = 0;
/*
* Check here to see whether the zoned device information
* page is supported. If so, continue on to request it.
* If not, skip to DA_STATE_PROBE_LOG or done.
*/
if ((softc->flags & DA_FLAG_CAN_ATA_ZONE) == 0) {
daprobedone(periph, start_ccb);
break;
}
ata_zone = malloc(sizeof(*ata_zone), M_SCSIDA,
M_NOWAIT|M_ZERO);
if (ata_zone == NULL) {
xpt_print(periph->path, "Couldn't malloc ata_zone "
"data\n");
daprobedone(periph, start_ccb);
break;
}
retval = scsi_ata_read_log(&start_ccb->csio,
/*retries*/ da_retry_count,
/*cbfcnp*/ dadone,
/*tag_action*/ MSG_SIMPLE_Q_TAG,
/*log_address*/ ATA_IDENTIFY_DATA_LOG,
/*page_number*/ ATA_IDL_ZDI,
/*block_count*/ 1,
/*protocol*/ softc->flags & DA_FLAG_CAN_ATA_DMA ?
AP_PROTO_DMA : AP_PROTO_PIO_IN,
/*data_ptr*/ (uint8_t *)ata_zone,
/*dxfer_len*/ sizeof(*ata_zone),
/*sense_len*/ SSD_FULL_SIZE,
/*timeout*/ da_default_timeout * 1000);
if (retval != 0) {
xpt_print(periph->path, "scsi_ata_read_log() failed!");
free(ata_zone, M_SCSIDA);
daprobedone(periph, start_ccb);
break;
}
start_ccb->ccb_h.ccb_bp = NULL;
start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_ATA_ZONE;
xpt_action(start_ccb);
break;
}
case DA_STATE_PROBE_ZONE:
{
struct scsi_vpd_zoned_bdc *bdc;
/*
* Note that this page will be supported for SCSI protocol
* devices that support ZBC (SMR devices), as well as ATA
* protocol devices that are behind a SAT (SCSI to ATA
* Translation) layer that supports converting ZBC commands
* to their ZAC equivalents.
*/
if (!scsi_vpd_supported_page(periph, SVPD_ZONED_BDC)) {
daprobedone(periph, start_ccb);
break;
}
bdc = (struct scsi_vpd_zoned_bdc *)
malloc(sizeof(*bdc), M_SCSIDA, M_NOWAIT|M_ZERO);
if (bdc == NULL) {
xpt_release_ccb(start_ccb);
xpt_print(periph->path, "Couldn't malloc zone VPD "
"data\n");
break;
}
scsi_inquiry(&start_ccb->csio,
/*retries*/da_retry_count,
/*cbfcnp*/dadone,
/*tag_action*/MSG_SIMPLE_Q_TAG,
/*inq_buf*/(u_int8_t *)bdc,
/*inq_len*/sizeof(*bdc),
/*evpd*/TRUE,
/*page_code*/SVPD_ZONED_BDC,
/*sense_len*/SSD_FULL_SIZE,
/*timeout*/da_default_timeout * 1000);
start_ccb->ccb_h.ccb_bp = NULL;
start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_ZONE;
xpt_action(start_ccb);
break;
}
}
}
/*
* In each of the methods below, while its the caller's
* responsibility to ensure the request will fit into a
* single device request, we might have changed the delete
* method due to the device incorrectly advertising either
* its supported methods or limits.
*
* To prevent this causing further issues we validate the
* against the methods limits, and warn which would
* otherwise be unnecessary.
*/
static void
da_delete_unmap(struct cam_periph *periph, union ccb *ccb, struct bio *bp)
{
struct da_softc *softc = (struct da_softc *)periph->softc;;
struct bio *bp1;
uint8_t *buf = softc->unmap_buf;
struct scsi_unmap_desc *d = (void *)&buf[UNMAP_HEAD_SIZE];
uint64_t lba, lastlba = (uint64_t)-1;
uint64_t totalcount = 0;
uint64_t count;
uint32_t c, lastcount = 0, ranges = 0;
/*
* Currently this doesn't take the UNMAP
* Granularity and Granularity Alignment
* fields into account.
*
* This could result in both unoptimal unmap
* requests as as well as UNMAP calls unmapping
* fewer LBA's than requested.
*/
bzero(softc->unmap_buf, sizeof(softc->unmap_buf));
bp1 = bp;
do {
/*
* Note: ada and da are different in how they store the
* pending bp's in a trim. ada stores all of them in the
* trim_req.bps. da stores all but the first one in the
* delete_run_queue. ada then completes all the bps in
* its adadone() loop. da completes all the bps in the
* delete_run_queue in dadone, and relies on the biodone
* after to complete. This should be reconciled since there's
* no real reason to do it differently. XXX
*/
if (bp1 != bp)
bioq_insert_tail(&softc->delete_run_queue, bp1);
lba = bp1->bio_pblkno;
count = bp1->bio_bcount / softc->params.secsize;
/* Try to extend the previous range. */
if (lba == lastlba) {
c = omin(count, UNMAP_RANGE_MAX - lastcount);
lastlba += c;
lastcount += c;
scsi_ulto4b(lastcount, d[ranges - 1].length);
count -= c;
lba += c;
totalcount += c;
} else if ((softc->quirks & DA_Q_STRICT_UNMAP) &&
softc->unmap_gran != 0) {
/* Align length of the previous range. */
if ((c = lastcount % softc->unmap_gran) != 0) {
if (lastcount <= c) {
totalcount -= lastcount;
lastlba = (uint64_t)-1;
lastcount = 0;
ranges--;
} else {
totalcount -= c;
lastlba -= c;
lastcount -= c;
scsi_ulto4b(lastcount, d[ranges - 1].length);
}
}
/* Align beginning of the new range. */
c = (lba - softc->unmap_gran_align) % softc->unmap_gran;
if (c != 0) {
c = softc->unmap_gran - c;
if (count <= c) {
count = 0;
} else {
lba += c;
count -= c;
}
}
}
while (count > 0) {
c = omin(count, UNMAP_RANGE_MAX);
if (totalcount + c > softc->unmap_max_lba ||
ranges >= softc->unmap_max_ranges) {
xpt_print(periph->path,
"%s issuing short delete %ld > %ld"
"|| %d >= %d",
da_delete_method_desc[softc->delete_method],
totalcount + c, softc->unmap_max_lba,
ranges, softc->unmap_max_ranges);
break;
}
scsi_u64to8b(lba, d[ranges].lba);
scsi_ulto4b(c, d[ranges].length);
lba += c;
totalcount += c;
ranges++;
count -= c;
lastlba = lba;
lastcount = c;
}
bp1 = cam_iosched_next_trim(softc->cam_iosched);
if (bp1 == NULL)
break;
if (ranges >= softc->unmap_max_ranges ||
totalcount + bp1->bio_bcount /
softc->params.secsize > softc->unmap_max_lba) {
cam_iosched_put_back_trim(softc->cam_iosched, bp1);
break;
}
} while (1);
/* Align length of the last range. */
if ((softc->quirks & DA_Q_STRICT_UNMAP) && softc->unmap_gran != 0 &&
(c = lastcount % softc->unmap_gran) != 0) {
if (lastcount <= c)
ranges--;
else
scsi_ulto4b(lastcount - c, d[ranges - 1].length);
}
scsi_ulto2b(ranges * 16 + 6, &buf[0]);
scsi_ulto2b(ranges * 16, &buf[2]);
scsi_unmap(&ccb->csio,
/*retries*/da_retry_count,
/*cbfcnp*/dadone,
/*tag_action*/MSG_SIMPLE_Q_TAG,
/*byte2*/0,
/*data_ptr*/ buf,
/*dxfer_len*/ ranges * 16 + 8,
/*sense_len*/SSD_FULL_SIZE,
da_default_timeout * 1000);
ccb->ccb_h.ccb_state = DA_CCB_DELETE;
ccb->ccb_h.flags |= CAM_UNLOCKED;
cam_iosched_submit_trim(softc->cam_iosched);
}
static void
da_delete_trim(struct cam_periph *periph, union ccb *ccb, struct bio *bp)
{
struct da_softc *softc = (struct da_softc *)periph->softc;
struct bio *bp1;
uint8_t *buf = softc->unmap_buf;
uint64_t lastlba = (uint64_t)-1;
uint64_t count;
uint64_t lba;
uint32_t lastcount = 0, c, requestcount;
int ranges = 0, off, block_count;
bzero(softc->unmap_buf, sizeof(softc->unmap_buf));
bp1 = bp;
do {
if (bp1 != bp)//XXX imp XXX
bioq_insert_tail(&softc->delete_run_queue, bp1);
lba = bp1->bio_pblkno;
count = bp1->bio_bcount / softc->params.secsize;
requestcount = count;
/* Try to extend the previous range. */
if (lba == lastlba) {
c = omin(count, ATA_DSM_RANGE_MAX - lastcount);
lastcount += c;
off = (ranges - 1) * 8;
buf[off + 6] = lastcount & 0xff;
buf[off + 7] = (lastcount >> 8) & 0xff;
count -= c;
lba += c;
}
while (count > 0) {
c = omin(count, ATA_DSM_RANGE_MAX);
off = ranges * 8;
buf[off + 0] = lba & 0xff;
buf[off + 1] = (lba >> 8) & 0xff;
buf[off + 2] = (lba >> 16) & 0xff;
buf[off + 3] = (lba >> 24) & 0xff;
buf[off + 4] = (lba >> 32) & 0xff;
buf[off + 5] = (lba >> 40) & 0xff;
buf[off + 6] = c & 0xff;
buf[off + 7] = (c >> 8) & 0xff;
lba += c;
ranges++;
count -= c;
lastcount = c;
if (count != 0 && ranges == softc->trim_max_ranges) {
xpt_print(periph->path,
"%s issuing short delete %ld > %ld\n",
da_delete_method_desc[softc->delete_method],
requestcount,
(softc->trim_max_ranges - ranges) *
ATA_DSM_RANGE_MAX);
break;
}
}
lastlba = lba;
bp1 = cam_iosched_next_trim(softc->cam_iosched);
if (bp1 == NULL)
break;
if (bp1->bio_bcount / softc->params.secsize >
(softc->trim_max_ranges - ranges) * ATA_DSM_RANGE_MAX) {
cam_iosched_put_back_trim(softc->cam_iosched, bp1);
break;
}
} while (1);
block_count = howmany(ranges, ATA_DSM_BLK_RANGES);
scsi_ata_trim(&ccb->csio,
/*retries*/da_retry_count,
/*cbfcnp*/dadone,
/*tag_action*/MSG_SIMPLE_Q_TAG,
block_count,
/*data_ptr*/buf,
/*dxfer_len*/block_count * ATA_DSM_BLK_SIZE,
/*sense_len*/SSD_FULL_SIZE,
da_default_timeout * 1000);
ccb->ccb_h.ccb_state = DA_CCB_DELETE;
ccb->ccb_h.flags |= CAM_UNLOCKED;
cam_iosched_submit_trim(softc->cam_iosched);
}
/*
* We calculate ws_max_blks here based off d_delmaxsize instead
* of using softc->ws_max_blks as it is absolute max for the
* device not the protocol max which may well be lower.
*/
static void
da_delete_ws(struct cam_periph *periph, union ccb *ccb, struct bio *bp)
{
struct da_softc *softc;
struct bio *bp1;
uint64_t ws_max_blks;
uint64_t lba;
uint64_t count; /* forward compat with WS32 */
softc = (struct da_softc *)periph->softc;
ws_max_blks = softc->disk->d_delmaxsize / softc->params.secsize;
lba = bp->bio_pblkno;
count = 0;
bp1 = bp;
do {
if (bp1 != bp)//XXX imp XXX
bioq_insert_tail(&softc->delete_run_queue, bp1);
count += bp1->bio_bcount / softc->params.secsize;
if (count > ws_max_blks) {
xpt_print(periph->path,
"%s issuing short delete %ld > %ld\n",
da_delete_method_desc[softc->delete_method],
count, ws_max_blks);
count = omin(count, ws_max_blks);
break;
}
bp1 = cam_iosched_next_trim(softc->cam_iosched);
if (bp1 == NULL)
break;
if (lba + count != bp1->bio_pblkno ||
count + bp1->bio_bcount /
softc->params.secsize > ws_max_blks) {
cam_iosched_put_back_trim(softc->cam_iosched, bp1);
break;
}
} while (1);
scsi_write_same(&ccb->csio,
/*retries*/da_retry_count,
/*cbfcnp*/dadone,
/*tag_action*/MSG_SIMPLE_Q_TAG,
/*byte2*/softc->delete_method ==
DA_DELETE_ZERO ? 0 : SWS_UNMAP,
softc->delete_method == DA_DELETE_WS16 ? 16 : 10,
/*lba*/lba,
/*block_count*/count,
/*data_ptr*/ __DECONST(void *, zero_region),
/*dxfer_len*/ softc->params.secsize,
/*sense_len*/SSD_FULL_SIZE,
da_default_timeout * 1000);
ccb->ccb_h.ccb_state = DA_CCB_DELETE;
ccb->ccb_h.flags |= CAM_UNLOCKED;
cam_iosched_submit_trim(softc->cam_iosched);
}
static int
cmd6workaround(union ccb *ccb)
{
struct scsi_rw_6 cmd6;
struct scsi_rw_10 *cmd10;
struct da_softc *softc;
u_int8_t *cdb;
struct bio *bp;
int frozen;
cdb = ccb->csio.cdb_io.cdb_bytes;
softc = (struct da_softc *)xpt_path_periph(ccb->ccb_h.path)->softc;
if (ccb->ccb_h.ccb_state == DA_CCB_DELETE) {
da_delete_methods old_method = softc->delete_method;
/*
* Typically there are two reasons for failure here
* 1. Delete method was detected as supported but isn't
* 2. Delete failed due to invalid params e.g. too big
*
* While we will attempt to choose an alternative delete method
* this may result in short deletes if the existing delete
* requests from geom are big for the new method chosen.
*
* This method assumes that the error which triggered this
* will not retry the io otherwise a panic will occur
*/
dadeleteflag(softc, old_method, 0);
dadeletemethodchoose(softc, DA_DELETE_DISABLE);
if (softc->delete_method == DA_DELETE_DISABLE)
xpt_print(ccb->ccb_h.path,
"%s failed, disabling BIO_DELETE\n",
da_delete_method_desc[old_method]);
else
xpt_print(ccb->ccb_h.path,
"%s failed, switching to %s BIO_DELETE\n",
da_delete_method_desc[old_method],
da_delete_method_desc[softc->delete_method]);
while ((bp = bioq_takefirst(&softc->delete_run_queue)) != NULL)
cam_iosched_queue_work(softc->cam_iosched, bp);
cam_iosched_queue_work(softc->cam_iosched,
(struct bio *)ccb->ccb_h.ccb_bp);
ccb->ccb_h.ccb_bp = NULL;
return (0);
}
/* Detect unsupported PREVENT ALLOW MEDIUM REMOVAL. */
if ((ccb->ccb_h.flags & CAM_CDB_POINTER) == 0 &&
(*cdb == PREVENT_ALLOW) &&
(softc->quirks & DA_Q_NO_PREVENT) == 0) {
if (bootverbose)
xpt_print(ccb->ccb_h.path,
"PREVENT ALLOW MEDIUM REMOVAL not supported.\n");
softc->quirks |= DA_Q_NO_PREVENT;
return (0);
}
/* Detect unsupported SYNCHRONIZE CACHE(10). */
if ((ccb->ccb_h.flags & CAM_CDB_POINTER) == 0 &&
(*cdb == SYNCHRONIZE_CACHE) &&
(softc->quirks & DA_Q_NO_SYNC_CACHE) == 0) {
if (bootverbose)
xpt_print(ccb->ccb_h.path,
"SYNCHRONIZE CACHE(10) not supported.\n");
softc->quirks |= DA_Q_NO_SYNC_CACHE;
softc->disk->d_flags &= ~DISKFLAG_CANFLUSHCACHE;
return (0);
}
/* Translation only possible if CDB is an array and cmd is R/W6 */
if ((ccb->ccb_h.flags & CAM_CDB_POINTER) != 0 ||
(*cdb != READ_6 && *cdb != WRITE_6))
return 0;
xpt_print(ccb->ccb_h.path, "READ(6)/WRITE(6) not supported, "
"increasing minimum_cmd_size to 10.\n");
softc->minimum_cmd_size = 10;
bcopy(cdb, &cmd6, sizeof(struct scsi_rw_6));
cmd10 = (struct scsi_rw_10 *)cdb;
cmd10->opcode = (cmd6.opcode == READ_6) ? READ_10 : WRITE_10;
cmd10->byte2 = 0;
scsi_ulto4b(scsi_3btoul(cmd6.addr), cmd10->addr);
cmd10->reserved = 0;
scsi_ulto2b(cmd6.length, cmd10->length);
cmd10->control = cmd6.control;
ccb->csio.cdb_len = sizeof(*cmd10);
/* Requeue request, unfreezing queue if necessary */
frozen = (ccb->ccb_h.status & CAM_DEV_QFRZN) != 0;
ccb->ccb_h.status = CAM_REQUEUE_REQ;
xpt_action(ccb);
if (frozen) {
cam_release_devq(ccb->ccb_h.path,
/*relsim_flags*/0,
/*reduction*/0,
/*timeout*/0,
/*getcount_only*/0);
}
return (ERESTART);
}
static void
dazonedone(struct cam_periph *periph, union ccb *ccb)
{
struct da_softc *softc;
struct bio *bp;
softc = periph->softc;
bp = (struct bio *)ccb->ccb_h.ccb_bp;
switch (bp->bio_zone.zone_cmd) {
case DISK_ZONE_OPEN:
case DISK_ZONE_CLOSE:
case DISK_ZONE_FINISH:
case DISK_ZONE_RWP:
break;
case DISK_ZONE_REPORT_ZONES: {
uint32_t avail_len;
struct disk_zone_report *rep;
struct scsi_report_zones_hdr *hdr;
struct scsi_report_zones_desc *desc;
struct disk_zone_rep_entry *entry;
- uint32_t num_alloced, hdr_len, num_avail;
+ uint32_t hdr_len, num_avail;
uint32_t num_to_fill, i;
int ata;
rep = &bp->bio_zone.zone_params.report;
avail_len = ccb->csio.dxfer_len - ccb->csio.resid;
/*
* Note that bio_resid isn't normally used for zone
* commands, but it is used by devstat_end_transaction_bio()
* to determine how much data was transferred. Because
* the size of the SCSI/ATA data structures is different
* than the size of the BIO interface structures, the
* amount of data actually transferred from the drive will
* be different than the amount of data transferred to
* the user.
*/
bp->bio_resid = ccb->csio.resid;
- num_alloced = rep->entries_allocated;
hdr = (struct scsi_report_zones_hdr *)ccb->csio.data_ptr;
if (avail_len < sizeof(*hdr)) {
/*
* Is there a better error than EIO here? We asked
* for at least the header, and we got less than
* that.
*/
bp->bio_error = EIO;
bp->bio_flags |= BIO_ERROR;
bp->bio_resid = bp->bio_bcount;
break;
}
if (softc->zone_interface == DA_ZONE_IF_ATA_PASS)
ata = 1;
else
ata = 0;
hdr_len = ata ? le32dec(hdr->length) :
scsi_4btoul(hdr->length);
if (hdr_len > 0)
rep->entries_available = hdr_len / sizeof(*desc);
else
rep->entries_available = 0;
/*
* NOTE: using the same values for the BIO version of the
* same field as the SCSI/ATA values. This means we could
* get some additional values that aren't defined in bio.h
* if more values of the same field are defined later.
*/
rep->header.same = hdr->byte4 & SRZ_SAME_MASK;
rep->header.maximum_lba = ata ? le64dec(hdr->maximum_lba) :
scsi_8btou64(hdr->maximum_lba);
/*
* If the drive reports no entries that match the query,
* we're done.
*/
if (hdr_len == 0) {
rep->entries_filled = 0;
break;
}
num_avail = min((avail_len - sizeof(*hdr)) / sizeof(*desc),
hdr_len / sizeof(*desc));
/*
* If the drive didn't return any data, then we're done.
*/
if (num_avail == 0) {
rep->entries_filled = 0;
break;
}
num_to_fill = min(num_avail, rep->entries_allocated);
/*
* If the user didn't allocate any entries for us to fill,
* we're done.
*/
if (num_to_fill == 0) {
rep->entries_filled = 0;
break;
}
for (i = 0, desc = &hdr->desc_list[0], entry=&rep->entries[0];
i < num_to_fill; i++, desc++, entry++) {
/*
* NOTE: we're mapping the values here directly
* from the SCSI/ATA bit definitions to the bio.h
* definitons. There is also a warning in
* disk_zone.h, but the impact is that if
* additional values are added in the SCSI/ATA
* specs these will be visible to consumers of
* this interface.
*/
entry->zone_type = desc->zone_type & SRZ_TYPE_MASK;
entry->zone_condition =
(desc->zone_flags & SRZ_ZONE_COND_MASK) >>
SRZ_ZONE_COND_SHIFT;
entry->zone_flags |= desc->zone_flags &
(SRZ_ZONE_NON_SEQ|SRZ_ZONE_RESET);
entry->zone_length =
ata ? le64dec(desc->zone_length) :
scsi_8btou64(desc->zone_length);
entry->zone_start_lba =
ata ? le64dec(desc->zone_start_lba) :
scsi_8btou64(desc->zone_start_lba);
entry->write_pointer_lba =
ata ? le64dec(desc->write_pointer_lba) :
scsi_8btou64(desc->write_pointer_lba);
}
rep->entries_filled = num_to_fill;
break;
}
case DISK_ZONE_GET_PARAMS:
default:
/*
* In theory we should not get a GET_PARAMS bio, since it
* should be handled without queueing the command to the
* drive.
*/
panic("%s: Invalid zone command %d", __func__,
bp->bio_zone.zone_cmd);
break;
}
if (bp->bio_zone.zone_cmd == DISK_ZONE_REPORT_ZONES)
free(ccb->csio.data_ptr, M_SCSIDA);
}
static void
dadone(struct cam_periph *periph, union ccb *done_ccb)
{
struct da_softc *softc;
struct ccb_scsiio *csio;
u_int32_t priority;
da_ccb_state state;
softc = (struct da_softc *)periph->softc;
priority = done_ccb->ccb_h.pinfo.priority;
CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("dadone\n"));
csio = &done_ccb->csio;
#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
if (csio->bio != NULL)
biotrack(csio->bio, __func__);
#endif
state = csio->ccb_h.ccb_state & DA_CCB_TYPE_MASK;
switch (state) {
case DA_CCB_BUFFER_IO:
case DA_CCB_DELETE:
{
struct bio *bp, *bp1;
cam_periph_lock(periph);
bp = (struct bio *)done_ccb->ccb_h.ccb_bp;
if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
int error;
int sf;
if ((csio->ccb_h.ccb_state & DA_CCB_RETRY_UA) != 0)
sf = SF_RETRY_UA;
else
sf = 0;
error = daerror(done_ccb, CAM_RETRY_SELTO, sf);
if (error == ERESTART) {
/*
* A retry was scheduled, so
* just return.
*/
cam_periph_unlock(periph);
return;
}
bp = (struct bio *)done_ccb->ccb_h.ccb_bp;
if (error != 0) {
int queued_error;
/*
* return all queued I/O with EIO, so that
* the client can retry these I/Os in the
* proper order should it attempt to recover.
*/
queued_error = EIO;
if (error == ENXIO
&& (softc->flags & DA_FLAG_PACK_INVALID)== 0) {
/*
* Catastrophic error. Mark our pack as
* invalid.
*/
/*
* XXX See if this is really a media
* XXX change first?
*/
xpt_print(periph->path,
"Invalidating pack\n");
softc->flags |= DA_FLAG_PACK_INVALID;
#ifdef CAM_IO_STATS
softc->invalidations++;
#endif
queued_error = ENXIO;
}
cam_iosched_flush(softc->cam_iosched, NULL,
queued_error);
if (bp != NULL) {
bp->bio_error = error;
bp->bio_resid = bp->bio_bcount;
bp->bio_flags |= BIO_ERROR;
}
} else if (bp != NULL) {
if (state == DA_CCB_DELETE)
bp->bio_resid = 0;
else
bp->bio_resid = csio->resid;
bp->bio_error = 0;
if (bp->bio_resid != 0)
bp->bio_flags |= BIO_ERROR;
}
if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0)
cam_release_devq(done_ccb->ccb_h.path,
/*relsim_flags*/0,
/*reduction*/0,
/*timeout*/0,
/*getcount_only*/0);
} else if (bp != NULL) {
if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0)
panic("REQ_CMP with QFRZN");
if (bp->bio_cmd == BIO_ZONE)
dazonedone(periph, done_ccb);
else if (state == DA_CCB_DELETE)
bp->bio_resid = 0;
else
bp->bio_resid = csio->resid;
if ((csio->resid > 0)
&& (bp->bio_cmd != BIO_ZONE))
bp->bio_flags |= BIO_ERROR;
if (softc->error_inject != 0) {
bp->bio_error = softc->error_inject;
bp->bio_resid = bp->bio_bcount;
bp->bio_flags |= BIO_ERROR;
softc->error_inject = 0;
}
}
if (bp != NULL)
biotrack(bp, __func__);
LIST_REMOVE(&done_ccb->ccb_h, periph_links.le);
if (LIST_EMPTY(&softc->pending_ccbs))
softc->flags |= DA_FLAG_WAS_OTAG;
/*
* We need to call cam_iosched before we call biodone so that we
* don't measure any activity that happens in the completion
* routine, which in the case of sendfile can be quite
* extensive.
*/
cam_iosched_bio_complete(softc->cam_iosched, bp, done_ccb);
xpt_release_ccb(done_ccb);
if (state == DA_CCB_DELETE) {
TAILQ_HEAD(, bio) queue;
TAILQ_INIT(&queue);
TAILQ_CONCAT(&queue, &softc->delete_run_queue.queue, bio_queue);
softc->delete_run_queue.insert_point = NULL;
/*
* Normally, the xpt_release_ccb() above would make sure
* that when we have more work to do, that work would
* get kicked off. However, we specifically keep
* delete_running set to 0 before the call above to
* allow other I/O to progress when many BIO_DELETE
* requests are pushed down. We set delete_running to 0
* and call daschedule again so that we don't stall if
* there are no other I/Os pending apart from BIO_DELETEs.
*/
cam_iosched_trim_done(softc->cam_iosched);
daschedule(periph);
cam_periph_unlock(periph);
while ((bp1 = TAILQ_FIRST(&queue)) != NULL) {
TAILQ_REMOVE(&queue, bp1, bio_queue);
bp1->bio_error = bp->bio_error;
if (bp->bio_flags & BIO_ERROR) {
bp1->bio_flags |= BIO_ERROR;
bp1->bio_resid = bp1->bio_bcount;
} else
bp1->bio_resid = 0;
biodone(bp1);
}
} else {
daschedule(periph);
cam_periph_unlock(periph);
}
if (bp != NULL)
biodone(bp);
return;
}
case DA_CCB_PROBE_RC:
case DA_CCB_PROBE_RC16:
{
struct scsi_read_capacity_data *rdcap;
struct scsi_read_capacity_data_long *rcaplong;
char *announce_buf;
int lbp;
lbp = 0;
rdcap = NULL;
rcaplong = NULL;
/* XXX TODO: can this be a malloc? */
announce_buf = softc->announce_temp;
bzero(announce_buf, DA_ANNOUNCETMP_SZ);
if (state == DA_CCB_PROBE_RC)
rdcap =(struct scsi_read_capacity_data *)csio->data_ptr;
else
rcaplong = (struct scsi_read_capacity_data_long *)
csio->data_ptr;
if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
struct disk_params *dp;
uint32_t block_size;
uint64_t maxsector;
u_int lalba; /* Lowest aligned LBA. */
if (state == DA_CCB_PROBE_RC) {
block_size = scsi_4btoul(rdcap->length);
maxsector = scsi_4btoul(rdcap->addr);
lalba = 0;
/*
* According to SBC-2, if the standard 10
* byte READ CAPACITY command returns 2^32,
* we should issue the 16 byte version of
* the command, since the device in question
* has more sectors than can be represented
* with the short version of the command.
*/
if (maxsector == 0xffffffff) {
free(rdcap, M_SCSIDA);
xpt_release_ccb(done_ccb);
softc->state = DA_STATE_PROBE_RC16;
xpt_schedule(periph, priority);
return;
}
} else {
block_size = scsi_4btoul(rcaplong->length);
maxsector = scsi_8btou64(rcaplong->addr);
lalba = scsi_2btoul(rcaplong->lalba_lbp);
}
/*
* Because GEOM code just will panic us if we
* give them an 'illegal' value we'll avoid that
* here.
*/
if (block_size == 0) {
block_size = 512;
if (maxsector == 0)
maxsector = -1;
}
if (block_size >= MAXPHYS) {
xpt_print(periph->path,
"unsupportable block size %ju\n",
(uintmax_t) block_size);
announce_buf = NULL;
cam_periph_invalidate(periph);
} else {
/*
* We pass rcaplong into dasetgeom(),
* because it will only use it if it is
* non-NULL.
*/
dasetgeom(periph, block_size, maxsector,
rcaplong, sizeof(*rcaplong));
lbp = (lalba & SRC16_LBPME_A);
dp = &softc->params;
snprintf(announce_buf, DA_ANNOUNCETMP_SZ,
"%juMB (%ju %u byte sectors)",
((uintmax_t)dp->secsize * dp->sectors) /
(1024 * 1024),
(uintmax_t)dp->sectors, dp->secsize);
}
} else {
int error;
/*
* Retry any UNIT ATTENTION type errors. They
* are expected at boot.
*/
error = daerror(done_ccb, CAM_RETRY_SELTO,
SF_RETRY_UA|SF_NO_PRINT);
if (error == ERESTART) {
/*
* A retry was scheuled, so
* just return.
*/
return;
} else if (error != 0) {
int asc, ascq;
int sense_key, error_code;
int have_sense;
cam_status status;
struct ccb_getdev cgd;
/* Don't wedge this device's queue */
status = done_ccb->ccb_h.status;
if ((status & CAM_DEV_QFRZN) != 0)
cam_release_devq(done_ccb->ccb_h.path,
/*relsim_flags*/0,
/*reduction*/0,
/*timeout*/0,
/*getcount_only*/0);
xpt_setup_ccb(&cgd.ccb_h,
done_ccb->ccb_h.path,
CAM_PRIORITY_NORMAL);
cgd.ccb_h.func_code = XPT_GDEV_TYPE;
xpt_action((union ccb *)&cgd);
if (scsi_extract_sense_ccb(done_ccb,
&error_code, &sense_key, &asc, &ascq))
have_sense = TRUE;
else
have_sense = FALSE;
/*
* If we tried READ CAPACITY(16) and failed,
* fallback to READ CAPACITY(10).
*/
if ((state == DA_CCB_PROBE_RC16) &&
(softc->flags & DA_FLAG_CAN_RC16) &&
(((csio->ccb_h.status & CAM_STATUS_MASK) ==
CAM_REQ_INVALID) ||
((have_sense) &&
(error_code == SSD_CURRENT_ERROR) &&
(sense_key == SSD_KEY_ILLEGAL_REQUEST)))) {
softc->flags &= ~DA_FLAG_CAN_RC16;
free(rdcap, M_SCSIDA);
xpt_release_ccb(done_ccb);
softc->state = DA_STATE_PROBE_RC;
xpt_schedule(periph, priority);
return;
}
/*
* Attach to anything that claims to be a
* direct access or optical disk device,
* as long as it doesn't return a "Logical
* unit not supported" (0x25) error.
* "Internal Target Failure" (0x44) is also
* special and typically means that the
* device is a SATA drive behind a SATL
* translation that's fallen into a
* terminally fatal state.
*/
if ((have_sense)
&& (asc != 0x25) && (asc != 0x44)
&& (error_code == SSD_CURRENT_ERROR)) {
const char *sense_key_desc;
const char *asc_desc;
dasetgeom(periph, 512, -1, NULL, 0);
scsi_sense_desc(sense_key, asc, ascq,
&cgd.inq_data,
&sense_key_desc,
&asc_desc);
snprintf(announce_buf,
DA_ANNOUNCETMP_SZ,
"Attempt to query device "
"size failed: %s, %s",
sense_key_desc, asc_desc);
} else {
if (have_sense)
scsi_sense_print(
&done_ccb->csio);
else {
xpt_print(periph->path,
"got CAM status %#x\n",
done_ccb->ccb_h.status);
}
xpt_print(periph->path, "fatal error, "
"failed to attach to device\n");
announce_buf = NULL;
/*
* Free up resources.
*/
cam_periph_invalidate(periph);
}
}
}
free(csio->data_ptr, M_SCSIDA);
if (announce_buf != NULL &&
((softc->flags & DA_FLAG_ANNOUNCED) == 0)) {
struct sbuf sb;
sbuf_new(&sb, softc->announcebuf, DA_ANNOUNCE_SZ,
SBUF_FIXEDLEN);
xpt_announce_periph_sbuf(periph, &sb, announce_buf);
xpt_announce_quirks_sbuf(periph, &sb, softc->quirks,
DA_Q_BIT_STRING);
sbuf_finish(&sb);
sbuf_putbuf(&sb);
/*
* Create our sysctl variables, now that we know
* we have successfully attached.
*/
/* increase the refcount */
if (cam_periph_acquire(periph) == CAM_REQ_CMP) {
taskqueue_enqueue(taskqueue_thread,
&softc->sysctl_task);
} else {
/* XXX This message is useless! */
xpt_print(periph->path, "fatal error, "
"could not acquire reference count\n");
}
}
/* We already probed the device. */
if (softc->flags & DA_FLAG_PROBED) {
daprobedone(periph, done_ccb);
return;
}
/* Ensure re-probe doesn't see old delete. */
softc->delete_available = 0;
dadeleteflag(softc, DA_DELETE_ZERO, 1);
if (lbp && (softc->quirks & DA_Q_NO_UNMAP) == 0) {
/*
* Based on older SBC-3 spec revisions
* any of the UNMAP methods "may" be
* available via LBP given this flag so
* we flag all of them as available and
* then remove those which further
* probes confirm aren't available
* later.
*
* We could also check readcap(16) p_type
* flag to exclude one or more invalid
* write same (X) types here
*/
dadeleteflag(softc, DA_DELETE_WS16, 1);
dadeleteflag(softc, DA_DELETE_WS10, 1);
dadeleteflag(softc, DA_DELETE_UNMAP, 1);
xpt_release_ccb(done_ccb);
softc->state = DA_STATE_PROBE_LBP;
xpt_schedule(periph, priority);
return;
}
xpt_release_ccb(done_ccb);
softc->state = DA_STATE_PROBE_BDC;
xpt_schedule(periph, priority);
return;
}
case DA_CCB_PROBE_LBP:
{
struct scsi_vpd_logical_block_prov *lbp;
lbp = (struct scsi_vpd_logical_block_prov *)csio->data_ptr;
if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
/*
* T10/1799-D Revision 31 states at least one of these
* must be supported but we don't currently enforce this.
*/
dadeleteflag(softc, DA_DELETE_WS16,
(lbp->flags & SVPD_LBP_WS16));
dadeleteflag(softc, DA_DELETE_WS10,
(lbp->flags & SVPD_LBP_WS10));
dadeleteflag(softc, DA_DELETE_UNMAP,
(lbp->flags & SVPD_LBP_UNMAP));
} else {
int error;
error = daerror(done_ccb, CAM_RETRY_SELTO,
SF_RETRY_UA|SF_NO_PRINT);
if (error == ERESTART)
return;
else if (error != 0) {
if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) {
/* Don't wedge this device's queue */
cam_release_devq(done_ccb->ccb_h.path,
/*relsim_flags*/0,
/*reduction*/0,
/*timeout*/0,
/*getcount_only*/0);
}
/*
* Failure indicates we don't support any SBC-3
* delete methods with UNMAP
*/
}
}
free(lbp, M_SCSIDA);
xpt_release_ccb(done_ccb);
softc->state = DA_STATE_PROBE_BLK_LIMITS;
xpt_schedule(periph, priority);
return;
}
case DA_CCB_PROBE_BLK_LIMITS:
{
struct scsi_vpd_block_limits *block_limits;
block_limits = (struct scsi_vpd_block_limits *)csio->data_ptr;
if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
uint32_t max_txfer_len = scsi_4btoul(
block_limits->max_txfer_len);
uint32_t max_unmap_lba_cnt = scsi_4btoul(
block_limits->max_unmap_lba_cnt);
uint32_t max_unmap_blk_cnt = scsi_4btoul(
block_limits->max_unmap_blk_cnt);
uint32_t unmap_gran = scsi_4btoul(
block_limits->opt_unmap_grain);
uint32_t unmap_gran_align = scsi_4btoul(
block_limits->unmap_grain_align);
uint64_t ws_max_blks = scsi_8btou64(
block_limits->max_write_same_length);
if (max_txfer_len != 0) {
softc->disk->d_maxsize = MIN(softc->maxio,
(off_t)max_txfer_len * softc->params.secsize);
}
/*
* We should already support UNMAP but we check lba
* and block count to be sure
*/
if (max_unmap_lba_cnt != 0x00L &&
max_unmap_blk_cnt != 0x00L) {
softc->unmap_max_lba = max_unmap_lba_cnt;
softc->unmap_max_ranges = min(max_unmap_blk_cnt,
UNMAP_MAX_RANGES);
if (unmap_gran > 1) {
softc->unmap_gran = unmap_gran;
if (unmap_gran_align & 0x80000000) {
softc->unmap_gran_align =
unmap_gran_align &
0x7fffffff;
}
}
} else {
/*
* Unexpected UNMAP limits which means the
* device doesn't actually support UNMAP
*/
dadeleteflag(softc, DA_DELETE_UNMAP, 0);
}
if (ws_max_blks != 0x00L)
softc->ws_max_blks = ws_max_blks;
} else {
int error;
error = daerror(done_ccb, CAM_RETRY_SELTO,
SF_RETRY_UA|SF_NO_PRINT);
if (error == ERESTART)
return;
else if (error != 0) {
if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) {
/* Don't wedge this device's queue */
cam_release_devq(done_ccb->ccb_h.path,
/*relsim_flags*/0,
/*reduction*/0,
/*timeout*/0,
/*getcount_only*/0);
}
/*
* Failure here doesn't mean UNMAP is not
* supported as this is an optional page.
*/
softc->unmap_max_lba = 1;
softc->unmap_max_ranges = 1;
}
}
free(block_limits, M_SCSIDA);
xpt_release_ccb(done_ccb);
softc->state = DA_STATE_PROBE_BDC;
xpt_schedule(periph, priority);
return;
}
case DA_CCB_PROBE_BDC:
{
struct scsi_vpd_block_device_characteristics *bdc;
bdc = (struct scsi_vpd_block_device_characteristics *)
csio->data_ptr;
if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
uint32_t valid_len;
/*
* Disable queue sorting for non-rotational media
* by default.
*/
u_int16_t old_rate = softc->disk->d_rotation_rate;
valid_len = csio->dxfer_len - csio->resid;
if (SBDC_IS_PRESENT(bdc, valid_len,
medium_rotation_rate)) {
softc->disk->d_rotation_rate =
scsi_2btoul(bdc->medium_rotation_rate);
if (softc->disk->d_rotation_rate ==
SVPD_BDC_RATE_NON_ROTATING) {
cam_iosched_set_sort_queue(
softc->cam_iosched, 0);
softc->rotating = 0;
}
if (softc->disk->d_rotation_rate != old_rate) {
disk_attr_changed(softc->disk,
"GEOM::rotation_rate", M_NOWAIT);
}
}
if ((SBDC_IS_PRESENT(bdc, valid_len, flags))
&& (softc->zone_mode == DA_ZONE_NONE)) {
int ata_proto;
if (scsi_vpd_supported_page(periph,
SVPD_ATA_INFORMATION))
ata_proto = 1;
else
ata_proto = 0;
/*
* The Zoned field will only be set for
* Drive Managed and Host Aware drives. If
* they are Host Managed, the device type
* in the standard INQUIRY data should be
* set to T_ZBC_HM (0x14).
*/
if ((bdc->flags & SVPD_ZBC_MASK) ==
SVPD_HAW_ZBC) {
softc->zone_mode = DA_ZONE_HOST_AWARE;
softc->zone_interface = (ata_proto) ?
DA_ZONE_IF_ATA_SAT : DA_ZONE_IF_SCSI;
} else if ((bdc->flags & SVPD_ZBC_MASK) ==
SVPD_DM_ZBC) {
softc->zone_mode =DA_ZONE_DRIVE_MANAGED;
softc->zone_interface = (ata_proto) ?
DA_ZONE_IF_ATA_SAT : DA_ZONE_IF_SCSI;
} else if ((bdc->flags & SVPD_ZBC_MASK) !=
SVPD_ZBC_NR) {
xpt_print(periph->path, "Unknown zoned "
"type %#x",
bdc->flags & SVPD_ZBC_MASK);
}
}
} else {
int error;
error = daerror(done_ccb, CAM_RETRY_SELTO,
SF_RETRY_UA|SF_NO_PRINT);
if (error == ERESTART)
return;
else if (error != 0) {
if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) {
/* Don't wedge this device's queue */
cam_release_devq(done_ccb->ccb_h.path,
/*relsim_flags*/0,
/*reduction*/0,
/*timeout*/0,
/*getcount_only*/0);
}
}
}
free(bdc, M_SCSIDA);
xpt_release_ccb(done_ccb);
softc->state = DA_STATE_PROBE_ATA;
xpt_schedule(periph, priority);
return;
}
case DA_CCB_PROBE_ATA:
{
int i;
struct ata_params *ata_params;
int continue_probe;
int error;
int16_t *ptr;
ata_params = (struct ata_params *)csio->data_ptr;
ptr = (uint16_t *)ata_params;
continue_probe = 0;
error = 0;
if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
uint16_t old_rate;
for (i = 0; i < sizeof(*ata_params) / 2; i++)
ptr[i] = le16toh(ptr[i]);
if (ata_params->support_dsm & ATA_SUPPORT_DSM_TRIM &&
(softc->quirks & DA_Q_NO_UNMAP) == 0) {
dadeleteflag(softc, DA_DELETE_ATA_TRIM, 1);
if (ata_params->max_dsm_blocks != 0)
softc->trim_max_ranges = min(
softc->trim_max_ranges,
ata_params->max_dsm_blocks *
ATA_DSM_BLK_RANGES);
}
/*
* Disable queue sorting for non-rotational media
* by default.
*/
old_rate = softc->disk->d_rotation_rate;
softc->disk->d_rotation_rate =
ata_params->media_rotation_rate;
if (softc->disk->d_rotation_rate ==
ATA_RATE_NON_ROTATING) {
cam_iosched_set_sort_queue(softc->cam_iosched, 0);
softc->rotating = 0;
}
if (softc->disk->d_rotation_rate != old_rate) {
disk_attr_changed(softc->disk,
"GEOM::rotation_rate", M_NOWAIT);
}
if (ata_params->capabilities1 & ATA_SUPPORT_DMA)
softc->flags |= DA_FLAG_CAN_ATA_DMA;
if (ata_params->support.extension &
ATA_SUPPORT_GENLOG)
softc->flags |= DA_FLAG_CAN_ATA_LOG;
/*
* At this point, if we have a SATA host aware drive,
* we communicate via ATA passthrough unless the
* SAT layer supports ZBC -> ZAC translation. In
* that case,
*/
/*
* XXX KDM figure out how to detect a host managed
* SATA drive.
*/
if (softc->zone_mode == DA_ZONE_NONE) {
/*
* Note that we don't override the zone
* mode or interface if it has already been
* set. This is because it has either been
* set as a quirk, or when we probed the
* SCSI Block Device Characteristics page,
* the zoned field was set. The latter
* means that the SAT layer supports ZBC to
* ZAC translation, and we would prefer to
* use that if it is available.
*/
if ((ata_params->support3 &
ATA_SUPPORT_ZONE_MASK) ==
ATA_SUPPORT_ZONE_HOST_AWARE) {
softc->zone_mode = DA_ZONE_HOST_AWARE;
softc->zone_interface =
DA_ZONE_IF_ATA_PASS;
} else if ((ata_params->support3 &
ATA_SUPPORT_ZONE_MASK) ==
ATA_SUPPORT_ZONE_DEV_MANAGED) {
softc->zone_mode =DA_ZONE_DRIVE_MANAGED;
softc->zone_interface =
DA_ZONE_IF_ATA_PASS;
}
}
} else {
error = daerror(done_ccb, CAM_RETRY_SELTO,
SF_RETRY_UA|SF_NO_PRINT);
if (error == ERESTART)
return;
else if (error != 0) {
if ((done_ccb->ccb_h.status &
CAM_DEV_QFRZN) != 0) {
/* Don't wedge this device's queue */
cam_release_devq(done_ccb->ccb_h.path,
/*relsim_flags*/0,
/*reduction*/0,
/*timeout*/0,
/*getcount_only*/0);
}
}
}
free(ata_params, M_SCSIDA);
if ((softc->zone_mode == DA_ZONE_HOST_AWARE)
|| (softc->zone_mode == DA_ZONE_HOST_MANAGED)) {
/*
* If the ATA IDENTIFY failed, we could be talking
* to a SCSI drive, although that seems unlikely,
* since the drive did report that it supported the
* ATA Information VPD page. If the ATA IDENTIFY
* succeeded, and the SAT layer doesn't support
* ZBC -> ZAC translation, continue on to get the
* directory of ATA logs, and complete the rest of
* the ZAC probe. If the SAT layer does support
* ZBC -> ZAC translation, we want to use that,
* and we'll probe the SCSI Zoned Block Device
* Characteristics VPD page next.
*/
if ((error == 0)
&& (softc->flags & DA_FLAG_CAN_ATA_LOG)
&& (softc->zone_interface == DA_ZONE_IF_ATA_PASS))
softc->state = DA_STATE_PROBE_ATA_LOGDIR;
else
softc->state = DA_STATE_PROBE_ZONE;
continue_probe = 1;
}
if (continue_probe != 0) {
xpt_release_ccb(done_ccb);
xpt_schedule(periph, priority);
return;
} else
daprobedone(periph, done_ccb);
return;
}
case DA_CCB_PROBE_ATA_LOGDIR:
{
int error;
if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
error = 0;
softc->valid_logdir_len = 0;
bzero(&softc->ata_logdir, sizeof(softc->ata_logdir));
softc->valid_logdir_len =
csio->dxfer_len - csio->resid;
if (softc->valid_logdir_len > 0)
bcopy(csio->data_ptr, &softc->ata_logdir,
min(softc->valid_logdir_len,
sizeof(softc->ata_logdir)));
/*
* Figure out whether the Identify Device log is
* supported. The General Purpose log directory
* has a header, and lists the number of pages
* available for each GP log identified by the
* offset into the list.
*/
if ((softc->valid_logdir_len >=
((ATA_IDENTIFY_DATA_LOG + 1) * sizeof(uint16_t)))
&& (le16dec(softc->ata_logdir.header) ==
ATA_GP_LOG_DIR_VERSION)
&& (le16dec(&softc->ata_logdir.num_pages[
(ATA_IDENTIFY_DATA_LOG *
sizeof(uint16_t)) - sizeof(uint16_t)]) > 0)){
softc->flags |= DA_FLAG_CAN_ATA_IDLOG;
} else {
softc->flags &= ~DA_FLAG_CAN_ATA_IDLOG;
}
} else {
error = daerror(done_ccb, CAM_RETRY_SELTO,
SF_RETRY_UA|SF_NO_PRINT);
if (error == ERESTART)
return;
else if (error != 0) {
/*
* If we can't get the ATA log directory,
* then ATA logs are effectively not
* supported even if the bit is set in the
* identify data.
*/
softc->flags &= ~(DA_FLAG_CAN_ATA_LOG |
DA_FLAG_CAN_ATA_IDLOG);
if ((done_ccb->ccb_h.status &
CAM_DEV_QFRZN) != 0) {
/* Don't wedge this device's queue */
cam_release_devq(done_ccb->ccb_h.path,
/*relsim_flags*/0,
/*reduction*/0,
/*timeout*/0,
/*getcount_only*/0);
}
}
}
free(csio->data_ptr, M_SCSIDA);
if ((error == 0)
&& (softc->flags & DA_FLAG_CAN_ATA_IDLOG)) {
softc->state = DA_STATE_PROBE_ATA_IDDIR;
xpt_release_ccb(done_ccb);
xpt_schedule(periph, priority);
return;
}
daprobedone(periph, done_ccb);
return;
}
case DA_CCB_PROBE_ATA_IDDIR:
{
int error;
if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
off_t entries_offset, max_entries;
error = 0;
softc->valid_iddir_len = 0;
bzero(&softc->ata_iddir, sizeof(softc->ata_iddir));
softc->flags &= ~(DA_FLAG_CAN_ATA_SUPCAP |
DA_FLAG_CAN_ATA_ZONE);
softc->valid_iddir_len =
csio->dxfer_len - csio->resid;
if (softc->valid_iddir_len > 0)
bcopy(csio->data_ptr, &softc->ata_iddir,
min(softc->valid_iddir_len,
sizeof(softc->ata_iddir)));
entries_offset =
__offsetof(struct ata_identify_log_pages,entries);
max_entries = softc->valid_iddir_len - entries_offset;
if ((softc->valid_iddir_len > (entries_offset + 1))
&& (le64dec(softc->ata_iddir.header) ==
ATA_IDLOG_REVISION)
&& (softc->ata_iddir.entry_count > 0)) {
int num_entries, i;
num_entries = softc->ata_iddir.entry_count;
num_entries = min(num_entries,
softc->valid_iddir_len - entries_offset);
for (i = 0; i < num_entries &&
i < max_entries; i++) {
if (softc->ata_iddir.entries[i] ==
ATA_IDL_SUP_CAP)
softc->flags |=
DA_FLAG_CAN_ATA_SUPCAP;
else if (softc->ata_iddir.entries[i]==
ATA_IDL_ZDI)
softc->flags |=
DA_FLAG_CAN_ATA_ZONE;
if ((softc->flags &
DA_FLAG_CAN_ATA_SUPCAP)
&& (softc->flags &
DA_FLAG_CAN_ATA_ZONE))
break;
}
}
} else {
error = daerror(done_ccb, CAM_RETRY_SELTO,
SF_RETRY_UA|SF_NO_PRINT);
if (error == ERESTART)
return;
else if (error != 0) {
/*
* If we can't get the ATA Identify Data log
* directory, then it effectively isn't
* supported even if the ATA Log directory
* a non-zero number of pages present for
* this log.
*/
softc->flags &= ~DA_FLAG_CAN_ATA_IDLOG;
if ((done_ccb->ccb_h.status &
CAM_DEV_QFRZN) != 0) {
/* Don't wedge this device's queue */
cam_release_devq(done_ccb->ccb_h.path,
/*relsim_flags*/0,
/*reduction*/0,
/*timeout*/0,
/*getcount_only*/0);
}
}
}
free(csio->data_ptr, M_SCSIDA);
if ((error == 0)
&& (softc->flags & DA_FLAG_CAN_ATA_SUPCAP)) {
softc->state = DA_STATE_PROBE_ATA_SUP;
xpt_release_ccb(done_ccb);
xpt_schedule(periph, priority);
return;
}
daprobedone(periph, done_ccb);
return;
}
case DA_CCB_PROBE_ATA_SUP:
{
int error;
if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
uint32_t valid_len;
size_t needed_size;
struct ata_identify_log_sup_cap *sup_cap;
error = 0;
sup_cap = (struct ata_identify_log_sup_cap *)
csio->data_ptr;
valid_len = csio->dxfer_len - csio->resid;
needed_size =
__offsetof(struct ata_identify_log_sup_cap,
sup_zac_cap) + 1 + sizeof(sup_cap->sup_zac_cap);
if (valid_len >= needed_size) {
uint64_t zoned, zac_cap;
zoned = le64dec(sup_cap->zoned_cap);
if (zoned & ATA_ZONED_VALID) {
/*
* This should have already been
* set, because this is also in the
* ATA identify data.
*/
if ((zoned & ATA_ZONED_MASK) ==
ATA_SUPPORT_ZONE_HOST_AWARE)
softc->zone_mode =
DA_ZONE_HOST_AWARE;
else if ((zoned & ATA_ZONED_MASK) ==
ATA_SUPPORT_ZONE_DEV_MANAGED)
softc->zone_mode =
DA_ZONE_DRIVE_MANAGED;
}
zac_cap = le64dec(sup_cap->sup_zac_cap);
if (zac_cap & ATA_SUP_ZAC_CAP_VALID) {
if (zac_cap & ATA_REPORT_ZONES_SUP)
softc->zone_flags |=
DA_ZONE_FLAG_RZ_SUP;
if (zac_cap & ATA_ND_OPEN_ZONE_SUP)
softc->zone_flags |=
DA_ZONE_FLAG_OPEN_SUP;
if (zac_cap & ATA_ND_CLOSE_ZONE_SUP)
softc->zone_flags |=
DA_ZONE_FLAG_CLOSE_SUP;
if (zac_cap & ATA_ND_FINISH_ZONE_SUP)
softc->zone_flags |=
DA_ZONE_FLAG_FINISH_SUP;
if (zac_cap & ATA_ND_RWP_SUP)
softc->zone_flags |=
DA_ZONE_FLAG_RWP_SUP;
} else {
/*
* This field was introduced in
* ACS-4, r08 on April 28th, 2015.
* If the drive firmware was written
* to an earlier spec, it won't have
* the field. So, assume all
* commands are supported.
*/
softc->zone_flags |=
DA_ZONE_FLAG_SUP_MASK;
}
}
} else {
error = daerror(done_ccb, CAM_RETRY_SELTO,
SF_RETRY_UA|SF_NO_PRINT);
if (error == ERESTART)
return;
else if (error != 0) {
/*
* If we can't get the ATA Identify Data
* Supported Capabilities page, clear the
* flag...
*/
softc->flags &= ~DA_FLAG_CAN_ATA_SUPCAP;
/*
* And clear zone capabilities.
*/
softc->zone_flags &= ~DA_ZONE_FLAG_SUP_MASK;
if ((done_ccb->ccb_h.status &
CAM_DEV_QFRZN) != 0) {
/* Don't wedge this device's queue */
cam_release_devq(done_ccb->ccb_h.path,
/*relsim_flags*/0,
/*reduction*/0,
/*timeout*/0,
/*getcount_only*/0);
}
}
}
free(csio->data_ptr, M_SCSIDA);
if ((error == 0)
&& (softc->flags & DA_FLAG_CAN_ATA_ZONE)) {
softc->state = DA_STATE_PROBE_ATA_ZONE;
xpt_release_ccb(done_ccb);
xpt_schedule(periph, priority);
return;
}
daprobedone(periph, done_ccb);
return;
}
case DA_CCB_PROBE_ATA_ZONE:
{
int error;
if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
struct ata_zoned_info_log *zi_log;
uint32_t valid_len;
size_t needed_size;
zi_log = (struct ata_zoned_info_log *)csio->data_ptr;
valid_len = csio->dxfer_len - csio->resid;
needed_size = __offsetof(struct ata_zoned_info_log,
version_info) + 1 + sizeof(zi_log->version_info);
if (valid_len >= needed_size) {
uint64_t tmpvar;
tmpvar = le64dec(zi_log->zoned_cap);
if (tmpvar & ATA_ZDI_CAP_VALID) {
if (tmpvar & ATA_ZDI_CAP_URSWRZ)
softc->zone_flags |=
DA_ZONE_FLAG_URSWRZ;
else
softc->zone_flags &=
~DA_ZONE_FLAG_URSWRZ;
}
tmpvar = le64dec(zi_log->optimal_seq_zones);
if (tmpvar & ATA_ZDI_OPT_SEQ_VALID) {
softc->zone_flags |=
DA_ZONE_FLAG_OPT_SEQ_SET;
softc->optimal_seq_zones = (tmpvar &
ATA_ZDI_OPT_SEQ_MASK);
} else {
softc->zone_flags &=
~DA_ZONE_FLAG_OPT_SEQ_SET;
softc->optimal_seq_zones = 0;
}
tmpvar =le64dec(zi_log->optimal_nonseq_zones);
if (tmpvar & ATA_ZDI_OPT_NS_VALID) {
softc->zone_flags |=
DA_ZONE_FLAG_OPT_NONSEQ_SET;
softc->optimal_nonseq_zones =
(tmpvar & ATA_ZDI_OPT_NS_MASK);
} else {
softc->zone_flags &=
~DA_ZONE_FLAG_OPT_NONSEQ_SET;
softc->optimal_nonseq_zones = 0;
}
tmpvar = le64dec(zi_log->max_seq_req_zones);
if (tmpvar & ATA_ZDI_MAX_SEQ_VALID) {
softc->zone_flags |=
DA_ZONE_FLAG_MAX_SEQ_SET;
softc->max_seq_zones =
(tmpvar & ATA_ZDI_MAX_SEQ_MASK);
} else {
softc->zone_flags &=
~DA_ZONE_FLAG_MAX_SEQ_SET;
softc->max_seq_zones = 0;
}
}
} else {
error = daerror(done_ccb, CAM_RETRY_SELTO,
SF_RETRY_UA|SF_NO_PRINT);
if (error == ERESTART)
return;
else if (error != 0) {
softc->flags &= ~DA_FLAG_CAN_ATA_ZONE;
softc->flags &= ~DA_ZONE_FLAG_SET_MASK;
if ((done_ccb->ccb_h.status &
CAM_DEV_QFRZN) != 0) {
/* Don't wedge this device's queue */
cam_release_devq(done_ccb->ccb_h.path,
/*relsim_flags*/0,
/*reduction*/0,
/*timeout*/0,
/*getcount_only*/0);
}
}
}
free(csio->data_ptr, M_SCSIDA);
daprobedone(periph, done_ccb);
return;
}
case DA_CCB_PROBE_ZONE:
{
int error;
if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
uint32_t valid_len;
size_t needed_len;
struct scsi_vpd_zoned_bdc *zoned_bdc;
error = 0;
zoned_bdc = (struct scsi_vpd_zoned_bdc *)
csio->data_ptr;
valid_len = csio->dxfer_len - csio->resid;
needed_len = __offsetof(struct scsi_vpd_zoned_bdc,
max_seq_req_zones) + 1 +
sizeof(zoned_bdc->max_seq_req_zones);
if ((valid_len >= needed_len)
&& (scsi_2btoul(zoned_bdc->page_length) >=
SVPD_ZBDC_PL)) {
if (zoned_bdc->flags & SVPD_ZBDC_URSWRZ)
softc->zone_flags |=
DA_ZONE_FLAG_URSWRZ;
else
softc->zone_flags &=
~DA_ZONE_FLAG_URSWRZ;
softc->optimal_seq_zones =
scsi_4btoul(zoned_bdc->optimal_seq_zones);
softc->zone_flags |= DA_ZONE_FLAG_OPT_SEQ_SET;
softc->optimal_nonseq_zones = scsi_4btoul(
zoned_bdc->optimal_nonseq_zones);
softc->zone_flags |=
DA_ZONE_FLAG_OPT_NONSEQ_SET;
softc->max_seq_zones =
scsi_4btoul(zoned_bdc->max_seq_req_zones);
softc->zone_flags |= DA_ZONE_FLAG_MAX_SEQ_SET;
}
/*
* All of the zone commands are mandatory for SCSI
* devices.
*
* XXX KDM this is valid as of September 2015.
* Re-check this assumption once the SAT spec is
* updated to support SCSI ZBC to ATA ZAC mapping.
* Since ATA allows zone commands to be reported
* as supported or not, this may not necessarily
* be true for an ATA device behind a SAT (SCSI to
* ATA Translation) layer.
*/
softc->zone_flags |= DA_ZONE_FLAG_SUP_MASK;
} else {
error = daerror(done_ccb, CAM_RETRY_SELTO,
SF_RETRY_UA|SF_NO_PRINT);
if (error == ERESTART)
return;
else if (error != 0) {
if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) {
/* Don't wedge this device's queue */
cam_release_devq(done_ccb->ccb_h.path,
/*relsim_flags*/0,
/*reduction*/0,
/*timeout*/0,
/*getcount_only*/0);
}
}
}
daprobedone(periph, done_ccb);
return;
}
case DA_CCB_DUMP:
/* No-op. We're polling */
return;
case DA_CCB_TUR:
{
if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
if (daerror(done_ccb, CAM_RETRY_SELTO,
SF_RETRY_UA | SF_NO_RECOVERY | SF_NO_PRINT) ==
ERESTART)
return;
if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0)
cam_release_devq(done_ccb->ccb_h.path,
/*relsim_flags*/0,
/*reduction*/0,
/*timeout*/0,
/*getcount_only*/0);
}
xpt_release_ccb(done_ccb);
cam_periph_release_locked(periph);
return;
}
default:
break;
}
xpt_release_ccb(done_ccb);
}
static void
dareprobe(struct cam_periph *periph)
{
struct da_softc *softc;
cam_status status;
softc = (struct da_softc *)periph->softc;
/* Probe in progress; don't interfere. */
if (softc->state != DA_STATE_NORMAL)
return;
status = cam_periph_acquire(periph);
KASSERT(status == CAM_REQ_CMP,
("dareprobe: cam_periph_acquire failed"));
if (softc->flags & DA_FLAG_CAN_RC16)
softc->state = DA_STATE_PROBE_RC16;
else
softc->state = DA_STATE_PROBE_RC;
xpt_schedule(periph, CAM_PRIORITY_DEV);
}
static int
daerror(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags)
{
struct da_softc *softc;
struct cam_periph *periph;
int error, error_code, sense_key, asc, ascq;
#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
if (ccb->csio.bio != NULL)
biotrack(ccb->csio.bio, __func__);
#endif
periph = xpt_path_periph(ccb->ccb_h.path);
softc = (struct da_softc *)periph->softc;
/*
* Automatically detect devices that do not support
* READ(6)/WRITE(6) and upgrade to using 10 byte cdbs.
*/
error = 0;
if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_INVALID) {
error = cmd6workaround(ccb);
} else if (scsi_extract_sense_ccb(ccb,
&error_code, &sense_key, &asc, &ascq)) {
if (sense_key == SSD_KEY_ILLEGAL_REQUEST)
error = cmd6workaround(ccb);
/*
* If the target replied with CAPACITY DATA HAS CHANGED UA,
* query the capacity and notify upper layers.
*/
else if (sense_key == SSD_KEY_UNIT_ATTENTION &&
asc == 0x2A && ascq == 0x09) {
xpt_print(periph->path, "Capacity data has changed\n");
softc->flags &= ~DA_FLAG_PROBED;
dareprobe(periph);
sense_flags |= SF_NO_PRINT;
} else if (sense_key == SSD_KEY_UNIT_ATTENTION &&
asc == 0x28 && ascq == 0x00) {
softc->flags &= ~DA_FLAG_PROBED;
disk_media_changed(softc->disk, M_NOWAIT);
} else if (sense_key == SSD_KEY_UNIT_ATTENTION &&
asc == 0x3F && ascq == 0x03) {
xpt_print(periph->path, "INQUIRY data has changed\n");
softc->flags &= ~DA_FLAG_PROBED;
dareprobe(periph);
sense_flags |= SF_NO_PRINT;
} else if (sense_key == SSD_KEY_NOT_READY &&
asc == 0x3a && (softc->flags & DA_FLAG_PACK_INVALID) == 0) {
softc->flags |= DA_FLAG_PACK_INVALID;
disk_media_gone(softc->disk, M_NOWAIT);
}
}
if (error == ERESTART)
return (ERESTART);
#ifdef CAM_IO_STATS
switch (ccb->ccb_h.status & CAM_STATUS_MASK) {
case CAM_CMD_TIMEOUT:
softc->timeouts++;
break;
case CAM_REQ_ABORTED:
case CAM_REQ_CMP_ERR:
case CAM_REQ_TERMIO:
case CAM_UNREC_HBA_ERROR:
case CAM_DATA_RUN_ERR:
softc->errors++;
break;
default:
break;
}
#endif
/*
* XXX
* Until we have a better way of doing pack validation,
* don't treat UAs as errors.
*/
sense_flags |= SF_RETRY_UA;
if (softc->quirks & DA_Q_RETRY_BUSY)
sense_flags |= SF_RETRY_BUSY;
return(cam_periph_error(ccb, cam_flags, sense_flags));
}
static void
damediapoll(void *arg)
{
struct cam_periph *periph = arg;
struct da_softc *softc = periph->softc;
if (!cam_iosched_has_work_flags(softc->cam_iosched, DA_WORK_TUR) &&
LIST_EMPTY(&softc->pending_ccbs)) {
if (cam_periph_acquire(periph) == CAM_REQ_CMP) {
cam_iosched_set_work_flags(softc->cam_iosched, DA_WORK_TUR);
daschedule(periph);
}
}
/* Queue us up again */
if (da_poll_period != 0)
callout_schedule(&softc->mediapoll_c, da_poll_period * hz);
}
static void
daprevent(struct cam_periph *periph, int action)
{
struct da_softc *softc;
union ccb *ccb;
int error;
softc = (struct da_softc *)periph->softc;
if (((action == PR_ALLOW)
&& (softc->flags & DA_FLAG_PACK_LOCKED) == 0)
|| ((action == PR_PREVENT)
&& (softc->flags & DA_FLAG_PACK_LOCKED) != 0)) {
return;
}
ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
scsi_prevent(&ccb->csio,
/*retries*/1,
/*cbcfp*/dadone,
MSG_SIMPLE_Q_TAG,
action,
SSD_FULL_SIZE,
5000);
error = cam_periph_runccb(ccb, daerror, CAM_RETRY_SELTO,
SF_RETRY_UA | SF_NO_PRINT, softc->disk->d_devstat);
if (error == 0) {
if (action == PR_ALLOW)
softc->flags &= ~DA_FLAG_PACK_LOCKED;
else
softc->flags |= DA_FLAG_PACK_LOCKED;
}
xpt_release_ccb(ccb);
}
static void
dasetgeom(struct cam_periph *periph, uint32_t block_len, uint64_t maxsector,
struct scsi_read_capacity_data_long *rcaplong, size_t rcap_len)
{
struct ccb_calc_geometry ccg;
struct da_softc *softc;
struct disk_params *dp;
u_int lbppbe, lalba;
int error;
softc = (struct da_softc *)periph->softc;
dp = &softc->params;
dp->secsize = block_len;
dp->sectors = maxsector + 1;
if (rcaplong != NULL) {
lbppbe = rcaplong->prot_lbppbe & SRC16_LBPPBE;
lalba = scsi_2btoul(rcaplong->lalba_lbp);
lalba &= SRC16_LALBA_A;
} else {
lbppbe = 0;
lalba = 0;
}
if (lbppbe > 0) {
dp->stripesize = block_len << lbppbe;
dp->stripeoffset = (dp->stripesize - block_len * lalba) %
dp->stripesize;
} else if (softc->quirks & DA_Q_4K) {
dp->stripesize = 4096;
dp->stripeoffset = 0;
} else if (softc->unmap_gran != 0) {
dp->stripesize = block_len * softc->unmap_gran;
dp->stripeoffset = (dp->stripesize - block_len *
softc->unmap_gran_align) % dp->stripesize;
} else {
dp->stripesize = 0;
dp->stripeoffset = 0;
}
/*
* Have the controller provide us with a geometry
* for this disk. The only time the geometry
* matters is when we boot and the controller
* is the only one knowledgeable enough to come
* up with something that will make this a bootable
* device.
*/
xpt_setup_ccb(&ccg.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
ccg.ccb_h.func_code = XPT_CALC_GEOMETRY;
ccg.block_size = dp->secsize;
ccg.volume_size = dp->sectors;
ccg.heads = 0;
ccg.secs_per_track = 0;
ccg.cylinders = 0;
xpt_action((union ccb*)&ccg);
if ((ccg.ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
/*
* We don't know what went wrong here- but just pick
* a geometry so we don't have nasty things like divide
* by zero.
*/
dp->heads = 255;
dp->secs_per_track = 255;
dp->cylinders = dp->sectors / (255 * 255);
if (dp->cylinders == 0) {
dp->cylinders = 1;
}
} else {
dp->heads = ccg.heads;
dp->secs_per_track = ccg.secs_per_track;
dp->cylinders = ccg.cylinders;
}
/*
* If the user supplied a read capacity buffer, and if it is
* different than the previous buffer, update the data in the EDT.
* If it's the same, we don't bother. This avoids sending an
* update every time someone opens this device.
*/
if ((rcaplong != NULL)
&& (bcmp(rcaplong, &softc->rcaplong,
min(sizeof(softc->rcaplong), rcap_len)) != 0)) {
struct ccb_dev_advinfo cdai;
xpt_setup_ccb(&cdai.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
cdai.ccb_h.func_code = XPT_DEV_ADVINFO;
cdai.buftype = CDAI_TYPE_RCAPLONG;
cdai.flags = CDAI_FLAG_STORE;
cdai.bufsiz = rcap_len;
cdai.buf = (uint8_t *)rcaplong;
xpt_action((union ccb *)&cdai);
if ((cdai.ccb_h.status & CAM_DEV_QFRZN) != 0)
cam_release_devq(cdai.ccb_h.path, 0, 0, 0, FALSE);
if (cdai.ccb_h.status != CAM_REQ_CMP) {
xpt_print(periph->path, "%s: failed to set read "
"capacity advinfo\n", __func__);
/* Use cam_error_print() to decode the status */
cam_error_print((union ccb *)&cdai, CAM_ESF_CAM_STATUS,
CAM_EPF_ALL);
} else {
bcopy(rcaplong, &softc->rcaplong,
min(sizeof(softc->rcaplong), rcap_len));
}
}
softc->disk->d_sectorsize = softc->params.secsize;
softc->disk->d_mediasize = softc->params.secsize * (off_t)softc->params.sectors;
softc->disk->d_stripesize = softc->params.stripesize;
softc->disk->d_stripeoffset = softc->params.stripeoffset;
/* XXX: these are not actually "firmware" values, so they may be wrong */
softc->disk->d_fwsectors = softc->params.secs_per_track;
softc->disk->d_fwheads = softc->params.heads;
softc->disk->d_devstat->block_size = softc->params.secsize;
softc->disk->d_devstat->flags &= ~DEVSTAT_BS_UNAVAILABLE;
error = disk_resize(softc->disk, M_NOWAIT);
if (error != 0)
xpt_print(periph->path, "disk_resize(9) failed, error = %d\n", error);
}
static void
dasendorderedtag(void *arg)
{
struct da_softc *softc = arg;
if (da_send_ordered) {
if (!LIST_EMPTY(&softc->pending_ccbs)) {
if ((softc->flags & DA_FLAG_WAS_OTAG) == 0)
softc->flags |= DA_FLAG_NEED_OTAG;
softc->flags &= ~DA_FLAG_WAS_OTAG;
}
}
/* Queue us up again */
callout_reset(&softc->sendordered_c,
(da_default_timeout * hz) / DA_ORDEREDTAG_INTERVAL,
dasendorderedtag, softc);
}
/*
* Step through all DA peripheral drivers, and if the device is still open,
* sync the disk cache to physical media.
*/
static void
dashutdown(void * arg, int howto)
{
struct cam_periph *periph;
struct da_softc *softc;
union ccb *ccb;
int error;
CAM_PERIPH_FOREACH(periph, &dadriver) {
softc = (struct da_softc *)periph->softc;
if (SCHEDULER_STOPPED()) {
/* If we paniced with the lock held, do not recurse. */
if (!cam_periph_owned(periph) &&
(softc->flags & DA_FLAG_OPEN)) {
dadump(softc->disk, NULL, 0, 0, 0);
}
continue;
}
cam_periph_lock(periph);
/*
* We only sync the cache if the drive is still open, and
* if the drive is capable of it..
*/
if (((softc->flags & DA_FLAG_OPEN) == 0)
|| (softc->quirks & DA_Q_NO_SYNC_CACHE)) {
cam_periph_unlock(periph);
continue;
}
ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
scsi_synchronize_cache(&ccb->csio,
/*retries*/0,
/*cbfcnp*/dadone,
MSG_SIMPLE_Q_TAG,
/*begin_lba*/0, /* whole disk */
/*lb_count*/0,
SSD_FULL_SIZE,
60 * 60 * 1000);
error = cam_periph_runccb(ccb, daerror, /*cam_flags*/0,
/*sense_flags*/ SF_NO_RECOVERY | SF_NO_RETRY | SF_QUIET_IR,
softc->disk->d_devstat);
if (error != 0)
xpt_print(periph->path, "Synchronize cache failed\n");
xpt_release_ccb(ccb);
cam_periph_unlock(periph);
}
}
#else /* !_KERNEL */
/*
* XXX These are only left out of the kernel build to silence warnings. If,
* for some reason these functions are used in the kernel, the ifdefs should
* be moved so they are included both in the kernel and userland.
*/
void
scsi_format_unit(struct ccb_scsiio *csio, u_int32_t retries,
void (*cbfcnp)(struct cam_periph *, union ccb *),
u_int8_t tag_action, u_int8_t byte2, u_int16_t ileave,
u_int8_t *data_ptr, u_int32_t dxfer_len, u_int8_t sense_len,
u_int32_t timeout)
{
struct scsi_format_unit *scsi_cmd;
scsi_cmd = (struct scsi_format_unit *)&csio->cdb_io.cdb_bytes;
scsi_cmd->opcode = FORMAT_UNIT;
scsi_cmd->byte2 = byte2;
scsi_ulto2b(ileave, scsi_cmd->interleave);
cam_fill_csio(csio,
retries,
cbfcnp,
/*flags*/ (dxfer_len > 0) ? CAM_DIR_OUT : CAM_DIR_NONE,
tag_action,
data_ptr,
dxfer_len,
sense_len,
sizeof(*scsi_cmd),
timeout);
}
void
scsi_read_defects(struct ccb_scsiio *csio, uint32_t retries,
void (*cbfcnp)(struct cam_periph *, union ccb *),
uint8_t tag_action, uint8_t list_format,
uint32_t addr_desc_index, uint8_t *data_ptr,
uint32_t dxfer_len, int minimum_cmd_size,
uint8_t sense_len, uint32_t timeout)
{
uint8_t cdb_len;
/*
* These conditions allow using the 10 byte command. Otherwise we
* need to use the 12 byte command.
*/
if ((minimum_cmd_size <= 10)
&& (addr_desc_index == 0)
&& (dxfer_len <= SRDD10_MAX_LENGTH)) {
struct scsi_read_defect_data_10 *cdb10;
cdb10 = (struct scsi_read_defect_data_10 *)
&csio->cdb_io.cdb_bytes;
cdb_len = sizeof(*cdb10);
bzero(cdb10, cdb_len);
cdb10->opcode = READ_DEFECT_DATA_10;
cdb10->format = list_format;
scsi_ulto2b(dxfer_len, cdb10->alloc_length);
} else {
struct scsi_read_defect_data_12 *cdb12;
cdb12 = (struct scsi_read_defect_data_12 *)
&csio->cdb_io.cdb_bytes;
cdb_len = sizeof(*cdb12);
bzero(cdb12, cdb_len);
cdb12->opcode = READ_DEFECT_DATA_12;
cdb12->format = list_format;
scsi_ulto4b(dxfer_len, cdb12->alloc_length);
scsi_ulto4b(addr_desc_index, cdb12->address_descriptor_index);
}
cam_fill_csio(csio,
retries,
cbfcnp,
/*flags*/ CAM_DIR_IN,
tag_action,
data_ptr,
dxfer_len,
sense_len,
cdb_len,
timeout);
}
void
scsi_sanitize(struct ccb_scsiio *csio, u_int32_t retries,
void (*cbfcnp)(struct cam_periph *, union ccb *),
u_int8_t tag_action, u_int8_t byte2, u_int16_t control,
u_int8_t *data_ptr, u_int32_t dxfer_len, u_int8_t sense_len,
u_int32_t timeout)
{
struct scsi_sanitize *scsi_cmd;
scsi_cmd = (struct scsi_sanitize *)&csio->cdb_io.cdb_bytes;
scsi_cmd->opcode = SANITIZE;
scsi_cmd->byte2 = byte2;
scsi_cmd->control = control;
scsi_ulto2b(dxfer_len, scsi_cmd->length);
cam_fill_csio(csio,
retries,
cbfcnp,
/*flags*/ (dxfer_len > 0) ? CAM_DIR_OUT : CAM_DIR_NONE,
tag_action,
data_ptr,
dxfer_len,
sense_len,
sizeof(*scsi_cmd),
timeout);
}
#endif /* _KERNEL */
void
scsi_zbc_out(struct ccb_scsiio *csio, uint32_t retries,
void (*cbfcnp)(struct cam_periph *, union ccb *),
uint8_t tag_action, uint8_t service_action, uint64_t zone_id,
uint8_t zone_flags, uint8_t *data_ptr, uint32_t dxfer_len,
uint8_t sense_len, uint32_t timeout)
{
struct scsi_zbc_out *scsi_cmd;
scsi_cmd = (struct scsi_zbc_out *)&csio->cdb_io.cdb_bytes;
scsi_cmd->opcode = ZBC_OUT;
scsi_cmd->service_action = service_action;
scsi_u64to8b(zone_id, scsi_cmd->zone_id);
scsi_cmd->zone_flags = zone_flags;
cam_fill_csio(csio,
retries,
cbfcnp,
/*flags*/ (dxfer_len > 0) ? CAM_DIR_OUT : CAM_DIR_NONE,
tag_action,
data_ptr,
dxfer_len,
sense_len,
sizeof(*scsi_cmd),
timeout);
}
void
scsi_zbc_in(struct ccb_scsiio *csio, uint32_t retries,
void (*cbfcnp)(struct cam_periph *, union ccb *),
uint8_t tag_action, uint8_t service_action, uint64_t zone_start_lba,
uint8_t zone_options, uint8_t *data_ptr, uint32_t dxfer_len,
uint8_t sense_len, uint32_t timeout)
{
struct scsi_zbc_in *scsi_cmd;
scsi_cmd = (struct scsi_zbc_in *)&csio->cdb_io.cdb_bytes;
scsi_cmd->opcode = ZBC_IN;
scsi_cmd->service_action = service_action;
scsi_ulto4b(dxfer_len, scsi_cmd->length);
scsi_u64to8b(zone_start_lba, scsi_cmd->zone_start_lba);
scsi_cmd->zone_options = zone_options;
cam_fill_csio(csio,
retries,
cbfcnp,
/*flags*/ (dxfer_len > 0) ? CAM_DIR_IN : CAM_DIR_NONE,
tag_action,
data_ptr,
dxfer_len,
sense_len,
sizeof(*scsi_cmd),
timeout);
}
int
scsi_ata_zac_mgmt_out(struct ccb_scsiio *csio, uint32_t retries,
void (*cbfcnp)(struct cam_periph *, union ccb *),
uint8_t tag_action, int use_ncq,
uint8_t zm_action, uint64_t zone_id, uint8_t zone_flags,
uint8_t *data_ptr, uint32_t dxfer_len,
uint8_t *cdb_storage, size_t cdb_storage_len,
uint8_t sense_len, uint32_t timeout)
{
uint8_t command_out, protocol, ata_flags;
uint16_t features_out;
uint32_t sectors_out, auxiliary;
int retval;
retval = 0;
if (use_ncq == 0) {
command_out = ATA_ZAC_MANAGEMENT_OUT;
features_out = (zm_action & 0xf) | (zone_flags << 8);
ata_flags = AP_FLAG_BYT_BLOK_BLOCKS;
if (dxfer_len == 0) {
protocol = AP_PROTO_NON_DATA;
ata_flags |= AP_FLAG_TLEN_NO_DATA;
sectors_out = 0;
} else {
protocol = AP_PROTO_DMA;
ata_flags |= AP_FLAG_TLEN_SECT_CNT |
AP_FLAG_TDIR_TO_DEV;
sectors_out = ((dxfer_len >> 9) & 0xffff);
}
auxiliary = 0;
} else {
ata_flags = AP_FLAG_BYT_BLOK_BLOCKS;
if (dxfer_len == 0) {
command_out = ATA_NCQ_NON_DATA;
features_out = ATA_NCQ_ZAC_MGMT_OUT;
/*
* We're assuming the SCSI to ATA translation layer
* will set the NCQ tag number in the tag field.
* That isn't clear from the SAT-4 spec (as of rev 05).
*/
sectors_out = 0;
ata_flags |= AP_FLAG_TLEN_NO_DATA;
} else {
command_out = ATA_SEND_FPDMA_QUEUED;
/*
* Note that we're defaulting to normal priority,
* and assuming that the SCSI to ATA translation
* layer will insert the NCQ tag number in the tag
* field. That isn't clear in the SAT-4 spec (as
* of rev 05).
*/
sectors_out = ATA_SFPDMA_ZAC_MGMT_OUT << 8;
ata_flags |= AP_FLAG_TLEN_FEAT |
AP_FLAG_TDIR_TO_DEV;
/*
* For SEND FPDMA QUEUED, the transfer length is
* encoded in the FEATURE register, and 0 means
* that 65536 512 byte blocks are to be tranferred.
* In practice, it seems unlikely that we'll see
* a transfer that large, and it may confuse the
* the SAT layer, because generally that means that
* 0 bytes should be transferred.
*/
if (dxfer_len == (65536 * 512)) {
features_out = 0;
} else if (dxfer_len <= (65535 * 512)) {
features_out = ((dxfer_len >> 9) & 0xffff);
} else {
/* The transfer is too big. */
retval = 1;
goto bailout;
}
}
auxiliary = (zm_action & 0xf) | (zone_flags << 8);
protocol = AP_PROTO_FPDMA;
}
protocol |= AP_EXTEND;
retval = scsi_ata_pass(csio,
retries,
cbfcnp,
/*flags*/ (dxfer_len > 0) ? CAM_DIR_OUT : CAM_DIR_NONE,
tag_action,
/*protocol*/ protocol,
/*ata_flags*/ ata_flags,
/*features*/ features_out,
/*sector_count*/ sectors_out,
/*lba*/ zone_id,
/*command*/ command_out,
/*device*/ 0,
/*icc*/ 0,
/*auxiliary*/ auxiliary,
/*control*/ 0,
/*data_ptr*/ data_ptr,
/*dxfer_len*/ dxfer_len,
/*cdb_storage*/ cdb_storage,
/*cdb_storage_len*/ cdb_storage_len,
/*minimum_cmd_size*/ 0,
/*sense_len*/ SSD_FULL_SIZE,
/*timeout*/ timeout);
bailout:
return (retval);
}
int
scsi_ata_zac_mgmt_in(struct ccb_scsiio *csio, uint32_t retries,
void (*cbfcnp)(struct cam_periph *, union ccb *),
uint8_t tag_action, int use_ncq,
uint8_t zm_action, uint64_t zone_id, uint8_t zone_flags,
uint8_t *data_ptr, uint32_t dxfer_len,
uint8_t *cdb_storage, size_t cdb_storage_len,
uint8_t sense_len, uint32_t timeout)
{
uint8_t command_out, protocol;
uint16_t features_out, sectors_out;
uint32_t auxiliary;
int ata_flags;
int retval;
retval = 0;
ata_flags = AP_FLAG_TDIR_FROM_DEV | AP_FLAG_BYT_BLOK_BLOCKS;
if (use_ncq == 0) {
command_out = ATA_ZAC_MANAGEMENT_IN;
/* XXX KDM put a macro here */
features_out = (zm_action & 0xf) | (zone_flags << 8);
sectors_out = dxfer_len >> 9; /* XXX KDM macro */
protocol = AP_PROTO_DMA;
ata_flags |= AP_FLAG_TLEN_SECT_CNT;
auxiliary = 0;
} else {
ata_flags |= AP_FLAG_TLEN_FEAT;
command_out = ATA_RECV_FPDMA_QUEUED;
sectors_out = ATA_RFPDMA_ZAC_MGMT_IN << 8;
/*
* For RECEIVE FPDMA QUEUED, the transfer length is
* encoded in the FEATURE register, and 0 means
* that 65536 512 byte blocks are to be tranferred.
* In practice, it seems unlikely that we'll see
* a transfer that large, and it may confuse the
* the SAT layer, because generally that means that
* 0 bytes should be transferred.
*/
if (dxfer_len == (65536 * 512)) {
features_out = 0;
} else if (dxfer_len <= (65535 * 512)) {
features_out = ((dxfer_len >> 9) & 0xffff);
} else {
/* The transfer is too big. */
retval = 1;
goto bailout;
}
auxiliary = (zm_action & 0xf) | (zone_flags << 8),
protocol = AP_PROTO_FPDMA;
}
protocol |= AP_EXTEND;
retval = scsi_ata_pass(csio,
retries,
cbfcnp,
/*flags*/ CAM_DIR_IN,
tag_action,
/*protocol*/ protocol,
/*ata_flags*/ ata_flags,
/*features*/ features_out,
/*sector_count*/ sectors_out,
/*lba*/ zone_id,
/*command*/ command_out,
/*device*/ 0,
/*icc*/ 0,
/*auxiliary*/ auxiliary,
/*control*/ 0,
/*data_ptr*/ data_ptr,
/*dxfer_len*/ (dxfer_len >> 9) * 512, /* XXX KDM */
/*cdb_storage*/ cdb_storage,
/*cdb_storage_len*/ cdb_storage_len,
/*minimum_cmd_size*/ 0,
/*sense_len*/ SSD_FULL_SIZE,
/*timeout*/ timeout);
bailout:
return (retval);
}
Index: head/sys/cam/scsi/scsi_pass.c
===================================================================
--- head/sys/cam/scsi/scsi_pass.c (revision 327172)
+++ head/sys/cam/scsi/scsi_pass.c (revision 327173)
@@ -1,2278 +1,2276 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 1997, 1998, 2000 Justin T. Gibbs.
* Copyright (c) 1997, 1998, 1999 Kenneth D. Merry.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions, and the following disclaimer,
* without modification, immediately at the beginning of the file.
* 2. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/conf.h>
#include <sys/types.h>
#include <sys/bio.h>
#include <sys/bus.h>
#include <sys/devicestat.h>
#include <sys/errno.h>
#include <sys/fcntl.h>
#include <sys/malloc.h>
#include <sys/proc.h>
#include <sys/poll.h>
#include <sys/selinfo.h>
#include <sys/sdt.h>
#include <sys/taskqueue.h>
#include <vm/uma.h>
#include <vm/vm.h>
#include <vm/vm_extern.h>
#include <machine/bus.h>
#include <cam/cam.h>
#include <cam/cam_ccb.h>
#include <cam/cam_periph.h>
#include <cam/cam_queue.h>
#include <cam/cam_xpt.h>
#include <cam/cam_xpt_periph.h>
#include <cam/cam_debug.h>
#include <cam/cam_compat.h>
#include <cam/cam_xpt_periph.h>
#include <cam/scsi/scsi_all.h>
#include <cam/scsi/scsi_pass.h>
typedef enum {
PASS_FLAG_OPEN = 0x01,
PASS_FLAG_LOCKED = 0x02,
PASS_FLAG_INVALID = 0x04,
PASS_FLAG_INITIAL_PHYSPATH = 0x08,
PASS_FLAG_ZONE_INPROG = 0x10,
PASS_FLAG_ZONE_VALID = 0x20,
PASS_FLAG_UNMAPPED_CAPABLE = 0x40,
PASS_FLAG_ABANDONED_REF_SET = 0x80
} pass_flags;
typedef enum {
PASS_STATE_NORMAL
} pass_state;
typedef enum {
PASS_CCB_BUFFER_IO,
PASS_CCB_QUEUED_IO
} pass_ccb_types;
#define ccb_type ppriv_field0
#define ccb_ioreq ppriv_ptr1
/*
* The maximum number of memory segments we preallocate.
*/
#define PASS_MAX_SEGS 16
typedef enum {
PASS_IO_NONE = 0x00,
PASS_IO_USER_SEG_MALLOC = 0x01,
PASS_IO_KERN_SEG_MALLOC = 0x02,
PASS_IO_ABANDONED = 0x04
} pass_io_flags;
struct pass_io_req {
union ccb ccb;
union ccb *alloced_ccb;
union ccb *user_ccb_ptr;
camq_entry user_periph_links;
ccb_ppriv_area user_periph_priv;
struct cam_periph_map_info mapinfo;
pass_io_flags flags;
ccb_flags data_flags;
int num_user_segs;
bus_dma_segment_t user_segs[PASS_MAX_SEGS];
int num_kern_segs;
bus_dma_segment_t kern_segs[PASS_MAX_SEGS];
bus_dma_segment_t *user_segptr;
bus_dma_segment_t *kern_segptr;
int num_bufs;
uint32_t dirs[CAM_PERIPH_MAXMAPS];
uint32_t lengths[CAM_PERIPH_MAXMAPS];
uint8_t *user_bufs[CAM_PERIPH_MAXMAPS];
uint8_t *kern_bufs[CAM_PERIPH_MAXMAPS];
struct bintime start_time;
TAILQ_ENTRY(pass_io_req) links;
};
struct pass_softc {
pass_state state;
pass_flags flags;
u_int8_t pd_type;
union ccb saved_ccb;
int open_count;
u_int maxio;
struct devstat *device_stats;
struct cdev *dev;
struct cdev *alias_dev;
struct task add_physpath_task;
struct task shutdown_kqueue_task;
struct selinfo read_select;
TAILQ_HEAD(, pass_io_req) incoming_queue;
TAILQ_HEAD(, pass_io_req) active_queue;
TAILQ_HEAD(, pass_io_req) abandoned_queue;
TAILQ_HEAD(, pass_io_req) done_queue;
struct cam_periph *periph;
char zone_name[12];
char io_zone_name[12];
uma_zone_t pass_zone;
uma_zone_t pass_io_zone;
size_t io_zone_size;
};
static d_open_t passopen;
static d_close_t passclose;
static d_ioctl_t passioctl;
static d_ioctl_t passdoioctl;
static d_poll_t passpoll;
static d_kqfilter_t passkqfilter;
static void passreadfiltdetach(struct knote *kn);
static int passreadfilt(struct knote *kn, long hint);
static periph_init_t passinit;
static periph_ctor_t passregister;
static periph_oninv_t passoninvalidate;
static periph_dtor_t passcleanup;
static periph_start_t passstart;
static void pass_shutdown_kqueue(void *context, int pending);
static void pass_add_physpath(void *context, int pending);
static void passasync(void *callback_arg, u_int32_t code,
struct cam_path *path, void *arg);
static void passdone(struct cam_periph *periph,
union ccb *done_ccb);
static int passcreatezone(struct cam_periph *periph);
static void passiocleanup(struct pass_softc *softc,
struct pass_io_req *io_req);
static int passcopysglist(struct cam_periph *periph,
struct pass_io_req *io_req,
ccb_flags direction);
static int passmemsetup(struct cam_periph *periph,
struct pass_io_req *io_req);
static int passmemdone(struct cam_periph *periph,
struct pass_io_req *io_req);
static int passerror(union ccb *ccb, u_int32_t cam_flags,
u_int32_t sense_flags);
static int passsendccb(struct cam_periph *periph, union ccb *ccb,
union ccb *inccb);
static struct periph_driver passdriver =
{
passinit, "pass",
TAILQ_HEAD_INITIALIZER(passdriver.units), /* generation */ 0
};
PERIPHDRIVER_DECLARE(pass, passdriver);
static struct cdevsw pass_cdevsw = {
.d_version = D_VERSION,
.d_flags = D_TRACKCLOSE,
.d_open = passopen,
.d_close = passclose,
.d_ioctl = passioctl,
.d_poll = passpoll,
.d_kqfilter = passkqfilter,
.d_name = "pass",
};
static struct filterops passread_filtops = {
.f_isfd = 1,
.f_detach = passreadfiltdetach,
.f_event = passreadfilt
};
static MALLOC_DEFINE(M_SCSIPASS, "scsi_pass", "scsi passthrough buffers");
static void
passinit(void)
{
cam_status status;
/*
* Install a global async callback. This callback will
* receive async callbacks like "new device found".
*/
status = xpt_register_async(AC_FOUND_DEVICE, passasync, NULL, NULL);
if (status != CAM_REQ_CMP) {
printf("pass: Failed to attach master async callback "
"due to status 0x%x!\n", status);
}
}
static void
passrejectios(struct cam_periph *periph)
{
struct pass_io_req *io_req, *io_req2;
struct pass_softc *softc;
softc = (struct pass_softc *)periph->softc;
/*
* The user can no longer get status for I/O on the done queue, so
* clean up all outstanding I/O on the done queue.
*/
TAILQ_FOREACH_SAFE(io_req, &softc->done_queue, links, io_req2) {
TAILQ_REMOVE(&softc->done_queue, io_req, links);
passiocleanup(softc, io_req);
uma_zfree(softc->pass_zone, io_req);
}
/*
* The underlying device is gone, so we can't issue these I/Os.
* The devfs node has been shut down, so we can't return status to
* the user. Free any I/O left on the incoming queue.
*/
TAILQ_FOREACH_SAFE(io_req, &softc->incoming_queue, links, io_req2) {
TAILQ_REMOVE(&softc->incoming_queue, io_req, links);
passiocleanup(softc, io_req);
uma_zfree(softc->pass_zone, io_req);
}
/*
* Normally we would put I/Os on the abandoned queue and acquire a
* reference when we saw the final close. But, the device went
* away and devfs may have moved everything off to deadfs by the
* time the I/O done callback is called; as a result, we won't see
* any more closes. So, if we have any active I/Os, we need to put
* them on the abandoned queue. When the abandoned queue is empty,
* we'll release the remaining reference (see below) to the peripheral.
*/
TAILQ_FOREACH_SAFE(io_req, &softc->active_queue, links, io_req2) {
TAILQ_REMOVE(&softc->active_queue, io_req, links);
io_req->flags |= PASS_IO_ABANDONED;
TAILQ_INSERT_TAIL(&softc->abandoned_queue, io_req, links);
}
/*
* If we put any I/O on the abandoned queue, acquire a reference.
*/
if ((!TAILQ_EMPTY(&softc->abandoned_queue))
&& ((softc->flags & PASS_FLAG_ABANDONED_REF_SET) == 0)) {
cam_periph_doacquire(periph);
softc->flags |= PASS_FLAG_ABANDONED_REF_SET;
}
}
static void
passdevgonecb(void *arg)
{
struct cam_periph *periph;
struct mtx *mtx;
struct pass_softc *softc;
int i;
periph = (struct cam_periph *)arg;
mtx = cam_periph_mtx(periph);
mtx_lock(mtx);
softc = (struct pass_softc *)periph->softc;
KASSERT(softc->open_count >= 0, ("Negative open count %d",
softc->open_count));
/*
* When we get this callback, we will get no more close calls from
* devfs. So if we have any dangling opens, we need to release the
* reference held for that particular context.
*/
for (i = 0; i < softc->open_count; i++)
cam_periph_release_locked(periph);
softc->open_count = 0;
/*
* Release the reference held for the device node, it is gone now.
* Accordingly, inform all queued I/Os of their fate.
*/
cam_periph_release_locked(periph);
passrejectios(periph);
/*
* We reference the SIM lock directly here, instead of using
* cam_periph_unlock(). The reason is that the final call to
* cam_periph_release_locked() above could result in the periph
* getting freed. If that is the case, dereferencing the periph
* with a cam_periph_unlock() call would cause a page fault.
*/
mtx_unlock(mtx);
/*
* We have to remove our kqueue context from a thread because it
* may sleep. It would be nice if we could get a callback from
* kqueue when it is done cleaning up resources.
*/
taskqueue_enqueue(taskqueue_thread, &softc->shutdown_kqueue_task);
}
static void
passoninvalidate(struct cam_periph *periph)
{
struct pass_softc *softc;
softc = (struct pass_softc *)periph->softc;
/*
* De-register any async callbacks.
*/
xpt_register_async(0, passasync, periph, periph->path);
softc->flags |= PASS_FLAG_INVALID;
/*
* Tell devfs this device has gone away, and ask for a callback
* when it has cleaned up its state.
*/
destroy_dev_sched_cb(softc->dev, passdevgonecb, periph);
}
static void
passcleanup(struct cam_periph *periph)
{
struct pass_softc *softc;
softc = (struct pass_softc *)periph->softc;
cam_periph_assert(periph, MA_OWNED);
KASSERT(TAILQ_EMPTY(&softc->active_queue),
("%s called when there are commands on the active queue!\n",
__func__));
KASSERT(TAILQ_EMPTY(&softc->abandoned_queue),
("%s called when there are commands on the abandoned queue!\n",
__func__));
KASSERT(TAILQ_EMPTY(&softc->incoming_queue),
("%s called when there are commands on the incoming queue!\n",
__func__));
KASSERT(TAILQ_EMPTY(&softc->done_queue),
("%s called when there are commands on the done queue!\n",
__func__));
devstat_remove_entry(softc->device_stats);
cam_periph_unlock(periph);
/*
* We call taskqueue_drain() for the physpath task to make sure it
* is complete. We drop the lock because this can potentially
* sleep. XXX KDM that is bad. Need a way to get a callback when
* a taskqueue is drained.
*
* Note that we don't drain the kqueue shutdown task queue. This
* is because we hold a reference on the periph for kqueue, and
* release that reference from the kqueue shutdown task queue. So
* we cannot come into this routine unless we've released that
* reference. Also, because that could be the last reference, we
* could be called from the cam_periph_release() call in
* pass_shutdown_kqueue(). In that case, the taskqueue_drain()
* would deadlock. It would be preferable if we had a way to
* get a callback when a taskqueue is done.
*/
taskqueue_drain(taskqueue_thread, &softc->add_physpath_task);
cam_periph_lock(periph);
free(softc, M_DEVBUF);
}
static void
pass_shutdown_kqueue(void *context, int pending)
{
struct cam_periph *periph;
struct pass_softc *softc;
periph = context;
softc = periph->softc;
knlist_clear(&softc->read_select.si_note, /*is_locked*/ 0);
knlist_destroy(&softc->read_select.si_note);
/*
* Release the reference we held for kqueue.
*/
cam_periph_release(periph);
}
static void
pass_add_physpath(void *context, int pending)
{
struct cam_periph *periph;
struct pass_softc *softc;
struct mtx *mtx;
char *physpath;
/*
* If we have one, create a devfs alias for our
* physical path.
*/
periph = context;
softc = periph->softc;
physpath = malloc(MAXPATHLEN, M_DEVBUF, M_WAITOK);
mtx = cam_periph_mtx(periph);
mtx_lock(mtx);
if (periph->flags & CAM_PERIPH_INVALID)
goto out;
if (xpt_getattr(physpath, MAXPATHLEN,
"GEOM::physpath", periph->path) == 0
&& strlen(physpath) != 0) {
mtx_unlock(mtx);
make_dev_physpath_alias(MAKEDEV_WAITOK, &softc->alias_dev,
softc->dev, softc->alias_dev, physpath);
mtx_lock(mtx);
}
out:
/*
* Now that we've made our alias, we no longer have to have a
* reference to the device.
*/
if ((softc->flags & PASS_FLAG_INITIAL_PHYSPATH) == 0)
softc->flags |= PASS_FLAG_INITIAL_PHYSPATH;
/*
* We always acquire a reference to the periph before queueing this
* task queue function, so it won't go away before we run.
*/
while (pending-- > 0)
cam_periph_release_locked(periph);
mtx_unlock(mtx);
free(physpath, M_DEVBUF);
}
static void
passasync(void *callback_arg, u_int32_t code,
struct cam_path *path, void *arg)
{
struct cam_periph *periph;
periph = (struct cam_periph *)callback_arg;
switch (code) {
case AC_FOUND_DEVICE:
{
struct ccb_getdev *cgd;
cam_status status;
cgd = (struct ccb_getdev *)arg;
if (cgd == NULL)
break;
/*
* Allocate a peripheral instance for
* this device and start the probe
* process.
*/
status = cam_periph_alloc(passregister, passoninvalidate,
passcleanup, passstart, "pass",
CAM_PERIPH_BIO, path,
passasync, AC_FOUND_DEVICE, cgd);
if (status != CAM_REQ_CMP
&& status != CAM_REQ_INPROG) {
const struct cam_status_entry *entry;
entry = cam_fetch_status_entry(status);
printf("passasync: Unable to attach new device "
"due to status %#x: %s\n", status, entry ?
entry->status_text : "Unknown");
}
break;
}
case AC_ADVINFO_CHANGED:
{
uintptr_t buftype;
buftype = (uintptr_t)arg;
if (buftype == CDAI_TYPE_PHYS_PATH) {
struct pass_softc *softc;
cam_status status;
softc = (struct pass_softc *)periph->softc;
/*
* Acquire a reference to the periph before we
* start the taskqueue, so that we don't run into
* a situation where the periph goes away before
* the task queue has a chance to run.
*/
status = cam_periph_acquire(periph);
if (status != CAM_REQ_CMP)
break;
taskqueue_enqueue(taskqueue_thread,
&softc->add_physpath_task);
}
break;
}
default:
cam_periph_async(periph, code, path, arg);
break;
}
}
static cam_status
passregister(struct cam_periph *periph, void *arg)
{
struct pass_softc *softc;
struct ccb_getdev *cgd;
struct ccb_pathinq cpi;
struct make_dev_args args;
int error, no_tags;
cgd = (struct ccb_getdev *)arg;
if (cgd == NULL) {
printf("%s: no getdev CCB, can't register device\n", __func__);
return(CAM_REQ_CMP_ERR);
}
softc = (struct pass_softc *)malloc(sizeof(*softc),
M_DEVBUF, M_NOWAIT);
if (softc == NULL) {
printf("%s: Unable to probe new device. "
"Unable to allocate softc\n", __func__);
return(CAM_REQ_CMP_ERR);
}
bzero(softc, sizeof(*softc));
softc->state = PASS_STATE_NORMAL;
if (cgd->protocol == PROTO_SCSI || cgd->protocol == PROTO_ATAPI)
softc->pd_type = SID_TYPE(&cgd->inq_data);
else if (cgd->protocol == PROTO_SATAPM)
softc->pd_type = T_ENCLOSURE;
else
softc->pd_type = T_DIRECT;
periph->softc = softc;
softc->periph = periph;
TAILQ_INIT(&softc->incoming_queue);
TAILQ_INIT(&softc->active_queue);
TAILQ_INIT(&softc->abandoned_queue);
TAILQ_INIT(&softc->done_queue);
snprintf(softc->zone_name, sizeof(softc->zone_name), "%s%d",
periph->periph_name, periph->unit_number);
snprintf(softc->io_zone_name, sizeof(softc->io_zone_name), "%s%dIO",
periph->periph_name, periph->unit_number);
softc->io_zone_size = MAXPHYS;
knlist_init_mtx(&softc->read_select.si_note, cam_periph_mtx(periph));
xpt_path_inq(&cpi, periph->path);
if (cpi.maxio == 0)
softc->maxio = DFLTPHYS; /* traditional default */
else if (cpi.maxio > MAXPHYS)
softc->maxio = MAXPHYS; /* for safety */
else
softc->maxio = cpi.maxio; /* real value */
if (cpi.hba_misc & PIM_UNMAPPED)
softc->flags |= PASS_FLAG_UNMAPPED_CAPABLE;
/*
* We pass in 0 for a blocksize, since we don't
* know what the blocksize of this device is, if
* it even has a blocksize.
*/
cam_periph_unlock(periph);
no_tags = (cgd->inq_data.flags & SID_CmdQue) == 0;
softc->device_stats = devstat_new_entry("pass",
periph->unit_number, 0,
DEVSTAT_NO_BLOCKSIZE
| (no_tags ? DEVSTAT_NO_ORDERED_TAGS : 0),
softc->pd_type |
XPORT_DEVSTAT_TYPE(cpi.transport) |
DEVSTAT_TYPE_PASS,
DEVSTAT_PRIORITY_PASS);
/*
* Initialize the taskqueue handler for shutting down kqueue.
*/
TASK_INIT(&softc->shutdown_kqueue_task, /*priority*/ 0,
pass_shutdown_kqueue, periph);
/*
* Acquire a reference to the periph that we can release once we've
* cleaned up the kqueue.
*/
if (cam_periph_acquire(periph) != CAM_REQ_CMP) {
xpt_print(periph->path, "%s: lost periph during "
"registration!\n", __func__);
cam_periph_lock(periph);
return (CAM_REQ_CMP_ERR);
}
/*
* Acquire a reference to the periph before we create the devfs
* instance for it. We'll release this reference once the devfs
* instance has been freed.
*/
if (cam_periph_acquire(periph) != CAM_REQ_CMP) {
xpt_print(periph->path, "%s: lost periph during "
"registration!\n", __func__);
cam_periph_lock(periph);
return (CAM_REQ_CMP_ERR);
}
/* Register the device */
make_dev_args_init(&args);
args.mda_devsw = &pass_cdevsw;
args.mda_unit = periph->unit_number;
args.mda_uid = UID_ROOT;
args.mda_gid = GID_OPERATOR;
args.mda_mode = 0600;
args.mda_si_drv1 = periph;
error = make_dev_s(&args, &softc->dev, "%s%d", periph->periph_name,
periph->unit_number);
if (error != 0) {
cam_periph_lock(periph);
cam_periph_release_locked(periph);
return (CAM_REQ_CMP_ERR);
}
/*
* Hold a reference to the periph before we create the physical
* path alias so it can't go away.
*/
if (cam_periph_acquire(periph) != CAM_REQ_CMP) {
xpt_print(periph->path, "%s: lost periph during "
"registration!\n", __func__);
cam_periph_lock(periph);
return (CAM_REQ_CMP_ERR);
}
cam_periph_lock(periph);
TASK_INIT(&softc->add_physpath_task, /*priority*/0,
pass_add_physpath, periph);
/*
* See if physical path information is already available.
*/
taskqueue_enqueue(taskqueue_thread, &softc->add_physpath_task);
/*
* Add an async callback so that we get notified if
* this device goes away or its physical path
* (stored in the advanced info data of the EDT) has
* changed.
*/
xpt_register_async(AC_LOST_DEVICE | AC_ADVINFO_CHANGED,
passasync, periph, periph->path);
if (bootverbose)
xpt_announce_periph(periph, NULL);
return(CAM_REQ_CMP);
}
static int
passopen(struct cdev *dev, int flags, int fmt, struct thread *td)
{
struct cam_periph *periph;
struct pass_softc *softc;
int error;
periph = (struct cam_periph *)dev->si_drv1;
if (cam_periph_acquire(periph) != CAM_REQ_CMP)
return (ENXIO);
cam_periph_lock(periph);
softc = (struct pass_softc *)periph->softc;
if (softc->flags & PASS_FLAG_INVALID) {
cam_periph_release_locked(periph);
cam_periph_unlock(periph);
return(ENXIO);
}
/*
* Don't allow access when we're running at a high securelevel.
*/
error = securelevel_gt(td->td_ucred, 1);
if (error) {
cam_periph_release_locked(periph);
cam_periph_unlock(periph);
return(error);
}
/*
* Only allow read-write access.
*/
if (((flags & FWRITE) == 0) || ((flags & FREAD) == 0)) {
cam_periph_release_locked(periph);
cam_periph_unlock(periph);
return(EPERM);
}
/*
* We don't allow nonblocking access.
*/
if ((flags & O_NONBLOCK) != 0) {
xpt_print(periph->path, "can't do nonblocking access\n");
cam_periph_release_locked(periph);
cam_periph_unlock(periph);
return(EINVAL);
}
softc->open_count++;
cam_periph_unlock(periph);
return (error);
}
static int
passclose(struct cdev *dev, int flag, int fmt, struct thread *td)
{
struct cam_periph *periph;
struct pass_softc *softc;
struct mtx *mtx;
periph = (struct cam_periph *)dev->si_drv1;
mtx = cam_periph_mtx(periph);
mtx_lock(mtx);
softc = periph->softc;
softc->open_count--;
if (softc->open_count == 0) {
struct pass_io_req *io_req, *io_req2;
TAILQ_FOREACH_SAFE(io_req, &softc->done_queue, links, io_req2) {
TAILQ_REMOVE(&softc->done_queue, io_req, links);
passiocleanup(softc, io_req);
uma_zfree(softc->pass_zone, io_req);
}
TAILQ_FOREACH_SAFE(io_req, &softc->incoming_queue, links,
io_req2) {
TAILQ_REMOVE(&softc->incoming_queue, io_req, links);
passiocleanup(softc, io_req);
uma_zfree(softc->pass_zone, io_req);
}
/*
* If there are any active I/Os, we need to forcibly acquire a
* reference to the peripheral so that we don't go away
* before they complete. We'll release the reference when
* the abandoned queue is empty.
*/
io_req = TAILQ_FIRST(&softc->active_queue);
if ((io_req != NULL)
&& (softc->flags & PASS_FLAG_ABANDONED_REF_SET) == 0) {
cam_periph_doacquire(periph);
softc->flags |= PASS_FLAG_ABANDONED_REF_SET;
}
/*
* Since the I/O in the active queue is not under our
* control, just set a flag so that we can clean it up when
* it completes and put it on the abandoned queue. This
* will prevent our sending spurious completions in the
* event that the device is opened again before these I/Os
* complete.
*/
TAILQ_FOREACH_SAFE(io_req, &softc->active_queue, links,
io_req2) {
TAILQ_REMOVE(&softc->active_queue, io_req, links);
io_req->flags |= PASS_IO_ABANDONED;
TAILQ_INSERT_TAIL(&softc->abandoned_queue, io_req,
links);
}
}
cam_periph_release_locked(periph);
/*
* We reference the lock directly here, instead of using
* cam_periph_unlock(). The reason is that the call to
* cam_periph_release_locked() above could result in the periph
* getting freed. If that is the case, dereferencing the periph
* with a cam_periph_unlock() call would cause a page fault.
*
* cam_periph_release() avoids this problem using the same method,
* but we're manually acquiring and dropping the lock here to
* protect the open count and avoid another lock acquisition and
* release.
*/
mtx_unlock(mtx);
return (0);
}
static void
passstart(struct cam_periph *periph, union ccb *start_ccb)
{
struct pass_softc *softc;
softc = (struct pass_softc *)periph->softc;
switch (softc->state) {
case PASS_STATE_NORMAL: {
struct pass_io_req *io_req;
/*
* Check for any queued I/O requests that require an
* allocated slot.
*/
io_req = TAILQ_FIRST(&softc->incoming_queue);
if (io_req == NULL) {
xpt_release_ccb(start_ccb);
break;
}
TAILQ_REMOVE(&softc->incoming_queue, io_req, links);
TAILQ_INSERT_TAIL(&softc->active_queue, io_req, links);
/*
* Merge the user's CCB into the allocated CCB.
*/
xpt_merge_ccb(start_ccb, &io_req->ccb);
start_ccb->ccb_h.ccb_type = PASS_CCB_QUEUED_IO;
start_ccb->ccb_h.ccb_ioreq = io_req;
start_ccb->ccb_h.cbfcnp = passdone;
io_req->alloced_ccb = start_ccb;
binuptime(&io_req->start_time);
devstat_start_transaction(softc->device_stats,
&io_req->start_time);
xpt_action(start_ccb);
/*
* If we have any more I/O waiting, schedule ourselves again.
*/
if (!TAILQ_EMPTY(&softc->incoming_queue))
xpt_schedule(periph, CAM_PRIORITY_NORMAL);
break;
}
default:
break;
}
}
static void
passdone(struct cam_periph *periph, union ccb *done_ccb)
{
struct pass_softc *softc;
struct ccb_scsiio *csio;
softc = (struct pass_softc *)periph->softc;
cam_periph_assert(periph, MA_OWNED);
csio = &done_ccb->csio;
switch (csio->ccb_h.ccb_type) {
case PASS_CCB_QUEUED_IO: {
struct pass_io_req *io_req;
io_req = done_ccb->ccb_h.ccb_ioreq;
#if 0
xpt_print(periph->path, "%s: called for user CCB %p\n",
__func__, io_req->user_ccb_ptr);
#endif
if (((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP)
&& (done_ccb->ccb_h.flags & CAM_PASS_ERR_RECOVER)
&& ((io_req->flags & PASS_IO_ABANDONED) == 0)) {
int error;
error = passerror(done_ccb, CAM_RETRY_SELTO,
SF_RETRY_UA | SF_NO_PRINT);
if (error == ERESTART) {
/*
* A retry was scheduled, so
* just return.
*/
return;
}
}
/*
* Copy the allocated CCB contents back to the malloced CCB
* so we can give status back to the user when he requests it.
*/
bcopy(done_ccb, &io_req->ccb, sizeof(*done_ccb));
/*
* Log data/transaction completion with devstat(9).
*/
switch (done_ccb->ccb_h.func_code) {
case XPT_SCSI_IO:
devstat_end_transaction(softc->device_stats,
done_ccb->csio.dxfer_len - done_ccb->csio.resid,
done_ccb->csio.tag_action & 0x3,
((done_ccb->ccb_h.flags & CAM_DIR_MASK) ==
CAM_DIR_NONE) ? DEVSTAT_NO_DATA :
(done_ccb->ccb_h.flags & CAM_DIR_OUT) ?
DEVSTAT_WRITE : DEVSTAT_READ, NULL,
&io_req->start_time);
break;
case XPT_ATA_IO:
devstat_end_transaction(softc->device_stats,
done_ccb->ataio.dxfer_len - done_ccb->ataio.resid,
0, /* Not used in ATA */
((done_ccb->ccb_h.flags & CAM_DIR_MASK) ==
CAM_DIR_NONE) ? DEVSTAT_NO_DATA :
(done_ccb->ccb_h.flags & CAM_DIR_OUT) ?
DEVSTAT_WRITE : DEVSTAT_READ, NULL,
&io_req->start_time);
break;
case XPT_SMP_IO:
/*
* XXX KDM this isn't quite right, but there isn't
* currently an easy way to represent a bidirectional
* transfer in devstat. The only way to do it
* and have the byte counts come out right would
* mean that we would have to record two
* transactions, one for the request and one for the
* response. For now, so that we report something,
* just treat the entire thing as a read.
*/
devstat_end_transaction(softc->device_stats,
done_ccb->smpio.smp_request_len +
done_ccb->smpio.smp_response_len,
DEVSTAT_TAG_SIMPLE, DEVSTAT_READ, NULL,
&io_req->start_time);
break;
default:
devstat_end_transaction(softc->device_stats, 0,
DEVSTAT_TAG_NONE, DEVSTAT_NO_DATA, NULL,
&io_req->start_time);
break;
}
/*
* In the normal case, take the completed I/O off of the
* active queue and put it on the done queue. Notitfy the
* user that we have a completed I/O.
*/
if ((io_req->flags & PASS_IO_ABANDONED) == 0) {
TAILQ_REMOVE(&softc->active_queue, io_req, links);
TAILQ_INSERT_TAIL(&softc->done_queue, io_req, links);
selwakeuppri(&softc->read_select, PRIBIO);
KNOTE_LOCKED(&softc->read_select.si_note, 0);
} else {
/*
* In the case of an abandoned I/O (final close
* without fetching the I/O), take it off of the
* abandoned queue and free it.
*/
TAILQ_REMOVE(&softc->abandoned_queue, io_req, links);
passiocleanup(softc, io_req);
uma_zfree(softc->pass_zone, io_req);
/*
* Release the done_ccb here, since we may wind up
* freeing the peripheral when we decrement the
* reference count below.
*/
xpt_release_ccb(done_ccb);
/*
* If the abandoned queue is empty, we can release
* our reference to the periph since we won't have
* any more completions coming.
*/
if ((TAILQ_EMPTY(&softc->abandoned_queue))
&& (softc->flags & PASS_FLAG_ABANDONED_REF_SET)) {
softc->flags &= ~PASS_FLAG_ABANDONED_REF_SET;
cam_periph_release_locked(periph);
}
/*
* We have already released the CCB, so we can
* return.
*/
return;
}
break;
}
}
xpt_release_ccb(done_ccb);
}
static int
passcreatezone(struct cam_periph *periph)
{
struct pass_softc *softc;
int error;
error = 0;
softc = (struct pass_softc *)periph->softc;
cam_periph_assert(periph, MA_OWNED);
KASSERT(((softc->flags & PASS_FLAG_ZONE_VALID) == 0),
("%s called when the pass(4) zone is valid!\n", __func__));
KASSERT((softc->pass_zone == NULL),
("%s called when the pass(4) zone is allocated!\n", __func__));
if ((softc->flags & PASS_FLAG_ZONE_INPROG) == 0) {
/*
* We're the first context through, so we need to create
* the pass(4) UMA zone for I/O requests.
*/
softc->flags |= PASS_FLAG_ZONE_INPROG;
/*
* uma_zcreate() does a blocking (M_WAITOK) allocation,
* so we cannot hold a mutex while we call it.
*/
cam_periph_unlock(periph);
softc->pass_zone = uma_zcreate(softc->zone_name,
sizeof(struct pass_io_req), NULL, NULL, NULL, NULL,
/*align*/ 0, /*flags*/ 0);
softc->pass_io_zone = uma_zcreate(softc->io_zone_name,
softc->io_zone_size, NULL, NULL, NULL, NULL,
/*align*/ 0, /*flags*/ 0);
cam_periph_lock(periph);
if ((softc->pass_zone == NULL)
|| (softc->pass_io_zone == NULL)) {
if (softc->pass_zone == NULL)
xpt_print(periph->path, "unable to allocate "
"IO Req UMA zone\n");
else
xpt_print(periph->path, "unable to allocate "
"IO UMA zone\n");
softc->flags &= ~PASS_FLAG_ZONE_INPROG;
goto bailout;
}
/*
* Set the flags appropriately and notify any other waiters.
*/
softc->flags &= PASS_FLAG_ZONE_INPROG;
softc->flags |= PASS_FLAG_ZONE_VALID;
wakeup(&softc->pass_zone);
} else {
/*
* In this case, the UMA zone has not yet been created, but
* another context is in the process of creating it. We
* need to sleep until the creation is either done or has
* failed.
*/
while ((softc->flags & PASS_FLAG_ZONE_INPROG)
&& ((softc->flags & PASS_FLAG_ZONE_VALID) == 0)) {
error = msleep(&softc->pass_zone,
cam_periph_mtx(periph), PRIBIO,
"paszon", 0);
if (error != 0)
goto bailout;
}
/*
* If the zone creation failed, no luck for the user.
*/
if ((softc->flags & PASS_FLAG_ZONE_VALID) == 0){
error = ENOMEM;
goto bailout;
}
}
bailout:
return (error);
}
static void
passiocleanup(struct pass_softc *softc, struct pass_io_req *io_req)
{
union ccb *ccb;
u_int8_t **data_ptrs[CAM_PERIPH_MAXMAPS];
int i, numbufs;
ccb = &io_req->ccb;
switch (ccb->ccb_h.func_code) {
case XPT_DEV_MATCH:
numbufs = min(io_req->num_bufs, 2);
if (numbufs == 1) {
data_ptrs[0] = (u_int8_t **)&ccb->cdm.matches;
} else {
data_ptrs[0] = (u_int8_t **)&ccb->cdm.patterns;
data_ptrs[1] = (u_int8_t **)&ccb->cdm.matches;
}
break;
case XPT_SCSI_IO:
case XPT_CONT_TARGET_IO:
data_ptrs[0] = &ccb->csio.data_ptr;
numbufs = min(io_req->num_bufs, 1);
break;
case XPT_ATA_IO:
data_ptrs[0] = &ccb->ataio.data_ptr;
numbufs = min(io_req->num_bufs, 1);
break;
case XPT_SMP_IO:
numbufs = min(io_req->num_bufs, 2);
data_ptrs[0] = &ccb->smpio.smp_request;
data_ptrs[1] = &ccb->smpio.smp_response;
break;
case XPT_DEV_ADVINFO:
numbufs = min(io_req->num_bufs, 1);
data_ptrs[0] = (uint8_t **)&ccb->cdai.buf;
break;
case XPT_NVME_IO:
case XPT_NVME_ADMIN:
data_ptrs[0] = &ccb->nvmeio.data_ptr;
numbufs = min(io_req->num_bufs, 1);
break;
default:
/* allow ourselves to be swapped once again */
return;
break; /* NOTREACHED */
}
if (io_req->flags & PASS_IO_USER_SEG_MALLOC) {
free(io_req->user_segptr, M_SCSIPASS);
io_req->user_segptr = NULL;
}
/*
* We only want to free memory we malloced.
*/
if (io_req->data_flags == CAM_DATA_VADDR) {
for (i = 0; i < io_req->num_bufs; i++) {
if (io_req->kern_bufs[i] == NULL)
continue;
free(io_req->kern_bufs[i], M_SCSIPASS);
io_req->kern_bufs[i] = NULL;
}
} else if (io_req->data_flags == CAM_DATA_SG) {
for (i = 0; i < io_req->num_kern_segs; i++) {
if ((uint8_t *)(uintptr_t)
io_req->kern_segptr[i].ds_addr == NULL)
continue;
uma_zfree(softc->pass_io_zone, (uint8_t *)(uintptr_t)
io_req->kern_segptr[i].ds_addr);
io_req->kern_segptr[i].ds_addr = 0;
}
}
if (io_req->flags & PASS_IO_KERN_SEG_MALLOC) {
free(io_req->kern_segptr, M_SCSIPASS);
io_req->kern_segptr = NULL;
}
if (io_req->data_flags != CAM_DATA_PADDR) {
for (i = 0; i < numbufs; i++) {
/*
* Restore the user's buffer pointers to their
* previous values.
*/
if (io_req->user_bufs[i] != NULL)
*data_ptrs[i] = io_req->user_bufs[i];
}
}
}
static int
passcopysglist(struct cam_periph *periph, struct pass_io_req *io_req,
ccb_flags direction)
{
bus_size_t kern_watermark, user_watermark, len_copied, len_to_copy;
bus_dma_segment_t *user_sglist, *kern_sglist;
int i, j, error;
error = 0;
kern_watermark = 0;
user_watermark = 0;
len_to_copy = 0;
len_copied = 0;
user_sglist = io_req->user_segptr;
kern_sglist = io_req->kern_segptr;
for (i = 0, j = 0; i < io_req->num_user_segs &&
j < io_req->num_kern_segs;) {
uint8_t *user_ptr, *kern_ptr;
len_to_copy = min(user_sglist[i].ds_len -user_watermark,
kern_sglist[j].ds_len - kern_watermark);
user_ptr = (uint8_t *)(uintptr_t)user_sglist[i].ds_addr;
user_ptr = user_ptr + user_watermark;
kern_ptr = (uint8_t *)(uintptr_t)kern_sglist[j].ds_addr;
kern_ptr = kern_ptr + kern_watermark;
user_watermark += len_to_copy;
kern_watermark += len_to_copy;
if (!useracc(user_ptr, len_to_copy,
(direction == CAM_DIR_IN) ? VM_PROT_WRITE : VM_PROT_READ)) {
xpt_print(periph->path, "%s: unable to access user "
"S/G list element %p len %zu\n", __func__,
user_ptr, len_to_copy);
error = EFAULT;
goto bailout;
}
if (direction == CAM_DIR_IN) {
error = copyout(kern_ptr, user_ptr, len_to_copy);
if (error != 0) {
xpt_print(periph->path, "%s: copyout of %u "
"bytes from %p to %p failed with "
"error %d\n", __func__, len_to_copy,
kern_ptr, user_ptr, error);
goto bailout;
}
} else {
error = copyin(user_ptr, kern_ptr, len_to_copy);
if (error != 0) {
xpt_print(periph->path, "%s: copyin of %u "
"bytes from %p to %p failed with "
"error %d\n", __func__, len_to_copy,
user_ptr, kern_ptr, error);
goto bailout;
}
}
len_copied += len_to_copy;
if (user_sglist[i].ds_len == user_watermark) {
i++;
user_watermark = 0;
}
if (kern_sglist[j].ds_len == kern_watermark) {
j++;
kern_watermark = 0;
}
}
bailout:
return (error);
}
static int
passmemsetup(struct cam_periph *periph, struct pass_io_req *io_req)
{
union ccb *ccb;
struct pass_softc *softc;
int numbufs, i;
uint8_t **data_ptrs[CAM_PERIPH_MAXMAPS];
uint32_t lengths[CAM_PERIPH_MAXMAPS];
uint32_t dirs[CAM_PERIPH_MAXMAPS];
uint32_t num_segs;
uint16_t *seg_cnt_ptr;
size_t maxmap;
int error;
cam_periph_assert(periph, MA_NOTOWNED);
softc = periph->softc;
error = 0;
ccb = &io_req->ccb;
maxmap = 0;
num_segs = 0;
seg_cnt_ptr = NULL;
switch(ccb->ccb_h.func_code) {
case XPT_DEV_MATCH:
if (ccb->cdm.match_buf_len == 0) {
printf("%s: invalid match buffer length 0\n", __func__);
return(EINVAL);
}
if (ccb->cdm.pattern_buf_len > 0) {
data_ptrs[0] = (u_int8_t **)&ccb->cdm.patterns;
lengths[0] = ccb->cdm.pattern_buf_len;
dirs[0] = CAM_DIR_OUT;
data_ptrs[1] = (u_int8_t **)&ccb->cdm.matches;
lengths[1] = ccb->cdm.match_buf_len;
dirs[1] = CAM_DIR_IN;
numbufs = 2;
} else {
data_ptrs[0] = (u_int8_t **)&ccb->cdm.matches;
lengths[0] = ccb->cdm.match_buf_len;
dirs[0] = CAM_DIR_IN;
numbufs = 1;
}
io_req->data_flags = CAM_DATA_VADDR;
break;
case XPT_SCSI_IO:
case XPT_CONT_TARGET_IO:
if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE)
return(0);
/*
* The user shouldn't be able to supply a bio.
*/
if ((ccb->ccb_h.flags & CAM_DATA_MASK) == CAM_DATA_BIO)
return (EINVAL);
io_req->data_flags = ccb->ccb_h.flags & CAM_DATA_MASK;
data_ptrs[0] = &ccb->csio.data_ptr;
lengths[0] = ccb->csio.dxfer_len;
dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
num_segs = ccb->csio.sglist_cnt;
seg_cnt_ptr = &ccb->csio.sglist_cnt;
numbufs = 1;
maxmap = softc->maxio;
break;
case XPT_ATA_IO:
if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE)
return(0);
/*
* We only support a single virtual address for ATA I/O.
*/
if ((ccb->ccb_h.flags & CAM_DATA_MASK) != CAM_DATA_VADDR)
return (EINVAL);
io_req->data_flags = CAM_DATA_VADDR;
data_ptrs[0] = &ccb->ataio.data_ptr;
lengths[0] = ccb->ataio.dxfer_len;
dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
numbufs = 1;
maxmap = softc->maxio;
break;
case XPT_SMP_IO:
io_req->data_flags = CAM_DATA_VADDR;
data_ptrs[0] = &ccb->smpio.smp_request;
lengths[0] = ccb->smpio.smp_request_len;
dirs[0] = CAM_DIR_OUT;
data_ptrs[1] = &ccb->smpio.smp_response;
lengths[1] = ccb->smpio.smp_response_len;
dirs[1] = CAM_DIR_IN;
numbufs = 2;
maxmap = softc->maxio;
break;
case XPT_DEV_ADVINFO:
if (ccb->cdai.bufsiz == 0)
return (0);
io_req->data_flags = CAM_DATA_VADDR;
data_ptrs[0] = (uint8_t **)&ccb->cdai.buf;
lengths[0] = ccb->cdai.bufsiz;
dirs[0] = CAM_DIR_IN;
numbufs = 1;
break;
case XPT_NVME_ADMIN:
case XPT_NVME_IO:
if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE)
return (0);
io_req->data_flags = ccb->ccb_h.flags & CAM_DATA_MASK;
data_ptrs[0] = &ccb->nvmeio.data_ptr;
lengths[0] = ccb->nvmeio.dxfer_len;
dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
num_segs = ccb->nvmeio.sglist_cnt;
seg_cnt_ptr = &ccb->nvmeio.sglist_cnt;
numbufs = 1;
maxmap = softc->maxio;
break;
default:
return(EINVAL);
break; /* NOTREACHED */
}
io_req->num_bufs = numbufs;
/*
* If there is a maximum, check to make sure that the user's
* request fits within the limit. In general, we should only have
* a maximum length for requests that go to hardware. Otherwise it
* is whatever we're able to malloc.
*/
for (i = 0; i < numbufs; i++) {
io_req->user_bufs[i] = *data_ptrs[i];
io_req->dirs[i] = dirs[i];
io_req->lengths[i] = lengths[i];
if (maxmap == 0)
continue;
if (lengths[i] <= maxmap)
continue;
xpt_print(periph->path, "%s: data length %u > max allowed %u "
"bytes\n", __func__, lengths[i], maxmap);
error = EINVAL;
goto bailout;
}
switch (io_req->data_flags) {
case CAM_DATA_VADDR:
/* Map or copy the buffer into kernel address space */
for (i = 0; i < numbufs; i++) {
uint8_t *tmp_buf;
/*
* If for some reason no length is specified, we
* don't need to allocate anything.
*/
if (io_req->lengths[i] == 0)
continue;
/*
* Make sure that the user's buffer is accessible
* to that process.
*/
if (!useracc(io_req->user_bufs[i], io_req->lengths[i],
(io_req->dirs[i] == CAM_DIR_IN) ? VM_PROT_WRITE :
VM_PROT_READ)) {
xpt_print(periph->path, "%s: user address %p "
"length %u is not accessible\n", __func__,
io_req->user_bufs[i], io_req->lengths[i]);
error = EFAULT;
goto bailout;
}
tmp_buf = malloc(lengths[i], M_SCSIPASS,
M_WAITOK | M_ZERO);
io_req->kern_bufs[i] = tmp_buf;
*data_ptrs[i] = tmp_buf;
#if 0
xpt_print(periph->path, "%s: malloced %p len %u, user "
"buffer %p, operation: %s\n", __func__,
tmp_buf, lengths[i], io_req->user_bufs[i],
(dirs[i] == CAM_DIR_IN) ? "read" : "write");
#endif
/*
* We only need to copy in if the user is writing.
*/
if (dirs[i] != CAM_DIR_OUT)
continue;
error = copyin(io_req->user_bufs[i],
io_req->kern_bufs[i], lengths[i]);
if (error != 0) {
xpt_print(periph->path, "%s: copy of user "
"buffer from %p to %p failed with "
"error %d\n", __func__,
io_req->user_bufs[i],
io_req->kern_bufs[i], error);
goto bailout;
}
}
break;
case CAM_DATA_PADDR:
/* Pass down the pointer as-is */
break;
case CAM_DATA_SG: {
size_t sg_length, size_to_go, alloc_size;
uint32_t num_segs_needed;
/*
* Copy the user S/G list in, and then copy in the
* individual segments.
*/
/*
* We shouldn't see this, but check just in case.
*/
if (numbufs != 1) {
xpt_print(periph->path, "%s: cannot currently handle "
"more than one S/G list per CCB\n", __func__);
error = EINVAL;
goto bailout;
}
/*
* We have to have at least one segment.
*/
if (num_segs == 0) {
xpt_print(periph->path, "%s: CAM_DATA_SG flag set, "
"but sglist_cnt=0!\n", __func__);
error = EINVAL;
goto bailout;
}
/*
* Make sure the user specified the total length and didn't
* just leave it to us to decode the S/G list.
*/
if (lengths[0] == 0) {
xpt_print(periph->path, "%s: no dxfer_len specified, "
"but CAM_DATA_SG flag is set!\n", __func__);
error = EINVAL;
goto bailout;
}
/*
* We allocate buffers in io_zone_size increments for an
* S/G list. This will generally be MAXPHYS.
*/
if (lengths[0] <= softc->io_zone_size)
num_segs_needed = 1;
else {
num_segs_needed = lengths[0] / softc->io_zone_size;
if ((lengths[0] % softc->io_zone_size) != 0)
num_segs_needed++;
}
/* Figure out the size of the S/G list */
sg_length = num_segs * sizeof(bus_dma_segment_t);
io_req->num_user_segs = num_segs;
io_req->num_kern_segs = num_segs_needed;
/* Save the user's S/G list pointer for later restoration */
io_req->user_bufs[0] = *data_ptrs[0];
/*
* If we have enough segments allocated by default to handle
* the length of the user's S/G list,
*/
if (num_segs > PASS_MAX_SEGS) {
io_req->user_segptr = malloc(sizeof(bus_dma_segment_t) *
num_segs, M_SCSIPASS, M_WAITOK | M_ZERO);
io_req->flags |= PASS_IO_USER_SEG_MALLOC;
} else
io_req->user_segptr = io_req->user_segs;
if (!useracc(*data_ptrs[0], sg_length, VM_PROT_READ)) {
xpt_print(periph->path, "%s: unable to access user "
"S/G list at %p\n", __func__, *data_ptrs[0]);
error = EFAULT;
goto bailout;
}
error = copyin(*data_ptrs[0], io_req->user_segptr, sg_length);
if (error != 0) {
xpt_print(periph->path, "%s: copy of user S/G list "
"from %p to %p failed with error %d\n",
__func__, *data_ptrs[0], io_req->user_segptr,
error);
goto bailout;
}
if (num_segs_needed > PASS_MAX_SEGS) {
io_req->kern_segptr = malloc(sizeof(bus_dma_segment_t) *
num_segs_needed, M_SCSIPASS, M_WAITOK | M_ZERO);
io_req->flags |= PASS_IO_KERN_SEG_MALLOC;
} else {
io_req->kern_segptr = io_req->kern_segs;
}
/*
* Allocate the kernel S/G list.
*/
for (size_to_go = lengths[0], i = 0;
size_to_go > 0 && i < num_segs_needed;
i++, size_to_go -= alloc_size) {
uint8_t *kern_ptr;
alloc_size = min(size_to_go, softc->io_zone_size);
kern_ptr = uma_zalloc(softc->pass_io_zone, M_WAITOK);
io_req->kern_segptr[i].ds_addr =
(bus_addr_t)(uintptr_t)kern_ptr;
io_req->kern_segptr[i].ds_len = alloc_size;
}
if (size_to_go > 0) {
printf("%s: size_to_go = %zu, software error!\n",
__func__, size_to_go);
error = EINVAL;
goto bailout;
}
*data_ptrs[0] = (uint8_t *)io_req->kern_segptr;
*seg_cnt_ptr = io_req->num_kern_segs;
/*
* We only need to copy data here if the user is writing.
*/
if (dirs[0] == CAM_DIR_OUT)
error = passcopysglist(periph, io_req, dirs[0]);
break;
}
case CAM_DATA_SG_PADDR: {
size_t sg_length;
/*
* We shouldn't see this, but check just in case.
*/
if (numbufs != 1) {
printf("%s: cannot currently handle more than one "
"S/G list per CCB\n", __func__);
error = EINVAL;
goto bailout;
}
/*
* We have to have at least one segment.
*/
if (num_segs == 0) {
xpt_print(periph->path, "%s: CAM_DATA_SG_PADDR flag "
"set, but sglist_cnt=0!\n", __func__);
error = EINVAL;
goto bailout;
}
/*
* Make sure the user specified the total length and didn't
* just leave it to us to decode the S/G list.
*/
if (lengths[0] == 0) {
xpt_print(periph->path, "%s: no dxfer_len specified, "
"but CAM_DATA_SG flag is set!\n", __func__);
error = EINVAL;
goto bailout;
}
/* Figure out the size of the S/G list */
sg_length = num_segs * sizeof(bus_dma_segment_t);
io_req->num_user_segs = num_segs;
io_req->num_kern_segs = io_req->num_user_segs;
/* Save the user's S/G list pointer for later restoration */
io_req->user_bufs[0] = *data_ptrs[0];
if (num_segs > PASS_MAX_SEGS) {
io_req->user_segptr = malloc(sizeof(bus_dma_segment_t) *
num_segs, M_SCSIPASS, M_WAITOK | M_ZERO);
io_req->flags |= PASS_IO_USER_SEG_MALLOC;
} else
io_req->user_segptr = io_req->user_segs;
io_req->kern_segptr = io_req->user_segptr;
error = copyin(*data_ptrs[0], io_req->user_segptr, sg_length);
if (error != 0) {
xpt_print(periph->path, "%s: copy of user S/G list "
"from %p to %p failed with error %d\n",
__func__, *data_ptrs[0], io_req->user_segptr,
error);
goto bailout;
}
break;
}
default:
case CAM_DATA_BIO:
/*
* A user shouldn't be attaching a bio to the CCB. It
* isn't a user-accessible structure.
*/
error = EINVAL;
break;
}
bailout:
if (error != 0)
passiocleanup(softc, io_req);
return (error);
}
static int
passmemdone(struct cam_periph *periph, struct pass_io_req *io_req)
{
struct pass_softc *softc;
- union ccb *ccb;
int error;
int i;
error = 0;
softc = (struct pass_softc *)periph->softc;
- ccb = &io_req->ccb;
switch (io_req->data_flags) {
case CAM_DATA_VADDR:
/*
* Copy back to the user buffer if this was a read.
*/
for (i = 0; i < io_req->num_bufs; i++) {
if (io_req->dirs[i] != CAM_DIR_IN)
continue;
error = copyout(io_req->kern_bufs[i],
io_req->user_bufs[i], io_req->lengths[i]);
if (error != 0) {
xpt_print(periph->path, "Unable to copy %u "
"bytes from %p to user address %p\n",
io_req->lengths[i],
io_req->kern_bufs[i],
io_req->user_bufs[i]);
goto bailout;
}
}
break;
case CAM_DATA_PADDR:
/* Do nothing. The pointer is a physical address already */
break;
case CAM_DATA_SG:
/*
* Copy back to the user buffer if this was a read.
* Restore the user's S/G list buffer pointer.
*/
if (io_req->dirs[0] == CAM_DIR_IN)
error = passcopysglist(periph, io_req, io_req->dirs[0]);
break;
case CAM_DATA_SG_PADDR:
/*
* Restore the user's S/G list buffer pointer. No need to
* copy.
*/
break;
default:
case CAM_DATA_BIO:
error = EINVAL;
break;
}
bailout:
/*
* Reset the user's pointers to their original values and free
* allocated memory.
*/
passiocleanup(softc, io_req);
return (error);
}
static int
passioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td)
{
int error;
if ((error = passdoioctl(dev, cmd, addr, flag, td)) == ENOTTY) {
error = cam_compat_ioctl(dev, cmd, addr, flag, td, passdoioctl);
}
return (error);
}
static int
passdoioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td)
{
struct cam_periph *periph;
struct pass_softc *softc;
int error;
uint32_t priority;
periph = (struct cam_periph *)dev->si_drv1;
cam_periph_lock(periph);
softc = (struct pass_softc *)periph->softc;
error = 0;
switch (cmd) {
case CAMIOCOMMAND:
{
union ccb *inccb;
union ccb *ccb;
int ccb_malloced;
inccb = (union ccb *)addr;
#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
if (inccb->ccb_h.func_code == XPT_SCSI_IO)
inccb->csio.bio = NULL;
#endif
if (inccb->ccb_h.flags & CAM_UNLOCKED) {
error = EINVAL;
break;
}
/*
* Some CCB types, like scan bus and scan lun can only go
* through the transport layer device.
*/
if (inccb->ccb_h.func_code & XPT_FC_XPT_ONLY) {
xpt_print(periph->path, "CCB function code %#x is "
"restricted to the XPT device\n",
inccb->ccb_h.func_code);
error = ENODEV;
break;
}
/* Compatibility for RL/priority-unaware code. */
priority = inccb->ccb_h.pinfo.priority;
if (priority <= CAM_PRIORITY_OOB)
priority += CAM_PRIORITY_OOB + 1;
/*
* Non-immediate CCBs need a CCB from the per-device pool
* of CCBs, which is scheduled by the transport layer.
* Immediate CCBs and user-supplied CCBs should just be
* malloced.
*/
if ((inccb->ccb_h.func_code & XPT_FC_QUEUED)
&& ((inccb->ccb_h.func_code & XPT_FC_USER_CCB) == 0)) {
ccb = cam_periph_getccb(periph, priority);
ccb_malloced = 0;
} else {
ccb = xpt_alloc_ccb_nowait();
if (ccb != NULL)
xpt_setup_ccb(&ccb->ccb_h, periph->path,
priority);
ccb_malloced = 1;
}
if (ccb == NULL) {
xpt_print(periph->path, "unable to allocate CCB\n");
error = ENOMEM;
break;
}
error = passsendccb(periph, ccb, inccb);
if (ccb_malloced)
xpt_free_ccb(ccb);
else
xpt_release_ccb(ccb);
break;
}
case CAMIOQUEUE:
{
struct pass_io_req *io_req;
union ccb **user_ccb, *ccb;
xpt_opcode fc;
if ((softc->flags & PASS_FLAG_ZONE_VALID) == 0) {
error = passcreatezone(periph);
if (error != 0)
goto bailout;
}
/*
* We're going to do a blocking allocation for this I/O
* request, so we have to drop the lock.
*/
cam_periph_unlock(periph);
io_req = uma_zalloc(softc->pass_zone, M_WAITOK | M_ZERO);
ccb = &io_req->ccb;
user_ccb = (union ccb **)addr;
/*
* Unlike the CAMIOCOMMAND ioctl above, we only have a
* pointer to the user's CCB, so we have to copy the whole
* thing in to a buffer we have allocated (above) instead
* of allowing the ioctl code to malloc a buffer and copy
* it in.
*
* This is an advantage for this asynchronous interface,
* since we don't want the memory to get freed while the
* CCB is outstanding.
*/
#if 0
xpt_print(periph->path, "Copying user CCB %p to "
"kernel address %p\n", *user_ccb, ccb);
#endif
error = copyin(*user_ccb, ccb, sizeof(*ccb));
if (error != 0) {
xpt_print(periph->path, "Copy of user CCB %p to "
"kernel address %p failed with error %d\n",
*user_ccb, ccb, error);
goto camioqueue_error;
}
#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
if (ccb->ccb_h.func_code == XPT_SCSI_IO)
ccb->csio.bio = NULL;
#endif
if (ccb->ccb_h.flags & CAM_UNLOCKED) {
error = EINVAL;
goto camioqueue_error;
}
if (ccb->ccb_h.flags & CAM_CDB_POINTER) {
if (ccb->csio.cdb_len > IOCDBLEN) {
error = EINVAL;
goto camioqueue_error;
}
error = copyin(ccb->csio.cdb_io.cdb_ptr,
ccb->csio.cdb_io.cdb_bytes, ccb->csio.cdb_len);
if (error != 0)
goto camioqueue_error;
ccb->ccb_h.flags &= ~CAM_CDB_POINTER;
}
/*
* Some CCB types, like scan bus and scan lun can only go
* through the transport layer device.
*/
if (ccb->ccb_h.func_code & XPT_FC_XPT_ONLY) {
xpt_print(periph->path, "CCB function code %#x is "
"restricted to the XPT device\n",
ccb->ccb_h.func_code);
error = ENODEV;
goto camioqueue_error;
}
/*
* Save the user's CCB pointer as well as his linked list
* pointers and peripheral private area so that we can
* restore these later.
*/
io_req->user_ccb_ptr = *user_ccb;
io_req->user_periph_links = ccb->ccb_h.periph_links;
io_req->user_periph_priv = ccb->ccb_h.periph_priv;
/*
* Now that we've saved the user's values, we can set our
* own peripheral private entry.
*/
ccb->ccb_h.ccb_ioreq = io_req;
/* Compatibility for RL/priority-unaware code. */
priority = ccb->ccb_h.pinfo.priority;
if (priority <= CAM_PRIORITY_OOB)
priority += CAM_PRIORITY_OOB + 1;
/*
* Setup fields in the CCB like the path and the priority.
* The path in particular cannot be done in userland, since
* it is a pointer to a kernel data structure.
*/
xpt_setup_ccb_flags(&ccb->ccb_h, periph->path, priority,
ccb->ccb_h.flags);
/*
* Setup our done routine. There is no way for the user to
* have a valid pointer here.
*/
ccb->ccb_h.cbfcnp = passdone;
fc = ccb->ccb_h.func_code;
/*
* If this function code has memory that can be mapped in
* or out, we need to call passmemsetup().
*/
if ((fc == XPT_SCSI_IO) || (fc == XPT_ATA_IO)
|| (fc == XPT_SMP_IO) || (fc == XPT_DEV_MATCH)
|| (fc == XPT_DEV_ADVINFO)
|| (fc == XPT_NVME_ADMIN) || (fc == XPT_NVME_IO)) {
error = passmemsetup(periph, io_req);
if (error != 0)
goto camioqueue_error;
} else
io_req->mapinfo.num_bufs_used = 0;
cam_periph_lock(periph);
/*
* Everything goes on the incoming queue initially.
*/
TAILQ_INSERT_TAIL(&softc->incoming_queue, io_req, links);
/*
* If the CCB is queued, and is not a user CCB, then
* we need to allocate a slot for it. Call xpt_schedule()
* so that our start routine will get called when a CCB is
* available.
*/
if ((fc & XPT_FC_QUEUED)
&& ((fc & XPT_FC_USER_CCB) == 0)) {
xpt_schedule(periph, priority);
break;
}
/*
* At this point, the CCB in question is either an
* immediate CCB (like XPT_DEV_ADVINFO) or it is a user CCB
* and therefore should be malloced, not allocated via a slot.
* Remove the CCB from the incoming queue and add it to the
* active queue.
*/
TAILQ_REMOVE(&softc->incoming_queue, io_req, links);
TAILQ_INSERT_TAIL(&softc->active_queue, io_req, links);
xpt_action(ccb);
/*
* If this is not a queued CCB (i.e. it is an immediate CCB),
* then it is already done. We need to put it on the done
* queue for the user to fetch.
*/
if ((fc & XPT_FC_QUEUED) == 0) {
TAILQ_REMOVE(&softc->active_queue, io_req, links);
TAILQ_INSERT_TAIL(&softc->done_queue, io_req, links);
}
break;
camioqueue_error:
uma_zfree(softc->pass_zone, io_req);
cam_periph_lock(periph);
break;
}
case CAMIOGET:
{
union ccb **user_ccb;
struct pass_io_req *io_req;
int old_error;
user_ccb = (union ccb **)addr;
old_error = 0;
io_req = TAILQ_FIRST(&softc->done_queue);
if (io_req == NULL) {
error = ENOENT;
break;
}
/*
* Remove the I/O from the done queue.
*/
TAILQ_REMOVE(&softc->done_queue, io_req, links);
/*
* We have to drop the lock during the copyout because the
* copyout can result in VM faults that require sleeping.
*/
cam_periph_unlock(periph);
/*
* Do any needed copies (e.g. for reads) and revert the
* pointers in the CCB back to the user's pointers.
*/
error = passmemdone(periph, io_req);
old_error = error;
io_req->ccb.ccb_h.periph_links = io_req->user_periph_links;
io_req->ccb.ccb_h.periph_priv = io_req->user_periph_priv;
#if 0
xpt_print(periph->path, "Copying to user CCB %p from "
"kernel address %p\n", *user_ccb, &io_req->ccb);
#endif
error = copyout(&io_req->ccb, *user_ccb, sizeof(union ccb));
if (error != 0) {
xpt_print(periph->path, "Copy to user CCB %p from "
"kernel address %p failed with error %d\n",
*user_ccb, &io_req->ccb, error);
}
/*
* Prefer the first error we got back, and make sure we
* don't overwrite bad status with good.
*/
if (old_error != 0)
error = old_error;
cam_periph_lock(periph);
/*
* At this point, if there was an error, we could potentially
* re-queue the I/O and try again. But why? The error
* would almost certainly happen again. We might as well
* not leak memory.
*/
uma_zfree(softc->pass_zone, io_req);
break;
}
default:
error = cam_periph_ioctl(periph, cmd, addr, passerror);
break;
}
bailout:
cam_periph_unlock(periph);
return(error);
}
static int
passpoll(struct cdev *dev, int poll_events, struct thread *td)
{
struct cam_periph *periph;
struct pass_softc *softc;
int revents;
periph = (struct cam_periph *)dev->si_drv1;
softc = (struct pass_softc *)periph->softc;
revents = poll_events & (POLLOUT | POLLWRNORM);
if ((poll_events & (POLLIN | POLLRDNORM)) != 0) {
cam_periph_lock(periph);
if (!TAILQ_EMPTY(&softc->done_queue)) {
revents |= poll_events & (POLLIN | POLLRDNORM);
}
cam_periph_unlock(periph);
if (revents == 0)
selrecord(td, &softc->read_select);
}
return (revents);
}
static int
passkqfilter(struct cdev *dev, struct knote *kn)
{
struct cam_periph *periph;
struct pass_softc *softc;
periph = (struct cam_periph *)dev->si_drv1;
softc = (struct pass_softc *)periph->softc;
kn->kn_hook = (caddr_t)periph;
kn->kn_fop = &passread_filtops;
knlist_add(&softc->read_select.si_note, kn, 0);
return (0);
}
static void
passreadfiltdetach(struct knote *kn)
{
struct cam_periph *periph;
struct pass_softc *softc;
periph = (struct cam_periph *)kn->kn_hook;
softc = (struct pass_softc *)periph->softc;
knlist_remove(&softc->read_select.si_note, kn, 0);
}
static int
passreadfilt(struct knote *kn, long hint)
{
struct cam_periph *periph;
struct pass_softc *softc;
int retval;
periph = (struct cam_periph *)kn->kn_hook;
softc = (struct pass_softc *)periph->softc;
cam_periph_assert(periph, MA_OWNED);
if (TAILQ_EMPTY(&softc->done_queue))
retval = 0;
else
retval = 1;
return (retval);
}
/*
* Generally, "ccb" should be the CCB supplied by the kernel. "inccb"
* should be the CCB that is copied in from the user.
*/
static int
passsendccb(struct cam_periph *periph, union ccb *ccb, union ccb *inccb)
{
struct pass_softc *softc;
struct cam_periph_map_info mapinfo;
uint8_t *cmd;
xpt_opcode fc;
int error;
softc = (struct pass_softc *)periph->softc;
/*
* There are some fields in the CCB header that need to be
* preserved, the rest we get from the user.
*/
xpt_merge_ccb(ccb, inccb);
if (ccb->ccb_h.flags & CAM_CDB_POINTER) {
cmd = __builtin_alloca(ccb->csio.cdb_len);
error = copyin(ccb->csio.cdb_io.cdb_ptr, cmd, ccb->csio.cdb_len);
if (error)
return (error);
ccb->csio.cdb_io.cdb_ptr = cmd;
}
/*
*/
ccb->ccb_h.cbfcnp = passdone;
/*
* Let cam_periph_mapmem do a sanity check on the data pointer format.
* Even if no data transfer is needed, it's a cheap check and it
* simplifies the code.
*/
fc = ccb->ccb_h.func_code;
if ((fc == XPT_SCSI_IO) || (fc == XPT_ATA_IO) || (fc == XPT_SMP_IO)
|| (fc == XPT_DEV_MATCH) || (fc == XPT_DEV_ADVINFO) || (fc == XPT_MMC_IO)
|| (fc == XPT_NVME_ADMIN) || (fc == XPT_NVME_IO)) {
bzero(&mapinfo, sizeof(mapinfo));
/*
* cam_periph_mapmem calls into proc and vm functions that can
* sleep as well as trigger I/O, so we can't hold the lock.
* Dropping it here is reasonably safe.
*/
cam_periph_unlock(periph);
error = cam_periph_mapmem(ccb, &mapinfo, softc->maxio);
cam_periph_lock(periph);
/*
* cam_periph_mapmem returned an error, we can't continue.
* Return the error to the user.
*/
if (error)
return(error);
} else
/* Ensure that the unmap call later on is a no-op. */
mapinfo.num_bufs_used = 0;
/*
* If the user wants us to perform any error recovery, then honor
* that request. Otherwise, it's up to the user to perform any
* error recovery.
*/
cam_periph_runccb(ccb, (ccb->ccb_h.flags & CAM_PASS_ERR_RECOVER) ?
passerror : NULL, /* cam_flags */ CAM_RETRY_SELTO,
/* sense_flags */ SF_RETRY_UA | SF_NO_PRINT,
softc->device_stats);
cam_periph_unmapmem(ccb, &mapinfo);
ccb->ccb_h.cbfcnp = NULL;
ccb->ccb_h.periph_priv = inccb->ccb_h.periph_priv;
bcopy(ccb, inccb, sizeof(union ccb));
return(0);
}
static int
passerror(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags)
{
struct cam_periph *periph;
struct pass_softc *softc;
periph = xpt_path_periph(ccb->ccb_h.path);
softc = (struct pass_softc *)periph->softc;
return(cam_periph_error(ccb, cam_flags, sense_flags));
}
Index: head/sys/dev/al_eth/al_eth.c
===================================================================
--- head/sys/dev/al_eth/al_eth.c (revision 327172)
+++ head/sys/dev/al_eth/al_eth.c (revision 327173)
@@ -1,3584 +1,3581 @@
/*-
* Copyright (c) 2015,2016 Annapurna Labs Ltd. and affiliates
* All rights reserved.
*
* Developed by Semihalf.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/bus.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/lock.h>
#include <sys/mbuf.h>
#include <sys/malloc.h>
#include <sys/module.h>
#include <sys/rman.h>
#include <sys/socket.h>
#include <sys/sockio.h>
#include <sys/sysctl.h>
#include <sys/taskqueue.h>
#include <machine/atomic.h>
#include "opt_inet.h"
#include "opt_inet6.h"
#include <net/ethernet.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/if_arp.h>
#include <net/if_dl.h>
#include <net/if_media.h>
#include <net/if_types.h>
#include <netinet/in.h>
#include <net/if_vlan_var.h>
#include <netinet/tcp.h>
#include <netinet/tcp_lro.h>
#ifdef INET
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/in_var.h>
#include <netinet/ip.h>
#endif
#ifdef INET6
#include <netinet/ip6.h>
#endif
#include <sys/sockio.h>
#include <dev/pci/pcireg.h>
#include <dev/pci/pcivar.h>
#include <dev/mii/mii.h>
#include <dev/mii/miivar.h>
#include <al_hal_common.h>
#include <al_hal_plat_services.h>
#include <al_hal_udma_config.h>
#include <al_hal_udma_iofic.h>
#include <al_hal_udma_debug.h>
#include <al_hal_eth.h>
#include "al_eth.h"
#include "al_init_eth_lm.h"
#include "arm/annapurna/alpine/alpine_serdes.h"
#include "miibus_if.h"
#define device_printf_dbg(fmt, ...) do { \
if (AL_DBG_LEVEL >= AL_DBG_LEVEL_DBG) { AL_DBG_LOCK(); \
device_printf(fmt, __VA_ARGS__); AL_DBG_UNLOCK();} \
} while (0)
MALLOC_DEFINE(M_IFAL, "if_al_malloc", "All allocated data for AL ETH driver");
/* move out to some pci header file */
#define PCI_VENDOR_ID_ANNAPURNA_LABS 0x1c36
#define PCI_DEVICE_ID_AL_ETH 0x0001
#define PCI_DEVICE_ID_AL_ETH_ADVANCED 0x0002
#define PCI_DEVICE_ID_AL_ETH_NIC 0x0003
#define PCI_DEVICE_ID_AL_ETH_FPGA_NIC 0x0030
#define PCI_DEVICE_ID_AL_CRYPTO 0x0011
#define PCI_DEVICE_ID_AL_CRYPTO_VF 0x8011
#define PCI_DEVICE_ID_AL_RAID_DMA 0x0021
#define PCI_DEVICE_ID_AL_RAID_DMA_VF 0x8021
#define PCI_DEVICE_ID_AL_USB 0x0041
#define MAC_ADDR_STR "%02x:%02x:%02x:%02x:%02x:%02x"
#define MAC_ADDR(addr) addr[0], addr[1], addr[2], addr[3], addr[4], addr[5]
#define AL_ETH_MAC_TABLE_UNICAST_IDX_BASE 0
#define AL_ETH_MAC_TABLE_UNICAST_MAX_COUNT 4
#define AL_ETH_MAC_TABLE_ALL_MULTICAST_IDX (AL_ETH_MAC_TABLE_UNICAST_IDX_BASE + \
AL_ETH_MAC_TABLE_UNICAST_MAX_COUNT)
#define AL_ETH_MAC_TABLE_DROP_IDX (AL_ETH_FWD_MAC_NUM - 1)
#define AL_ETH_MAC_TABLE_BROADCAST_IDX (AL_ETH_MAC_TABLE_DROP_IDX - 1)
#define AL_ETH_THASH_UDMA_SHIFT 0
#define AL_ETH_THASH_UDMA_MASK (0xF << AL_ETH_THASH_UDMA_SHIFT)
#define AL_ETH_THASH_Q_SHIFT 4
#define AL_ETH_THASH_Q_MASK (0x3 << AL_ETH_THASH_Q_SHIFT)
/* the following defines should be moved to hal */
#define AL_ETH_FSM_ENTRY_IPV4_TCP 0
#define AL_ETH_FSM_ENTRY_IPV4_UDP 1
#define AL_ETH_FSM_ENTRY_IPV6_TCP 2
#define AL_ETH_FSM_ENTRY_IPV6_UDP 3
#define AL_ETH_FSM_ENTRY_IPV6_NO_UDP_TCP 4
#define AL_ETH_FSM_ENTRY_IPV4_NO_UDP_TCP 5
/* FSM DATA format */
#define AL_ETH_FSM_DATA_OUTER_2_TUPLE 0
#define AL_ETH_FSM_DATA_OUTER_4_TUPLE 1
#define AL_ETH_FSM_DATA_INNER_2_TUPLE 2
#define AL_ETH_FSM_DATA_INNER_4_TUPLE 3
#define AL_ETH_FSM_DATA_HASH_SEL (1 << 2)
#define AL_ETH_FSM_DATA_DEFAULT_Q 0
#define AL_ETH_FSM_DATA_DEFAULT_UDMA 0
#define AL_BR_SIZE 512
#define AL_TSO_SIZE 65500
#define AL_DEFAULT_MTU 1500
#define CSUM_OFFLOAD (CSUM_IP|CSUM_TCP|CSUM_UDP|CSUM_SCTP)
#define AL_IP_ALIGNMENT_OFFSET 2
#define SFP_I2C_ADDR 0x50
#define AL_MASK_GROUP_A_INT 0x7
#define AL_MASK_GROUP_B_INT 0xF
#define AL_MASK_GROUP_C_INT 0xF
#define AL_MASK_GROUP_D_INT 0xFFFFFFFF
#define AL_REG_OFFSET_FORWARD_INTR (0x1800000 + 0x1210)
#define AL_EN_FORWARD_INTR 0x1FFFF
#define AL_DIS_FORWARD_INTR 0
#define AL_M2S_MASK_INIT 0x480
#define AL_S2M_MASK_INIT 0x1E0
#define AL_M2S_S2M_MASK_NOT_INT (0x3f << 25)
#define AL_10BASE_T_SPEED 10
#define AL_100BASE_TX_SPEED 100
#define AL_1000BASE_T_SPEED 1000
static devclass_t al_devclass;
#define AL_RX_LOCK_INIT(_sc) mtx_init(&((_sc)->if_rx_lock), "ALRXL", "ALRXL", MTX_DEF)
#define AL_RX_LOCK(_sc) mtx_lock(&((_sc)->if_rx_lock))
#define AL_RX_UNLOCK(_sc) mtx_unlock(&((_sc)->if_rx_lock))
/* helper functions */
static int al_is_device_supported(device_t);
static void al_eth_init_rings(struct al_eth_adapter *);
static void al_eth_flow_ctrl_disable(struct al_eth_adapter *);
int al_eth_fpga_read_pci_config(void *, int, uint32_t *);
int al_eth_fpga_write_pci_config(void *, int, uint32_t);
int al_eth_read_pci_config(void *, int, uint32_t *);
int al_eth_write_pci_config(void *, int, uint32_t);
void al_eth_irq_config(uint32_t *, uint32_t);
void al_eth_forward_int_config(uint32_t *, uint32_t);
static void al_eth_start_xmit(void *, int);
static void al_eth_rx_recv_work(void *, int);
static int al_eth_up(struct al_eth_adapter *);
static void al_eth_down(struct al_eth_adapter *);
static void al_eth_interrupts_unmask(struct al_eth_adapter *);
static void al_eth_interrupts_mask(struct al_eth_adapter *);
static int al_eth_check_mtu(struct al_eth_adapter *, int);
static uint64_t al_get_counter(struct ifnet *, ift_counter);
static void al_eth_req_rx_buff_size(struct al_eth_adapter *, int);
static int al_eth_board_params_init(struct al_eth_adapter *);
static int al_media_update(struct ifnet *);
static void al_media_status(struct ifnet *, struct ifmediareq *);
static int al_eth_function_reset(struct al_eth_adapter *);
static int al_eth_hw_init_adapter(struct al_eth_adapter *);
static void al_eth_serdes_init(struct al_eth_adapter *);
static void al_eth_lm_config(struct al_eth_adapter *);
static int al_eth_hw_init(struct al_eth_adapter *);
static void al_tick_stats(void *);
/* ifnet entry points */
static void al_init(void *);
static int al_mq_start(struct ifnet *, struct mbuf *);
static void al_qflush(struct ifnet *);
static int al_ioctl(struct ifnet * ifp, u_long, caddr_t);
/* bus entry points */
static int al_probe(device_t);
static int al_attach(device_t);
static int al_detach(device_t);
static int al_shutdown(device_t);
/* mii bus support routines */
static int al_miibus_readreg(device_t, int, int);
static int al_miibus_writereg(device_t, int, int, int);
static void al_miibus_statchg(device_t);
static void al_miibus_linkchg(device_t);
struct al_eth_adapter* g_adapters[16];
uint32_t g_adapters_count;
/* flag for napi-like mbuf processing, controlled from sysctl */
static int napi = 0;
static device_method_t al_methods[] = {
/* Device interface */
DEVMETHOD(device_probe, al_probe),
DEVMETHOD(device_attach, al_attach),
DEVMETHOD(device_detach, al_detach),
DEVMETHOD(device_shutdown, al_shutdown),
DEVMETHOD(miibus_readreg, al_miibus_readreg),
DEVMETHOD(miibus_writereg, al_miibus_writereg),
DEVMETHOD(miibus_statchg, al_miibus_statchg),
DEVMETHOD(miibus_linkchg, al_miibus_linkchg),
{ 0, 0 }
};
static driver_t al_driver = {
"al",
al_methods,
sizeof(struct al_eth_adapter),
};
DRIVER_MODULE(al, pci, al_driver, al_devclass, 0, 0);
DRIVER_MODULE(miibus, al, miibus_driver, miibus_devclass, 0, 0);
static int
al_probe(device_t dev)
{
if ((al_is_device_supported(dev)) != 0) {
device_set_desc(dev, "al");
return (BUS_PROBE_DEFAULT);
}
return (ENXIO);
}
static int
al_attach(device_t dev)
{
- struct al_eth_lm_context *lm_context;
struct al_eth_adapter *adapter;
struct sysctl_oid_list *child;
struct sysctl_ctx_list *ctx;
struct sysctl_oid *tree;
struct ifnet *ifp;
uint32_t dev_id;
uint32_t rev_id;
int bar_udma;
int bar_mac;
int bar_ec;
int err;
err = 0;
ifp = NULL;
dev_id = rev_id = 0;
ctx = device_get_sysctl_ctx(dev);
tree = SYSCTL_PARENT(device_get_sysctl_tree(dev));
child = SYSCTL_CHILDREN(tree);
if (g_adapters_count == 0) {
SYSCTL_ADD_INT(ctx, child, OID_AUTO, "napi",
CTLFLAG_RW, &napi, 0, "Use pseudo-napi mechanism");
}
adapter = device_get_softc(dev);
adapter->dev = dev;
adapter->board_type = ALPINE_INTEGRATED;
snprintf(adapter->name, AL_ETH_NAME_MAX_LEN, "%s",
device_get_nameunit(dev));
AL_RX_LOCK_INIT(adapter);
g_adapters[g_adapters_count] = adapter;
-
- lm_context = &adapter->lm_context;
bar_udma = PCIR_BAR(AL_ETH_UDMA_BAR);
adapter->udma_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
&bar_udma, RF_ACTIVE);
if (adapter->udma_res == NULL) {
device_printf(adapter->dev,
"could not allocate memory resources for DMA.\n");
err = ENOMEM;
goto err_res_dma;
}
adapter->udma_base = al_bus_dma_to_va(rman_get_bustag(adapter->udma_res),
rman_get_bushandle(adapter->udma_res));
bar_mac = PCIR_BAR(AL_ETH_MAC_BAR);
adapter->mac_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
&bar_mac, RF_ACTIVE);
if (adapter->mac_res == NULL) {
device_printf(adapter->dev,
"could not allocate memory resources for MAC.\n");
err = ENOMEM;
goto err_res_mac;
}
adapter->mac_base = al_bus_dma_to_va(rman_get_bustag(adapter->mac_res),
rman_get_bushandle(adapter->mac_res));
bar_ec = PCIR_BAR(AL_ETH_EC_BAR);
adapter->ec_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &bar_ec,
RF_ACTIVE);
if (adapter->ec_res == NULL) {
device_printf(adapter->dev,
"could not allocate memory resources for EC.\n");
err = ENOMEM;
goto err_res_ec;
}
adapter->ec_base = al_bus_dma_to_va(rman_get_bustag(adapter->ec_res),
rman_get_bushandle(adapter->ec_res));
adapter->netdev = ifp = if_alloc(IFT_ETHER);
adapter->netdev->if_link_state = LINK_STATE_DOWN;
ifp->if_softc = adapter;
if_initname(ifp, device_get_name(dev), device_get_unit(dev));
ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
ifp->if_flags = ifp->if_drv_flags;
ifp->if_flags |= IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST | IFF_ALLMULTI;
ifp->if_transmit = al_mq_start;
ifp->if_qflush = al_qflush;
ifp->if_ioctl = al_ioctl;
ifp->if_init = al_init;
ifp->if_get_counter = al_get_counter;
ifp->if_mtu = AL_DEFAULT_MTU;
adapter->if_flags = ifp->if_flags;
ifp->if_capabilities = ifp->if_capenable = 0;
ifp->if_capabilities |= IFCAP_HWCSUM |
IFCAP_HWCSUM_IPV6 | IFCAP_TSO |
IFCAP_LRO | IFCAP_JUMBO_MTU;
ifp->if_capenable = ifp->if_capabilities;
adapter->id_number = g_adapters_count;
if (adapter->board_type == ALPINE_INTEGRATED) {
dev_id = pci_get_device(adapter->dev);
rev_id = pci_get_revid(adapter->dev);
} else {
al_eth_fpga_read_pci_config(adapter->internal_pcie_base,
PCIR_DEVICE, &dev_id);
al_eth_fpga_read_pci_config(adapter->internal_pcie_base,
PCIR_REVID, &rev_id);
}
adapter->dev_id = dev_id;
adapter->rev_id = rev_id;
/* set default ring sizes */
adapter->tx_ring_count = AL_ETH_DEFAULT_TX_SW_DESCS;
adapter->tx_descs_count = AL_ETH_DEFAULT_TX_HW_DESCS;
adapter->rx_ring_count = AL_ETH_DEFAULT_RX_DESCS;
adapter->rx_descs_count = AL_ETH_DEFAULT_RX_DESCS;
adapter->num_tx_queues = AL_ETH_NUM_QUEUES;
adapter->num_rx_queues = AL_ETH_NUM_QUEUES;
adapter->small_copy_len = AL_ETH_DEFAULT_SMALL_PACKET_LEN;
adapter->link_poll_interval = AL_ETH_DEFAULT_LINK_POLL_INTERVAL;
adapter->max_rx_buff_alloc_size = AL_ETH_DEFAULT_MAX_RX_BUFF_ALLOC_SIZE;
al_eth_req_rx_buff_size(adapter, adapter->netdev->if_mtu);
adapter->link_config.force_1000_base_x = AL_ETH_DEFAULT_FORCE_1000_BASEX;
err = al_eth_board_params_init(adapter);
if (err != 0)
goto err;
if (adapter->mac_mode == AL_ETH_MAC_MODE_10GbE_Serial) {
ifmedia_init(&adapter->media, IFM_IMASK,
al_media_update, al_media_status);
ifmedia_add(&adapter->media, IFM_ETHER | IFM_1000_LX, 0, NULL);
ifmedia_add(&adapter->media, IFM_ETHER | IFM_10G_LR, 0, NULL);
ifmedia_add(&adapter->media, IFM_ETHER | IFM_AUTO, 0, NULL);
ifmedia_set(&adapter->media, IFM_ETHER | IFM_AUTO);
}
al_eth_function_reset(adapter);
err = al_eth_hw_init_adapter(adapter);
if (err != 0)
goto err;
al_eth_init_rings(adapter);
g_adapters_count++;
al_eth_lm_config(adapter);
mtx_init(&adapter->stats_mtx, "AlStatsMtx", NULL, MTX_DEF);
mtx_init(&adapter->wd_mtx, "AlWdMtx", NULL, MTX_DEF);
callout_init_mtx(&adapter->stats_callout, &adapter->stats_mtx, 0);
callout_init_mtx(&adapter->wd_callout, &adapter->wd_mtx, 0);
ether_ifattach(ifp, adapter->mac_addr);
ifp->if_mtu = AL_DEFAULT_MTU;
if (adapter->mac_mode == AL_ETH_MAC_MODE_RGMII) {
al_eth_hw_init(adapter);
/* Attach PHY(s) */
err = mii_attach(adapter->dev, &adapter->miibus, adapter->netdev,
al_media_update, al_media_status, BMSR_DEFCAPMASK, 0,
MII_OFFSET_ANY, 0);
if (err != 0) {
device_printf(adapter->dev, "attaching PHYs failed\n");
return (err);
}
adapter->mii = device_get_softc(adapter->miibus);
}
return (err);
err:
bus_release_resource(dev, SYS_RES_MEMORY, bar_ec, adapter->ec_res);
err_res_ec:
bus_release_resource(dev, SYS_RES_MEMORY, bar_mac, adapter->mac_res);
err_res_mac:
bus_release_resource(dev, SYS_RES_MEMORY, bar_udma, adapter->udma_res);
err_res_dma:
return (err);
}
static int
al_detach(device_t dev)
{
struct al_eth_adapter *adapter;
adapter = device_get_softc(dev);
ether_ifdetach(adapter->netdev);
mtx_destroy(&adapter->stats_mtx);
mtx_destroy(&adapter->wd_mtx);
al_eth_down(adapter);
bus_release_resource(dev, SYS_RES_IRQ, 0, adapter->irq_res);
bus_release_resource(dev, SYS_RES_MEMORY, 0, adapter->ec_res);
bus_release_resource(dev, SYS_RES_MEMORY, 0, adapter->mac_res);
bus_release_resource(dev, SYS_RES_MEMORY, 0, adapter->udma_res);
return (0);
}
int
al_eth_fpga_read_pci_config(void *handle, int where, uint32_t *val)
{
/* handle is the base address of the adapter */
*val = al_reg_read32((void*)((u_long)handle + where));
return (0);
}
int
al_eth_fpga_write_pci_config(void *handle, int where, uint32_t val)
{
/* handle is the base address of the adapter */
al_reg_write32((void*)((u_long)handle + where), val);
return (0);
}
int
al_eth_read_pci_config(void *handle, int where, uint32_t *val)
{
/* handle is a pci_dev */
*val = pci_read_config((device_t)handle, where, sizeof(*val));
return (0);
}
int
al_eth_write_pci_config(void *handle, int where, uint32_t val)
{
/* handle is a pci_dev */
pci_write_config((device_t)handle, where, val, sizeof(val));
return (0);
}
void
al_eth_irq_config(uint32_t *offset, uint32_t value)
{
al_reg_write32_relaxed(offset, value);
}
void
al_eth_forward_int_config(uint32_t *offset, uint32_t value)
{
al_reg_write32(offset, value);
}
static void
al_eth_serdes_init(struct al_eth_adapter *adapter)
{
void __iomem *serdes_base;
adapter->serdes_init = false;
serdes_base = alpine_serdes_resource_get(adapter->serdes_grp);
if (serdes_base == NULL) {
device_printf(adapter->dev, "serdes_base get failed!\n");
return;
}
serdes_base = al_bus_dma_to_va(serdes_tag, serdes_base);
al_serdes_handle_grp_init(serdes_base, adapter->serdes_grp,
&adapter->serdes_obj);
adapter->serdes_init = true;
}
static void
al_dma_map_addr(void *arg, bus_dma_segment_t *segs, int nseg, int error)
{
bus_addr_t *paddr;
paddr = arg;
*paddr = segs->ds_addr;
}
static int
al_dma_alloc_coherent(struct device *dev, bus_dma_tag_t *tag, bus_dmamap_t *map,
bus_addr_t *baddr, void **vaddr, uint32_t size)
{
int ret;
uint32_t maxsize = ((size - 1)/PAGE_SIZE + 1) * PAGE_SIZE;
ret = bus_dma_tag_create(bus_get_dma_tag(dev), 8, 0,
BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL,
maxsize, 1, maxsize, BUS_DMA_COHERENT, NULL, NULL, tag);
if (ret != 0) {
device_printf(dev,
"failed to create bus tag, ret = %d\n", ret);
return (ret);
}
ret = bus_dmamem_alloc(*tag, vaddr,
BUS_DMA_COHERENT | BUS_DMA_ZERO, map);
if (ret != 0) {
device_printf(dev,
"failed to allocate dmamem, ret = %d\n", ret);
return (ret);
}
ret = bus_dmamap_load(*tag, *map, *vaddr,
size, al_dma_map_addr, baddr, 0);
if (ret != 0) {
device_printf(dev,
"failed to allocate bus_dmamap_load, ret = %d\n", ret);
return (ret);
}
return (0);
}
static void
al_dma_free_coherent(bus_dma_tag_t tag, bus_dmamap_t map, void *vaddr)
{
bus_dmamap_unload(tag, map);
bus_dmamem_free(tag, vaddr, map);
bus_dma_tag_destroy(tag);
}
static void
al_eth_mac_table_unicast_add(struct al_eth_adapter *adapter,
uint8_t idx, uint8_t *addr, uint8_t udma_mask)
{
struct al_eth_fwd_mac_table_entry entry = { { 0 } };
memcpy(entry.addr, adapter->mac_addr, sizeof(adapter->mac_addr));
memset(entry.mask, 0xff, sizeof(entry.mask));
entry.rx_valid = true;
entry.tx_valid = false;
entry.udma_mask = udma_mask;
entry.filter = false;
device_printf_dbg(adapter->dev,
"%s: [%d]: addr "MAC_ADDR_STR" mask "MAC_ADDR_STR"\n",
__func__, idx, MAC_ADDR(entry.addr), MAC_ADDR(entry.mask));
al_eth_fwd_mac_table_set(&adapter->hal_adapter, idx, &entry);
}
static void
al_eth_mac_table_all_multicast_add(struct al_eth_adapter *adapter, uint8_t idx,
uint8_t udma_mask)
{
struct al_eth_fwd_mac_table_entry entry = { { 0 } };
memset(entry.addr, 0x00, sizeof(entry.addr));
memset(entry.mask, 0x00, sizeof(entry.mask));
entry.mask[0] |= 1;
entry.addr[0] |= 1;
entry.rx_valid = true;
entry.tx_valid = false;
entry.udma_mask = udma_mask;
entry.filter = false;
device_printf_dbg(adapter->dev,
"%s: [%d]: addr "MAC_ADDR_STR" mask "MAC_ADDR_STR"\n",
__func__, idx, MAC_ADDR(entry.addr), MAC_ADDR(entry.mask));
al_eth_fwd_mac_table_set(&adapter->hal_adapter, idx, &entry);
}
static void
al_eth_mac_table_broadcast_add(struct al_eth_adapter *adapter,
uint8_t idx, uint8_t udma_mask)
{
struct al_eth_fwd_mac_table_entry entry = { { 0 } };
memset(entry.addr, 0xff, sizeof(entry.addr));
memset(entry.mask, 0xff, sizeof(entry.mask));
entry.rx_valid = true;
entry.tx_valid = false;
entry.udma_mask = udma_mask;
entry.filter = false;
device_printf_dbg(adapter->dev,
"%s: [%d]: addr "MAC_ADDR_STR" mask "MAC_ADDR_STR"\n",
__func__, idx, MAC_ADDR(entry.addr), MAC_ADDR(entry.mask));
al_eth_fwd_mac_table_set(&adapter->hal_adapter, idx, &entry);
}
static void
al_eth_mac_table_promiscuous_set(struct al_eth_adapter *adapter,
boolean_t promiscuous)
{
struct al_eth_fwd_mac_table_entry entry = { { 0 } };
memset(entry.addr, 0x00, sizeof(entry.addr));
memset(entry.mask, 0x00, sizeof(entry.mask));
entry.rx_valid = true;
entry.tx_valid = false;
entry.udma_mask = (promiscuous) ? 1 : 0;
entry.filter = (promiscuous) ? false : true;
device_printf_dbg(adapter->dev, "%s: %s promiscuous mode\n",
__func__, (promiscuous) ? "enter" : "exit");
al_eth_fwd_mac_table_set(&adapter->hal_adapter,
AL_ETH_MAC_TABLE_DROP_IDX, &entry);
}
static void
al_eth_set_thash_table_entry(struct al_eth_adapter *adapter, uint8_t idx,
uint8_t udma, uint32_t queue)
{
if (udma != 0)
panic("only UDMA0 is supporter");
if (queue >= AL_ETH_NUM_QUEUES)
panic("invalid queue number");
al_eth_thash_table_set(&adapter->hal_adapter, idx, udma, queue);
}
/* init FSM, no tunneling supported yet, if packet is tcp/udp over ipv4/ipv6, use 4 tuple hash */
static void
al_eth_fsm_table_init(struct al_eth_adapter *adapter)
{
uint32_t val;
int i;
for (i = 0; i < AL_ETH_RX_FSM_TABLE_SIZE; i++) {
uint8_t outer_type = AL_ETH_FSM_ENTRY_OUTER(i);
switch (outer_type) {
case AL_ETH_FSM_ENTRY_IPV4_TCP:
case AL_ETH_FSM_ENTRY_IPV4_UDP:
case AL_ETH_FSM_ENTRY_IPV6_TCP:
case AL_ETH_FSM_ENTRY_IPV6_UDP:
val = AL_ETH_FSM_DATA_OUTER_4_TUPLE |
AL_ETH_FSM_DATA_HASH_SEL;
break;
case AL_ETH_FSM_ENTRY_IPV6_NO_UDP_TCP:
case AL_ETH_FSM_ENTRY_IPV4_NO_UDP_TCP:
val = AL_ETH_FSM_DATA_OUTER_2_TUPLE |
AL_ETH_FSM_DATA_HASH_SEL;
break;
default:
val = AL_ETH_FSM_DATA_DEFAULT_Q |
AL_ETH_FSM_DATA_DEFAULT_UDMA;
}
al_eth_fsm_table_set(&adapter->hal_adapter, i, val);
}
}
static void
al_eth_mac_table_entry_clear(struct al_eth_adapter *adapter,
uint8_t idx)
{
struct al_eth_fwd_mac_table_entry entry = { { 0 } };
device_printf_dbg(adapter->dev, "%s: clear entry %d\n", __func__, idx);
al_eth_fwd_mac_table_set(&adapter->hal_adapter, idx, &entry);
}
static int
al_eth_hw_init_adapter(struct al_eth_adapter *adapter)
{
struct al_eth_adapter_params *params = &adapter->eth_hal_params;
int rc;
/* params->dev_id = adapter->dev_id; */
params->rev_id = adapter->rev_id;
params->udma_id = 0;
params->enable_rx_parser = 1; /* enable rx epe parser*/
params->udma_regs_base = adapter->udma_base; /* UDMA register base address */
params->ec_regs_base = adapter->ec_base; /* Ethernet controller registers base address */
params->mac_regs_base = adapter->mac_base; /* Ethernet MAC registers base address */
params->name = adapter->name;
params->serdes_lane = adapter->serdes_lane;
rc = al_eth_adapter_init(&adapter->hal_adapter, params);
if (rc != 0)
device_printf(adapter->dev, "%s failed at hal init!\n",
__func__);
if ((adapter->board_type == ALPINE_NIC) ||
(adapter->board_type == ALPINE_FPGA_NIC)) {
/* in pcie NIC mode, force eth UDMA to access PCIE0 using the vmid */
struct al_udma_gen_tgtid_conf conf;
int i;
for (i = 0; i < DMA_MAX_Q; i++) {
conf.tx_q_conf[i].queue_en = AL_TRUE;
conf.tx_q_conf[i].desc_en = AL_FALSE;
conf.tx_q_conf[i].tgtid = 0x100; /* for access from PCIE0 */
conf.rx_q_conf[i].queue_en = AL_TRUE;
conf.rx_q_conf[i].desc_en = AL_FALSE;
conf.rx_q_conf[i].tgtid = 0x100; /* for access from PCIE0 */
}
al_udma_gen_tgtid_conf_set(adapter->udma_base, &conf);
}
return (rc);
}
static void
al_eth_lm_config(struct al_eth_adapter *adapter)
{
struct al_eth_lm_init_params params = {0};
params.adapter = &adapter->hal_adapter;
params.serdes_obj = &adapter->serdes_obj;
params.lane = adapter->serdes_lane;
params.sfp_detection = adapter->sfp_detection_needed;
if (adapter->sfp_detection_needed == true) {
params.sfp_bus_id = adapter->i2c_adapter_id;
params.sfp_i2c_addr = SFP_I2C_ADDR;
}
if (adapter->sfp_detection_needed == false) {
switch (adapter->mac_mode) {
case AL_ETH_MAC_MODE_10GbE_Serial:
if ((adapter->lt_en != 0) && (adapter->an_en != 0))
params.default_mode = AL_ETH_LM_MODE_10G_DA;
else
params.default_mode = AL_ETH_LM_MODE_10G_OPTIC;
break;
case AL_ETH_MAC_MODE_SGMII:
params.default_mode = AL_ETH_LM_MODE_1G;
break;
default:
params.default_mode = AL_ETH_LM_MODE_10G_DA;
}
} else
params.default_mode = AL_ETH_LM_MODE_10G_DA;
params.link_training = adapter->lt_en;
params.rx_equal = true;
params.static_values = !adapter->dont_override_serdes;
params.i2c_context = adapter;
params.kr_fec_enable = false;
params.retimer_exist = adapter->retimer.exist;
params.retimer_bus_id = adapter->retimer.bus_id;
params.retimer_i2c_addr = adapter->retimer.i2c_addr;
params.retimer_channel = adapter->retimer.channel;
al_eth_lm_init(&adapter->lm_context, &params);
}
static int
al_eth_board_params_init(struct al_eth_adapter *adapter)
{
if (adapter->board_type == ALPINE_NIC) {
adapter->mac_mode = AL_ETH_MAC_MODE_10GbE_Serial;
adapter->sfp_detection_needed = false;
adapter->phy_exist = false;
adapter->an_en = false;
adapter->lt_en = false;
adapter->ref_clk_freq = AL_ETH_REF_FREQ_375_MHZ;
adapter->mdio_freq = AL_ETH_DEFAULT_MDIO_FREQ_KHZ;
} else if (adapter->board_type == ALPINE_FPGA_NIC) {
adapter->mac_mode = AL_ETH_MAC_MODE_SGMII;
adapter->sfp_detection_needed = false;
adapter->phy_exist = false;
adapter->an_en = false;
adapter->lt_en = false;
adapter->ref_clk_freq = AL_ETH_REF_FREQ_375_MHZ;
adapter->mdio_freq = AL_ETH_DEFAULT_MDIO_FREQ_KHZ;
} else {
struct al_eth_board_params params;
int rc;
adapter->auto_speed = false;
rc = al_eth_board_params_get(adapter->mac_base, &params);
if (rc != 0) {
device_printf(adapter->dev,
"board info not available\n");
return (-1);
}
adapter->phy_exist = params.phy_exist == TRUE;
adapter->phy_addr = params.phy_mdio_addr;
adapter->an_en = params.autoneg_enable;
adapter->lt_en = params.kr_lt_enable;
adapter->serdes_grp = params.serdes_grp;
adapter->serdes_lane = params.serdes_lane;
adapter->sfp_detection_needed = params.sfp_plus_module_exist;
adapter->i2c_adapter_id = params.i2c_adapter_id;
adapter->ref_clk_freq = params.ref_clk_freq;
adapter->dont_override_serdes = params.dont_override_serdes;
adapter->link_config.active_duplex = !params.half_duplex;
adapter->link_config.autoneg = !params.an_disable;
adapter->link_config.force_1000_base_x = params.force_1000_base_x;
adapter->retimer.exist = params.retimer_exist;
adapter->retimer.bus_id = params.retimer_bus_id;
adapter->retimer.i2c_addr = params.retimer_i2c_addr;
adapter->retimer.channel = params.retimer_channel;
switch (params.speed) {
default:
device_printf(adapter->dev,
"%s: invalid speed (%d)\n", __func__, params.speed);
case AL_ETH_BOARD_1G_SPEED_1000M:
adapter->link_config.active_speed = 1000;
break;
case AL_ETH_BOARD_1G_SPEED_100M:
adapter->link_config.active_speed = 100;
break;
case AL_ETH_BOARD_1G_SPEED_10M:
adapter->link_config.active_speed = 10;
break;
}
switch (params.mdio_freq) {
default:
device_printf(adapter->dev,
"%s: invalid mdio freq (%d)\n", __func__,
params.mdio_freq);
case AL_ETH_BOARD_MDIO_FREQ_2_5_MHZ:
adapter->mdio_freq = AL_ETH_DEFAULT_MDIO_FREQ_KHZ;
break;
case AL_ETH_BOARD_MDIO_FREQ_1_MHZ:
adapter->mdio_freq = AL_ETH_MDIO_FREQ_1000_KHZ;
break;
}
switch (params.media_type) {
case AL_ETH_BOARD_MEDIA_TYPE_RGMII:
if (params.sfp_plus_module_exist == TRUE)
/* Backward compatibility */
adapter->mac_mode = AL_ETH_MAC_MODE_SGMII;
else
adapter->mac_mode = AL_ETH_MAC_MODE_RGMII;
adapter->use_lm = false;
break;
case AL_ETH_BOARD_MEDIA_TYPE_SGMII:
adapter->mac_mode = AL_ETH_MAC_MODE_SGMII;
adapter->use_lm = true;
break;
case AL_ETH_BOARD_MEDIA_TYPE_10GBASE_SR:
adapter->mac_mode = AL_ETH_MAC_MODE_10GbE_Serial;
adapter->use_lm = true;
break;
case AL_ETH_BOARD_MEDIA_TYPE_AUTO_DETECT:
adapter->sfp_detection_needed = TRUE;
adapter->auto_speed = false;
adapter->use_lm = true;
break;
case AL_ETH_BOARD_MEDIA_TYPE_AUTO_DETECT_AUTO_SPEED:
adapter->sfp_detection_needed = TRUE;
adapter->auto_speed = true;
adapter->mac_mode_set = false;
adapter->use_lm = true;
adapter->mac_mode = AL_ETH_MAC_MODE_10GbE_Serial;
break;
default:
device_printf(adapter->dev,
"%s: unsupported media type %d\n",
__func__, params.media_type);
return (-1);
}
device_printf(adapter->dev,
"Board info: phy exist %s. phy addr %d. mdio freq %u Khz. "
"SFP connected %s. media %d\n",
params.phy_exist == TRUE ? "Yes" : "No",
params.phy_mdio_addr, adapter->mdio_freq,
params.sfp_plus_module_exist == TRUE ? "Yes" : "No",
params.media_type);
}
al_eth_mac_addr_read(adapter->ec_base, 0, adapter->mac_addr);
return (0);
}
static int
al_eth_function_reset(struct al_eth_adapter *adapter)
{
struct al_eth_board_params params;
int rc;
/* save board params so we restore it after reset */
al_eth_board_params_get(adapter->mac_base, &params);
al_eth_mac_addr_read(adapter->ec_base, 0, adapter->mac_addr);
if (adapter->board_type == ALPINE_INTEGRATED)
rc = al_eth_flr_rmn(&al_eth_read_pci_config,
&al_eth_write_pci_config,
adapter->dev, adapter->mac_base);
else
rc = al_eth_flr_rmn(&al_eth_fpga_read_pci_config,
&al_eth_fpga_write_pci_config,
adapter->internal_pcie_base, adapter->mac_base);
/* restore params */
al_eth_board_params_set(adapter->mac_base, &params);
al_eth_mac_addr_store(adapter->ec_base, 0, adapter->mac_addr);
return (rc);
}
static void
al_eth_init_rings(struct al_eth_adapter *adapter)
{
int i;
for (i = 0; i < adapter->num_tx_queues; i++) {
struct al_eth_ring *ring = &adapter->tx_ring[i];
ring->ring_id = i;
ring->dev = adapter->dev;
ring->adapter = adapter;
ring->netdev = adapter->netdev;
al_udma_q_handle_get(&adapter->hal_adapter.tx_udma, i,
&ring->dma_q);
ring->sw_count = adapter->tx_ring_count;
ring->hw_count = adapter->tx_descs_count;
ring->unmask_reg_offset = al_udma_iofic_unmask_offset_get((struct unit_regs *)adapter->udma_base, AL_UDMA_IOFIC_LEVEL_PRIMARY, AL_INT_GROUP_C);
ring->unmask_val = ~(1 << i);
}
for (i = 0; i < adapter->num_rx_queues; i++) {
struct al_eth_ring *ring = &adapter->rx_ring[i];
ring->ring_id = i;
ring->dev = adapter->dev;
ring->adapter = adapter;
ring->netdev = adapter->netdev;
al_udma_q_handle_get(&adapter->hal_adapter.rx_udma, i, &ring->dma_q);
ring->sw_count = adapter->rx_ring_count;
ring->hw_count = adapter->rx_descs_count;
ring->unmask_reg_offset = al_udma_iofic_unmask_offset_get(
(struct unit_regs *)adapter->udma_base,
AL_UDMA_IOFIC_LEVEL_PRIMARY, AL_INT_GROUP_B);
ring->unmask_val = ~(1 << i);
}
}
static void
al_init_locked(void *arg)
{
struct al_eth_adapter *adapter = arg;
if_t ifp = adapter->netdev;
int rc = 0;
al_eth_down(adapter);
rc = al_eth_up(adapter);
ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
if (rc == 0)
ifp->if_drv_flags |= IFF_DRV_RUNNING;
}
static void
al_init(void *arg)
{
struct al_eth_adapter *adapter = arg;
al_init_locked(adapter);
}
static inline int
al_eth_alloc_rx_buf(struct al_eth_adapter *adapter,
struct al_eth_ring *rx_ring,
struct al_eth_rx_buffer *rx_info)
{
struct al_buf *al_buf;
bus_dma_segment_t segs[2];
int error;
int nsegs;
if (rx_info->m != NULL)
return (0);
rx_info->data_size = adapter->rx_mbuf_sz;
AL_RX_LOCK(adapter);
/* Get mbuf using UMA allocator */
rx_info->m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
rx_info->data_size);
AL_RX_UNLOCK(adapter);
if (rx_info->m == NULL)
return (ENOMEM);
rx_info->m->m_pkthdr.len = rx_info->m->m_len = adapter->rx_mbuf_sz;
/* Map packets for DMA */
error = bus_dmamap_load_mbuf_sg(rx_ring->dma_buf_tag, rx_info->dma_map,
rx_info->m, segs, &nsegs, BUS_DMA_NOWAIT);
if (__predict_false(error)) {
device_printf(rx_ring->dev, "failed to map mbuf, error = %d\n",
error);
m_freem(rx_info->m);
rx_info->m = NULL;
return (EFAULT);
}
al_buf = &rx_info->al_buf;
al_buf->addr = segs[0].ds_addr + AL_IP_ALIGNMENT_OFFSET;
al_buf->len = rx_info->data_size - AL_IP_ALIGNMENT_OFFSET;
return (0);
}
static int
al_eth_refill_rx_bufs(struct al_eth_adapter *adapter, unsigned int qid,
unsigned int num)
{
struct al_eth_ring *rx_ring = &adapter->rx_ring[qid];
uint16_t next_to_use;
unsigned int i;
next_to_use = rx_ring->next_to_use;
for (i = 0; i < num; i++) {
int rc;
struct al_eth_rx_buffer *rx_info =
&rx_ring->rx_buffer_info[next_to_use];
if (__predict_false(al_eth_alloc_rx_buf(adapter,
rx_ring, rx_info) < 0)) {
device_printf(adapter->dev,
"failed to alloc buffer for rx queue %d\n", qid);
break;
}
rc = al_eth_rx_buffer_add(rx_ring->dma_q,
&rx_info->al_buf, AL_ETH_RX_FLAGS_INT, NULL);
if (__predict_false(rc)) {
device_printf(adapter->dev,
"failed to add buffer for rx queue %d\n", qid);
break;
}
next_to_use = AL_ETH_RX_RING_IDX_NEXT(rx_ring, next_to_use);
}
if (__predict_false(i < num))
device_printf(adapter->dev,
"refilled rx queue %d with %d pages only - available %d\n",
qid, i, al_udma_available_get(rx_ring->dma_q));
if (__predict_true(i))
al_eth_rx_buffer_action(rx_ring->dma_q, i);
rx_ring->next_to_use = next_to_use;
return (i);
}
/*
* al_eth_refill_all_rx_bufs - allocate all queues Rx buffers
* @adapter: board private structure
*/
static void
al_eth_refill_all_rx_bufs(struct al_eth_adapter *adapter)
{
int i;
for (i = 0; i < adapter->num_rx_queues; i++)
al_eth_refill_rx_bufs(adapter, i, AL_ETH_DEFAULT_RX_DESCS - 1);
}
static void
al_eth_tx_do_cleanup(struct al_eth_ring *tx_ring)
{
unsigned int total_done;
uint16_t next_to_clean;
int qid = tx_ring->ring_id;
total_done = al_eth_comp_tx_get(tx_ring->dma_q);
device_printf_dbg(tx_ring->dev,
"tx_poll: q %d total completed descs %x\n", qid, total_done);
next_to_clean = tx_ring->next_to_clean;
while (total_done != 0) {
struct al_eth_tx_buffer *tx_info;
struct mbuf *mbuf;
tx_info = &tx_ring->tx_buffer_info[next_to_clean];
/* stop if not all descriptors of the packet are completed */
if (tx_info->tx_descs > total_done)
break;
mbuf = tx_info->m;
tx_info->m = NULL;
device_printf_dbg(tx_ring->dev,
"tx_poll: q %d mbuf %p completed\n", qid, mbuf);
/* map is no longer required */
bus_dmamap_unload(tx_ring->dma_buf_tag, tx_info->dma_map);
m_freem(mbuf);
total_done -= tx_info->tx_descs;
next_to_clean = AL_ETH_TX_RING_IDX_NEXT(tx_ring, next_to_clean);
}
tx_ring->next_to_clean = next_to_clean;
device_printf_dbg(tx_ring->dev, "tx_poll: q %d done next to clean %x\n",
qid, next_to_clean);
/*
* need to make the rings circular update visible to
* al_eth_start_xmit() before checking for netif_queue_stopped().
*/
al_smp_data_memory_barrier();
}
static void
al_eth_tx_csum(struct al_eth_ring *tx_ring, struct al_eth_tx_buffer *tx_info,
struct al_eth_pkt *hal_pkt, struct mbuf *m)
{
uint32_t mss = m->m_pkthdr.tso_segsz;
struct ether_vlan_header *eh;
uint16_t etype;
struct ip *ip;
struct ip6_hdr *ip6;
struct tcphdr *th = NULL;
int ehdrlen, ip_hlen = 0;
uint8_t ipproto = 0;
uint32_t offload = 0;
if (mss != 0)
offload = 1;
if ((m->m_pkthdr.csum_flags & CSUM_TSO) != 0)
offload = 1;
if ((m->m_pkthdr.csum_flags & CSUM_OFFLOAD) != 0)
offload = 1;
if (offload != 0) {
struct al_eth_meta_data *meta = &tx_ring->hal_meta;
if (mss != 0)
hal_pkt->flags |= (AL_ETH_TX_FLAGS_TSO |
AL_ETH_TX_FLAGS_L4_CSUM);
else
hal_pkt->flags |= (AL_ETH_TX_FLAGS_L4_CSUM |
AL_ETH_TX_FLAGS_L4_PARTIAL_CSUM);
/*
* Determine where frame payload starts.
* Jump over vlan headers if already present,
* helpful for QinQ too.
*/
eh = mtod(m, struct ether_vlan_header *);
if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
etype = ntohs(eh->evl_proto);
ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
} else {
etype = ntohs(eh->evl_encap_proto);
ehdrlen = ETHER_HDR_LEN;
}
switch (etype) {
case ETHERTYPE_IP:
ip = (struct ip *)(m->m_data + ehdrlen);
ip_hlen = ip->ip_hl << 2;
ipproto = ip->ip_p;
hal_pkt->l3_proto_idx = AL_ETH_PROTO_ID_IPv4;
th = (struct tcphdr *)((caddr_t)ip + ip_hlen);
if (mss != 0)
hal_pkt->flags |= AL_ETH_TX_FLAGS_IPV4_L3_CSUM;
if (ipproto == IPPROTO_TCP)
hal_pkt->l4_proto_idx = AL_ETH_PROTO_ID_TCP;
else
hal_pkt->l4_proto_idx = AL_ETH_PROTO_ID_UDP;
break;
case ETHERTYPE_IPV6:
ip6 = (struct ip6_hdr *)(m->m_data + ehdrlen);
hal_pkt->l3_proto_idx = AL_ETH_PROTO_ID_IPv6;
ip_hlen = sizeof(struct ip6_hdr);
th = (struct tcphdr *)((caddr_t)ip6 + ip_hlen);
ipproto = ip6->ip6_nxt;
if (ipproto == IPPROTO_TCP)
hal_pkt->l4_proto_idx = AL_ETH_PROTO_ID_TCP;
else
hal_pkt->l4_proto_idx = AL_ETH_PROTO_ID_UDP;
break;
default:
break;
}
meta->words_valid = 4;
meta->l3_header_len = ip_hlen;
meta->l3_header_offset = ehdrlen;
if (th != NULL)
meta->l4_header_len = th->th_off; /* this param needed only for TSO */
meta->mss_idx_sel = 0; /* check how to select MSS */
meta->mss_val = mss;
hal_pkt->meta = meta;
} else
hal_pkt->meta = NULL;
}
#define XMIT_QUEUE_TIMEOUT 100
static void
al_eth_xmit_mbuf(struct al_eth_ring *tx_ring, struct mbuf *m)
{
struct al_eth_tx_buffer *tx_info;
int error;
int nsegs, a;
uint16_t next_to_use;
bus_dma_segment_t segs[AL_ETH_PKT_MAX_BUFS + 1];
struct al_eth_pkt *hal_pkt;
struct al_buf *al_buf;
boolean_t remap;
/* Check if queue is ready */
if (unlikely(tx_ring->stall) != 0) {
for (a = 0; a < XMIT_QUEUE_TIMEOUT; a++) {
if (al_udma_available_get(tx_ring->dma_q) >=
(AL_ETH_DEFAULT_TX_HW_DESCS -
AL_ETH_TX_WAKEUP_THRESH)) {
tx_ring->stall = 0;
break;
}
pause("stall", 1);
}
if (a == XMIT_QUEUE_TIMEOUT) {
device_printf(tx_ring->dev,
"timeout waiting for queue %d ready!\n",
tx_ring->ring_id);
return;
} else {
device_printf_dbg(tx_ring->dev,
"queue %d is ready!\n", tx_ring->ring_id);
}
}
next_to_use = tx_ring->next_to_use;
tx_info = &tx_ring->tx_buffer_info[next_to_use];
tx_info->m = m;
hal_pkt = &tx_info->hal_pkt;
if (m == NULL) {
device_printf(tx_ring->dev, "mbuf is NULL\n");
return;
}
remap = TRUE;
/* Map packets for DMA */
retry:
error = bus_dmamap_load_mbuf_sg(tx_ring->dma_buf_tag, tx_info->dma_map,
m, segs, &nsegs, BUS_DMA_NOWAIT);
if (__predict_false(error)) {
struct mbuf *m_new;
if (error == EFBIG) {
/* Try it again? - one try */
if (remap == TRUE) {
remap = FALSE;
m_new = m_defrag(m, M_NOWAIT);
if (m_new == NULL) {
device_printf(tx_ring->dev,
"failed to defrag mbuf\n");
goto exit;
}
m = m_new;
goto retry;
} else {
device_printf(tx_ring->dev,
"failed to map mbuf, error %d\n", error);
goto exit;
}
} else {
device_printf(tx_ring->dev,
"failed to map mbuf, error %d\n", error);
goto exit;
}
}
/* set flags and meta data */
hal_pkt->flags = AL_ETH_TX_FLAGS_INT;
al_eth_tx_csum(tx_ring, tx_info, hal_pkt, m);
al_buf = hal_pkt->bufs;
for (a = 0; a < nsegs; a++) {
al_buf->addr = segs[a].ds_addr;
al_buf->len = segs[a].ds_len;
al_buf++;
}
hal_pkt->num_of_bufs = nsegs;
/* prepare the packet's descriptors to dma engine */
tx_info->tx_descs = al_eth_tx_pkt_prepare(tx_ring->dma_q, hal_pkt);
if (tx_info->tx_descs == 0)
goto exit;
/*
* stop the queue when no more space available, the packet can have up
* to AL_ETH_PKT_MAX_BUFS + 1 buffers and a meta descriptor
*/
if (unlikely(al_udma_available_get(tx_ring->dma_q) <
(AL_ETH_PKT_MAX_BUFS + 2))) {
tx_ring->stall = 1;
device_printf_dbg(tx_ring->dev, "stall, stopping queue %d...\n",
tx_ring->ring_id);
al_data_memory_barrier();
}
tx_ring->next_to_use = AL_ETH_TX_RING_IDX_NEXT(tx_ring, next_to_use);
/* trigger the dma engine */
al_eth_tx_dma_action(tx_ring->dma_q, tx_info->tx_descs);
return;
exit:
m_freem(m);
}
static void
al_eth_tx_cmpl_work(void *arg, int pending)
{
struct al_eth_ring *tx_ring = arg;
if (napi != 0) {
tx_ring->cmpl_is_running = 1;
al_data_memory_barrier();
}
al_eth_tx_do_cleanup(tx_ring);
if (napi != 0) {
tx_ring->cmpl_is_running = 0;
al_data_memory_barrier();
}
/* all work done, enable IRQs */
al_eth_irq_config(tx_ring->unmask_reg_offset, tx_ring->unmask_val);
}
static int
al_eth_tx_cmlp_irq_filter(void *arg)
{
struct al_eth_ring *tx_ring = arg;
/* Interrupt should be auto-masked upon arrival */
device_printf_dbg(tx_ring->dev, "%s for ring ID = %d\n", __func__,
tx_ring->ring_id);
/*
* For napi, if work is not running, schedule it. Always schedule
* for casual (non-napi) packet handling.
*/
if ((napi == 0) || (napi && tx_ring->cmpl_is_running == 0))
taskqueue_enqueue(tx_ring->cmpl_tq, &tx_ring->cmpl_task);
/* Do not run bottom half */
return (FILTER_HANDLED);
}
static int
al_eth_rx_recv_irq_filter(void *arg)
{
struct al_eth_ring *rx_ring = arg;
/* Interrupt should be auto-masked upon arrival */
device_printf_dbg(rx_ring->dev, "%s for ring ID = %d\n", __func__,
rx_ring->ring_id);
/*
* For napi, if work is not running, schedule it. Always schedule
* for casual (non-napi) packet handling.
*/
if ((napi == 0) || (napi && rx_ring->enqueue_is_running == 0))
taskqueue_enqueue(rx_ring->enqueue_tq, &rx_ring->enqueue_task);
/* Do not run bottom half */
return (FILTER_HANDLED);
}
/*
* al_eth_rx_checksum - indicate in mbuf if hw indicated a good cksum
* @adapter: structure containing adapter specific data
* @hal_pkt: HAL structure for the packet
* @mbuf: mbuf currently being received and modified
*/
static inline void
al_eth_rx_checksum(struct al_eth_adapter *adapter,
struct al_eth_pkt *hal_pkt, struct mbuf *mbuf)
{
/* if IPv4 and error */
if (unlikely((adapter->netdev->if_capenable & IFCAP_RXCSUM) &&
(hal_pkt->l3_proto_idx == AL_ETH_PROTO_ID_IPv4) &&
(hal_pkt->flags & AL_ETH_RX_FLAGS_L3_CSUM_ERR))) {
device_printf(adapter->dev,"rx ipv4 header checksum error\n");
return;
}
/* if IPv6 and error */
if (unlikely((adapter->netdev->if_capenable & IFCAP_RXCSUM_IPV6) &&
(hal_pkt->l3_proto_idx == AL_ETH_PROTO_ID_IPv6) &&
(hal_pkt->flags & AL_ETH_RX_FLAGS_L3_CSUM_ERR))) {
device_printf(adapter->dev,"rx ipv6 header checksum error\n");
return;
}
/* if TCP/UDP */
if (likely((hal_pkt->l4_proto_idx == AL_ETH_PROTO_ID_TCP) ||
(hal_pkt->l4_proto_idx == AL_ETH_PROTO_ID_UDP))) {
if (unlikely(hal_pkt->flags & AL_ETH_RX_FLAGS_L4_CSUM_ERR)) {
device_printf_dbg(adapter->dev, "rx L4 checksum error\n");
/* TCP/UDP checksum error */
mbuf->m_pkthdr.csum_flags = 0;
} else {
device_printf_dbg(adapter->dev, "rx checksum correct\n");
/* IP Checksum Good */
mbuf->m_pkthdr.csum_flags = CSUM_IP_CHECKED;
mbuf->m_pkthdr.csum_flags |= CSUM_IP_VALID;
}
}
}
static struct mbuf*
al_eth_rx_mbuf(struct al_eth_adapter *adapter,
struct al_eth_ring *rx_ring, struct al_eth_pkt *hal_pkt,
unsigned int descs, uint16_t *next_to_clean)
{
struct mbuf *mbuf;
struct al_eth_rx_buffer *rx_info =
&rx_ring->rx_buffer_info[*next_to_clean];
unsigned int len;
len = hal_pkt->bufs[0].len;
device_printf_dbg(adapter->dev, "rx_info %p data %p\n", rx_info,
rx_info->m);
if (rx_info->m == NULL) {
*next_to_clean = AL_ETH_RX_RING_IDX_NEXT(rx_ring,
*next_to_clean);
return (NULL);
}
mbuf = rx_info->m;
mbuf->m_pkthdr.len = len;
mbuf->m_len = len;
mbuf->m_pkthdr.rcvif = rx_ring->netdev;
mbuf->m_flags |= M_PKTHDR;
if (len <= adapter->small_copy_len) {
struct mbuf *smbuf;
device_printf_dbg(adapter->dev, "rx small packet. len %d\n", len);
AL_RX_LOCK(adapter);
smbuf = m_gethdr(M_NOWAIT, MT_DATA);
AL_RX_UNLOCK(adapter);
if (__predict_false(smbuf == NULL)) {
device_printf(adapter->dev, "smbuf is NULL\n");
return (NULL);
}
smbuf->m_data = smbuf->m_data + AL_IP_ALIGNMENT_OFFSET;
memcpy(smbuf->m_data, mbuf->m_data + AL_IP_ALIGNMENT_OFFSET, len);
smbuf->m_len = len;
smbuf->m_pkthdr.rcvif = rx_ring->netdev;
/* first desc of a non-ps chain */
smbuf->m_flags |= M_PKTHDR;
smbuf->m_pkthdr.len = smbuf->m_len;
*next_to_clean = AL_ETH_RX_RING_IDX_NEXT(rx_ring,
*next_to_clean);
return (smbuf);
}
mbuf->m_data = mbuf->m_data + AL_IP_ALIGNMENT_OFFSET;
/* Unmap the buffer */
bus_dmamap_unload(rx_ring->dma_buf_tag, rx_info->dma_map);
rx_info->m = NULL;
*next_to_clean = AL_ETH_RX_RING_IDX_NEXT(rx_ring, *next_to_clean);
return (mbuf);
}
static void
al_eth_rx_recv_work(void *arg, int pending)
{
struct al_eth_ring *rx_ring = arg;
struct mbuf *mbuf;
struct lro_entry *queued;
unsigned int qid = rx_ring->ring_id;
struct al_eth_pkt *hal_pkt = &rx_ring->hal_pkt;
uint16_t next_to_clean = rx_ring->next_to_clean;
uint32_t refill_required;
uint32_t refill_actual;
uint32_t do_if_input;
if (napi != 0) {
rx_ring->enqueue_is_running = 1;
al_data_memory_barrier();
}
do {
unsigned int descs;
descs = al_eth_pkt_rx(rx_ring->dma_q, hal_pkt);
if (unlikely(descs == 0))
break;
device_printf_dbg(rx_ring->dev, "rx_poll: q %d got packet "
"from hal. descs %d\n", qid, descs);
device_printf_dbg(rx_ring->dev, "rx_poll: q %d flags %x. "
"l3 proto %d l4 proto %d\n", qid, hal_pkt->flags,
hal_pkt->l3_proto_idx, hal_pkt->l4_proto_idx);
/* ignore if detected dma or eth controller errors */
if ((hal_pkt->flags & (AL_ETH_RX_ERROR |
AL_UDMA_CDESC_ERROR)) != 0) {
device_printf(rx_ring->dev, "receive packet with error. "
"flags = 0x%x\n", hal_pkt->flags);
next_to_clean = AL_ETH_RX_RING_IDX_ADD(rx_ring,
next_to_clean, descs);
continue;
}
/* allocate mbuf and fill it */
mbuf = al_eth_rx_mbuf(rx_ring->adapter, rx_ring, hal_pkt, descs,
&next_to_clean);
/* exit if we failed to retrieve a buffer */
if (unlikely(mbuf == NULL)) {
next_to_clean = AL_ETH_RX_RING_IDX_ADD(rx_ring,
next_to_clean, descs);
break;
}
if (__predict_true(rx_ring->netdev->if_capenable & IFCAP_RXCSUM ||
rx_ring->netdev->if_capenable & IFCAP_RXCSUM_IPV6)) {
al_eth_rx_checksum(rx_ring->adapter, hal_pkt, mbuf);
}
#if __FreeBSD_version >= 800000
mbuf->m_pkthdr.flowid = qid;
M_HASHTYPE_SET(mbuf, M_HASHTYPE_OPAQUE);
#endif
/*
* LRO is only for IP/TCP packets and TCP checksum of the packet
* should be computed by hardware.
*/
do_if_input = 1;
if ((rx_ring->lro_enabled != 0) &&
((mbuf->m_pkthdr.csum_flags & CSUM_IP_VALID) != 0) &&
hal_pkt->l4_proto_idx == AL_ETH_PROTO_ID_TCP) {
/*
* Send to the stack if:
* - LRO not enabled, or
* - no LRO resources, or
* - lro enqueue fails
*/
if (rx_ring->lro.lro_cnt != 0) {
if (tcp_lro_rx(&rx_ring->lro, mbuf, 0) == 0)
do_if_input = 0;
}
}
if (do_if_input)
(*rx_ring->netdev->if_input)(rx_ring->netdev, mbuf);
} while (1);
rx_ring->next_to_clean = next_to_clean;
refill_required = al_udma_available_get(rx_ring->dma_q);
refill_actual = al_eth_refill_rx_bufs(rx_ring->adapter, qid,
refill_required);
if (unlikely(refill_actual < refill_required)) {
device_printf_dbg(rx_ring->dev,
"%s: not filling rx queue %d\n", __func__, qid);
}
while (((queued = LIST_FIRST(&rx_ring->lro.lro_active)) != NULL)) {
LIST_REMOVE(queued, next);
tcp_lro_flush(&rx_ring->lro, queued);
}
if (napi != 0) {
rx_ring->enqueue_is_running = 0;
al_data_memory_barrier();
}
/* unmask irq */
al_eth_irq_config(rx_ring->unmask_reg_offset, rx_ring->unmask_val);
}
static void
al_eth_start_xmit(void *arg, int pending)
{
struct al_eth_ring *tx_ring = arg;
struct mbuf *mbuf;
if (napi != 0) {
tx_ring->enqueue_is_running = 1;
al_data_memory_barrier();
}
while (1) {
mtx_lock(&tx_ring->br_mtx);
mbuf = drbr_dequeue(NULL, tx_ring->br);
mtx_unlock(&tx_ring->br_mtx);
if (mbuf == NULL)
break;
al_eth_xmit_mbuf(tx_ring, mbuf);
}
if (napi != 0) {
tx_ring->enqueue_is_running = 0;
al_data_memory_barrier();
while (1) {
mtx_lock(&tx_ring->br_mtx);
mbuf = drbr_dequeue(NULL, tx_ring->br);
mtx_unlock(&tx_ring->br_mtx);
if (mbuf == NULL)
break;
al_eth_xmit_mbuf(tx_ring, mbuf);
}
}
}
static int
al_mq_start(struct ifnet *ifp, struct mbuf *m)
{
struct al_eth_adapter *adapter = ifp->if_softc;
struct al_eth_ring *tx_ring;
int i;
int ret;
/* Which queue to use */
if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
i = m->m_pkthdr.flowid % adapter->num_tx_queues;
else
i = curcpu % adapter->num_tx_queues;
if ((ifp->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
IFF_DRV_RUNNING) {
return (EFAULT);
}
tx_ring = &adapter->tx_ring[i];
device_printf_dbg(adapter->dev, "dgb start() - assuming link is active, "
"sending packet to queue %d\n", i);
ret = drbr_enqueue(ifp, tx_ring->br, m);
/*
* For napi, if work is not running, schedule it. Always schedule
* for casual (non-napi) packet handling.
*/
if ((napi == 0) || ((napi != 0) && (tx_ring->enqueue_is_running == 0)))
taskqueue_enqueue(tx_ring->enqueue_tq, &tx_ring->enqueue_task);
return (ret);
}
static void
al_qflush(struct ifnet * ifp)
{
/* unused */
}
static inline void
al_eth_flow_ctrl_init(struct al_eth_adapter *adapter)
{
uint8_t default_flow_ctrl;
default_flow_ctrl = AL_ETH_FLOW_CTRL_TX_PAUSE;
default_flow_ctrl |= AL_ETH_FLOW_CTRL_RX_PAUSE;
adapter->link_config.flow_ctrl_supported = default_flow_ctrl;
}
static int
al_eth_flow_ctrl_config(struct al_eth_adapter *adapter)
{
struct al_eth_flow_control_params *flow_ctrl_params;
uint8_t active = adapter->link_config.flow_ctrl_active;
int i;
flow_ctrl_params = &adapter->flow_ctrl_params;
flow_ctrl_params->type = AL_ETH_FLOW_CONTROL_TYPE_LINK_PAUSE;
flow_ctrl_params->obay_enable =
((active & AL_ETH_FLOW_CTRL_RX_PAUSE) != 0);
flow_ctrl_params->gen_enable =
((active & AL_ETH_FLOW_CTRL_TX_PAUSE) != 0);
flow_ctrl_params->rx_fifo_th_high = AL_ETH_FLOW_CTRL_RX_FIFO_TH_HIGH;
flow_ctrl_params->rx_fifo_th_low = AL_ETH_FLOW_CTRL_RX_FIFO_TH_LOW;
flow_ctrl_params->quanta = AL_ETH_FLOW_CTRL_QUANTA;
flow_ctrl_params->quanta_th = AL_ETH_FLOW_CTRL_QUANTA_TH;
/* map priority to queue index, queue id = priority/2 */
for (i = 0; i < AL_ETH_FWD_PRIO_TABLE_NUM; i++)
flow_ctrl_params->prio_q_map[0][i] = 1 << (i >> 1);
al_eth_flow_control_config(&adapter->hal_adapter, flow_ctrl_params);
return (0);
}
static void
al_eth_flow_ctrl_enable(struct al_eth_adapter *adapter)
{
/*
* change the active configuration to the default / force by ethtool
* and call to configure
*/
adapter->link_config.flow_ctrl_active =
adapter->link_config.flow_ctrl_supported;
al_eth_flow_ctrl_config(adapter);
}
static void
al_eth_flow_ctrl_disable(struct al_eth_adapter *adapter)
{
adapter->link_config.flow_ctrl_active = 0;
al_eth_flow_ctrl_config(adapter);
}
static int
al_eth_hw_init(struct al_eth_adapter *adapter)
{
int rc;
rc = al_eth_hw_init_adapter(adapter);
if (rc != 0)
return (rc);
rc = al_eth_mac_config(&adapter->hal_adapter, adapter->mac_mode);
if (rc < 0) {
device_printf(adapter->dev, "%s failed to configure mac!\n",
__func__);
return (rc);
}
if ((adapter->mac_mode == AL_ETH_MAC_MODE_SGMII) ||
(adapter->mac_mode == AL_ETH_MAC_MODE_RGMII &&
adapter->phy_exist == FALSE)) {
rc = al_eth_mac_link_config(&adapter->hal_adapter,
adapter->link_config.force_1000_base_x,
adapter->link_config.autoneg,
adapter->link_config.active_speed,
adapter->link_config.active_duplex);
if (rc != 0) {
device_printf(adapter->dev,
"%s failed to configure link parameters!\n",
__func__);
return (rc);
}
}
rc = al_eth_mdio_config(&adapter->hal_adapter,
AL_ETH_MDIO_TYPE_CLAUSE_22, TRUE /* shared_mdio_if */,
adapter->ref_clk_freq, adapter->mdio_freq);
if (rc != 0) {
device_printf(adapter->dev, "%s failed at mdio config!\n",
__func__);
return (rc);
}
al_eth_flow_ctrl_init(adapter);
return (rc);
}
static int
al_eth_hw_stop(struct al_eth_adapter *adapter)
{
al_eth_mac_stop(&adapter->hal_adapter);
/*
* wait till pending rx packets written and UDMA becomes idle,
* the MAC has ~10KB fifo, 10us should be enought time for the
* UDMA to write to the memory
*/
DELAY(10);
al_eth_adapter_stop(&adapter->hal_adapter);
adapter->flags |= AL_ETH_FLAG_RESET_REQUESTED;
/* disable flow ctrl to avoid pause packets*/
al_eth_flow_ctrl_disable(adapter);
return (0);
}
/*
* al_eth_intr_intx_all - Legacy Interrupt Handler for all interrupts
* @irq: interrupt number
* @data: pointer to a network interface device structure
*/
static int
al_eth_intr_intx_all(void *data)
{
struct al_eth_adapter *adapter = data;
struct unit_regs __iomem *regs_base =
(struct unit_regs __iomem *)adapter->udma_base;
uint32_t reg;
reg = al_udma_iofic_read_cause(regs_base, AL_UDMA_IOFIC_LEVEL_PRIMARY,
AL_INT_GROUP_A);
if (likely(reg))
device_printf_dbg(adapter->dev, "%s group A cause %x\n",
__func__, reg);
if (unlikely(reg & AL_INT_GROUP_A_GROUP_D_SUM)) {
struct al_iofic_grp_ctrl __iomem *sec_ints_base;
uint32_t cause_d = al_udma_iofic_read_cause(regs_base,
AL_UDMA_IOFIC_LEVEL_PRIMARY, AL_INT_GROUP_D);
sec_ints_base =
&regs_base->gen.interrupt_regs.secondary_iofic_ctrl[0];
if (cause_d != 0) {
device_printf_dbg(adapter->dev,
"got interrupt from group D. cause %x\n", cause_d);
cause_d = al_iofic_read_cause(sec_ints_base,
AL_INT_GROUP_A);
device_printf(adapter->dev,
"secondary A cause %x\n", cause_d);
cause_d = al_iofic_read_cause(sec_ints_base,
AL_INT_GROUP_B);
device_printf_dbg(adapter->dev,
"secondary B cause %x\n", cause_d);
}
}
if ((reg & AL_INT_GROUP_A_GROUP_B_SUM) != 0 ) {
uint32_t cause_b = al_udma_iofic_read_cause(regs_base,
AL_UDMA_IOFIC_LEVEL_PRIMARY, AL_INT_GROUP_B);
int qid;
device_printf_dbg(adapter->dev, "secondary B cause %x\n",
cause_b);
for (qid = 0; qid < adapter->num_rx_queues; qid++) {
if (cause_b & (1 << qid)) {
/* mask */
al_udma_iofic_mask(
(struct unit_regs __iomem *)adapter->udma_base,
AL_UDMA_IOFIC_LEVEL_PRIMARY,
AL_INT_GROUP_B, 1 << qid);
}
}
}
if ((reg & AL_INT_GROUP_A_GROUP_C_SUM) != 0) {
uint32_t cause_c = al_udma_iofic_read_cause(regs_base,
AL_UDMA_IOFIC_LEVEL_PRIMARY, AL_INT_GROUP_C);
int qid;
device_printf_dbg(adapter->dev, "secondary C cause %x\n", cause_c);
for (qid = 0; qid < adapter->num_tx_queues; qid++) {
if ((cause_c & (1 << qid)) != 0) {
al_udma_iofic_mask(
(struct unit_regs __iomem *)adapter->udma_base,
AL_UDMA_IOFIC_LEVEL_PRIMARY,
AL_INT_GROUP_C, 1 << qid);
}
}
}
al_eth_tx_cmlp_irq_filter(adapter->tx_ring);
return (0);
}
static int
al_eth_intr_msix_all(void *data)
{
struct al_eth_adapter *adapter = data;
device_printf_dbg(adapter->dev, "%s\n", __func__);
return (0);
}
static int
al_eth_intr_msix_mgmt(void *data)
{
struct al_eth_adapter *adapter = data;
device_printf_dbg(adapter->dev, "%s\n", __func__);
return (0);
}
static int
al_eth_enable_msix(struct al_eth_adapter *adapter)
{
int i, msix_vecs, rc, count;
device_printf_dbg(adapter->dev, "%s\n", __func__);
msix_vecs = 1 + adapter->num_rx_queues + adapter->num_tx_queues;
device_printf_dbg(adapter->dev,
"Try to enable MSIX, vector numbers = %d\n", msix_vecs);
adapter->msix_entries = malloc(msix_vecs*sizeof(*adapter->msix_entries),
M_IFAL, M_ZERO | M_WAITOK);
if (adapter->msix_entries == NULL) {
device_printf_dbg(adapter->dev, "failed to allocate"
" msix_entries %d\n", msix_vecs);
rc = ENOMEM;
goto exit;
}
/* management vector (GROUP_A) @2*/
adapter->msix_entries[AL_ETH_MGMT_IRQ_IDX].entry = 2;
adapter->msix_entries[AL_ETH_MGMT_IRQ_IDX].vector = 0;
/* rx queues start @3 */
for (i = 0; i < adapter->num_rx_queues; i++) {
int irq_idx = AL_ETH_RXQ_IRQ_IDX(adapter, i);
adapter->msix_entries[irq_idx].entry = 3 + i;
adapter->msix_entries[irq_idx].vector = 0;
}
/* tx queues start @7 */
for (i = 0; i < adapter->num_tx_queues; i++) {
int irq_idx = AL_ETH_TXQ_IRQ_IDX(adapter, i);
adapter->msix_entries[irq_idx].entry = 3 +
AL_ETH_MAX_HW_QUEUES + i;
adapter->msix_entries[irq_idx].vector = 0;
}
count = msix_vecs + 2; /* entries start from 2 */
rc = pci_alloc_msix(adapter->dev, &count);
if (rc != 0) {
device_printf_dbg(adapter->dev, "failed to allocate MSIX "
"vectors %d\n", msix_vecs+2);
device_printf_dbg(adapter->dev, "ret = %d\n", rc);
goto msix_entries_exit;
}
if (count != msix_vecs + 2) {
device_printf_dbg(adapter->dev, "failed to allocate all MSIX "
"vectors %d, allocated %d\n", msix_vecs+2, count);
rc = ENOSPC;
goto msix_entries_exit;
}
for (i = 0; i < msix_vecs; i++)
adapter->msix_entries[i].vector = 2 + 1 + i;
device_printf_dbg(adapter->dev, "successfully enabled MSIX,"
" vectors %d\n", msix_vecs);
adapter->msix_vecs = msix_vecs;
adapter->flags |= AL_ETH_FLAG_MSIX_ENABLED;
goto exit;
msix_entries_exit:
adapter->msix_vecs = 0;
free(adapter->msix_entries, M_IFAL);
adapter->msix_entries = NULL;
exit:
return (rc);
}
static int
al_eth_setup_int_mode(struct al_eth_adapter *adapter)
{
int i, rc;
rc = al_eth_enable_msix(adapter);
if (rc != 0) {
device_printf(adapter->dev, "Failed to enable MSIX mode.\n");
return (rc);
}
adapter->irq_vecs = max(1, adapter->msix_vecs);
/* single INTX mode */
if (adapter->msix_vecs == 0) {
snprintf(adapter->irq_tbl[AL_ETH_MGMT_IRQ_IDX].name,
AL_ETH_IRQNAME_SIZE, "al-eth-intx-all@pci:%s",
device_get_name(adapter->dev));
adapter->irq_tbl[AL_ETH_MGMT_IRQ_IDX].handler =
al_eth_intr_intx_all;
/* IRQ vector will be resolved from device resources */
adapter->irq_tbl[AL_ETH_MGMT_IRQ_IDX].vector = 0;
adapter->irq_tbl[AL_ETH_MGMT_IRQ_IDX].data = adapter;
device_printf(adapter->dev, "%s and vector %d \n", __func__,
adapter->irq_tbl[AL_ETH_MGMT_IRQ_IDX].vector);
return (0);
}
/* single MSI-X mode */
if (adapter->msix_vecs == 1) {
snprintf(adapter->irq_tbl[AL_ETH_MGMT_IRQ_IDX].name,
AL_ETH_IRQNAME_SIZE, "al-eth-msix-all@pci:%s",
device_get_name(adapter->dev));
adapter->irq_tbl[AL_ETH_MGMT_IRQ_IDX].handler =
al_eth_intr_msix_all;
adapter->irq_tbl[AL_ETH_MGMT_IRQ_IDX].vector =
adapter->msix_entries[AL_ETH_MGMT_IRQ_IDX].vector;
adapter->irq_tbl[AL_ETH_MGMT_IRQ_IDX].data = adapter;
return (0);
}
/* MSI-X per queue */
snprintf(adapter->irq_tbl[AL_ETH_MGMT_IRQ_IDX].name, AL_ETH_IRQNAME_SIZE,
"al-eth-msix-mgmt@pci:%s", device_get_name(adapter->dev));
adapter->irq_tbl[AL_ETH_MGMT_IRQ_IDX].handler = al_eth_intr_msix_mgmt;
adapter->irq_tbl[AL_ETH_MGMT_IRQ_IDX].data = adapter;
adapter->irq_tbl[AL_ETH_MGMT_IRQ_IDX].vector =
adapter->msix_entries[AL_ETH_MGMT_IRQ_IDX].vector;
for (i = 0; i < adapter->num_rx_queues; i++) {
int irq_idx = AL_ETH_RXQ_IRQ_IDX(adapter, i);
snprintf(adapter->irq_tbl[irq_idx].name, AL_ETH_IRQNAME_SIZE,
"al-eth-rx-comp-%d@pci:%s", i,
device_get_name(adapter->dev));
adapter->irq_tbl[irq_idx].handler = al_eth_rx_recv_irq_filter;
adapter->irq_tbl[irq_idx].data = &adapter->rx_ring[i];
adapter->irq_tbl[irq_idx].vector =
adapter->msix_entries[irq_idx].vector;
}
for (i = 0; i < adapter->num_tx_queues; i++) {
int irq_idx = AL_ETH_TXQ_IRQ_IDX(adapter, i);
snprintf(adapter->irq_tbl[irq_idx].name,
AL_ETH_IRQNAME_SIZE, "al-eth-tx-comp-%d@pci:%s", i,
device_get_name(adapter->dev));
adapter->irq_tbl[irq_idx].handler = al_eth_tx_cmlp_irq_filter;
adapter->irq_tbl[irq_idx].data = &adapter->tx_ring[i];
adapter->irq_tbl[irq_idx].vector =
adapter->msix_entries[irq_idx].vector;
}
return (0);
}
static void
__al_eth_free_irq(struct al_eth_adapter *adapter)
{
struct al_eth_irq *irq;
int i, rc;
for (i = 0; i < adapter->irq_vecs; i++) {
irq = &adapter->irq_tbl[i];
if (irq->requested != 0) {
device_printf_dbg(adapter->dev, "tear down irq: %d\n",
irq->vector);
rc = bus_teardown_intr(adapter->dev, irq->res,
irq->cookie);
if (rc != 0)
device_printf(adapter->dev, "failed to tear "
"down irq: %d\n", irq->vector);
}
irq->requested = 0;
}
}
static void
al_eth_free_irq(struct al_eth_adapter *adapter)
{
struct al_eth_irq *irq;
int i, rc;
#ifdef CONFIG_RFS_ACCEL
if (adapter->msix_vecs >= 1) {
free_irq_cpu_rmap(adapter->netdev->rx_cpu_rmap);
adapter->netdev->rx_cpu_rmap = NULL;
}
#endif
__al_eth_free_irq(adapter);
for (i = 0; i < adapter->irq_vecs; i++) {
irq = &adapter->irq_tbl[i];
if (irq->res == NULL)
continue;
device_printf_dbg(adapter->dev, "release resource irq: %d\n",
irq->vector);
rc = bus_release_resource(adapter->dev, SYS_RES_IRQ, irq->vector,
irq->res);
irq->res = NULL;
if (rc != 0)
device_printf(adapter->dev, "dev has no parent while "
"releasing res for irq: %d\n", irq->vector);
}
pci_release_msi(adapter->dev);
adapter->flags &= ~AL_ETH_FLAG_MSIX_ENABLED;
adapter->msix_vecs = 0;
free(adapter->msix_entries, M_IFAL);
adapter->msix_entries = NULL;
}
static int
al_eth_request_irq(struct al_eth_adapter *adapter)
{
unsigned long flags;
struct al_eth_irq *irq;
int rc = 0, i, v;
if ((adapter->flags & AL_ETH_FLAG_MSIX_ENABLED) != 0)
flags = RF_ACTIVE;
else
flags = RF_ACTIVE | RF_SHAREABLE;
for (i = 0; i < adapter->irq_vecs; i++) {
irq = &adapter->irq_tbl[i];
if (irq->requested != 0)
continue;
irq->res = bus_alloc_resource_any(adapter->dev, SYS_RES_IRQ,
&irq->vector, flags);
if (irq->res == NULL) {
device_printf(adapter->dev, "could not allocate "
"irq vector=%d\n", irq->vector);
rc = ENXIO;
goto exit_res;
}
if ((rc = bus_setup_intr(adapter->dev, irq->res,
INTR_TYPE_NET | INTR_MPSAFE, irq->handler,
NULL, irq->data, &irq->cookie)) != 0) {
device_printf(adapter->dev, "failed to register "
"interrupt handler for irq %ju: %d\n",
(uintmax_t)rman_get_start(irq->res), rc);
goto exit_intr;
}
irq->requested = 1;
}
goto exit;
exit_intr:
v = i - 1; /* -1 because we omit the operation that failed */
while (v-- >= 0) {
int bti;
irq = &adapter->irq_tbl[v];
bti = bus_teardown_intr(adapter->dev, irq->res, irq->cookie);
if (bti != 0) {
device_printf(adapter->dev, "failed to tear "
"down irq: %d\n", irq->vector);
}
irq->requested = 0;
device_printf_dbg(adapter->dev, "exit_intr: releasing irq %d\n",
irq->vector);
}
exit_res:
v = i - 1; /* -1 because we omit the operation that failed */
while (v-- >= 0) {
int brr;
irq = &adapter->irq_tbl[v];
device_printf_dbg(adapter->dev, "exit_res: releasing resource"
" for irq %d\n", irq->vector);
brr = bus_release_resource(adapter->dev, SYS_RES_IRQ,
irq->vector, irq->res);
if (brr != 0)
device_printf(adapter->dev, "dev has no parent while "
"releasing res for irq: %d\n", irq->vector);
irq->res = NULL;
}
exit:
return (rc);
}
/**
* al_eth_setup_tx_resources - allocate Tx resources (Descriptors)
* @adapter: network interface device structure
* @qid: queue index
*
* Return 0 on success, negative on failure
**/
static int
al_eth_setup_tx_resources(struct al_eth_adapter *adapter, int qid)
{
struct al_eth_ring *tx_ring = &adapter->tx_ring[qid];
struct device *dev = tx_ring->dev;
struct al_udma_q_params *q_params = &tx_ring->q_params;
int size;
int ret;
if (adapter->up)
return (0);
size = sizeof(struct al_eth_tx_buffer) * tx_ring->sw_count;
tx_ring->tx_buffer_info = malloc(size, M_IFAL, M_ZERO | M_WAITOK);
if (tx_ring->tx_buffer_info == NULL)
return (ENOMEM);
tx_ring->descs_size = tx_ring->hw_count * sizeof(union al_udma_desc);
q_params->size = tx_ring->hw_count;
ret = al_dma_alloc_coherent(dev, &q_params->desc_phy_base_tag,
(bus_dmamap_t *)&q_params->desc_phy_base_map,
(bus_addr_t *)&q_params->desc_phy_base,
(void**)&q_params->desc_base, tx_ring->descs_size);
if (ret != 0) {
device_printf(dev, "failed to al_dma_alloc_coherent,"
" ret = %d\n", ret);
return (ENOMEM);
}
if (q_params->desc_base == NULL)
return (ENOMEM);
device_printf_dbg(dev, "Initializing ring queues %d\n", qid);
/* Allocate Ring Queue */
mtx_init(&tx_ring->br_mtx, "AlRingMtx", NULL, MTX_DEF);
tx_ring->br = buf_ring_alloc(AL_BR_SIZE, M_DEVBUF, M_WAITOK,
&tx_ring->br_mtx);
if (tx_ring->br == NULL) {
device_printf(dev, "Critical Failure setting up buf ring\n");
return (ENOMEM);
}
/* Allocate taskqueues */
TASK_INIT(&tx_ring->enqueue_task, 0, al_eth_start_xmit, tx_ring);
tx_ring->enqueue_tq = taskqueue_create_fast("al_tx_enque", M_NOWAIT,
taskqueue_thread_enqueue, &tx_ring->enqueue_tq);
taskqueue_start_threads(&tx_ring->enqueue_tq, 1, PI_NET, "%s txeq",
device_get_nameunit(adapter->dev));
TASK_INIT(&tx_ring->cmpl_task, 0, al_eth_tx_cmpl_work, tx_ring);
tx_ring->cmpl_tq = taskqueue_create_fast("al_tx_cmpl", M_NOWAIT,
taskqueue_thread_enqueue, &tx_ring->cmpl_tq);
taskqueue_start_threads(&tx_ring->cmpl_tq, 1, PI_REALTIME, "%s txcq",
device_get_nameunit(adapter->dev));
/* Setup DMA descriptor areas. */
ret = bus_dma_tag_create(bus_get_dma_tag(dev),
1, 0, /* alignment, bounds */
BUS_SPACE_MAXADDR, /* lowaddr */
BUS_SPACE_MAXADDR, /* highaddr */
NULL, NULL, /* filter, filterarg */
AL_TSO_SIZE, /* maxsize */
AL_ETH_PKT_MAX_BUFS, /* nsegments */
PAGE_SIZE, /* maxsegsize */
0, /* flags */
NULL, /* lockfunc */
NULL, /* lockfuncarg */
&tx_ring->dma_buf_tag);
if (ret != 0) {
device_printf(dev,"Unable to allocate dma_buf_tag, ret = %d\n",
ret);
return (ret);
}
for (size = 0; size < tx_ring->sw_count; size++) {
ret = bus_dmamap_create(tx_ring->dma_buf_tag, 0,
&tx_ring->tx_buffer_info[size].dma_map);
if (ret != 0) {
device_printf(dev, "Unable to map DMA TX "
"buffer memory [iter=%d]\n", size);
return (ret);
}
}
/* completion queue not used for tx */
q_params->cdesc_base = NULL;
/* size in bytes of the udma completion ring descriptor */
q_params->cdesc_size = 8;
tx_ring->next_to_use = 0;
tx_ring->next_to_clean = 0;
return (0);
}
/*
* al_eth_free_tx_resources - Free Tx Resources per Queue
* @adapter: network interface device structure
* @qid: queue index
*
* Free all transmit software resources
*/
static void
al_eth_free_tx_resources(struct al_eth_adapter *adapter, int qid)
{
struct al_eth_ring *tx_ring = &adapter->tx_ring[qid];
struct al_udma_q_params *q_params = &tx_ring->q_params;
int size;
/* At this point interrupts' handlers must be deactivated */
while (taskqueue_cancel(tx_ring->cmpl_tq, &tx_ring->cmpl_task, NULL))
taskqueue_drain(tx_ring->cmpl_tq, &tx_ring->cmpl_task);
taskqueue_free(tx_ring->cmpl_tq);
while (taskqueue_cancel(tx_ring->enqueue_tq,
&tx_ring->enqueue_task, NULL)) {
taskqueue_drain(tx_ring->enqueue_tq, &tx_ring->enqueue_task);
}
taskqueue_free(tx_ring->enqueue_tq);
if (tx_ring->br != NULL) {
drbr_flush(adapter->netdev, tx_ring->br);
buf_ring_free(tx_ring->br, M_DEVBUF);
}
for (size = 0; size < tx_ring->sw_count; size++) {
m_freem(tx_ring->tx_buffer_info[size].m);
tx_ring->tx_buffer_info[size].m = NULL;
bus_dmamap_unload(tx_ring->dma_buf_tag,
tx_ring->tx_buffer_info[size].dma_map);
bus_dmamap_destroy(tx_ring->dma_buf_tag,
tx_ring->tx_buffer_info[size].dma_map);
}
bus_dma_tag_destroy(tx_ring->dma_buf_tag);
free(tx_ring->tx_buffer_info, M_IFAL);
tx_ring->tx_buffer_info = NULL;
mtx_destroy(&tx_ring->br_mtx);
/* if not set, then don't free */
if (q_params->desc_base == NULL)
return;
al_dma_free_coherent(q_params->desc_phy_base_tag,
q_params->desc_phy_base_map, q_params->desc_base);
q_params->desc_base = NULL;
}
/*
* al_eth_free_all_tx_resources - Free Tx Resources for All Queues
* @adapter: board private structure
*
* Free all transmit software resources
*/
static void
al_eth_free_all_tx_resources(struct al_eth_adapter *adapter)
{
int i;
for (i = 0; i < adapter->num_tx_queues; i++)
if (adapter->tx_ring[i].q_params.desc_base)
al_eth_free_tx_resources(adapter, i);
}
/*
* al_eth_setup_rx_resources - allocate Rx resources (Descriptors)
* @adapter: network interface device structure
* @qid: queue index
*
* Returns 0 on success, negative on failure
*/
static int
al_eth_setup_rx_resources(struct al_eth_adapter *adapter, unsigned int qid)
{
struct al_eth_ring *rx_ring = &adapter->rx_ring[qid];
struct device *dev = rx_ring->dev;
struct al_udma_q_params *q_params = &rx_ring->q_params;
int size;
int ret;
size = sizeof(struct al_eth_rx_buffer) * rx_ring->sw_count;
/* alloc extra element so in rx path we can always prefetch rx_info + 1 */
size += 1;
rx_ring->rx_buffer_info = malloc(size, M_IFAL, M_ZERO | M_WAITOK);
if (rx_ring->rx_buffer_info == NULL)
return (ENOMEM);
rx_ring->descs_size = rx_ring->hw_count * sizeof(union al_udma_desc);
q_params->size = rx_ring->hw_count;
ret = al_dma_alloc_coherent(dev, &q_params->desc_phy_base_tag,
&q_params->desc_phy_base_map,
(bus_addr_t *)&q_params->desc_phy_base,
(void**)&q_params->desc_base, rx_ring->descs_size);
if ((q_params->desc_base == NULL) || (ret != 0))
return (ENOMEM);
/* size in bytes of the udma completion ring descriptor */
q_params->cdesc_size = 16;
rx_ring->cdescs_size = rx_ring->hw_count * q_params->cdesc_size;
ret = al_dma_alloc_coherent(dev, &q_params->cdesc_phy_base_tag,
&q_params->cdesc_phy_base_map,
(bus_addr_t *)&q_params->cdesc_phy_base,
(void**)&q_params->cdesc_base, rx_ring->cdescs_size);
if ((q_params->cdesc_base == NULL) || (ret != 0))
return (ENOMEM);
/* Allocate taskqueues */
TASK_INIT(&rx_ring->enqueue_task, 0, al_eth_rx_recv_work, rx_ring);
rx_ring->enqueue_tq = taskqueue_create_fast("al_rx_enque", M_NOWAIT,
taskqueue_thread_enqueue, &rx_ring->enqueue_tq);
taskqueue_start_threads(&rx_ring->enqueue_tq, 1, PI_NET, "%s rxeq",
device_get_nameunit(adapter->dev));
/* Setup DMA descriptor areas. */
ret = bus_dma_tag_create(bus_get_dma_tag(dev),
1, 0, /* alignment, bounds */
BUS_SPACE_MAXADDR, /* lowaddr */
BUS_SPACE_MAXADDR, /* highaddr */
NULL, NULL, /* filter, filterarg */
AL_TSO_SIZE, /* maxsize */
1, /* nsegments */
AL_TSO_SIZE, /* maxsegsize */
0, /* flags */
NULL, /* lockfunc */
NULL, /* lockfuncarg */
&rx_ring->dma_buf_tag);
if (ret != 0) {
device_printf(dev,"Unable to allocate RX dma_buf_tag\n");
return (ret);
}
for (size = 0; size < rx_ring->sw_count; size++) {
ret = bus_dmamap_create(rx_ring->dma_buf_tag, 0,
&rx_ring->rx_buffer_info[size].dma_map);
if (ret != 0) {
device_printf(dev,"Unable to map DMA RX buffer memory\n");
return (ret);
}
}
/* Zero out the descriptor ring */
memset(q_params->cdesc_base, 0, rx_ring->cdescs_size);
/* Create LRO for the ring */
if ((adapter->netdev->if_capenable & IFCAP_LRO) != 0) {
int err = tcp_lro_init(&rx_ring->lro);
if (err != 0) {
device_printf(adapter->dev,
"LRO[%d] Initialization failed!\n", qid);
} else {
device_printf_dbg(adapter->dev,
"RX Soft LRO[%d] Initialized\n", qid);
rx_ring->lro_enabled = TRUE;
rx_ring->lro.ifp = adapter->netdev;
}
}
rx_ring->next_to_clean = 0;
rx_ring->next_to_use = 0;
return (0);
}
/*
* al_eth_free_rx_resources - Free Rx Resources
* @adapter: network interface device structure
* @qid: queue index
*
* Free all receive software resources
*/
static void
al_eth_free_rx_resources(struct al_eth_adapter *adapter, unsigned int qid)
{
struct al_eth_ring *rx_ring = &adapter->rx_ring[qid];
struct al_udma_q_params *q_params = &rx_ring->q_params;
int size;
/* At this point interrupts' handlers must be deactivated */
while (taskqueue_cancel(rx_ring->enqueue_tq,
&rx_ring->enqueue_task, NULL)) {
taskqueue_drain(rx_ring->enqueue_tq, &rx_ring->enqueue_task);
}
taskqueue_free(rx_ring->enqueue_tq);
for (size = 0; size < rx_ring->sw_count; size++) {
m_freem(rx_ring->rx_buffer_info[size].m);
rx_ring->rx_buffer_info[size].m = NULL;
bus_dmamap_unload(rx_ring->dma_buf_tag,
rx_ring->rx_buffer_info[size].dma_map);
bus_dmamap_destroy(rx_ring->dma_buf_tag,
rx_ring->rx_buffer_info[size].dma_map);
}
bus_dma_tag_destroy(rx_ring->dma_buf_tag);
free(rx_ring->rx_buffer_info, M_IFAL);
rx_ring->rx_buffer_info = NULL;
/* if not set, then don't free */
if (q_params->desc_base == NULL)
return;
al_dma_free_coherent(q_params->desc_phy_base_tag,
q_params->desc_phy_base_map, q_params->desc_base);
q_params->desc_base = NULL;
/* if not set, then don't free */
if (q_params->cdesc_base == NULL)
return;
al_dma_free_coherent(q_params->cdesc_phy_base_tag,
q_params->cdesc_phy_base_map, q_params->cdesc_base);
q_params->cdesc_phy_base = 0;
/* Free LRO resources */
tcp_lro_free(&rx_ring->lro);
}
/*
* al_eth_free_all_rx_resources - Free Rx Resources for All Queues
* @adapter: board private structure
*
* Free all receive software resources
*/
static void
al_eth_free_all_rx_resources(struct al_eth_adapter *adapter)
{
int i;
for (i = 0; i < adapter->num_rx_queues; i++)
if (adapter->rx_ring[i].q_params.desc_base != 0)
al_eth_free_rx_resources(adapter, i);
}
/*
* al_eth_setup_all_rx_resources - allocate all queues Rx resources
* @adapter: board private structure
*
* Return 0 on success, negative on failure
*/
static int
al_eth_setup_all_rx_resources(struct al_eth_adapter *adapter)
{
int i, rc = 0;
for (i = 0; i < adapter->num_rx_queues; i++) {
rc = al_eth_setup_rx_resources(adapter, i);
if (rc == 0)
continue;
device_printf(adapter->dev, "Allocation for Rx Queue %u failed\n", i);
goto err_setup_rx;
}
return (0);
err_setup_rx:
/* rewind the index freeing the rings as we go */
while (i--)
al_eth_free_rx_resources(adapter, i);
return (rc);
}
/*
* al_eth_setup_all_tx_resources - allocate all queues Tx resources
* @adapter: private structure
*
* Return 0 on success, negative on failure
*/
static int
al_eth_setup_all_tx_resources(struct al_eth_adapter *adapter)
{
int i, rc = 0;
for (i = 0; i < adapter->num_tx_queues; i++) {
rc = al_eth_setup_tx_resources(adapter, i);
if (rc == 0)
continue;
device_printf(adapter->dev,
"Allocation for Tx Queue %u failed\n", i);
goto err_setup_tx;
}
return (0);
err_setup_tx:
/* rewind the index freeing the rings as we go */
while (i--)
al_eth_free_tx_resources(adapter, i);
return (rc);
}
static void
al_eth_disable_int_sync(struct al_eth_adapter *adapter)
{
/* disable forwarding interrupts from eth through pci end point */
if ((adapter->board_type == ALPINE_FPGA_NIC) ||
(adapter->board_type == ALPINE_NIC)) {
al_eth_forward_int_config((uint32_t*)adapter->internal_pcie_base +
AL_REG_OFFSET_FORWARD_INTR, AL_DIS_FORWARD_INTR);
}
/* mask hw interrupts */
al_eth_interrupts_mask(adapter);
}
static void
al_eth_interrupts_unmask(struct al_eth_adapter *adapter)
{
uint32_t group_a_mask = AL_INT_GROUP_A_GROUP_D_SUM; /* enable group D summery */
uint32_t group_b_mask = (1 << adapter->num_rx_queues) - 1;/* bit per Rx q*/
uint32_t group_c_mask = (1 << adapter->num_tx_queues) - 1;/* bit per Tx q*/
uint32_t group_d_mask = 3 << 8;
struct unit_regs __iomem *regs_base =
(struct unit_regs __iomem *)adapter->udma_base;
if (adapter->int_mode == AL_IOFIC_MODE_LEGACY)
group_a_mask |= AL_INT_GROUP_A_GROUP_B_SUM |
AL_INT_GROUP_A_GROUP_C_SUM |
AL_INT_GROUP_A_GROUP_D_SUM;
al_udma_iofic_unmask(regs_base, AL_UDMA_IOFIC_LEVEL_PRIMARY,
AL_INT_GROUP_A, group_a_mask);
al_udma_iofic_unmask(regs_base, AL_UDMA_IOFIC_LEVEL_PRIMARY,
AL_INT_GROUP_B, group_b_mask);
al_udma_iofic_unmask(regs_base, AL_UDMA_IOFIC_LEVEL_PRIMARY,
AL_INT_GROUP_C, group_c_mask);
al_udma_iofic_unmask(regs_base, AL_UDMA_IOFIC_LEVEL_PRIMARY,
AL_INT_GROUP_D, group_d_mask);
}
static void
al_eth_interrupts_mask(struct al_eth_adapter *adapter)
{
struct unit_regs __iomem *regs_base =
(struct unit_regs __iomem *)adapter->udma_base;
/* mask all interrupts */
al_udma_iofic_mask(regs_base, AL_UDMA_IOFIC_LEVEL_PRIMARY,
AL_INT_GROUP_A, AL_MASK_GROUP_A_INT);
al_udma_iofic_mask(regs_base, AL_UDMA_IOFIC_LEVEL_PRIMARY,
AL_INT_GROUP_B, AL_MASK_GROUP_B_INT);
al_udma_iofic_mask(regs_base, AL_UDMA_IOFIC_LEVEL_PRIMARY,
AL_INT_GROUP_C, AL_MASK_GROUP_C_INT);
al_udma_iofic_mask(regs_base, AL_UDMA_IOFIC_LEVEL_PRIMARY,
AL_INT_GROUP_D, AL_MASK_GROUP_D_INT);
}
static int
al_eth_configure_int_mode(struct al_eth_adapter *adapter)
{
enum al_iofic_mode int_mode;
uint32_t m2s_errors_disable = AL_M2S_MASK_INIT;
uint32_t m2s_aborts_disable = AL_M2S_MASK_INIT;
uint32_t s2m_errors_disable = AL_S2M_MASK_INIT;
uint32_t s2m_aborts_disable = AL_S2M_MASK_INIT;
/* single INTX mode */
if (adapter->msix_vecs == 0)
int_mode = AL_IOFIC_MODE_LEGACY;
else if (adapter->msix_vecs > 1)
int_mode = AL_IOFIC_MODE_MSIX_PER_Q;
else {
device_printf(adapter->dev,
"udma doesn't support single MSI-X mode yet.\n");
return (EIO);
}
if (adapter->board_type != ALPINE_INTEGRATED) {
m2s_errors_disable |= AL_M2S_S2M_MASK_NOT_INT;
m2s_errors_disable |= AL_M2S_S2M_MASK_NOT_INT;
s2m_aborts_disable |= AL_M2S_S2M_MASK_NOT_INT;
s2m_aborts_disable |= AL_M2S_S2M_MASK_NOT_INT;
}
if (al_udma_iofic_config((struct unit_regs __iomem *)adapter->udma_base,
int_mode, m2s_errors_disable, m2s_aborts_disable,
s2m_errors_disable, s2m_aborts_disable)) {
device_printf(adapter->dev,
"al_udma_unit_int_config failed!.\n");
return (EIO);
}
adapter->int_mode = int_mode;
device_printf_dbg(adapter->dev, "using %s interrupt mode\n",
int_mode == AL_IOFIC_MODE_LEGACY ? "INTx" :
int_mode == AL_IOFIC_MODE_MSIX_PER_Q ? "MSI-X per Queue" : "Unknown");
/* set interrupt moderation resolution to 15us */
al_iofic_moder_res_config(&((struct unit_regs *)(adapter->udma_base))->gen.interrupt_regs.main_iofic, AL_INT_GROUP_B, 15);
al_iofic_moder_res_config(&((struct unit_regs *)(adapter->udma_base))->gen.interrupt_regs.main_iofic, AL_INT_GROUP_C, 15);
/* by default interrupt coalescing is disabled */
adapter->tx_usecs = 0;
adapter->rx_usecs = 0;
return (0);
}
/*
* ethtool_rxfh_indir_default - get default value for RX flow hash indirection
* @index: Index in RX flow hash indirection table
* @n_rx_rings: Number of RX rings to use
*
* This function provides the default policy for RX flow hash indirection.
*/
static inline uint32_t
ethtool_rxfh_indir_default(uint32_t index, uint32_t n_rx_rings)
{
return (index % n_rx_rings);
}
static void*
al_eth_update_stats(struct al_eth_adapter *adapter)
{
struct al_eth_mac_stats *mac_stats = &adapter->mac_stats;
if (adapter->up == 0)
return (NULL);
al_eth_mac_stats_get(&adapter->hal_adapter, mac_stats);
return (NULL);
}
static uint64_t
al_get_counter(struct ifnet *ifp, ift_counter cnt)
{
struct al_eth_adapter *adapter;
struct al_eth_mac_stats *mac_stats;
uint64_t rv;
adapter = if_getsoftc(ifp);
mac_stats = &adapter->mac_stats;
switch (cnt) {
case IFCOUNTER_IPACKETS:
return (mac_stats->aFramesReceivedOK); /* including pause frames */
case IFCOUNTER_OPACKETS:
return (mac_stats->aFramesTransmittedOK);
case IFCOUNTER_IBYTES:
return (mac_stats->aOctetsReceivedOK);
case IFCOUNTER_OBYTES:
return (mac_stats->aOctetsTransmittedOK);
case IFCOUNTER_IMCASTS:
return (mac_stats->ifInMulticastPkts);
case IFCOUNTER_OMCASTS:
return (mac_stats->ifOutMulticastPkts);
case IFCOUNTER_COLLISIONS:
return (0);
case IFCOUNTER_IQDROPS:
return (mac_stats->etherStatsDropEvents);
case IFCOUNTER_IERRORS:
rv = mac_stats->ifInErrors +
mac_stats->etherStatsUndersizePkts + /* good but short */
mac_stats->etherStatsFragments + /* short and bad*/
mac_stats->etherStatsJabbers + /* with crc errors */
mac_stats->etherStatsOversizePkts +
mac_stats->aFrameCheckSequenceErrors +
mac_stats->aAlignmentErrors;
return (rv);
case IFCOUNTER_OERRORS:
return (mac_stats->ifOutErrors);
default:
return (if_get_counter_default(ifp, cnt));
}
}
/*
* Unicast, Multicast and Promiscuous mode set
*
* The set_rx_mode entry point is called whenever the unicast or multicast
* address lists or the network interface flags are updated. This routine is
* responsible for configuring the hardware for proper unicast, multicast,
* promiscuous mode, and all-multi behavior.
*/
#define MAX_NUM_MULTICAST_ADDRESSES 32
#define MAX_NUM_ADDRESSES 32
static void
al_eth_set_rx_mode(struct al_eth_adapter *adapter)
{
struct ifnet *ifp = adapter->netdev;
struct ifmultiaddr *ifma; /* multicast addresses configured */
struct ifaddr *ifua; /* unicast address */
int mc = 0;
int uc = 0;
uint8_t i;
unsigned char *mac;
if_maddr_rlock(ifp);
TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
if (ifma->ifma_addr->sa_family != AF_LINK)
continue;
if (mc == MAX_NUM_MULTICAST_ADDRESSES)
break;
mac = LLADDR((struct sockaddr_dl *) ifma->ifma_addr);
/* default mc address inside mac address */
if (mac[3] != 0 && mac[4] != 0 && mac[5] != 1)
mc++;
}
if_maddr_runlock(ifp);
if_addr_rlock(ifp);
TAILQ_FOREACH(ifua, &ifp->if_addrhead, ifa_link) {
if (ifua->ifa_addr->sa_family != AF_LINK)
continue;
if (uc == MAX_NUM_ADDRESSES)
break;
uc++;
}
if_addr_runlock(ifp);
if ((ifp->if_flags & IFF_PROMISC) != 0) {
al_eth_mac_table_promiscuous_set(adapter, true);
} else {
if ((ifp->if_flags & IFF_ALLMULTI) != 0) {
/* This interface is in all-multicasts mode (used by multicast routers). */
al_eth_mac_table_all_multicast_add(adapter,
AL_ETH_MAC_TABLE_ALL_MULTICAST_IDX, 1);
} else {
if (mc == 0) {
al_eth_mac_table_entry_clear(adapter,
AL_ETH_MAC_TABLE_ALL_MULTICAST_IDX);
} else {
al_eth_mac_table_all_multicast_add(adapter,
AL_ETH_MAC_TABLE_ALL_MULTICAST_IDX, 1);
}
}
if (uc != 0) {
i = AL_ETH_MAC_TABLE_UNICAST_IDX_BASE + 1;
if (uc > AL_ETH_MAC_TABLE_UNICAST_MAX_COUNT) {
/*
* In this case there are more addresses then
* entries in the mac table - set promiscuous
*/
al_eth_mac_table_promiscuous_set(adapter, true);
return;
}
/* clear the last configuration */
while (i < (AL_ETH_MAC_TABLE_UNICAST_IDX_BASE +
AL_ETH_MAC_TABLE_UNICAST_MAX_COUNT)) {
al_eth_mac_table_entry_clear(adapter, i);
i++;
}
/* set new addresses */
i = AL_ETH_MAC_TABLE_UNICAST_IDX_BASE + 1;
if_addr_rlock(ifp);
TAILQ_FOREACH(ifua, &ifp->if_addrhead, ifa_link) {
if (ifua->ifa_addr->sa_family != AF_LINK) {
continue;
}
al_eth_mac_table_unicast_add(adapter, i,
(unsigned char *)ifua->ifa_addr, 1);
i++;
}
if_addr_runlock(ifp);
}
al_eth_mac_table_promiscuous_set(adapter, false);
}
}
static void
al_eth_config_rx_fwd(struct al_eth_adapter *adapter)
{
struct al_eth_fwd_ctrl_table_entry entry;
int i;
/* let priority be equal to pbits */
for (i = 0; i < AL_ETH_FWD_PBITS_TABLE_NUM; i++)
al_eth_fwd_pbits_table_set(&adapter->hal_adapter, i, i);
/* map priority to queue index, queue id = priority/2 */
for (i = 0; i < AL_ETH_FWD_PRIO_TABLE_NUM; i++)
al_eth_fwd_priority_table_set(&adapter->hal_adapter, i, i >> 1);
entry.prio_sel = AL_ETH_CTRL_TABLE_PRIO_SEL_VAL_0;
entry.queue_sel_1 = AL_ETH_CTRL_TABLE_QUEUE_SEL_1_THASH_TABLE;
entry.queue_sel_2 = AL_ETH_CTRL_TABLE_QUEUE_SEL_2_NO_PRIO;
entry.udma_sel = AL_ETH_CTRL_TABLE_UDMA_SEL_MAC_TABLE;
entry.filter = FALSE;
al_eth_ctrl_table_def_set(&adapter->hal_adapter, FALSE, &entry);
/*
* By default set the mac table to forward all unicast packets to our
* MAC address and all broadcast. all the rest will be dropped.
*/
al_eth_mac_table_unicast_add(adapter, AL_ETH_MAC_TABLE_UNICAST_IDX_BASE,
adapter->mac_addr, 1);
al_eth_mac_table_broadcast_add(adapter, AL_ETH_MAC_TABLE_BROADCAST_IDX, 1);
al_eth_mac_table_promiscuous_set(adapter, false);
/* set toeplitz hash keys */
for (i = 0; i < sizeof(adapter->toeplitz_hash_key); i++)
*((uint8_t*)adapter->toeplitz_hash_key + i) = (uint8_t)random();
for (i = 0; i < AL_ETH_RX_HASH_KEY_NUM; i++)
al_eth_hash_key_set(&adapter->hal_adapter, i,
htonl(adapter->toeplitz_hash_key[i]));
for (i = 0; i < AL_ETH_RX_RSS_TABLE_SIZE; i++) {
adapter->rss_ind_tbl[i] = ethtool_rxfh_indir_default(i,
AL_ETH_NUM_QUEUES);
al_eth_set_thash_table_entry(adapter, i, 0,
adapter->rss_ind_tbl[i]);
}
al_eth_fsm_table_init(adapter);
}
static void
al_eth_req_rx_buff_size(struct al_eth_adapter *adapter, int size)
{
/*
* Determine the correct mbuf pool
* for doing jumbo frames
* Try from the smallest up to maximum supported
*/
adapter->rx_mbuf_sz = MCLBYTES;
if (size > 2048) {
if (adapter->max_rx_buff_alloc_size > 2048)
adapter->rx_mbuf_sz = MJUMPAGESIZE;
else
return;
}
if (size > 4096) {
if (adapter->max_rx_buff_alloc_size > 4096)
adapter->rx_mbuf_sz = MJUM9BYTES;
else
return;
}
if (size > 9216) {
if (adapter->max_rx_buff_alloc_size > 9216)
adapter->rx_mbuf_sz = MJUM16BYTES;
else
return;
}
}
static int
al_eth_change_mtu(struct al_eth_adapter *adapter, int new_mtu)
{
int max_frame = new_mtu + ETHER_HDR_LEN + ETHER_CRC_LEN +
ETHER_VLAN_ENCAP_LEN;
al_eth_req_rx_buff_size(adapter, new_mtu);
device_printf_dbg(adapter->dev, "set MTU to %d\n", new_mtu);
al_eth_rx_pkt_limit_config(&adapter->hal_adapter,
AL_ETH_MIN_FRAME_LEN, max_frame);
al_eth_tso_mss_config(&adapter->hal_adapter, 0, new_mtu - 100);
return (0);
}
static int
al_eth_check_mtu(struct al_eth_adapter *adapter, int new_mtu)
{
int max_frame = new_mtu + ETHER_HDR_LEN + ETHER_CRC_LEN + ETHER_VLAN_ENCAP_LEN;
if ((new_mtu < AL_ETH_MIN_FRAME_LEN) ||
(max_frame > AL_ETH_MAX_FRAME_LEN)) {
return (EINVAL);
}
return (0);
}
static int
al_eth_udma_queue_enable(struct al_eth_adapter *adapter, enum al_udma_type type,
int qid)
{
int rc = 0;
char *name = (type == UDMA_TX) ? "Tx" : "Rx";
struct al_udma_q_params *q_params;
if (type == UDMA_TX)
q_params = &adapter->tx_ring[qid].q_params;
else
q_params = &adapter->rx_ring[qid].q_params;
rc = al_eth_queue_config(&adapter->hal_adapter, type, qid, q_params);
if (rc < 0) {
device_printf(adapter->dev, "config %s queue %u failed\n", name,
qid);
return (rc);
}
return (rc);
}
static int
al_eth_udma_queues_enable_all(struct al_eth_adapter *adapter)
{
int i;
for (i = 0; i < adapter->num_tx_queues; i++)
al_eth_udma_queue_enable(adapter, UDMA_TX, i);
for (i = 0; i < adapter->num_rx_queues; i++)
al_eth_udma_queue_enable(adapter, UDMA_RX, i);
return (0);
}
static void
al_eth_up_complete(struct al_eth_adapter *adapter)
{
al_eth_configure_int_mode(adapter);
al_eth_config_rx_fwd(adapter);
al_eth_change_mtu(adapter, adapter->netdev->if_mtu);
al_eth_udma_queues_enable_all(adapter);
al_eth_refill_all_rx_bufs(adapter);
al_eth_interrupts_unmask(adapter);
/* enable forwarding interrupts from eth through pci end point */
if ((adapter->board_type == ALPINE_FPGA_NIC) ||
(adapter->board_type == ALPINE_NIC)) {
al_eth_forward_int_config((uint32_t*)adapter->internal_pcie_base +
AL_REG_OFFSET_FORWARD_INTR, AL_EN_FORWARD_INTR);
}
al_eth_flow_ctrl_enable(adapter);
mtx_lock(&adapter->stats_mtx);
callout_reset(&adapter->stats_callout, hz, al_tick_stats, (void*)adapter);
mtx_unlock(&adapter->stats_mtx);
al_eth_mac_start(&adapter->hal_adapter);
}
static int
al_media_update(struct ifnet *ifp)
{
struct al_eth_adapter *adapter = ifp->if_softc;
if ((ifp->if_flags & IFF_UP) != 0)
mii_mediachg(adapter->mii);
return (0);
}
static void
al_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
{
struct al_eth_adapter *sc = ifp->if_softc;
struct mii_data *mii;
if (sc->mii == NULL) {
ifmr->ifm_active = IFM_ETHER | IFM_NONE;
ifmr->ifm_status = 0;
return;
}
mii = sc->mii;
mii_pollstat(mii);
ifmr->ifm_active = mii->mii_media_active;
ifmr->ifm_status = mii->mii_media_status;
}
static void
al_tick(void *arg)
{
struct al_eth_adapter *adapter = arg;
mii_tick(adapter->mii);
/* Schedule another timeout one second from now */
callout_schedule(&adapter->wd_callout, hz);
}
static void
al_tick_stats(void *arg)
{
struct al_eth_adapter *adapter = arg;
al_eth_update_stats(adapter);
callout_schedule(&adapter->stats_callout, hz);
}
static int
al_eth_up(struct al_eth_adapter *adapter)
{
struct ifnet *ifp = adapter->netdev;
int rc;
if (adapter->up)
return (0);
if ((adapter->flags & AL_ETH_FLAG_RESET_REQUESTED) != 0) {
al_eth_function_reset(adapter);
adapter->flags &= ~AL_ETH_FLAG_RESET_REQUESTED;
}
ifp->if_hwassist = 0;
if ((ifp->if_capenable & IFCAP_TSO) != 0)
ifp->if_hwassist |= CSUM_TSO;
if ((ifp->if_capenable & IFCAP_TXCSUM) != 0)
ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
if ((ifp->if_capenable & IFCAP_TXCSUM_IPV6) != 0)
ifp->if_hwassist |= (CSUM_TCP_IPV6 | CSUM_UDP_IPV6);
al_eth_serdes_init(adapter);
rc = al_eth_hw_init(adapter);
if (rc != 0)
goto err_hw_init_open;
rc = al_eth_setup_int_mode(adapter);
if (rc != 0) {
device_printf(adapter->dev,
"%s failed at setup interrupt mode!\n", __func__);
goto err_setup_int;
}
/* allocate transmit descriptors */
rc = al_eth_setup_all_tx_resources(adapter);
if (rc != 0)
goto err_setup_tx;
/* allocate receive descriptors */
rc = al_eth_setup_all_rx_resources(adapter);
if (rc != 0)
goto err_setup_rx;
rc = al_eth_request_irq(adapter);
if (rc != 0)
goto err_req_irq;
al_eth_up_complete(adapter);
adapter->up = true;
if (adapter->mac_mode == AL_ETH_MAC_MODE_10GbE_Serial)
adapter->netdev->if_link_state = LINK_STATE_UP;
if (adapter->mac_mode == AL_ETH_MAC_MODE_RGMII) {
mii_mediachg(adapter->mii);
/* Schedule watchdog timeout */
mtx_lock(&adapter->wd_mtx);
callout_reset(&adapter->wd_callout, hz, al_tick, adapter);
mtx_unlock(&adapter->wd_mtx);
mii_pollstat(adapter->mii);
}
return (rc);
err_req_irq:
al_eth_free_all_rx_resources(adapter);
err_setup_rx:
al_eth_free_all_tx_resources(adapter);
err_setup_tx:
al_eth_free_irq(adapter);
err_setup_int:
al_eth_hw_stop(adapter);
err_hw_init_open:
al_eth_function_reset(adapter);
return (rc);
}
static int
al_shutdown(device_t dev)
{
struct al_eth_adapter *adapter = device_get_softc(dev);
al_eth_down(adapter);
return (0);
}
static void
al_eth_down(struct al_eth_adapter *adapter)
{
device_printf_dbg(adapter->dev, "al_eth_down: begin\n");
adapter->up = false;
mtx_lock(&adapter->wd_mtx);
callout_stop(&adapter->wd_callout);
mtx_unlock(&adapter->wd_mtx);
al_eth_disable_int_sync(adapter);
mtx_lock(&adapter->stats_mtx);
callout_stop(&adapter->stats_callout);
mtx_unlock(&adapter->stats_mtx);
al_eth_free_irq(adapter);
al_eth_hw_stop(adapter);
al_eth_free_all_tx_resources(adapter);
al_eth_free_all_rx_resources(adapter);
}
static int
al_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
{
struct al_eth_adapter *adapter = ifp->if_softc;
struct ifreq *ifr = (struct ifreq *)data;
int error = 0;
switch (command) {
case SIOCSIFMTU:
{
error = al_eth_check_mtu(adapter, ifr->ifr_mtu);
if (error != 0) {
device_printf(adapter->dev, "ioctl wrong mtu %u\n",
adapter->netdev->if_mtu);
break;
}
ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
adapter->netdev->if_mtu = ifr->ifr_mtu;
al_init(adapter);
break;
}
case SIOCSIFFLAGS:
if ((ifp->if_flags & IFF_UP) != 0) {
if ((ifp->if_drv_flags & IFF_DRV_RUNNING) != 0) {
if (((ifp->if_flags ^ adapter->if_flags) &
(IFF_PROMISC | IFF_ALLMULTI)) != 0) {
device_printf_dbg(adapter->dev,
"ioctl promisc/allmulti\n");
al_eth_set_rx_mode(adapter);
}
} else {
error = al_eth_up(adapter);
if (error == 0)
ifp->if_drv_flags |= IFF_DRV_RUNNING;
}
} else {
if ((ifp->if_drv_flags & IFF_DRV_RUNNING) != 0) {
al_eth_down(adapter);
ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
}
}
adapter->if_flags = ifp->if_flags;
break;
case SIOCADDMULTI:
case SIOCDELMULTI:
if ((ifp->if_drv_flags & IFF_DRV_RUNNING) != 0) {
device_printf_dbg(adapter->dev,
"ioctl add/del multi before\n");
al_eth_set_rx_mode(adapter);
#ifdef DEVICE_POLLING
if ((ifp->if_capenable & IFCAP_POLLING) == 0)
#endif
}
break;
case SIOCSIFMEDIA:
case SIOCGIFMEDIA:
if (adapter->mii != NULL)
error = ifmedia_ioctl(ifp, ifr,
&adapter->mii->mii_media, command);
else
error = ifmedia_ioctl(ifp, ifr,
&adapter->media, command);
break;
case SIOCSIFCAP:
{
int mask, reinit;
reinit = 0;
mask = ifr->ifr_reqcap ^ ifp->if_capenable;
#ifdef DEVICE_POLLING
if ((mask & IFCAP_POLLING) != 0) {
if ((ifr->ifr_reqcap & IFCAP_POLLING) != 0) {
if (error != 0)
return (error);
ifp->if_capenable |= IFCAP_POLLING;
} else {
error = ether_poll_deregister(ifp);
/* Enable interrupt even in error case */
ifp->if_capenable &= ~IFCAP_POLLING;
}
}
#endif
if ((mask & IFCAP_HWCSUM) != 0) {
/* apply to both rx and tx */
ifp->if_capenable ^= IFCAP_HWCSUM;
reinit = 1;
}
if ((mask & IFCAP_HWCSUM_IPV6) != 0) {
ifp->if_capenable ^= IFCAP_HWCSUM_IPV6;
reinit = 1;
}
if ((mask & IFCAP_TSO) != 0) {
ifp->if_capenable ^= IFCAP_TSO;
reinit = 1;
}
if ((mask & IFCAP_LRO) != 0) {
ifp->if_capenable ^= IFCAP_LRO;
}
if ((mask & IFCAP_VLAN_HWTAGGING) != 0) {
ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
reinit = 1;
}
if ((mask & IFCAP_VLAN_HWFILTER) != 0) {
ifp->if_capenable ^= IFCAP_VLAN_HWFILTER;
reinit = 1;
}
if ((mask & IFCAP_VLAN_HWTSO) != 0) {
ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
reinit = 1;
}
if ((reinit != 0) &&
((ifp->if_drv_flags & IFF_DRV_RUNNING)) != 0)
{
al_init(adapter);
}
break;
}
default:
error = ether_ioctl(ifp, command, data);
break;
}
return (error);
}
static int
al_is_device_supported(device_t dev)
{
uint16_t pci_vendor_id = pci_get_vendor(dev);
uint16_t pci_device_id = pci_get_device(dev);
return (pci_vendor_id == PCI_VENDOR_ID_ANNAPURNA_LABS &&
(pci_device_id == PCI_DEVICE_ID_AL_ETH ||
pci_device_id == PCI_DEVICE_ID_AL_ETH_ADVANCED ||
pci_device_id == PCI_DEVICE_ID_AL_ETH_NIC ||
pci_device_id == PCI_DEVICE_ID_AL_ETH_FPGA_NIC));
}
/* Time in mSec to keep trying to read / write from MDIO in case of error */
#define MDIO_TIMEOUT_MSEC 100
#define MDIO_PAUSE_MSEC 10
static int
al_miibus_readreg(device_t dev, int phy, int reg)
{
struct al_eth_adapter *adapter = device_get_softc(dev);
uint16_t value = 0;
int rc;
int timeout = MDIO_TIMEOUT_MSEC;
while (timeout > 0) {
rc = al_eth_mdio_read(&adapter->hal_adapter, adapter->phy_addr,
-1, reg, &value);
if (rc == 0)
return (value);
device_printf_dbg(adapter->dev,
"mdio read failed. try again in 10 msec\n");
timeout -= MDIO_PAUSE_MSEC;
pause("readred pause", MDIO_PAUSE_MSEC);
}
if (rc != 0)
device_printf(adapter->dev, "MDIO read failed on timeout\n");
return (value);
}
static int
al_miibus_writereg(device_t dev, int phy, int reg, int value)
{
struct al_eth_adapter *adapter = device_get_softc(dev);
int rc;
int timeout = MDIO_TIMEOUT_MSEC;
while (timeout > 0) {
rc = al_eth_mdio_write(&adapter->hal_adapter, adapter->phy_addr,
-1, reg, value);
if (rc == 0)
return (0);
device_printf(adapter->dev,
"mdio write failed. try again in 10 msec\n");
timeout -= MDIO_PAUSE_MSEC;
pause("miibus writereg", MDIO_PAUSE_MSEC);
}
if (rc != 0)
device_printf(adapter->dev, "MDIO write failed on timeout\n");
return (rc);
}
static void
al_miibus_statchg(device_t dev)
{
struct al_eth_adapter *adapter = device_get_softc(dev);
device_printf_dbg(adapter->dev,
"al_miibus_statchg: state has changed!\n");
device_printf_dbg(adapter->dev,
"al_miibus_statchg: active = 0x%x status = 0x%x\n",
adapter->mii->mii_media_active, adapter->mii->mii_media_status);
if (adapter->up == 0)
return;
if ((adapter->mii->mii_media_status & IFM_AVALID) != 0) {
if (adapter->mii->mii_media_status & IFM_ACTIVE) {
device_printf(adapter->dev, "link is UP\n");
adapter->netdev->if_link_state = LINK_STATE_UP;
} else {
device_printf(adapter->dev, "link is DOWN\n");
adapter->netdev->if_link_state = LINK_STATE_DOWN;
}
}
}
static void
al_miibus_linkchg(device_t dev)
{
struct al_eth_adapter *adapter = device_get_softc(dev);
uint8_t duplex = 0;
uint8_t speed = 0;
if (adapter->mii == NULL)
return;
if ((adapter->netdev->if_flags & IFF_UP) == 0)
return;
/* Ignore link changes when link is not ready */
if ((adapter->mii->mii_media_status & (IFM_AVALID | IFM_ACTIVE)) !=
(IFM_AVALID | IFM_ACTIVE)) {
return;
}
if ((adapter->mii->mii_media_active & IFM_FDX) != 0)
duplex = 1;
speed = IFM_SUBTYPE(adapter->mii->mii_media_active);
if (speed == IFM_10_T) {
al_eth_mac_link_config(&adapter->hal_adapter, 0, 1,
AL_10BASE_T_SPEED, duplex);
return;
}
if (speed == IFM_100_TX) {
al_eth_mac_link_config(&adapter->hal_adapter, 0, 1,
AL_100BASE_TX_SPEED, duplex);
return;
}
if (speed == IFM_1000_T) {
al_eth_mac_link_config(&adapter->hal_adapter, 0, 1,
AL_1000BASE_T_SPEED, duplex);
return;
}
device_printf(adapter->dev, "ERROR: unknown MII media active 0x%08x\n",
adapter->mii->mii_media_active);
}
Index: head/sys/dev/axgbe/xgbe-drv.c
===================================================================
--- head/sys/dev/axgbe/xgbe-drv.c (revision 327172)
+++ head/sys/dev/axgbe/xgbe-drv.c (revision 327173)
@@ -1,1079 +1,1076 @@
/*
* AMD 10Gb Ethernet driver
*
* This file is available to you under your choice of the following two
* licenses:
*
* License 1: GPLv2
*
* Copyright (c) 2014-2016 Advanced Micro Devices, Inc.
*
* This file is free software; you may copy, redistribute and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 2 of the License, or (at
* your option) any later version.
*
* This file is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* This file incorporates work covered by the following copyright and
* permission notice:
* The Synopsys DWC ETHER XGMAC Software Driver and documentation
* (hereinafter "Software") is an unsupported proprietary work of Synopsys,
* Inc. unless otherwise expressly agreed to in writing between Synopsys
* and you.
*
* The Software IS NOT an item of Licensed Software or Licensed Product
* under any End User Software License Agreement or Agreement for Licensed
* Product with Synopsys or any supplement thereto. Permission is hereby
* granted, free of charge, to any person obtaining a copy of this software
* annotated with this license and the Software, to deal in the Software
* without restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is furnished
* to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS"
* BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
* PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*
*
* License 2: Modified BSD
*
* Copyright (c) 2014-2016 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Advanced Micro Devices, Inc. nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* This file incorporates work covered by the following copyright and
* permission notice:
* The Synopsys DWC ETHER XGMAC Software Driver and documentation
* (hereinafter "Software") is an unsupported proprietary work of Synopsys,
* Inc. unless otherwise expressly agreed to in writing between Synopsys
* and you.
*
* The Software IS NOT an item of Licensed Software or Licensed Product
* under any End User Software License Agreement or Agreement for Licensed
* Product with Synopsys or any supplement thereto. Permission is hereby
* granted, free of charge, to any person obtaining a copy of this software
* annotated with this license and the Software, to deal in the Software
* without restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is furnished
* to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS"
* BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
* PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/kernel.h>
#include "xgbe.h"
#include "xgbe-common.h"
static int xgbe_one_poll(struct xgbe_channel *channel, int budget);
static int xgbe_all_poll(struct xgbe_prv_data *pdata, int budget);
static int xgbe_alloc_channels(struct xgbe_prv_data *pdata)
{
struct xgbe_channel *channel_mem, *channel;
struct xgbe_ring *tx_ring, *rx_ring;
unsigned int count, i;
int ret = -ENOMEM;
count = max_t(unsigned int, pdata->tx_ring_count, pdata->rx_ring_count);
channel_mem = malloc(count * sizeof(struct xgbe_channel), M_AXGBE,
M_WAITOK | M_ZERO);
tx_ring = malloc(pdata->tx_ring_count * sizeof(struct xgbe_ring),
M_AXGBE, M_WAITOK | M_ZERO);
rx_ring = malloc(pdata->rx_ring_count * sizeof(struct xgbe_ring),
M_AXGBE, M_WAITOK | M_ZERO);
for (i = 0, channel = channel_mem; i < count; i++, channel++) {
snprintf(channel->name, sizeof(channel->name), "channel-%d", i);
channel->pdata = pdata;
channel->queue_index = i;
channel->dma_tag = rman_get_bustag(pdata->xgmac_res);
bus_space_subregion(channel->dma_tag,
rman_get_bushandle(pdata->xgmac_res),
DMA_CH_BASE + (DMA_CH_INC * i), DMA_CH_INC,
&channel->dma_handle);
if (pdata->per_channel_irq) {
if (pdata->chan_irq_res[i] == NULL)
goto err_irq;
channel->dma_irq_res = pdata->chan_irq_res[i];
}
if (i < pdata->tx_ring_count) {
spin_lock_init(&tx_ring->lock);
channel->tx_ring = tx_ring++;
}
if (i < pdata->rx_ring_count) {
spin_lock_init(&rx_ring->lock);
channel->rx_ring = rx_ring++;
}
}
pdata->channel = channel_mem;
pdata->channel_count = count;
return 0;
err_irq:
free(rx_ring, M_AXGBE);
free(tx_ring, M_AXGBE);
free(channel_mem, M_AXGBE);
return ret;
}
static void xgbe_free_channels(struct xgbe_prv_data *pdata)
{
if (!pdata->channel)
return;
free(pdata->channel->rx_ring, M_AXGBE);
free(pdata->channel->tx_ring, M_AXGBE);
free(pdata->channel, M_AXGBE);
pdata->channel = NULL;
pdata->channel_count = 0;
}
static inline unsigned int xgbe_tx_avail_desc(struct xgbe_ring *ring)
{
return (ring->rdesc_count - (ring->cur - ring->dirty));
}
static inline unsigned int xgbe_rx_dirty_desc(struct xgbe_ring *ring)
{
return (ring->cur - ring->dirty);
}
static int xgbe_maybe_stop_tx_queue(struct xgbe_channel *channel,
struct xgbe_ring *ring, unsigned int count)
{
struct xgbe_prv_data *pdata = channel->pdata;
if (count > xgbe_tx_avail_desc(ring)) {
/* If we haven't notified the hardware because of xmit_more
* support, tell it now
*/
if (ring->tx.xmit_more)
pdata->hw_if.tx_start_xmit(channel, ring);
return EFBIG;
}
return 0;
}
static int xgbe_calc_rx_buf_size(struct ifnet *netdev, unsigned int mtu)
{
unsigned int rx_buf_size;
if (mtu > XGMAC_JUMBO_PACKET_MTU) {
return -EINVAL;
}
rx_buf_size = mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN;
rx_buf_size = MIN(XGBE_RX_MIN_BUF_SIZE, PAGE_SIZE);
rx_buf_size = (rx_buf_size + XGBE_RX_BUF_ALIGN - 1) &
~(XGBE_RX_BUF_ALIGN - 1);
return rx_buf_size;
}
static void xgbe_enable_rx_tx_ints(struct xgbe_prv_data *pdata)
{
struct xgbe_hw_if *hw_if = &pdata->hw_if;
struct xgbe_channel *channel;
enum xgbe_int int_id;
unsigned int i;
channel = pdata->channel;
for (i = 0; i < pdata->channel_count; i++, channel++) {
if (channel->tx_ring && channel->rx_ring)
int_id = XGMAC_INT_DMA_CH_SR_TI_RI;
else if (channel->tx_ring)
int_id = XGMAC_INT_DMA_CH_SR_TI;
else if (channel->rx_ring)
int_id = XGMAC_INT_DMA_CH_SR_RI;
else
continue;
hw_if->enable_int(channel, int_id);
}
}
static void xgbe_isr(void *data)
{
struct xgbe_prv_data *pdata = data;
struct xgbe_hw_if *hw_if = &pdata->hw_if;
struct xgbe_channel *channel;
unsigned int dma_isr, dma_ch_isr;
unsigned int mac_isr;
unsigned int i;
/* The DMA interrupt status register also reports MAC and MTL
* interrupts. So for polling mode, we just need to check for
* this register to be non-zero
*/
dma_isr = XGMAC_IOREAD(pdata, DMA_ISR);
if (!dma_isr)
return;
for (i = 0; i < pdata->channel_count; i++) {
if (!(dma_isr & (1 << i)))
continue;
channel = pdata->channel + i;
dma_ch_isr = XGMAC_DMA_IOREAD(channel, DMA_CH_SR);
/* The TI or RI interrupt bits may still be set even if using
* per channel DMA interrupts. Check to be sure those are not
* enabled before using the private data napi structure.
*/
if (!pdata->per_channel_irq &&
(XGMAC_GET_BITS(dma_ch_isr, DMA_CH_SR, TI) ||
XGMAC_GET_BITS(dma_ch_isr, DMA_CH_SR, RI))) {
xgbe_all_poll(pdata, 16);
}
if (XGMAC_GET_BITS(dma_ch_isr, DMA_CH_SR, RBU))
pdata->ext_stats.rx_buffer_unavailable++;
/* Restart the device on a Fatal Bus Error */
if (XGMAC_GET_BITS(dma_ch_isr, DMA_CH_SR, FBE))
taskqueue_enqueue(taskqueue_thread,
&pdata->restart_work);
/* Clear all interrupt signals */
XGMAC_DMA_IOWRITE(channel, DMA_CH_SR, dma_ch_isr);
}
if (XGMAC_GET_BITS(dma_isr, DMA_ISR, MACIS)) {
mac_isr = XGMAC_IOREAD(pdata, MAC_ISR);
if (XGMAC_GET_BITS(mac_isr, MAC_ISR, MMCTXIS))
hw_if->tx_mmc_int(pdata);
if (XGMAC_GET_BITS(mac_isr, MAC_ISR, MMCRXIS))
hw_if->rx_mmc_int(pdata);
}
}
static void xgbe_dma_isr(void *data)
{
struct xgbe_channel *channel = data;
xgbe_one_poll(channel, 16);
}
static void xgbe_service(void *ctx, int pending)
{
struct xgbe_prv_data *pdata = ctx;
pdata->phy_if.phy_status(pdata);
}
static void xgbe_service_timer(void *data)
{
struct xgbe_prv_data *pdata = data;
DBGPR("--> xgbe_service_timer\n");
taskqueue_enqueue(pdata->dev_workqueue, &pdata->service_work);
callout_reset(&pdata->service_timer, hz, xgbe_service_timer, pdata);
DBGPR("<-- xgbe_service_timer\n");
}
static void xgbe_init_timers(struct xgbe_prv_data *pdata)
{
callout_init(&pdata->service_timer, 1);
}
static void xgbe_start_timers(struct xgbe_prv_data *pdata)
{
callout_reset(&pdata->service_timer, hz, xgbe_service_timer, pdata);
}
static void xgbe_stop_timers(struct xgbe_prv_data *pdata)
{
callout_drain(&pdata->service_timer);
}
void xgbe_get_all_hw_features(struct xgbe_prv_data *pdata)
{
unsigned int mac_hfr0, mac_hfr1, mac_hfr2;
struct xgbe_hw_features *hw_feat = &pdata->hw_feat;
DBGPR("-->xgbe_get_all_hw_features\n");
mac_hfr0 = XGMAC_IOREAD(pdata, MAC_HWF0R);
mac_hfr1 = XGMAC_IOREAD(pdata, MAC_HWF1R);
mac_hfr2 = XGMAC_IOREAD(pdata, MAC_HWF2R);
memset(hw_feat, 0, sizeof(*hw_feat));
hw_feat->version = XGMAC_IOREAD(pdata, MAC_VR);
/* Hardware feature register 0 */
hw_feat->gmii = XGMAC_GET_BITS(mac_hfr0, MAC_HWF0R, GMIISEL);
hw_feat->vlhash = XGMAC_GET_BITS(mac_hfr0, MAC_HWF0R, VLHASH);
hw_feat->sma = XGMAC_GET_BITS(mac_hfr0, MAC_HWF0R, SMASEL);
hw_feat->rwk = XGMAC_GET_BITS(mac_hfr0, MAC_HWF0R, RWKSEL);
hw_feat->mgk = XGMAC_GET_BITS(mac_hfr0, MAC_HWF0R, MGKSEL);
hw_feat->mmc = XGMAC_GET_BITS(mac_hfr0, MAC_HWF0R, MMCSEL);
hw_feat->aoe = XGMAC_GET_BITS(mac_hfr0, MAC_HWF0R, ARPOFFSEL);
hw_feat->ts = XGMAC_GET_BITS(mac_hfr0, MAC_HWF0R, TSSEL);
hw_feat->eee = XGMAC_GET_BITS(mac_hfr0, MAC_HWF0R, EEESEL);
hw_feat->tx_coe = XGMAC_GET_BITS(mac_hfr0, MAC_HWF0R, TXCOESEL);
hw_feat->rx_coe = XGMAC_GET_BITS(mac_hfr0, MAC_HWF0R, RXCOESEL);
hw_feat->addn_mac = XGMAC_GET_BITS(mac_hfr0, MAC_HWF0R,
ADDMACADRSEL);
hw_feat->ts_src = XGMAC_GET_BITS(mac_hfr0, MAC_HWF0R, TSSTSSEL);
hw_feat->sa_vlan_ins = XGMAC_GET_BITS(mac_hfr0, MAC_HWF0R, SAVLANINS);
/* Hardware feature register 1 */
hw_feat->rx_fifo_size = XGMAC_GET_BITS(mac_hfr1, MAC_HWF1R,
RXFIFOSIZE);
hw_feat->tx_fifo_size = XGMAC_GET_BITS(mac_hfr1, MAC_HWF1R,
TXFIFOSIZE);
hw_feat->adv_ts_hi = XGMAC_GET_BITS(mac_hfr1, MAC_HWF1R, ADVTHWORD);
hw_feat->dma_width = XGMAC_GET_BITS(mac_hfr1, MAC_HWF1R, ADDR64);
hw_feat->dcb = XGMAC_GET_BITS(mac_hfr1, MAC_HWF1R, DCBEN);
hw_feat->sph = XGMAC_GET_BITS(mac_hfr1, MAC_HWF1R, SPHEN);
hw_feat->tso = XGMAC_GET_BITS(mac_hfr1, MAC_HWF1R, TSOEN);
hw_feat->dma_debug = XGMAC_GET_BITS(mac_hfr1, MAC_HWF1R, DBGMEMA);
hw_feat->rss = XGMAC_GET_BITS(mac_hfr1, MAC_HWF1R, RSSEN);
hw_feat->tc_cnt = XGMAC_GET_BITS(mac_hfr1, MAC_HWF1R, NUMTC);
hw_feat->hash_table_size = XGMAC_GET_BITS(mac_hfr1, MAC_HWF1R,
HASHTBLSZ);
hw_feat->l3l4_filter_num = XGMAC_GET_BITS(mac_hfr1, MAC_HWF1R,
L3L4FNUM);
/* Hardware feature register 2 */
hw_feat->rx_q_cnt = XGMAC_GET_BITS(mac_hfr2, MAC_HWF2R, RXQCNT);
hw_feat->tx_q_cnt = XGMAC_GET_BITS(mac_hfr2, MAC_HWF2R, TXQCNT);
hw_feat->rx_ch_cnt = XGMAC_GET_BITS(mac_hfr2, MAC_HWF2R, RXCHCNT);
hw_feat->tx_ch_cnt = XGMAC_GET_BITS(mac_hfr2, MAC_HWF2R, TXCHCNT);
hw_feat->pps_out_num = XGMAC_GET_BITS(mac_hfr2, MAC_HWF2R, PPSOUTNUM);
hw_feat->aux_snap_num = XGMAC_GET_BITS(mac_hfr2, MAC_HWF2R, AUXSNAPNUM);
/* Translate the Hash Table size into actual number */
switch (hw_feat->hash_table_size) {
case 0:
break;
case 1:
hw_feat->hash_table_size = 64;
break;
case 2:
hw_feat->hash_table_size = 128;
break;
case 3:
hw_feat->hash_table_size = 256;
break;
}
/* Translate the address width setting into actual number */
switch (hw_feat->dma_width) {
case 0:
hw_feat->dma_width = 32;
break;
case 1:
hw_feat->dma_width = 40;
break;
case 2:
hw_feat->dma_width = 48;
break;
default:
hw_feat->dma_width = 32;
}
/* The Queue, Channel and TC counts are zero based so increment them
* to get the actual number
*/
hw_feat->rx_q_cnt++;
hw_feat->tx_q_cnt++;
hw_feat->rx_ch_cnt++;
hw_feat->tx_ch_cnt++;
hw_feat->tc_cnt++;
DBGPR("<--xgbe_get_all_hw_features\n");
}
static int xgbe_request_irqs(struct xgbe_prv_data *pdata)
{
struct xgbe_channel *channel;
unsigned int i;
int ret;
ret = bus_setup_intr(pdata->dev, pdata->dev_irq_res,
INTR_MPSAFE | INTR_TYPE_NET, NULL, xgbe_isr, pdata,
&pdata->dev_irq_tag);
if (ret) {
return ret;
}
if (!pdata->per_channel_irq)
return 0;
channel = pdata->channel;
for (i = 0; i < pdata->channel_count; i++, channel++) {
ret = bus_setup_intr(pdata->dev, channel->dma_irq_res,
INTR_MPSAFE | INTR_TYPE_NET, NULL, xgbe_dma_isr, channel,
&channel->dma_irq_tag);
if (ret != 0) {
goto err_irq;
}
}
return 0;
err_irq:
/* Using an unsigned int, 'i' will go to UINT_MAX and exit */
for (i--, channel--; i < pdata->channel_count; i--, channel--)
bus_teardown_intr(pdata->dev, channel->dma_irq_res,
channel->dma_irq_tag);
bus_teardown_intr(pdata->dev, pdata->dev_irq_res, pdata->dev_irq_tag);
return -ret;
}
static void xgbe_free_irqs(struct xgbe_prv_data *pdata)
{
struct xgbe_channel *channel;
unsigned int i;
bus_teardown_intr(pdata->dev, pdata->dev_irq_res, pdata->dev_irq_tag);
if (!pdata->per_channel_irq)
return;
channel = pdata->channel;
for (i = 0; i < pdata->channel_count; i++, channel++)
bus_teardown_intr(pdata->dev, channel->dma_irq_res,
channel->dma_irq_tag);
}
void xgbe_init_tx_coalesce(struct xgbe_prv_data *pdata)
{
struct xgbe_hw_if *hw_if = &pdata->hw_if;
DBGPR("-->xgbe_init_tx_coalesce\n");
pdata->tx_usecs = XGMAC_INIT_DMA_TX_USECS;
pdata->tx_frames = XGMAC_INIT_DMA_TX_FRAMES;
hw_if->config_tx_coalesce(pdata);
DBGPR("<--xgbe_init_tx_coalesce\n");
}
void xgbe_init_rx_coalesce(struct xgbe_prv_data *pdata)
{
struct xgbe_hw_if *hw_if = &pdata->hw_if;
DBGPR("-->xgbe_init_rx_coalesce\n");
pdata->rx_riwt = hw_if->usec_to_riwt(pdata, XGMAC_INIT_DMA_RX_USECS);
pdata->rx_usecs = XGMAC_INIT_DMA_RX_USECS;
pdata->rx_frames = XGMAC_INIT_DMA_RX_FRAMES;
hw_if->config_rx_coalesce(pdata);
DBGPR("<--xgbe_init_rx_coalesce\n");
}
static void xgbe_free_tx_data(struct xgbe_prv_data *pdata)
{
struct xgbe_desc_if *desc_if = &pdata->desc_if;
struct xgbe_channel *channel;
struct xgbe_ring *ring;
struct xgbe_ring_data *rdata;
unsigned int i, j;
DBGPR("-->xgbe_free_tx_data\n");
channel = pdata->channel;
for (i = 0; i < pdata->channel_count; i++, channel++) {
ring = channel->tx_ring;
if (!ring)
break;
for (j = 0; j < ring->rdesc_count; j++) {
rdata = XGBE_GET_DESC_DATA(ring, j);
desc_if->unmap_rdata(pdata, rdata);
}
}
DBGPR("<--xgbe_free_tx_data\n");
}
static void xgbe_free_rx_data(struct xgbe_prv_data *pdata)
{
struct xgbe_desc_if *desc_if = &pdata->desc_if;
struct xgbe_channel *channel;
struct xgbe_ring *ring;
struct xgbe_ring_data *rdata;
unsigned int i, j;
DBGPR("-->xgbe_free_rx_data\n");
channel = pdata->channel;
for (i = 0; i < pdata->channel_count; i++, channel++) {
ring = channel->rx_ring;
if (!ring)
break;
for (j = 0; j < ring->rdesc_count; j++) {
rdata = XGBE_GET_DESC_DATA(ring, j);
desc_if->unmap_rdata(pdata, rdata);
}
}
DBGPR("<--xgbe_free_rx_data\n");
}
static int xgbe_phy_init(struct xgbe_prv_data *pdata)
{
pdata->phy_link = -1;
pdata->phy_speed = SPEED_UNKNOWN;
return pdata->phy_if.phy_reset(pdata);
}
static int xgbe_start(struct xgbe_prv_data *pdata)
{
struct xgbe_hw_if *hw_if = &pdata->hw_if;
struct xgbe_phy_if *phy_if = &pdata->phy_if;
int ret;
DBGPR("-->xgbe_start\n");
hw_if->init(pdata);
ret = phy_if->phy_start(pdata);
if (ret)
goto err_phy;
ret = xgbe_request_irqs(pdata);
if (ret)
goto err_napi;
hw_if->enable_tx(pdata);
hw_if->enable_rx(pdata);
xgbe_enable_rx_tx_ints(pdata);
xgbe_start_timers(pdata);
taskqueue_enqueue(pdata->dev_workqueue, &pdata->service_work);
DBGPR("<--xgbe_start\n");
return 0;
err_napi:
phy_if->phy_stop(pdata);
err_phy:
hw_if->exit(pdata);
return ret;
}
static void xgbe_stop(struct xgbe_prv_data *pdata)
{
struct xgbe_hw_if *hw_if = &pdata->hw_if;
struct xgbe_phy_if *phy_if = &pdata->phy_if;
DBGPR("-->xgbe_stop\n");
xgbe_stop_timers(pdata);
taskqueue_drain_all(pdata->dev_workqueue);
hw_if->disable_tx(pdata);
hw_if->disable_rx(pdata);
xgbe_free_irqs(pdata);
phy_if->phy_stop(pdata);
hw_if->exit(pdata);
DBGPR("<--xgbe_stop\n");
}
static void xgbe_restart_dev(struct xgbe_prv_data *pdata)
{
DBGPR("-->xgbe_restart_dev\n");
/* If not running, "restart" will happen on open */
if ((pdata->netdev->if_drv_flags & IFF_DRV_RUNNING) == 0)
return;
xgbe_stop(pdata);
xgbe_free_tx_data(pdata);
xgbe_free_rx_data(pdata);
xgbe_start(pdata);
DBGPR("<--xgbe_restart_dev\n");
}
static void xgbe_restart(void *ctx, int pending)
{
struct xgbe_prv_data *pdata = ctx;
xgbe_restart_dev(pdata);
}
static void xgbe_packet_info(struct xgbe_prv_data *pdata,
struct xgbe_ring *ring, struct mbuf *m0,
struct xgbe_packet_data *packet)
{
struct mbuf *m;
unsigned int len;
packet->m = m0;
packet->rdesc_count = 0;
packet->tx_packets = 1;
packet->tx_bytes = m_length(m0, NULL);
for (m = m0; m != NULL; m = m->m_next) {
for (len = m->m_len; len != 0;) {
packet->rdesc_count++;
len -= MIN(len, XGBE_TX_MAX_BUF_SIZE);
}
}
}
int xgbe_open(struct ifnet *netdev)
{
struct xgbe_prv_data *pdata = netdev->if_softc;
struct xgbe_desc_if *desc_if = &pdata->desc_if;
int ret;
DBGPR("-->xgbe_open\n");
/* Initialize the phy */
ret = xgbe_phy_init(pdata);
if (ret)
return ret;
/* Calculate the Rx buffer size before allocating rings */
ret = xgbe_calc_rx_buf_size(netdev, if_getmtu(netdev));
if (ret < 0) {
goto err_ptpclk;
}
pdata->rx_buf_size = ret;
/* Allocate the channel and ring structures */
ret = xgbe_alloc_channels(pdata);
if (ret) {
printf("xgbe_alloc_channels failed\n");
goto err_ptpclk;
}
/* Allocate the ring descriptors and buffers */
ret = desc_if->alloc_ring_resources(pdata);
if (ret) {
printf("desc_if->alloc_ring_resources failed\n");
goto err_channels;
}
TASK_INIT(&pdata->service_work, 0, xgbe_service, pdata);
TASK_INIT(&pdata->restart_work, 0, xgbe_restart, pdata);
xgbe_init_timers(pdata);
ret = xgbe_start(pdata);
if (ret)
goto err_rings;
clear_bit(XGBE_DOWN, &pdata->dev_state);
DBGPR("<--xgbe_open\n");
return 0;
err_rings:
desc_if->free_ring_resources(pdata);
err_channels:
xgbe_free_channels(pdata);
err_ptpclk:
return ret;
}
int xgbe_close(struct ifnet *netdev)
{
struct xgbe_prv_data *pdata = netdev->if_softc;
struct xgbe_desc_if *desc_if = &pdata->desc_if;
DBGPR("-->xgbe_close\n");
/* Stop the device */
xgbe_stop(pdata);
/* Free the ring descriptors and buffers */
desc_if->free_ring_resources(pdata);
/* Free the channel and ring structures */
xgbe_free_channels(pdata);
set_bit(XGBE_DOWN, &pdata->dev_state);
DBGPR("<--xgbe_close\n");
return 0;
}
int xgbe_xmit(struct ifnet *ifp, struct mbuf *m)
{
struct xgbe_prv_data *pdata = ifp->if_softc;
struct xgbe_hw_if *hw_if = &pdata->hw_if;
struct xgbe_desc_if *desc_if = &pdata->desc_if;
struct xgbe_channel *channel;
struct xgbe_ring *ring;
struct xgbe_packet_data *packet;
int ret;
M_ASSERTPKTHDR(m);
MPASS(m->m_nextpkt == NULL);
if (__predict_false(test_bit(XGBE_DOWN, &pdata->dev_state) ||
!pdata->phy.link)) {
m_freem(m);
return (ENETDOWN);
}
channel = pdata->channel;
ring = channel->tx_ring;
packet = &ring->packet_data;
/* Calculate preliminary packet info */
memset(packet, 0, sizeof(*packet));
xgbe_packet_info(pdata, ring, m, packet);
/* Check that there are enough descriptors available */
ret = xgbe_maybe_stop_tx_queue(channel, ring, packet->rdesc_count);
if (ret)
goto tx_netdev_return;
if (!desc_if->map_tx_skb(channel, m)) {
goto tx_netdev_return;
}
/* Configure required descriptor fields for transmission */
hw_if->dev_xmit(channel);
return 0;
tx_netdev_return:
m_free(m);
return 0;
}
int xgbe_change_mtu(struct ifnet *netdev, int mtu)
{
struct xgbe_prv_data *pdata = netdev->if_softc;
int ret;
DBGPR("-->xgbe_change_mtu\n");
ret = xgbe_calc_rx_buf_size(netdev, mtu);
if (ret < 0)
return -ret;
pdata->rx_buf_size = ret;
netdev->if_mtu = mtu;
xgbe_restart_dev(pdata);
DBGPR("<--xgbe_change_mtu\n");
return 0;
}
static void xgbe_rx_refresh(struct xgbe_channel *channel)
{
struct xgbe_prv_data *pdata = channel->pdata;
struct xgbe_hw_if *hw_if = &pdata->hw_if;
struct xgbe_desc_if *desc_if = &pdata->desc_if;
struct xgbe_ring *ring = channel->rx_ring;
struct xgbe_ring_data *rdata;
while (ring->dirty != ring->cur) {
rdata = XGBE_GET_DESC_DATA(ring, ring->dirty);
/* Reset rdata values */
desc_if->unmap_rdata(pdata, rdata);
if (desc_if->map_rx_buffer(pdata, ring, rdata))
break;
hw_if->rx_desc_reset(pdata, rdata, ring->dirty);
ring->dirty++;
}
/* Make sure everything is written before the register write */
dsb(sy);
/* Update the Rx Tail Pointer Register with address of
* the last cleaned entry */
rdata = XGBE_GET_DESC_DATA(ring, ring->dirty - 1);
XGMAC_DMA_IOWRITE(channel, DMA_CH_RDTR_LO,
lower_32_bits(rdata->rdata_paddr));
}
static int xgbe_tx_poll(struct xgbe_channel *channel)
{
struct xgbe_prv_data *pdata = channel->pdata;
struct xgbe_hw_if *hw_if = &pdata->hw_if;
struct xgbe_desc_if *desc_if = &pdata->desc_if;
struct xgbe_ring *ring = channel->tx_ring;
struct xgbe_ring_data *rdata;
struct xgbe_ring_desc *rdesc;
int processed = 0;
unsigned int cur;
DBGPR("-->xgbe_tx_poll\n");
/* Nothing to do if there isn't a Tx ring for this channel */
if (!ring)
return 0;
cur = ring->cur;
/* Be sure we get ring->cur before accessing descriptor data */
dsb(sy);
while ((processed < XGBE_TX_DESC_MAX_PROC) &&
(ring->dirty != cur)) {
rdata = XGBE_GET_DESC_DATA(ring, ring->dirty);
rdesc = rdata->rdesc;
if (!hw_if->tx_complete(rdesc))
break;
/* Make sure descriptor fields are read after reading the OWN
* bit */
dsb(sy);
/* Free the SKB and reset the descriptor for re-use */
desc_if->unmap_rdata(pdata, rdata);
hw_if->tx_desc_reset(rdata);
processed++;
ring->dirty++;
}
if (!processed)
return 0;
DBGPR("<--xgbe_tx_poll: processed=%d\n", processed);
return processed;
}
static int xgbe_rx_poll(struct xgbe_channel *channel, int budget)
{
struct xgbe_prv_data *pdata = channel->pdata;
struct xgbe_hw_if *hw_if = &pdata->hw_if;
struct xgbe_ring *ring = channel->rx_ring;
struct xgbe_ring_data *rdata;
struct xgbe_packet_data *packet;
struct ifnet *ifp = pdata->netdev;
struct mbuf *m;
- unsigned int incomplete, context_next, context;
+ unsigned int incomplete, context_next;
unsigned int received = 0;
int packet_count = 0;
DBGPR("-->xgbe_rx_poll: budget=%d\n", budget);
/* Nothing to do if there isn't a Rx ring for this channel */
if (!ring)
return 0;
incomplete = 0;
context_next = 0;
rdata = XGBE_GET_DESC_DATA(ring, ring->cur);
packet = &ring->packet_data;
while (packet_count < budget) {
DBGPR(" cur = %d\n", ring->cur);
read_again:
rdata = XGBE_GET_DESC_DATA(ring, ring->cur);
if (xgbe_rx_dirty_desc(ring) > (XGBE_RX_DESC_CNT >> 3))
xgbe_rx_refresh(channel);
if (hw_if->dev_read(channel))
break;
m = rdata->mb;
received++;
ring->cur++;
incomplete = XGMAC_GET_BITS(packet->attributes,
RX_PACKET_ATTRIBUTES,
INCOMPLETE);
context_next = XGMAC_GET_BITS(packet->attributes,
RX_PACKET_ATTRIBUTES,
CONTEXT_NEXT);
- context = XGMAC_GET_BITS(packet->attributes,
- RX_PACKET_ATTRIBUTES,
- CONTEXT);
/* Earlier error, just drain the remaining data */
if (incomplete || context_next) {
goto read_again;
}
if (packet->errors) {
rdata->mbuf_free = 1;
goto next_packet;
}
rdata->mb = NULL;
m->m_pkthdr.len = rdata->rx.hdr_len + rdata->rx.len;
if (rdata->rx.hdr_len != 0) {
m->m_len = rdata->rx.hdr_len;
m->m_next->m_len = rdata->rx.len;
} else {
m->m_len = rdata->rx.len;
m_freem(m->m_next);
m->m_next = NULL;
}
if_setrcvif(m, ifp);
if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
ifp->if_input(ifp, m);
next_packet:
packet_count++;
}
DBGPR("<--xgbe_rx_poll: packet_count = %d\n", packet_count);
return packet_count;
}
static int xgbe_one_poll(struct xgbe_channel *channel, int budget)
{
int processed = 0;
DBGPR("-->xgbe_one_poll: budget=%d\n", budget);
/* Cleanup Tx ring first */
xgbe_tx_poll(channel);
/* Process Rx ring next */
processed = xgbe_rx_poll(channel, budget);
DBGPR("<--xgbe_one_poll: received = %d\n", processed);
return processed;
}
static int xgbe_all_poll(struct xgbe_prv_data *pdata, int budget)
{
struct xgbe_channel *channel;
int ring_budget;
int processed, last_processed;
unsigned int i;
DBGPR("-->xgbe_all_poll: budget=%d\n", budget);
processed = 0;
ring_budget = budget / pdata->rx_ring_count;
do {
last_processed = processed;
channel = pdata->channel;
for (i = 0; i < pdata->channel_count; i++, channel++) {
/* Cleanup Tx ring first */
xgbe_tx_poll(channel);
/* Process Rx ring next */
if (ring_budget > (budget - processed))
ring_budget = budget - processed;
processed += xgbe_rx_poll(channel, ring_budget);
}
} while ((processed < budget) && (processed != last_processed));
DBGPR("<--xgbe_all_poll: received = %d\n", processed);
return processed;
}
Index: head/sys/dev/axgbe/xgbe-mdio.c
===================================================================
--- head/sys/dev/axgbe/xgbe-mdio.c (revision 327172)
+++ head/sys/dev/axgbe/xgbe-mdio.c (revision 327173)
@@ -1,1180 +1,1174 @@
/*
* AMD 10Gb Ethernet driver
*
* This file is available to you under your choice of the following two
* licenses:
*
* License 1: GPLv2
*
* Copyright (c) 2014-2016 Advanced Micro Devices, Inc.
*
* This file is free software; you may copy, redistribute and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 2 of the License, or (at
* your option) any later version.
*
* This file is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* This file incorporates work covered by the following copyright and
* permission notice:
* The Synopsys DWC ETHER XGMAC Software Driver and documentation
* (hereinafter "Software") is an unsupported proprietary work of Synopsys,
* Inc. unless otherwise expressly agreed to in writing between Synopsys
* and you.
*
* The Software IS NOT an item of Licensed Software or Licensed Product
* under any End User Software License Agreement or Agreement for Licensed
* Product with Synopsys or any supplement thereto. Permission is hereby
* granted, free of charge, to any person obtaining a copy of this software
* annotated with this license and the Software, to deal in the Software
* without restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is furnished
* to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS"
* BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
* PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*
*
* License 2: Modified BSD
*
* Copyright (c) 2014-2016 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Advanced Micro Devices, Inc. nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* This file incorporates work covered by the following copyright and
* permission notice:
* The Synopsys DWC ETHER XGMAC Software Driver and documentation
* (hereinafter "Software") is an unsupported proprietary work of Synopsys,
* Inc. unless otherwise expressly agreed to in writing between Synopsys
* and you.
*
* The Software IS NOT an item of Licensed Software or Licensed Product
* under any End User Software License Agreement or Agreement for Licensed
* Product with Synopsys or any supplement thereto. Permission is hereby
* granted, free of charge, to any person obtaining a copy of this software
* annotated with this license and the Software, to deal in the Software
* without restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is furnished
* to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS"
* BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
* PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/kernel.h>
#include "xgbe.h"
#include "xgbe-common.h"
static void xgbe_an_state_machine(struct xgbe_prv_data *pdata);
static void xgbe_an_enable_kr_training(struct xgbe_prv_data *pdata)
{
unsigned int reg;
reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL);
reg |= XGBE_KR_TRAINING_ENABLE;
XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL, reg);
}
static void xgbe_an_disable_kr_training(struct xgbe_prv_data *pdata)
{
unsigned int reg;
reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL);
reg &= ~XGBE_KR_TRAINING_ENABLE;
XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL, reg);
}
static void xgbe_pcs_power_cycle(struct xgbe_prv_data *pdata)
{
unsigned int reg;
reg = XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_CTRL1);
reg |= MDIO_CTRL1_LPOWER;
XMDIO_WRITE(pdata, MDIO_MMD_PCS, MDIO_CTRL1, reg);
DELAY(75);
reg &= ~MDIO_CTRL1_LPOWER;
XMDIO_WRITE(pdata, MDIO_MMD_PCS, MDIO_CTRL1, reg);
}
static void xgbe_serdes_start_ratechange(struct xgbe_prv_data *pdata)
{
/* Assert Rx and Tx ratechange */
XSIR1_IOWRITE_BITS(pdata, SIR1_SPEED, RATECHANGE, 1);
}
static void xgbe_serdes_complete_ratechange(struct xgbe_prv_data *pdata)
{
unsigned int wait;
u16 status;
/* Release Rx and Tx ratechange */
XSIR1_IOWRITE_BITS(pdata, SIR1_SPEED, RATECHANGE, 0);
/* Wait for Rx and Tx ready */
wait = XGBE_RATECHANGE_COUNT;
while (wait--) {
DELAY(50);
status = XSIR0_IOREAD(pdata, SIR0_STATUS);
if (XSIR_GET_BITS(status, SIR0_STATUS, RX_READY) &&
XSIR_GET_BITS(status, SIR0_STATUS, TX_READY))
goto rx_reset;
}
rx_reset:
/* Perform Rx reset for the DFE changes */
XRXTX_IOWRITE_BITS(pdata, RXTX_REG6, RESETB_RXD, 0);
XRXTX_IOWRITE_BITS(pdata, RXTX_REG6, RESETB_RXD, 1);
}
static void xgbe_xgmii_mode(struct xgbe_prv_data *pdata)
{
unsigned int reg;
/* Enable KR training */
xgbe_an_enable_kr_training(pdata);
/* Set MAC to 10G speed */
pdata->hw_if.set_xgmii_speed(pdata);
/* Set PCS to KR/10G speed */
reg = XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_CTRL2);
reg &= ~MDIO_PCS_CTRL2_TYPE;
reg |= MDIO_PCS_CTRL2_10GBR;
XMDIO_WRITE(pdata, MDIO_MMD_PCS, MDIO_CTRL2, reg);
reg = XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_CTRL1);
reg &= ~MDIO_CTRL1_SPEEDSEL;
reg |= MDIO_CTRL1_SPEED10G;
XMDIO_WRITE(pdata, MDIO_MMD_PCS, MDIO_CTRL1, reg);
xgbe_pcs_power_cycle(pdata);
/* Set SerDes to 10G speed */
xgbe_serdes_start_ratechange(pdata);
XSIR1_IOWRITE_BITS(pdata, SIR1_SPEED, DATARATE, XGBE_SPEED_10000_RATE);
XSIR1_IOWRITE_BITS(pdata, SIR1_SPEED, WORDMODE, XGBE_SPEED_10000_WORD);
XSIR1_IOWRITE_BITS(pdata, SIR1_SPEED, PLLSEL, XGBE_SPEED_10000_PLL);
XSIR1_IOWRITE_BITS(pdata, SIR1_SPEED, CDR_RATE,
pdata->serdes_cdr_rate[XGBE_SPEED_10000]);
XSIR1_IOWRITE_BITS(pdata, SIR1_SPEED, TXAMP,
pdata->serdes_tx_amp[XGBE_SPEED_10000]);
XRXTX_IOWRITE_BITS(pdata, RXTX_REG20, BLWC_ENA,
pdata->serdes_blwc[XGBE_SPEED_10000]);
XRXTX_IOWRITE_BITS(pdata, RXTX_REG114, PQ_REG,
pdata->serdes_pq_skew[XGBE_SPEED_10000]);
XRXTX_IOWRITE_BITS(pdata, RXTX_REG129, RXDFE_CONFIG,
pdata->serdes_dfe_tap_cfg[XGBE_SPEED_10000]);
XRXTX_IOWRITE(pdata, RXTX_REG22,
pdata->serdes_dfe_tap_ena[XGBE_SPEED_10000]);
xgbe_serdes_complete_ratechange(pdata);
}
static void xgbe_gmii_2500_mode(struct xgbe_prv_data *pdata)
{
unsigned int reg;
/* Disable KR training */
xgbe_an_disable_kr_training(pdata);
/* Set MAC to 2.5G speed */
pdata->hw_if.set_gmii_2500_speed(pdata);
/* Set PCS to KX/1G speed */
reg = XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_CTRL2);
reg &= ~MDIO_PCS_CTRL2_TYPE;
reg |= MDIO_PCS_CTRL2_10GBX;
XMDIO_WRITE(pdata, MDIO_MMD_PCS, MDIO_CTRL2, reg);
reg = XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_CTRL1);
reg &= ~MDIO_CTRL1_SPEEDSEL;
reg |= MDIO_CTRL1_SPEED1G;
XMDIO_WRITE(pdata, MDIO_MMD_PCS, MDIO_CTRL1, reg);
xgbe_pcs_power_cycle(pdata);
/* Set SerDes to 2.5G speed */
xgbe_serdes_start_ratechange(pdata);
XSIR1_IOWRITE_BITS(pdata, SIR1_SPEED, DATARATE, XGBE_SPEED_2500_RATE);
XSIR1_IOWRITE_BITS(pdata, SIR1_SPEED, WORDMODE, XGBE_SPEED_2500_WORD);
XSIR1_IOWRITE_BITS(pdata, SIR1_SPEED, PLLSEL, XGBE_SPEED_2500_PLL);
XSIR1_IOWRITE_BITS(pdata, SIR1_SPEED, CDR_RATE,
pdata->serdes_cdr_rate[XGBE_SPEED_2500]);
XSIR1_IOWRITE_BITS(pdata, SIR1_SPEED, TXAMP,
pdata->serdes_tx_amp[XGBE_SPEED_2500]);
XRXTX_IOWRITE_BITS(pdata, RXTX_REG20, BLWC_ENA,
pdata->serdes_blwc[XGBE_SPEED_2500]);
XRXTX_IOWRITE_BITS(pdata, RXTX_REG114, PQ_REG,
pdata->serdes_pq_skew[XGBE_SPEED_2500]);
XRXTX_IOWRITE_BITS(pdata, RXTX_REG129, RXDFE_CONFIG,
pdata->serdes_dfe_tap_cfg[XGBE_SPEED_2500]);
XRXTX_IOWRITE(pdata, RXTX_REG22,
pdata->serdes_dfe_tap_ena[XGBE_SPEED_2500]);
xgbe_serdes_complete_ratechange(pdata);
}
static void xgbe_gmii_mode(struct xgbe_prv_data *pdata)
{
unsigned int reg;
/* Disable KR training */
xgbe_an_disable_kr_training(pdata);
/* Set MAC to 1G speed */
pdata->hw_if.set_gmii_speed(pdata);
/* Set PCS to KX/1G speed */
reg = XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_CTRL2);
reg &= ~MDIO_PCS_CTRL2_TYPE;
reg |= MDIO_PCS_CTRL2_10GBX;
XMDIO_WRITE(pdata, MDIO_MMD_PCS, MDIO_CTRL2, reg);
reg = XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_CTRL1);
reg &= ~MDIO_CTRL1_SPEEDSEL;
reg |= MDIO_CTRL1_SPEED1G;
XMDIO_WRITE(pdata, MDIO_MMD_PCS, MDIO_CTRL1, reg);
xgbe_pcs_power_cycle(pdata);
/* Set SerDes to 1G speed */
xgbe_serdes_start_ratechange(pdata);
XSIR1_IOWRITE_BITS(pdata, SIR1_SPEED, DATARATE, XGBE_SPEED_1000_RATE);
XSIR1_IOWRITE_BITS(pdata, SIR1_SPEED, WORDMODE, XGBE_SPEED_1000_WORD);
XSIR1_IOWRITE_BITS(pdata, SIR1_SPEED, PLLSEL, XGBE_SPEED_1000_PLL);
XSIR1_IOWRITE_BITS(pdata, SIR1_SPEED, CDR_RATE,
pdata->serdes_cdr_rate[XGBE_SPEED_1000]);
XSIR1_IOWRITE_BITS(pdata, SIR1_SPEED, TXAMP,
pdata->serdes_tx_amp[XGBE_SPEED_1000]);
XRXTX_IOWRITE_BITS(pdata, RXTX_REG20, BLWC_ENA,
pdata->serdes_blwc[XGBE_SPEED_1000]);
XRXTX_IOWRITE_BITS(pdata, RXTX_REG114, PQ_REG,
pdata->serdes_pq_skew[XGBE_SPEED_1000]);
XRXTX_IOWRITE_BITS(pdata, RXTX_REG129, RXDFE_CONFIG,
pdata->serdes_dfe_tap_cfg[XGBE_SPEED_1000]);
XRXTX_IOWRITE(pdata, RXTX_REG22,
pdata->serdes_dfe_tap_ena[XGBE_SPEED_1000]);
xgbe_serdes_complete_ratechange(pdata);
}
static void xgbe_cur_mode(struct xgbe_prv_data *pdata,
enum xgbe_mode *mode)
{
unsigned int reg;
reg = XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_CTRL2);
if ((reg & MDIO_PCS_CTRL2_TYPE) == MDIO_PCS_CTRL2_10GBR)
*mode = XGBE_MODE_KR;
else
*mode = XGBE_MODE_KX;
}
static bool xgbe_in_kr_mode(struct xgbe_prv_data *pdata)
{
enum xgbe_mode mode;
xgbe_cur_mode(pdata, &mode);
return (mode == XGBE_MODE_KR);
}
static void xgbe_switch_mode(struct xgbe_prv_data *pdata)
{
/* If we are in KR switch to KX, and vice-versa */
if (xgbe_in_kr_mode(pdata)) {
if (pdata->speed_set == XGBE_SPEEDSET_1000_10000)
xgbe_gmii_mode(pdata);
else
xgbe_gmii_2500_mode(pdata);
} else {
xgbe_xgmii_mode(pdata);
}
}
static void xgbe_set_mode(struct xgbe_prv_data *pdata,
enum xgbe_mode mode)
{
enum xgbe_mode cur_mode;
xgbe_cur_mode(pdata, &cur_mode);
if (mode != cur_mode)
xgbe_switch_mode(pdata);
}
static bool xgbe_use_xgmii_mode(struct xgbe_prv_data *pdata)
{
if (pdata->phy.autoneg == AUTONEG_ENABLE) {
if (pdata->phy.advertising & ADVERTISED_10000baseKR_Full)
return true;
} else {
if (pdata->phy.speed == SPEED_10000)
return true;
}
return false;
}
static bool xgbe_use_gmii_2500_mode(struct xgbe_prv_data *pdata)
{
if (pdata->phy.autoneg == AUTONEG_ENABLE) {
if (pdata->phy.advertising & ADVERTISED_2500baseX_Full)
return true;
} else {
if (pdata->phy.speed == SPEED_2500)
return true;
}
return false;
}
static bool xgbe_use_gmii_mode(struct xgbe_prv_data *pdata)
{
if (pdata->phy.autoneg == AUTONEG_ENABLE) {
if (pdata->phy.advertising & ADVERTISED_1000baseKX_Full)
return true;
} else {
if (pdata->phy.speed == SPEED_1000)
return true;
}
return false;
}
static void xgbe_set_an(struct xgbe_prv_data *pdata, bool enable, bool restart)
{
unsigned int reg;
reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_CTRL1);
reg &= ~MDIO_AN_CTRL1_ENABLE;
if (enable)
reg |= MDIO_AN_CTRL1_ENABLE;
if (restart)
reg |= MDIO_AN_CTRL1_RESTART;
XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_CTRL1, reg);
}
static void xgbe_restart_an(struct xgbe_prv_data *pdata)
{
xgbe_set_an(pdata, true, true);
}
static void xgbe_disable_an(struct xgbe_prv_data *pdata)
{
xgbe_set_an(pdata, false, false);
}
static enum xgbe_an xgbe_an_tx_training(struct xgbe_prv_data *pdata,
enum xgbe_rx *state)
{
unsigned int ad_reg, lp_reg, reg;
*state = XGBE_RX_COMPLETE;
/* If we're not in KR mode then we're done */
if (!xgbe_in_kr_mode(pdata))
return XGBE_AN_PAGE_RECEIVED;
/* Enable/Disable FEC */
ad_reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_ADVERTISE + 2);
lp_reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_LPA + 2);
reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_FECCTRL);
reg &= ~(MDIO_PMA_10GBR_FECABLE_ABLE | MDIO_PMA_10GBR_FECABLE_ERRABLE);
if ((ad_reg & 0xc000) && (lp_reg & 0xc000))
reg |= pdata->fec_ability;
XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_FECCTRL, reg);
/* Start KR training */
reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL);
if (reg & XGBE_KR_TRAINING_ENABLE) {
XSIR0_IOWRITE_BITS(pdata, SIR0_KR_RT_1, RESET, 1);
reg |= XGBE_KR_TRAINING_START;
XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL,
reg);
XSIR0_IOWRITE_BITS(pdata, SIR0_KR_RT_1, RESET, 0);
}
return XGBE_AN_PAGE_RECEIVED;
}
static enum xgbe_an xgbe_an_tx_xnp(struct xgbe_prv_data *pdata,
enum xgbe_rx *state)
{
u16 msg;
*state = XGBE_RX_XNP;
msg = XGBE_XNP_MCF_NULL_MESSAGE;
msg |= XGBE_XNP_MP_FORMATTED;
XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_XNP + 2, 0);
XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_XNP + 1, 0);
XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_XNP, msg);
return XGBE_AN_PAGE_RECEIVED;
}
static enum xgbe_an xgbe_an_rx_bpa(struct xgbe_prv_data *pdata,
enum xgbe_rx *state)
{
unsigned int link_support;
unsigned int reg, ad_reg, lp_reg;
/* Read Base Ability register 2 first */
reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_LPA + 1);
/* Check for a supported mode, otherwise restart in a different one */
link_support = xgbe_in_kr_mode(pdata) ? 0x80 : 0x20;
if (!(reg & link_support))
return XGBE_AN_INCOMPAT_LINK;
/* Check Extended Next Page support */
ad_reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_ADVERTISE);
lp_reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_LPA);
return ((ad_reg & XGBE_XNP_NP_EXCHANGE) ||
(lp_reg & XGBE_XNP_NP_EXCHANGE))
? xgbe_an_tx_xnp(pdata, state)
: xgbe_an_tx_training(pdata, state);
}
static enum xgbe_an xgbe_an_rx_xnp(struct xgbe_prv_data *pdata,
enum xgbe_rx *state)
{
unsigned int ad_reg, lp_reg;
/* Check Extended Next Page support */
ad_reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_XNP);
lp_reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_LPX);
return ((ad_reg & XGBE_XNP_NP_EXCHANGE) ||
(lp_reg & XGBE_XNP_NP_EXCHANGE))
? xgbe_an_tx_xnp(pdata, state)
: xgbe_an_tx_training(pdata, state);
}
static enum xgbe_an xgbe_an_page_received(struct xgbe_prv_data *pdata)
{
enum xgbe_rx *state;
unsigned long an_timeout;
enum xgbe_an ret;
if (!pdata->an_start) {
pdata->an_start = ticks;
} else {
an_timeout = pdata->an_start +
((uint64_t)XGBE_AN_MS_TIMEOUT * (uint64_t)hz) / 1000ull;
if ((int)(ticks - an_timeout) > 0) {
/* Auto-negotiation timed out, reset state */
pdata->kr_state = XGBE_RX_BPA;
pdata->kx_state = XGBE_RX_BPA;
pdata->an_start = ticks;
}
}
state = xgbe_in_kr_mode(pdata) ? &pdata->kr_state
: &pdata->kx_state;
switch (*state) {
case XGBE_RX_BPA:
ret = xgbe_an_rx_bpa(pdata, state);
break;
case XGBE_RX_XNP:
ret = xgbe_an_rx_xnp(pdata, state);
break;
default:
ret = XGBE_AN_ERROR;
}
return ret;
}
static enum xgbe_an xgbe_an_incompat_link(struct xgbe_prv_data *pdata)
{
/* Be sure we aren't looping trying to negotiate */
if (xgbe_in_kr_mode(pdata)) {
pdata->kr_state = XGBE_RX_ERROR;
if (!(pdata->phy.advertising & ADVERTISED_1000baseKX_Full) &&
!(pdata->phy.advertising & ADVERTISED_2500baseX_Full))
return XGBE_AN_NO_LINK;
if (pdata->kx_state != XGBE_RX_BPA)
return XGBE_AN_NO_LINK;
} else {
pdata->kx_state = XGBE_RX_ERROR;
if (!(pdata->phy.advertising & ADVERTISED_10000baseKR_Full))
return XGBE_AN_NO_LINK;
if (pdata->kr_state != XGBE_RX_BPA)
return XGBE_AN_NO_LINK;
}
xgbe_disable_an(pdata);
xgbe_switch_mode(pdata);
xgbe_restart_an(pdata);
return XGBE_AN_INCOMPAT_LINK;
}
static void xgbe_an_isr(void *data)
{
struct xgbe_prv_data *pdata = (struct xgbe_prv_data *)data;
/* Disable AN interrupts */
XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_INTMASK, 0);
/* Save the interrupt(s) that fired */
pdata->an_int = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_INT);
if (pdata->an_int) {
/* Clear the interrupt(s) that fired and process them */
XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_INT, ~pdata->an_int);
xgbe_an_state_machine(pdata);
} else {
/* Enable AN interrupts */
XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_INTMASK,
XGBE_AN_INT_MASK);
}
}
static void xgbe_an_state_machine(struct xgbe_prv_data *pdata)
{
enum xgbe_an cur_state = pdata->an_state;
sx_xlock(&pdata->an_mutex);
if (!pdata->an_int)
goto out;
next_int:
if (pdata->an_int & XGBE_AN_PG_RCV) {
pdata->an_state = XGBE_AN_PAGE_RECEIVED;
pdata->an_int &= ~XGBE_AN_PG_RCV;
} else if (pdata->an_int & XGBE_AN_INC_LINK) {
pdata->an_state = XGBE_AN_INCOMPAT_LINK;
pdata->an_int &= ~XGBE_AN_INC_LINK;
} else if (pdata->an_int & XGBE_AN_INT_CMPLT) {
pdata->an_state = XGBE_AN_COMPLETE;
pdata->an_int &= ~XGBE_AN_INT_CMPLT;
} else {
pdata->an_state = XGBE_AN_ERROR;
}
pdata->an_result = pdata->an_state;
again:
cur_state = pdata->an_state;
switch (pdata->an_state) {
case XGBE_AN_READY:
pdata->an_supported = 0;
break;
case XGBE_AN_PAGE_RECEIVED:
pdata->an_state = xgbe_an_page_received(pdata);
pdata->an_supported++;
break;
case XGBE_AN_INCOMPAT_LINK:
pdata->an_supported = 0;
pdata->parallel_detect = 0;
pdata->an_state = xgbe_an_incompat_link(pdata);
break;
case XGBE_AN_COMPLETE:
pdata->parallel_detect = pdata->an_supported ? 0 : 1;
break;
case XGBE_AN_NO_LINK:
break;
default:
pdata->an_state = XGBE_AN_ERROR;
}
if (pdata->an_state == XGBE_AN_NO_LINK) {
pdata->an_int = 0;
XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_INT, 0);
} else if (pdata->an_state == XGBE_AN_ERROR) {
pdata->an_int = 0;
XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_INT, 0);
}
if (pdata->an_state >= XGBE_AN_COMPLETE) {
pdata->an_result = pdata->an_state;
pdata->an_state = XGBE_AN_READY;
pdata->kr_state = XGBE_RX_BPA;
pdata->kx_state = XGBE_RX_BPA;
pdata->an_start = 0;
}
if (cur_state != pdata->an_state)
goto again;
if (pdata->an_int)
goto next_int;
out:
/* Enable AN interrupts on the way out */
XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_INTMASK, XGBE_AN_INT_MASK);
sx_xunlock(&pdata->an_mutex);
}
static void xgbe_an_init(struct xgbe_prv_data *pdata)
{
unsigned int reg;
/* Set up Advertisement register 3 first */
reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_ADVERTISE + 2);
reg &= ~0xc000;
XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_ADVERTISE + 2, reg);
/* Set up Advertisement register 2 next */
reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_ADVERTISE + 1);
if (pdata->phy.advertising & ADVERTISED_10000baseKR_Full)
reg |= 0x80;
else
reg &= ~0x80;
if ((pdata->phy.advertising & ADVERTISED_1000baseKX_Full) ||
(pdata->phy.advertising & ADVERTISED_2500baseX_Full))
reg |= 0x20;
else
reg &= ~0x20;
XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_ADVERTISE + 1, reg);
/* Set up Advertisement register 1 last */
reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_ADVERTISE);
if (pdata->phy.advertising & ADVERTISED_Pause)
reg |= 0x400;
else
reg &= ~0x400;
if (pdata->phy.advertising & ADVERTISED_Asym_Pause)
reg |= 0x800;
else
reg &= ~0x800;
/* We don't intend to perform XNP */
reg &= ~XGBE_XNP_NP_EXCHANGE;
XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_ADVERTISE, reg);
}
static void xgbe_phy_adjust_link(struct xgbe_prv_data *pdata)
{
- int new_state = 0;
if (pdata->phy.link) {
/* Flow control support */
pdata->pause_autoneg = pdata->phy.pause_autoneg;
if (pdata->tx_pause != pdata->phy.tx_pause) {
- new_state = 1;
pdata->hw_if.config_tx_flow_control(pdata);
pdata->tx_pause = pdata->phy.tx_pause;
}
if (pdata->rx_pause != pdata->phy.rx_pause) {
- new_state = 1;
pdata->hw_if.config_rx_flow_control(pdata);
pdata->rx_pause = pdata->phy.rx_pause;
}
/* Speed support */
if (pdata->phy_speed != pdata->phy.speed) {
- new_state = 1;
pdata->phy_speed = pdata->phy.speed;
}
if (pdata->phy_link != pdata->phy.link) {
- new_state = 1;
pdata->phy_link = pdata->phy.link;
}
} else if (pdata->phy_link) {
- new_state = 1;
pdata->phy_link = 0;
pdata->phy_speed = SPEED_UNKNOWN;
}
}
static int xgbe_phy_config_fixed(struct xgbe_prv_data *pdata)
{
/* Disable auto-negotiation */
xgbe_disable_an(pdata);
/* Validate/Set specified speed */
switch (pdata->phy.speed) {
case SPEED_10000:
xgbe_set_mode(pdata, XGBE_MODE_KR);
break;
case SPEED_2500:
case SPEED_1000:
xgbe_set_mode(pdata, XGBE_MODE_KX);
break;
default:
return -EINVAL;
}
/* Validate duplex mode */
if (pdata->phy.duplex != DUPLEX_FULL)
return -EINVAL;
return 0;
}
static int __xgbe_phy_config_aneg(struct xgbe_prv_data *pdata)
{
set_bit(XGBE_LINK_INIT, &pdata->dev_state);
pdata->link_check = ticks;
if (pdata->phy.autoneg != AUTONEG_ENABLE)
return xgbe_phy_config_fixed(pdata);
/* Disable auto-negotiation interrupt */
XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_INTMASK, 0);
/* Clear any auto-negotitation interrupts */
XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_INT, 0);
/* Start auto-negotiation in a supported mode */
if (pdata->phy.advertising & ADVERTISED_10000baseKR_Full) {
xgbe_set_mode(pdata, XGBE_MODE_KR);
} else if ((pdata->phy.advertising & ADVERTISED_1000baseKX_Full) ||
(pdata->phy.advertising & ADVERTISED_2500baseX_Full)) {
xgbe_set_mode(pdata, XGBE_MODE_KX);
} else {
XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_INTMASK, 0x07);
return -EINVAL;
}
/* Disable and stop any in progress auto-negotiation */
xgbe_disable_an(pdata);
/* Clear any auto-negotitation interrupts */
XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_INT, 0);
pdata->an_result = XGBE_AN_READY;
pdata->an_state = XGBE_AN_READY;
pdata->kr_state = XGBE_RX_BPA;
pdata->kx_state = XGBE_RX_BPA;
/* Re-enable auto-negotiation interrupt */
XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_INTMASK, 0x07);
/* Set up advertisement registers based on current settings */
xgbe_an_init(pdata);
/* Enable and start auto-negotiation */
xgbe_restart_an(pdata);
return 0;
}
static int xgbe_phy_config_aneg(struct xgbe_prv_data *pdata)
{
int ret;
sx_xlock(&pdata->an_mutex);
ret = __xgbe_phy_config_aneg(pdata);
if (ret)
set_bit(XGBE_LINK_ERR, &pdata->dev_state);
else
clear_bit(XGBE_LINK_ERR, &pdata->dev_state);
sx_unlock(&pdata->an_mutex);
return ret;
}
static bool xgbe_phy_aneg_done(struct xgbe_prv_data *pdata)
{
return (pdata->an_result == XGBE_AN_COMPLETE);
}
static void xgbe_check_link_timeout(struct xgbe_prv_data *pdata)
{
unsigned long link_timeout;
link_timeout = pdata->link_check + (XGBE_LINK_TIMEOUT * hz);
if ((int)(ticks - link_timeout) >= 0) {
xgbe_phy_config_aneg(pdata);
}
}
static void xgbe_phy_status_force(struct xgbe_prv_data *pdata)
{
if (xgbe_in_kr_mode(pdata)) {
pdata->phy.speed = SPEED_10000;
} else {
switch (pdata->speed_set) {
case XGBE_SPEEDSET_1000_10000:
pdata->phy.speed = SPEED_1000;
break;
case XGBE_SPEEDSET_2500_10000:
pdata->phy.speed = SPEED_2500;
break;
}
}
pdata->phy.duplex = DUPLEX_FULL;
}
static void xgbe_phy_status_aneg(struct xgbe_prv_data *pdata)
{
unsigned int ad_reg, lp_reg;
pdata->phy.lp_advertising = 0;
if ((pdata->phy.autoneg != AUTONEG_ENABLE) || pdata->parallel_detect)
return xgbe_phy_status_force(pdata);
pdata->phy.lp_advertising |= ADVERTISED_Autoneg;
pdata->phy.lp_advertising |= ADVERTISED_Backplane;
/* Compare Advertisement and Link Partner register 1 */
ad_reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_ADVERTISE);
lp_reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_LPA);
if (lp_reg & 0x400)
pdata->phy.lp_advertising |= ADVERTISED_Pause;
if (lp_reg & 0x800)
pdata->phy.lp_advertising |= ADVERTISED_Asym_Pause;
if (pdata->phy.pause_autoneg) {
/* Set flow control based on auto-negotiation result */
pdata->phy.tx_pause = 0;
pdata->phy.rx_pause = 0;
if (ad_reg & lp_reg & 0x400) {
pdata->phy.tx_pause = 1;
pdata->phy.rx_pause = 1;
} else if (ad_reg & lp_reg & 0x800) {
if (ad_reg & 0x400)
pdata->phy.rx_pause = 1;
else if (lp_reg & 0x400)
pdata->phy.tx_pause = 1;
}
}
/* Compare Advertisement and Link Partner register 2 */
ad_reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_ADVERTISE + 1);
lp_reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_LPA + 1);
if (lp_reg & 0x80)
pdata->phy.lp_advertising |= ADVERTISED_10000baseKR_Full;
if (lp_reg & 0x20) {
switch (pdata->speed_set) {
case XGBE_SPEEDSET_1000_10000:
pdata->phy.lp_advertising |= ADVERTISED_1000baseKX_Full;
break;
case XGBE_SPEEDSET_2500_10000:
pdata->phy.lp_advertising |= ADVERTISED_2500baseX_Full;
break;
}
}
ad_reg &= lp_reg;
if (ad_reg & 0x80) {
pdata->phy.speed = SPEED_10000;
xgbe_set_mode(pdata, XGBE_MODE_KR);
} else if (ad_reg & 0x20) {
switch (pdata->speed_set) {
case XGBE_SPEEDSET_1000_10000:
pdata->phy.speed = SPEED_1000;
break;
case XGBE_SPEEDSET_2500_10000:
pdata->phy.speed = SPEED_2500;
break;
}
xgbe_set_mode(pdata, XGBE_MODE_KX);
} else {
pdata->phy.speed = SPEED_UNKNOWN;
}
/* Compare Advertisement and Link Partner register 3 */
ad_reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_ADVERTISE + 2);
lp_reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_LPA + 2);
}
static void xgbe_phy_status(struct xgbe_prv_data *pdata)
{
unsigned int reg, link_aneg;
if (test_bit(XGBE_LINK_ERR, &pdata->dev_state)) {
pdata->phy.link = 0;
goto adjust_link;
}
link_aneg = (pdata->phy.autoneg == AUTONEG_ENABLE);
/* Get the link status. Link status is latched low, so read
* once to clear and then read again to get current state
*/
reg = XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_STAT1);
reg = XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_STAT1);
pdata->phy.link = (reg & MDIO_STAT1_LSTATUS) ? 1 : 0;
if (pdata->phy.link) {
if (link_aneg && !xgbe_phy_aneg_done(pdata)) {
xgbe_check_link_timeout(pdata);
return;
}
xgbe_phy_status_aneg(pdata);
if (test_bit(XGBE_LINK_INIT, &pdata->dev_state))
clear_bit(XGBE_LINK_INIT, &pdata->dev_state);
} else {
if (test_bit(XGBE_LINK_INIT, &pdata->dev_state)) {
xgbe_check_link_timeout(pdata);
if (link_aneg)
return;
}
xgbe_phy_status_aneg(pdata);
}
adjust_link:
xgbe_phy_adjust_link(pdata);
}
static void xgbe_phy_stop(struct xgbe_prv_data *pdata)
{
/* Disable auto-negotiation */
xgbe_disable_an(pdata);
/* Disable auto-negotiation interrupts */
XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_INTMASK, 0);
bus_teardown_intr(pdata->dev, pdata->an_irq_res, pdata->an_irq_tag);
pdata->phy.link = 0;
xgbe_phy_adjust_link(pdata);
}
static int xgbe_phy_start(struct xgbe_prv_data *pdata)
{
int ret;
ret = bus_setup_intr(pdata->dev, pdata->an_irq_res,
INTR_MPSAFE | INTR_TYPE_NET, NULL, xgbe_an_isr, pdata,
&pdata->an_irq_tag);
if (ret) {
return -ret;
}
/* Set initial mode - call the mode setting routines
* directly to insure we are properly configured
*/
if (xgbe_use_xgmii_mode(pdata)) {
xgbe_xgmii_mode(pdata);
} else if (xgbe_use_gmii_mode(pdata)) {
xgbe_gmii_mode(pdata);
} else if (xgbe_use_gmii_2500_mode(pdata)) {
xgbe_gmii_2500_mode(pdata);
} else {
ret = -EINVAL;
goto err_irq;
}
/* Set up advertisement registers based on current settings */
xgbe_an_init(pdata);
/* Enable auto-negotiation interrupts */
XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_INTMASK, 0x07);
return xgbe_phy_config_aneg(pdata);
err_irq:
bus_teardown_intr(pdata->dev, pdata->an_irq_res, pdata->an_irq_tag);
return ret;
}
static int xgbe_phy_reset(struct xgbe_prv_data *pdata)
{
unsigned int count, reg;
reg = XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_CTRL1);
reg |= MDIO_CTRL1_RESET;
XMDIO_WRITE(pdata, MDIO_MMD_PCS, MDIO_CTRL1, reg);
count = 50;
do {
DELAY(20);
reg = XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_CTRL1);
} while ((reg & MDIO_CTRL1_RESET) && --count);
if (reg & MDIO_CTRL1_RESET)
return -ETIMEDOUT;
/* Disable auto-negotiation for now */
xgbe_disable_an(pdata);
/* Clear auto-negotiation interrupts */
XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_INT, 0);
return 0;
}
static void xgbe_phy_init(struct xgbe_prv_data *pdata)
{
sx_init(&pdata->an_mutex, "axgbe AN lock");
pdata->mdio_mmd = MDIO_MMD_PCS;
/* Initialize supported features */
pdata->phy.supported = SUPPORTED_Autoneg;
pdata->phy.supported |= SUPPORTED_Pause | SUPPORTED_Asym_Pause;
pdata->phy.supported |= SUPPORTED_Backplane;
pdata->phy.supported |= SUPPORTED_10000baseKR_Full;
switch (pdata->speed_set) {
case XGBE_SPEEDSET_1000_10000:
pdata->phy.supported |= SUPPORTED_1000baseKX_Full;
break;
case XGBE_SPEEDSET_2500_10000:
pdata->phy.supported |= SUPPORTED_2500baseX_Full;
break;
}
pdata->fec_ability = XMDIO_READ(pdata, MDIO_MMD_PMAPMD,
MDIO_PMA_10GBR_FECABLE);
pdata->fec_ability &= (MDIO_PMA_10GBR_FECABLE_ABLE |
MDIO_PMA_10GBR_FECABLE_ERRABLE);
if (pdata->fec_ability & MDIO_PMA_10GBR_FECABLE_ABLE)
pdata->phy.supported |= SUPPORTED_10000baseR_FEC;
pdata->phy.advertising = pdata->phy.supported;
pdata->phy.address = 0;
pdata->phy.autoneg = AUTONEG_ENABLE;
pdata->phy.speed = SPEED_UNKNOWN;
pdata->phy.duplex = DUPLEX_UNKNOWN;
pdata->phy.link = 0;
pdata->phy.pause_autoneg = pdata->pause_autoneg;
pdata->phy.tx_pause = pdata->tx_pause;
pdata->phy.rx_pause = pdata->rx_pause;
/* Fix up Flow Control advertising */
pdata->phy.advertising &= ~ADVERTISED_Pause;
pdata->phy.advertising &= ~ADVERTISED_Asym_Pause;
if (pdata->rx_pause) {
pdata->phy.advertising |= ADVERTISED_Pause;
pdata->phy.advertising |= ADVERTISED_Asym_Pause;
}
if (pdata->tx_pause)
pdata->phy.advertising ^= ADVERTISED_Asym_Pause;
}
void xgbe_init_function_ptrs_phy(struct xgbe_phy_if *phy_if)
{
phy_if->phy_init = xgbe_phy_init;
phy_if->phy_reset = xgbe_phy_reset;
phy_if->phy_start = xgbe_phy_start;
phy_if->phy_stop = xgbe_phy_stop;
phy_if->phy_status = xgbe_phy_status;
phy_if->phy_config_aneg = xgbe_phy_config_aneg;
}
Index: head/sys/dev/e1000/igb_txrx.c
===================================================================
--- head/sys/dev/e1000/igb_txrx.c (revision 327172)
+++ head/sys/dev/e1000/igb_txrx.c (revision 327173)
@@ -1,584 +1,584 @@
/*-
* Copyright (c) 2016 Matthew Macy <mmacy@mattmacy.io>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/* $FreeBSD$ */
#include "if_em.h"
#ifdef RSS
#include <net/rss_config.h>
#include <netinet/in_rss.h>
#endif
#ifdef VERBOSE_DEBUG
#define DPRINTF device_printf
#else
#define DPRINTF(...)
#endif
/*********************************************************************
* Local Function prototypes
*********************************************************************/
static int igb_isc_txd_encap(void *arg, if_pkt_info_t pi);
static void igb_isc_txd_flush(void *arg, uint16_t txqid, qidx_t pidx);
static int igb_isc_txd_credits_update(void *arg, uint16_t txqid, bool clear);
static void igb_isc_rxd_refill(void *arg, if_rxd_update_t iru);
static void igb_isc_rxd_flush(void *arg, uint16_t rxqid, uint8_t flid __unused, qidx_t pidx);
static int igb_isc_rxd_available(void *arg, uint16_t rxqid, qidx_t idx, qidx_t budget);
static int igb_isc_rxd_pkt_get(void *arg, if_rxd_info_t ri);
static int igb_tx_ctx_setup(struct tx_ring *txr, if_pkt_info_t pi, u32 *cmd_type_len, u32 *olinfo_status);
static int igb_tso_setup(struct tx_ring *txr, if_pkt_info_t pi, u32 *cmd_type_len, u32 *olinfo_status);
static void igb_rx_checksum(u32 staterr, if_rxd_info_t ri, u32 ptype);
static int igb_determine_rsstype(u16 pkt_info);
extern void igb_if_enable_intr(if_ctx_t ctx);
extern int em_intr(void *arg);
struct if_txrx igb_txrx = {
igb_isc_txd_encap,
igb_isc_txd_flush,
igb_isc_txd_credits_update,
igb_isc_rxd_available,
igb_isc_rxd_pkt_get,
igb_isc_rxd_refill,
igb_isc_rxd_flush,
em_intr
};
extern if_shared_ctx_t em_sctx;
/**********************************************************************
*
* Setup work for hardware segmentation offload (TSO) on
* adapters using advanced tx descriptors
*
**********************************************************************/
static int
igb_tso_setup(struct tx_ring *txr, if_pkt_info_t pi, u32 *cmd_type_len, u32 *olinfo_status)
{
struct e1000_adv_tx_context_desc *TXD;
struct adapter *adapter = txr->adapter;
u32 type_tucmd_mlhl = 0, vlan_macip_lens = 0;
u32 mss_l4len_idx = 0;
u32 paylen;
switch(pi->ipi_etype) {
case ETHERTYPE_IPV6:
type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_IPV6;
break;
case ETHERTYPE_IP:
type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_IPV4;
/* Tell transmit desc to also do IPv4 checksum. */
*olinfo_status |= E1000_TXD_POPTS_IXSM << 8;
break;
default:
panic("%s: CSUM_TSO but no supported IP version (0x%04x)",
__func__, ntohs(pi->ipi_etype));
break;
}
TXD = (struct e1000_adv_tx_context_desc *) &txr->tx_base[pi->ipi_pidx];
/* This is used in the transmit desc in encap */
paylen = pi->ipi_len - pi->ipi_ehdrlen - pi->ipi_ip_hlen - pi->ipi_tcp_hlen;
/* VLAN MACLEN IPLEN */
if (pi->ipi_mflags & M_VLANTAG) {
vlan_macip_lens |= (pi->ipi_vtag << E1000_ADVTXD_VLAN_SHIFT);
}
vlan_macip_lens |= pi->ipi_ehdrlen << E1000_ADVTXD_MACLEN_SHIFT;
vlan_macip_lens |= pi->ipi_ip_hlen;
TXD->vlan_macip_lens = htole32(vlan_macip_lens);
/* ADV DTYPE TUCMD */
type_tucmd_mlhl |= E1000_ADVTXD_DCMD_DEXT | E1000_ADVTXD_DTYP_CTXT;
type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_TCP;
TXD->type_tucmd_mlhl = htole32(type_tucmd_mlhl);
/* MSS L4LEN IDX */
mss_l4len_idx |= (pi->ipi_tso_segsz << E1000_ADVTXD_MSS_SHIFT);
mss_l4len_idx |= (pi->ipi_tcp_hlen << E1000_ADVTXD_L4LEN_SHIFT);
/* 82575 needs the queue index added */
if (adapter->hw.mac.type == e1000_82575)
mss_l4len_idx |= txr->me << 4;
TXD->mss_l4len_idx = htole32(mss_l4len_idx);
TXD->seqnum_seed = htole32(0);
*cmd_type_len |= E1000_ADVTXD_DCMD_TSE;
*olinfo_status |= E1000_TXD_POPTS_TXSM << 8;
*olinfo_status |= paylen << E1000_ADVTXD_PAYLEN_SHIFT;
return (1);
}
/*********************************************************************
*
* Advanced Context Descriptor setup for VLAN, CSUM or TSO
*
**********************************************************************/
static int
igb_tx_ctx_setup(struct tx_ring *txr, if_pkt_info_t pi, u32 *cmd_type_len, u32 *olinfo_status)
{
struct e1000_adv_tx_context_desc *TXD;
struct adapter *adapter = txr->adapter;
u32 vlan_macip_lens, type_tucmd_mlhl;
u32 mss_l4len_idx;
mss_l4len_idx = vlan_macip_lens = type_tucmd_mlhl = 0;
int offload = TRUE;
/* First check if TSO is to be used */
if (pi->ipi_csum_flags & CSUM_TSO)
return (igb_tso_setup(txr, pi, cmd_type_len, olinfo_status));
/* Indicate the whole packet as payload when not doing TSO */
*olinfo_status |= pi->ipi_len << E1000_ADVTXD_PAYLEN_SHIFT;
/* Now ready a context descriptor */
TXD = (struct e1000_adv_tx_context_desc *) &txr->tx_base[pi->ipi_pidx];
/*
** In advanced descriptors the vlan tag must
** be placed into the context descriptor. Hence
** we need to make one even if not doing offloads.
*/
if (pi->ipi_mflags & M_VLANTAG) {
vlan_macip_lens |= (pi->ipi_vtag << E1000_ADVTXD_VLAN_SHIFT);
} else if ((pi->ipi_csum_flags & IGB_CSUM_OFFLOAD) == 0) {
return (0);
}
/* Set the ether header length */
vlan_macip_lens |= pi->ipi_ehdrlen << E1000_ADVTXD_MACLEN_SHIFT;
switch(pi->ipi_etype) {
case ETHERTYPE_IP:
type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_IPV4;
break;
case ETHERTYPE_IPV6:
type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_IPV6;
break;
default:
offload = FALSE;
break;
}
vlan_macip_lens |= pi->ipi_ip_hlen;
type_tucmd_mlhl |= E1000_ADVTXD_DCMD_DEXT | E1000_ADVTXD_DTYP_CTXT;
switch (pi->ipi_ipproto) {
case IPPROTO_TCP:
if (pi->ipi_csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_TCP;
break;
case IPPROTO_UDP:
if (pi->ipi_csum_flags & (CSUM_IP_UDP | CSUM_IP6_UDP))
type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_UDP;
break;
case IPPROTO_SCTP:
if (pi->ipi_csum_flags & (CSUM_IP_SCTP | CSUM_IP6_SCTP))
type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_SCTP;
break;
default:
offload = FALSE;
break;
}
if (offload) /* For the TX descriptor setup */
*olinfo_status |= E1000_TXD_POPTS_TXSM << 8;
/* 82575 needs the queue index added */
if (adapter->hw.mac.type == e1000_82575)
mss_l4len_idx = txr->me << 4;
/* Now copy bits into descriptor */
TXD->vlan_macip_lens = htole32(vlan_macip_lens);
TXD->type_tucmd_mlhl = htole32(type_tucmd_mlhl);
TXD->seqnum_seed = htole32(0);
TXD->mss_l4len_idx = htole32(mss_l4len_idx);
return (1);
}
static int
igb_isc_txd_encap(void *arg, if_pkt_info_t pi)
{
struct adapter *sc = arg;
if_softc_ctx_t scctx = sc->shared;
struct em_tx_queue *que = &sc->tx_queues[pi->ipi_qsidx];
struct tx_ring *txr = &que->txr;
int nsegs = pi->ipi_nsegs;
bus_dma_segment_t *segs = pi->ipi_segs;
union e1000_adv_tx_desc *txd = NULL;
- int i, j, first, pidx_last;
+ int i, j, pidx_last;
u32 olinfo_status, cmd_type_len, txd_flags;
qidx_t ntxd;
pidx_last = olinfo_status = 0;
/* Basic descriptor defines */
cmd_type_len = (E1000_ADVTXD_DTYP_DATA |
E1000_ADVTXD_DCMD_IFCS | E1000_ADVTXD_DCMD_DEXT);
if (pi->ipi_mflags & M_VLANTAG)
cmd_type_len |= E1000_ADVTXD_DCMD_VLE;
- first = i = pi->ipi_pidx;
+ i = pi->ipi_pidx;
ntxd = scctx->isc_ntxd[0];
txd_flags = pi->ipi_flags & IPI_TX_INTR ? E1000_ADVTXD_DCMD_RS : 0;
/* Consume the first descriptor */
i += igb_tx_ctx_setup(txr, pi, &cmd_type_len, &olinfo_status);
if (i == scctx->isc_ntxd[0])
i = 0;
/* 82575 needs the queue index added */
if (sc->hw.mac.type == e1000_82575)
olinfo_status |= txr->me << 4;
for (j = 0; j < nsegs; j++) {
bus_size_t seglen;
bus_addr_t segaddr;
txd = (union e1000_adv_tx_desc *)&txr->tx_base[i];
seglen = segs[j].ds_len;
segaddr = htole64(segs[j].ds_addr);
txd->read.buffer_addr = segaddr;
txd->read.cmd_type_len = htole32(E1000_TXD_CMD_IFCS |
cmd_type_len | seglen);
txd->read.olinfo_status = htole32(olinfo_status);
pidx_last = i;
if (++i == scctx->isc_ntxd[0]) {
i = 0;
}
}
if (txd_flags) {
txr->tx_rsq[txr->tx_rs_pidx] = pidx_last;
txr->tx_rs_pidx = (txr->tx_rs_pidx+1) & (ntxd-1);
MPASS(txr->tx_rs_pidx != txr->tx_rs_cidx);
}
txd->read.cmd_type_len |= htole32(E1000_TXD_CMD_EOP | txd_flags);
pi->ipi_new_pidx = i;
return (0);
}
static void
igb_isc_txd_flush(void *arg, uint16_t txqid, qidx_t pidx)
{
struct adapter *adapter = arg;
struct em_tx_queue *que = &adapter->tx_queues[txqid];
struct tx_ring *txr = &que->txr;
E1000_WRITE_REG(&adapter->hw, E1000_TDT(txr->me), pidx);
}
static int
igb_isc_txd_credits_update(void *arg, uint16_t txqid, bool clear)
{
struct adapter *adapter = arg;
if_softc_ctx_t scctx = adapter->shared;
struct em_tx_queue *que = &adapter->tx_queues[txqid];
struct tx_ring *txr = &que->txr;
qidx_t processed = 0;
int updated;
qidx_t cur, prev, ntxd, rs_cidx;
int32_t delta;
uint8_t status;
rs_cidx = txr->tx_rs_cidx;
if (rs_cidx == txr->tx_rs_pidx)
return (0);
cur = txr->tx_rsq[rs_cidx];
status = ((union e1000_adv_tx_desc *)&txr->tx_base[cur])->wb.status;
updated = !!(status & E1000_TXD_STAT_DD);
if (!clear || !updated)
return (updated);
prev = txr->tx_cidx_processed;
ntxd = scctx->isc_ntxd[0];
do {
delta = (int32_t)cur - (int32_t)prev;
MPASS(prev == 0 || delta != 0);
if (delta < 0)
delta += ntxd;
processed += delta;
prev = cur;
rs_cidx = (rs_cidx + 1) & (ntxd-1);
if (rs_cidx == txr->tx_rs_pidx)
break;
cur = txr->tx_rsq[rs_cidx];
status = ((union e1000_adv_tx_desc *)&txr->tx_base[cur])->wb.status;
} while ((status & E1000_TXD_STAT_DD));
txr->tx_rs_cidx = rs_cidx;
txr->tx_cidx_processed = prev;
return (processed);
}
static void
igb_isc_rxd_refill(void *arg, if_rxd_update_t iru)
{
struct adapter *sc = arg;
if_softc_ctx_t scctx = sc->shared;
uint16_t rxqid = iru->iru_qsidx;
struct em_rx_queue *que = &sc->rx_queues[rxqid];
union e1000_adv_rx_desc *rxd;
struct rx_ring *rxr = &que->rxr;
uint64_t *paddrs;
uint32_t next_pidx, pidx;
uint16_t count;
int i;
paddrs = iru->iru_paddrs;
pidx = iru->iru_pidx;
count = iru->iru_count;
for (i = 0, next_pidx = pidx; i < count; i++) {
rxd = (union e1000_adv_rx_desc *)&rxr->rx_base[next_pidx];
rxd->read.pkt_addr = htole64(paddrs[i]);
if (++next_pidx == scctx->isc_nrxd[0])
next_pidx = 0;
}
}
static void
igb_isc_rxd_flush(void *arg, uint16_t rxqid, uint8_t flid __unused, qidx_t pidx)
{
struct adapter *sc = arg;
struct em_rx_queue *que = &sc->rx_queues[rxqid];
struct rx_ring *rxr = &que->rxr;
E1000_WRITE_REG(&sc->hw, E1000_RDT(rxr->me), pidx);
}
static int
igb_isc_rxd_available(void *arg, uint16_t rxqid, qidx_t idx, qidx_t budget)
{
struct adapter *sc = arg;
if_softc_ctx_t scctx = sc->shared;
struct em_rx_queue *que = &sc->rx_queues[rxqid];
struct rx_ring *rxr = &que->rxr;
union e1000_adv_rx_desc *rxd;
u32 staterr = 0;
int cnt, i, iter;
if (budget == 1) {
rxd = (union e1000_adv_rx_desc *)&rxr->rx_base[idx];
staterr = le32toh(rxd->wb.upper.status_error);
return (staterr & E1000_RXD_STAT_DD);
}
for (iter = cnt = 0, i = idx; iter < scctx->isc_nrxd[0] && iter <= budget;) {
rxd = (union e1000_adv_rx_desc *)&rxr->rx_base[i];
staterr = le32toh(rxd->wb.upper.status_error);
if ((staterr & E1000_RXD_STAT_DD) == 0)
break;
if (++i == scctx->isc_nrxd[0]) {
i = 0;
}
if (staterr & E1000_RXD_STAT_EOP)
cnt++;
iter++;
}
return (cnt);
}
/****************************************************************
* Routine sends data which has been dma'ed into host memory
* to upper layer. Initialize ri structure.
*
* Returns 0 upon success, errno on failure
***************************************************************/
static int
igb_isc_rxd_pkt_get(void *arg, if_rxd_info_t ri)
{
struct adapter *adapter = arg;
if_softc_ctx_t scctx = adapter->shared;
struct em_rx_queue *que = &adapter->rx_queues[ri->iri_qsidx];
struct rx_ring *rxr = &que->rxr;
struct ifnet *ifp = iflib_get_ifp(adapter->ctx);
union e1000_adv_rx_desc *rxd;
u16 pkt_info, len;
u16 vtag = 0;
u32 ptype;
u32 staterr = 0;
bool eop;
int i = 0;
int cidx = ri->iri_cidx;
do {
rxd = (union e1000_adv_rx_desc *)&rxr->rx_base[cidx];
staterr = le32toh(rxd->wb.upper.status_error);
pkt_info = le16toh(rxd->wb.lower.lo_dword.hs_rss.pkt_info);
MPASS ((staterr & E1000_RXD_STAT_DD) != 0);
len = le16toh(rxd->wb.upper.length);
ptype = le32toh(rxd->wb.lower.lo_dword.data) & IGB_PKTTYPE_MASK;
ri->iri_len += len;
rxr->rx_bytes += ri->iri_len;
rxd->wb.upper.status_error = 0;
eop = ((staterr & E1000_RXD_STAT_EOP) == E1000_RXD_STAT_EOP);
if (((adapter->hw.mac.type == e1000_i350) ||
(adapter->hw.mac.type == e1000_i354)) &&
(staterr & E1000_RXDEXT_STATERR_LB))
vtag = be16toh(rxd->wb.upper.vlan);
else
vtag = le16toh(rxd->wb.upper.vlan);
/* Make sure bad packets are discarded */
if (eop && ((staterr & E1000_RXDEXT_ERR_FRAME_ERR_MASK) != 0)) {
adapter->dropped_pkts++;
++rxr->rx_discarded;
return (EBADMSG);
}
ri->iri_frags[i].irf_flid = 0;
ri->iri_frags[i].irf_idx = cidx;
ri->iri_frags[i].irf_len = len;
if (++cidx == scctx->isc_nrxd[0])
cidx = 0;
#ifdef notyet
if (rxr->hdr_split == TRUE) {
ri->iri_frags[i].irf_flid = 1;
ri->iri_frags[i].irf_idx = cidx;
if (++cidx == scctx->isc_nrxd[0])
cidx = 0;
}
#endif
i++;
} while (!eop);
rxr->rx_packets++;
if ((ifp->if_capenable & IFCAP_RXCSUM) != 0)
igb_rx_checksum(staterr, ri, ptype);
if ((ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0 &&
(staterr & E1000_RXD_STAT_VP) != 0) {
ri->iri_vtag = vtag;
ri->iri_flags |= M_VLANTAG;
}
ri->iri_flowid =
le32toh(rxd->wb.lower.hi_dword.rss);
ri->iri_rsstype = igb_determine_rsstype(pkt_info);
ri->iri_nfrags = i;
return (0);
}
/*********************************************************************
*
* Verify that the hardware indicated that the checksum is valid.
* Inform the stack about the status of checksum so that stack
* doesn't spend time verifying the checksum.
*
*********************************************************************/
static void
igb_rx_checksum(u32 staterr, if_rxd_info_t ri, u32 ptype)
{
u16 status = (u16)staterr;
u8 errors = (u8) (staterr >> 24);
bool sctp = FALSE;
/* Ignore Checksum bit is set */
if (status & E1000_RXD_STAT_IXSM) {
ri->iri_csum_flags = 0;
return;
}
if ((ptype & E1000_RXDADV_PKTTYPE_ETQF) == 0 &&
(ptype & E1000_RXDADV_PKTTYPE_SCTP) != 0)
sctp = 1;
else
sctp = 0;
if (status & E1000_RXD_STAT_IPCS) {
/* Did it pass? */
if (!(errors & E1000_RXD_ERR_IPE)) {
/* IP Checksum Good */
ri->iri_csum_flags = CSUM_IP_CHECKED;
ri->iri_csum_flags |= CSUM_IP_VALID;
} else
ri->iri_csum_flags = 0;
}
if (status & (E1000_RXD_STAT_TCPCS | E1000_RXD_STAT_UDPCS)) {
u64 type = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
if (sctp) /* reassign */
type = CSUM_SCTP_VALID;
/* Did it pass? */
if (!(errors & E1000_RXD_ERR_TCPE)) {
ri->iri_csum_flags |= type;
if (sctp == 0)
ri->iri_csum_data = htons(0xffff);
}
}
return;
}
/********************************************************************
*
* Parse the packet type to determine the appropriate hash
*
******************************************************************/
static int
igb_determine_rsstype(u16 pkt_info)
{
switch (pkt_info & E1000_RXDADV_RSSTYPE_MASK) {
case E1000_RXDADV_RSSTYPE_IPV4_TCP:
return M_HASHTYPE_RSS_TCP_IPV4;
case E1000_RXDADV_RSSTYPE_IPV4:
return M_HASHTYPE_RSS_IPV4;
case E1000_RXDADV_RSSTYPE_IPV6_TCP:
return M_HASHTYPE_RSS_TCP_IPV6;
case E1000_RXDADV_RSSTYPE_IPV6_EX:
return M_HASHTYPE_RSS_IPV6_EX;
case E1000_RXDADV_RSSTYPE_IPV6:
return M_HASHTYPE_RSS_IPV6;
case E1000_RXDADV_RSSTYPE_IPV6_TCP_EX:
return M_HASHTYPE_RSS_TCP_IPV6_EX;
default:
return M_HASHTYPE_OPAQUE;
}
}
Index: head/sys/dev/extres/clk/clk_bus.c
===================================================================
--- head/sys/dev/extres/clk/clk_bus.c (revision 327172)
+++ head/sys/dev/extres/clk/clk_bus.c (revision 327173)
@@ -1,93 +1,91 @@
/*-
* Copyright 2016 Michal Meloun <mmel@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/module.h>
#include <sys/bus.h>
#include <dev/fdt/simplebus.h>
#include <dev/ofw/openfirm.h>
#include <dev/ofw/ofw_bus_subr.h>
struct ofw_clkbus_softc {
struct simplebus_softc simplebus_sc;
};
static int
ofw_clkbus_probe(device_t dev)
{
const char *name;
name = ofw_bus_get_name(dev);
if (name == NULL || strcmp(name, "clocks") != 0)
return (ENXIO);
device_set_desc(dev, "OFW clocks bus");
return (BUS_PROBE_GENERIC);
}
static int
ofw_clkbus_attach(device_t dev)
{
- struct ofw_clkbus_softc *sc;
phandle_t node, child;
device_t cdev;
- sc = device_get_softc(dev);
node = ofw_bus_get_node(dev);
simplebus_init(dev, node);
for (child = OF_child(node); child > 0; child = OF_peer(child)) {
cdev = simplebus_add_device(dev, child, 0, NULL, -1, NULL);
if (cdev != NULL)
device_probe_and_attach(cdev);
}
return (bus_generic_attach(dev));
}
static device_method_t ofw_clkbus_methods[] = {
/* Device interface */
DEVMETHOD(device_probe, ofw_clkbus_probe),
DEVMETHOD(device_attach, ofw_clkbus_attach),
DEVMETHOD_END
};
DEFINE_CLASS_1(ofw_clkbus, ofw_clkbus_driver, ofw_clkbus_methods,
sizeof(struct ofw_clkbus_softc), simplebus_driver);
static devclass_t ofw_clkbus_devclass;
EARLY_DRIVER_MODULE(ofw_clkbus, simplebus, ofw_clkbus_driver,
ofw_clkbus_devclass, 0, 0, BUS_PASS_BUS + BUS_PASS_ORDER_MIDDLE);
MODULE_VERSION(ofw_clkbus, 1);
Index: head/sys/dev/extres/regulator/regulator_bus.c
===================================================================
--- head/sys/dev/extres/regulator/regulator_bus.c (revision 327172)
+++ head/sys/dev/extres/regulator/regulator_bus.c (revision 327173)
@@ -1,89 +1,87 @@
/*-
* Copyright 2016 Michal Meloun <mmel@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/module.h>
#include <sys/bus.h>
#include <dev/fdt/simplebus.h>
#include <dev/ofw/openfirm.h>
#include <dev/ofw/ofw_bus_subr.h>
struct ofw_regulator_bus_softc {
struct simplebus_softc simplebus_sc;
};
static int
ofw_regulator_bus_probe(device_t dev)
{
const char *name;
name = ofw_bus_get_name(dev);
if (name == NULL || strcmp(name, "regulators") != 0)
return (ENXIO);
device_set_desc(dev, "OFW regulators bus");
return (0);
}
static int
ofw_regulator_bus_attach(device_t dev)
{
- struct ofw_regulator_bus_softc *sc;
phandle_t node, child;
- sc = device_get_softc(dev);
node = ofw_bus_get_node(dev);
simplebus_init(dev, node);
for (child = OF_child(node); child > 0; child = OF_peer(child)) {
simplebus_add_device(dev, child, 0, NULL, -1, NULL);
}
return (bus_generic_attach(dev));
}
static device_method_t ofw_regulator_bus_methods[] = {
/* Device interface */
DEVMETHOD(device_probe, ofw_regulator_bus_probe),
DEVMETHOD(device_attach, ofw_regulator_bus_attach),
DEVMETHOD_END
};
DEFINE_CLASS_1(ofw_regulator_bus, ofw_regulator_bus_driver,
ofw_regulator_bus_methods, sizeof(struct ofw_regulator_bus_softc),
simplebus_driver);
static devclass_t ofw_regulator_bus_devclass;
EARLY_DRIVER_MODULE(ofw_regulator_bus, simplebus, ofw_regulator_bus_driver,
ofw_regulator_bus_devclass, 0, 0, BUS_PASS_BUS + BUS_PASS_ORDER_MIDDLE);
MODULE_VERSION(ofw_regulator_bus, 1);
Index: head/sys/dev/fdt/fdt_common.c
===================================================================
--- head/sys/dev/fdt/fdt_common.c (revision 327172)
+++ head/sys/dev/fdt/fdt_common.c (revision 327173)
@@ -1,745 +1,737 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2009-2014 The FreeBSD Foundation
* All rights reserved.
*
* This software was developed by Andrew Turner under sponsorship from
* the FreeBSD Foundation.
* This software was developed by Semihalf under sponsorship from
* the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/module.h>
#include <sys/bus.h>
#include <sys/limits.h>
#include <sys/sysctl.h>
#include <machine/resource.h>
#include <dev/fdt/fdt_common.h>
#include <dev/ofw/ofw_bus.h>
#include <dev/ofw/ofw_bus_subr.h>
#include <dev/ofw/openfirm.h>
#include "ofw_bus_if.h"
#ifdef DEBUG
#define debugf(fmt, args...) do { printf("%s(): ", __func__); \
printf(fmt,##args); } while (0)
#else
#define debugf(fmt, args...)
#endif
#define FDT_COMPAT_LEN 255
#define FDT_TYPE_LEN 64
#define FDT_REG_CELLS 4
#define FDT_RANGES_SIZE 48
SYSCTL_NODE(_hw, OID_AUTO, fdt, CTLFLAG_RD, 0, "Flattened Device Tree");
vm_paddr_t fdt_immr_pa;
vm_offset_t fdt_immr_va;
vm_offset_t fdt_immr_size;
struct fdt_ic_list fdt_ic_list_head = SLIST_HEAD_INITIALIZER(fdt_ic_list_head);
static int fdt_is_compatible(phandle_t, const char *);
static int
fdt_get_range_by_busaddr(phandle_t node, u_long addr, u_long *base,
u_long *size)
{
pcell_t ranges[32], *rangesptr;
pcell_t addr_cells, size_cells, par_addr_cells;
u_long bus_addr, par_bus_addr, pbase, psize;
int err, i, len, tuple_size, tuples;
if (node == 0) {
*base = 0;
*size = ULONG_MAX;
return (0);
}
if ((fdt_addrsize_cells(node, &addr_cells, &size_cells)) != 0)
return (ENXIO);
/*
* Process 'ranges' property.
*/
par_addr_cells = fdt_parent_addr_cells(node);
if (par_addr_cells > 2) {
return (ERANGE);
}
len = OF_getproplen(node, "ranges");
if (len < 0)
return (-1);
if (len > sizeof(ranges))
return (ENOMEM);
if (len == 0) {
return (fdt_get_range_by_busaddr(OF_parent(node), addr,
base, size));
}
if (OF_getprop(node, "ranges", ranges, sizeof(ranges)) <= 0)
return (EINVAL);
tuple_size = addr_cells + par_addr_cells + size_cells;
tuples = len / (tuple_size * sizeof(cell_t));
if (par_addr_cells > 2 || addr_cells > 2 || size_cells > 2)
return (ERANGE);
*base = 0;
*size = 0;
for (i = 0; i < tuples; i++) {
rangesptr = &ranges[i * tuple_size];
bus_addr = fdt_data_get((void *)rangesptr, addr_cells);
if (bus_addr != addr)
continue;
rangesptr += addr_cells;
par_bus_addr = fdt_data_get((void *)rangesptr, par_addr_cells);
rangesptr += par_addr_cells;
err = fdt_get_range_by_busaddr(OF_parent(node), par_bus_addr,
&pbase, &psize);
if (err > 0)
return (err);
if (err == 0)
*base = pbase;
else
*base = par_bus_addr;
*size = fdt_data_get((void *)rangesptr, size_cells);
return (0);
}
return (EINVAL);
}
int
fdt_get_range(phandle_t node, int range_id, u_long *base, u_long *size)
{
pcell_t ranges[FDT_RANGES_SIZE], *rangesptr;
pcell_t addr_cells, size_cells, par_addr_cells;
u_long par_bus_addr, pbase, psize;
- int err, len, tuple_size, tuples;
+ int err, len;
if ((fdt_addrsize_cells(node, &addr_cells, &size_cells)) != 0)
return (ENXIO);
/*
* Process 'ranges' property.
*/
par_addr_cells = fdt_parent_addr_cells(node);
if (par_addr_cells > 2)
return (ERANGE);
len = OF_getproplen(node, "ranges");
if (len > sizeof(ranges))
return (ENOMEM);
if (len == 0) {
*base = 0;
*size = ULONG_MAX;
return (0);
}
if (!(range_id < len))
return (ERANGE);
if (OF_getprop(node, "ranges", ranges, sizeof(ranges)) <= 0)
return (EINVAL);
- tuple_size = sizeof(pcell_t) * (addr_cells + par_addr_cells +
- size_cells);
- tuples = len / tuple_size;
-
if (par_addr_cells > 2 || addr_cells > 2 || size_cells > 2)
return (ERANGE);
*base = 0;
*size = 0;
rangesptr = &ranges[range_id];
*base = fdt_data_get((void *)rangesptr, addr_cells);
rangesptr += addr_cells;
par_bus_addr = fdt_data_get((void *)rangesptr, par_addr_cells);
rangesptr += par_addr_cells;
err = fdt_get_range_by_busaddr(OF_parent(node), par_bus_addr,
&pbase, &psize);
if (err == 0)
*base += pbase;
else
*base += par_bus_addr;
*size = fdt_data_get((void *)rangesptr, size_cells);
return (0);
}
int
fdt_immr_addr(vm_offset_t immr_va)
{
phandle_t node;
u_long base, size;
int r;
/*
* Try to access the SOC node directly i.e. through /aliases/.
*/
if ((node = OF_finddevice("soc")) != 0)
if (fdt_is_compatible(node, "simple-bus"))
goto moveon;
/*
* Find the node the long way.
*/
if ((node = OF_finddevice("/")) == 0)
return (ENXIO);
if ((node = fdt_find_compatible(node, "simple-bus", 0)) == 0)
return (ENXIO);
moveon:
if ((r = fdt_get_range(node, 0, &base, &size)) == 0) {
fdt_immr_pa = base;
fdt_immr_va = immr_va;
fdt_immr_size = size;
}
return (r);
}
/*
* This routine is an early-usage version of the ofw_bus_is_compatible() when
* the ofw_bus I/F is not available (like early console routines and similar).
* Note the buffer has to be on the stack since malloc() is usually not
* available in such cases either.
*/
static int
fdt_is_compatible(phandle_t node, const char *compatstr)
{
char buf[FDT_COMPAT_LEN];
char *compat;
int len, onelen, l, rv;
if ((len = OF_getproplen(node, "compatible")) <= 0)
return (0);
compat = (char *)&buf;
bzero(compat, FDT_COMPAT_LEN);
if (OF_getprop(node, "compatible", compat, FDT_COMPAT_LEN) < 0)
return (0);
onelen = strlen(compatstr);
rv = 0;
while (len > 0) {
if (strncasecmp(compat, compatstr, onelen) == 0) {
/* Found it. */
rv = 1;
break;
}
/* Slide to the next sub-string. */
l = strlen(compat) + 1;
compat += l;
len -= l;
}
return (rv);
}
int
fdt_is_compatible_strict(phandle_t node, const char *compatible)
{
char compat[FDT_COMPAT_LEN];
if (OF_getproplen(node, "compatible") <= 0)
return (0);
if (OF_getprop(node, "compatible", compat, FDT_COMPAT_LEN) < 0)
return (0);
if (strncasecmp(compat, compatible, FDT_COMPAT_LEN) == 0)
/* This fits. */
return (1);
return (0);
}
phandle_t
fdt_find_compatible(phandle_t start, const char *compat, int strict)
{
phandle_t child;
/*
* Traverse all children of 'start' node, and find first with
* matching 'compatible' property.
*/
for (child = OF_child(start); child != 0; child = OF_peer(child))
if (fdt_is_compatible(child, compat)) {
if (strict)
if (!fdt_is_compatible_strict(child, compat))
continue;
return (child);
}
return (0);
}
phandle_t
fdt_depth_search_compatible(phandle_t start, const char *compat, int strict)
{
phandle_t child, node;
/*
* Depth-search all descendants of 'start' node, and find first with
* matching 'compatible' property.
*/
for (node = OF_child(start); node != 0; node = OF_peer(node)) {
if (fdt_is_compatible(node, compat) &&
(strict == 0 || fdt_is_compatible_strict(node, compat))) {
return (node);
}
child = fdt_depth_search_compatible(node, compat, strict);
if (child != 0)
return (child);
}
return (0);
}
int
fdt_is_enabled(phandle_t node)
{
char *stat;
int ena, len;
len = OF_getprop_alloc(node, "status", sizeof(char),
(void **)&stat);
if (len <= 0)
/* It is OK if no 'status' property. */
return (1);
/* Anything other than 'okay' means disabled. */
ena = 0;
if (strncmp((char *)stat, "okay", len) == 0)
ena = 1;
OF_prop_free(stat);
return (ena);
}
int
fdt_is_type(phandle_t node, const char *typestr)
{
char type[FDT_TYPE_LEN];
if (OF_getproplen(node, "device_type") <= 0)
return (0);
if (OF_getprop(node, "device_type", type, FDT_TYPE_LEN) < 0)
return (0);
if (strncasecmp(type, typestr, FDT_TYPE_LEN) == 0)
/* This fits. */
return (1);
return (0);
}
int
fdt_parent_addr_cells(phandle_t node)
{
pcell_t addr_cells;
/* Find out #address-cells of the superior bus. */
if (OF_searchprop(OF_parent(node), "#address-cells", &addr_cells,
sizeof(addr_cells)) <= 0)
return (2);
return ((int)fdt32_to_cpu(addr_cells));
}
int
fdt_pm_is_enabled(phandle_t node)
{
int ret;
ret = 1;
#if defined(SOC_MV_KIRKWOOD) || defined(SOC_MV_DISCOVERY)
ret = fdt_pm(node);
#endif
return (ret);
}
u_long
fdt_data_get(void *data, int cells)
{
if (cells == 1)
return (fdt32_to_cpu(*((uint32_t *)data)));
return (fdt64_to_cpu(*((uint64_t *)data)));
}
int
fdt_addrsize_cells(phandle_t node, int *addr_cells, int *size_cells)
{
pcell_t cell;
int cell_size;
/*
* Retrieve #{address,size}-cells.
*/
cell_size = sizeof(cell);
if (OF_getencprop(node, "#address-cells", &cell, cell_size) < cell_size)
cell = 2;
*addr_cells = (int)cell;
if (OF_getencprop(node, "#size-cells", &cell, cell_size) < cell_size)
cell = 1;
*size_cells = (int)cell;
if (*addr_cells > 3 || *size_cells > 2)
return (ERANGE);
return (0);
}
int
fdt_data_to_res(pcell_t *data, int addr_cells, int size_cells, u_long *start,
u_long *count)
{
/* Address portion. */
if (addr_cells > 2)
return (ERANGE);
*start = fdt_data_get((void *)data, addr_cells);
data += addr_cells;
/* Size portion. */
if (size_cells > 2)
return (ERANGE);
*count = fdt_data_get((void *)data, size_cells);
return (0);
}
int
fdt_regsize(phandle_t node, u_long *base, u_long *size)
{
pcell_t reg[4];
int addr_cells, len, size_cells;
if (fdt_addrsize_cells(OF_parent(node), &addr_cells, &size_cells))
return (ENXIO);
if ((sizeof(pcell_t) * (addr_cells + size_cells)) > sizeof(reg))
return (ENOMEM);
len = OF_getprop(node, "reg", &reg, sizeof(reg));
if (len <= 0)
return (EINVAL);
*base = fdt_data_get(&reg[0], addr_cells);
*size = fdt_data_get(&reg[addr_cells], size_cells);
return (0);
}
int
fdt_reg_to_rl(phandle_t node, struct resource_list *rl)
{
u_long end, count, start;
pcell_t *reg, *regptr;
pcell_t addr_cells, size_cells;
int tuple_size, tuples;
int i, rv;
long busaddr, bussize;
if (fdt_addrsize_cells(OF_parent(node), &addr_cells, &size_cells) != 0)
return (ENXIO);
if (fdt_get_range(OF_parent(node), 0, &busaddr, &bussize)) {
busaddr = 0;
bussize = 0;
}
tuple_size = sizeof(pcell_t) * (addr_cells + size_cells);
tuples = OF_getprop_alloc(node, "reg", tuple_size, (void **)&reg);
debugf("addr_cells = %d, size_cells = %d\n", addr_cells, size_cells);
debugf("tuples = %d, tuple size = %d\n", tuples, tuple_size);
if (tuples <= 0)
/* No 'reg' property in this node. */
return (0);
regptr = reg;
for (i = 0; i < tuples; i++) {
rv = fdt_data_to_res(reg, addr_cells, size_cells, &start,
&count);
if (rv != 0) {
resource_list_free(rl);
goto out;
}
reg += addr_cells + size_cells;
/* Calculate address range relative to base. */
start += busaddr;
end = start + count - 1;
debugf("reg addr start = %lx, end = %lx, count = %lx\n", start,
end, count);
resource_list_add(rl, SYS_RES_MEMORY, i, start, end,
count);
}
rv = 0;
out:
OF_prop_free(regptr);
return (rv);
}
int
fdt_get_phyaddr(phandle_t node, device_t dev, int *phy_addr, void **phy_sc)
{
phandle_t phy_node;
pcell_t phy_handle, phy_reg;
uint32_t i;
device_t parent, child;
if (OF_getencprop(node, "phy-handle", (void *)&phy_handle,
sizeof(phy_handle)) <= 0)
return (ENXIO);
phy_node = OF_node_from_xref(phy_handle);
if (OF_getencprop(phy_node, "reg", (void *)&phy_reg,
sizeof(phy_reg)) <= 0)
return (ENXIO);
*phy_addr = phy_reg;
/*
* Search for softc used to communicate with phy.
*/
/*
* Step 1: Search for ancestor of the phy-node with a "phy-handle"
* property set.
*/
phy_node = OF_parent(phy_node);
while (phy_node != 0) {
if (OF_getprop(phy_node, "phy-handle", (void *)&phy_handle,
sizeof(phy_handle)) > 0)
break;
phy_node = OF_parent(phy_node);
}
if (phy_node == 0)
return (ENXIO);
/*
* Step 2: For each device with the same parent and name as ours
* compare its node with the one found in step 1, ancestor of phy
* node (stored in phy_node).
*/
parent = device_get_parent(dev);
i = 0;
child = device_find_child(parent, device_get_name(dev), i);
while (child != NULL) {
if (ofw_bus_get_node(child) == phy_node)
break;
i++;
child = device_find_child(parent, device_get_name(dev), i);
}
if (child == NULL)
return (ENXIO);
/*
* Use softc of the device found.
*/
*phy_sc = (void *)device_get_softc(child);
return (0);
}
int
fdt_get_reserved_regions(struct mem_region *mr, int *mrcnt)
{
pcell_t reserve[FDT_REG_CELLS * FDT_MEM_REGIONS];
pcell_t *reservep;
phandle_t memory, root;
- uint32_t memory_size;
int addr_cells, size_cells;
- int i, max_size, res_len, rv, tuple_size, tuples;
+ int i, res_len, rv, tuple_size, tuples;
- max_size = sizeof(reserve);
root = OF_finddevice("/");
memory = OF_finddevice("/memory");
if (memory == -1) {
rv = ENXIO;
goto out;
}
if ((rv = fdt_addrsize_cells(OF_parent(memory), &addr_cells,
&size_cells)) != 0)
goto out;
if (addr_cells > 2) {
rv = ERANGE;
goto out;
}
tuple_size = sizeof(pcell_t) * (addr_cells + size_cells);
res_len = OF_getproplen(root, "memreserve");
if (res_len <= 0 || res_len > sizeof(reserve)) {
rv = ERANGE;
goto out;
}
if (OF_getprop(root, "memreserve", reserve, res_len) <= 0) {
rv = ENXIO;
goto out;
}
- memory_size = 0;
tuples = res_len / tuple_size;
reservep = (pcell_t *)&reserve;
for (i = 0; i < tuples; i++) {
rv = fdt_data_to_res(reservep, addr_cells, size_cells,
(u_long *)&mr[i].mr_start, (u_long *)&mr[i].mr_size);
if (rv != 0)
goto out;
reservep += addr_cells + size_cells;
}
*mrcnt = i;
rv = 0;
out:
return (rv);
}
int
fdt_get_mem_regions(struct mem_region *mr, int *mrcnt, uint64_t *memsize)
{
pcell_t reg[FDT_REG_CELLS * FDT_MEM_REGIONS];
pcell_t *regp;
phandle_t memory;
uint64_t memory_size;
int addr_cells, size_cells;
- int i, max_size, reg_len, rv, tuple_size, tuples;
+ int i, reg_len, rv, tuple_size, tuples;
- max_size = sizeof(reg);
memory = OF_finddevice("/memory");
if (memory == -1) {
rv = ENXIO;
goto out;
}
if ((rv = fdt_addrsize_cells(OF_parent(memory), &addr_cells,
&size_cells)) != 0)
goto out;
if (addr_cells > 2) {
rv = ERANGE;
goto out;
}
tuple_size = sizeof(pcell_t) * (addr_cells + size_cells);
reg_len = OF_getproplen(memory, "reg");
if (reg_len <= 0 || reg_len > sizeof(reg)) {
rv = ERANGE;
goto out;
}
if (OF_getprop(memory, "reg", reg, reg_len) <= 0) {
rv = ENXIO;
goto out;
}
memory_size = 0;
tuples = reg_len / tuple_size;
regp = (pcell_t *)&reg;
for (i = 0; i < tuples; i++) {
rv = fdt_data_to_res(regp, addr_cells, size_cells,
(u_long *)&mr[i].mr_start, (u_long *)&mr[i].mr_size);
if (rv != 0)
goto out;
regp += addr_cells + size_cells;
memory_size += mr[i].mr_size;
}
if (memory_size == 0) {
rv = ERANGE;
goto out;
}
*mrcnt = i;
if (memsize != NULL)
*memsize = memory_size;
rv = 0;
out:
return (rv);
}
int
fdt_get_unit(device_t dev)
{
const char * name;
name = ofw_bus_get_name(dev);
name = strchr(name, '@') + 1;
return (strtol(name,NULL,0));
}
int
fdt_get_chosen_bootargs(char *bootargs, size_t max_size)
{
phandle_t chosen;
chosen = OF_finddevice("/chosen");
if (chosen == -1)
return (ENXIO);
if (OF_getprop(chosen, "bootargs", bootargs, max_size) == -1)
return (ENXIO);
return (0);
}
Index: head/sys/dev/mii/rdcphy.c
===================================================================
--- head/sys/dev/mii/rdcphy.c (revision 327172)
+++ head/sys/dev/mii/rdcphy.c (revision 327173)
@@ -1,236 +1,234 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2010, Pyun YongHyeon <yongari@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice unmodified, this list of conditions, and the following
* disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
/*
* Driver for the RDC Semiconductor R6040 10/100 PHY.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/module.h>
#include <sys/socket.h>
#include <sys/bus.h>
#include <net/if.h>
#include <net/if_media.h>
#include <dev/mii/mii.h>
#include <dev/mii/miivar.h>
#include "miidevs.h"
#include <dev/mii/rdcphyreg.h>
#include "miibus_if.h"
static device_probe_t rdcphy_probe;
static device_attach_t rdcphy_attach;
struct rdcphy_softc {
struct mii_softc mii_sc;
int mii_link_tick;
#define RDCPHY_MANNEG_TICK 3
};
static device_method_t rdcphy_methods[] = {
/* device interface */
DEVMETHOD(device_probe, rdcphy_probe),
DEVMETHOD(device_attach, rdcphy_attach),
DEVMETHOD(device_detach, mii_phy_detach),
DEVMETHOD(device_shutdown, bus_generic_shutdown),
DEVMETHOD_END
};
static devclass_t rdcphy_devclass;
static driver_t rdcphy_driver = {
"rdcphy",
rdcphy_methods,
sizeof(struct rdcphy_softc)
};
DRIVER_MODULE(rdcphy, miibus, rdcphy_driver, rdcphy_devclass, 0, 0);
static int rdcphy_service(struct mii_softc *, struct mii_data *, int);
static void rdcphy_status(struct mii_softc *);
static const struct mii_phydesc rdcphys[] = {
MII_PHY_DESC(RDC, R6040),
MII_PHY_END
};
static const struct mii_phy_funcs rdcphy_funcs = {
rdcphy_service,
rdcphy_status,
mii_phy_reset
};
static int
rdcphy_probe(device_t dev)
{
return (mii_phy_dev_probe(dev, rdcphys, BUS_PROBE_DEFAULT));
}
static int
rdcphy_attach(device_t dev)
{
mii_phy_dev_attach(dev, MIIF_NOMANPAUSE, &rdcphy_funcs, 1);
return (0);
}
static int
rdcphy_service(struct mii_softc *sc, struct mii_data *mii, int cmd)
{
struct rdcphy_softc *rsc;
struct ifmedia_entry *ife;
rsc = (struct rdcphy_softc *)sc;
ife = mii->mii_media.ifm_cur;
switch (cmd) {
case MII_POLLSTAT:
break;
case MII_MEDIACHG:
mii_phy_setmedia(sc);
switch (IFM_SUBTYPE(ife->ifm_media)) {
case IFM_100_TX:
case IFM_10_T:
/*
* Report fake lost link event to parent
* driver. This will stop MAC of parent
* driver and make it possible to reconfigure
* MAC after completion of link establishment.
* Note, the parent MAC seems to require
* restarting MAC when underlying any PHY
* configuration was changed even if the
* resolved speed/duplex was not changed at
* all.
*/
mii->mii_media_status = 0;
mii->mii_media_active = IFM_ETHER | IFM_NONE;
rsc->mii_link_tick = RDCPHY_MANNEG_TICK;
/* Immediately report link down. */
mii_phy_update(sc, MII_MEDIACHG);
return (0);
default:
break;
}
break;
case MII_TICK:
if (mii_phy_tick(sc) == EJUSTRETURN)
return (0);
if (IFM_SUBTYPE(ife->ifm_media) != IFM_AUTO) {
/*
* It seems the PHY hardware does not correctly
* report link status changes when manual link
* configuration is in progress. It is also
* possible for the PHY to complete establishing
* a link within one second such that mii(4)
* did not notice the link change. To workaround
* the issue, emulate lost link event and wait
* for 3 seconds when manual link configuration
* is in progress. 3 seconds would be long
* enough to absorb transient link flips.
*/
if (rsc->mii_link_tick > 0) {
rsc->mii_link_tick--;
return (0);
}
}
break;
}
/* Update the media status. */
PHY_STATUS(sc);
/* Callback if something changed. */
mii_phy_update(sc, cmd);
return (0);
}
static void
rdcphy_status(struct mii_softc *sc)
{
struct mii_data *mii;
- struct ifmedia_entry *ife;
int bmsr, bmcr, physts;
mii = sc->mii_pdata;
- ife = mii->mii_media.ifm_cur;
mii->mii_media_status = IFM_AVALID;
mii->mii_media_active = IFM_ETHER;
bmsr = PHY_READ(sc, MII_BMSR) | PHY_READ(sc, MII_BMSR);
physts = PHY_READ(sc, MII_RDCPHY_STATUS);
if ((physts & STATUS_LINK_UP) != 0)
mii->mii_media_status |= IFM_ACTIVE;
bmcr = PHY_READ(sc, MII_BMCR);
if ((bmcr & BMCR_ISO) != 0) {
mii->mii_media_active |= IFM_NONE;
mii->mii_media_status = 0;
return;
}
if ((bmcr & BMCR_LOOP) != 0)
mii->mii_media_active |= IFM_LOOP;
if ((bmcr & BMCR_AUTOEN) != 0) {
if ((bmsr & BMSR_ACOMP) == 0) {
/* Erg, still trying, I guess... */
mii->mii_media_active |= IFM_NONE;
return;
}
}
switch (physts & STATUS_SPEED_MASK) {
case STATUS_SPEED_100:
mii->mii_media_active |= IFM_100_TX;
break;
case STATUS_SPEED_10:
mii->mii_media_active |= IFM_10_T;
break;
default:
mii->mii_media_active |= IFM_NONE;
return;
}
if ((physts & STATUS_FULL_DUPLEX) != 0)
mii->mii_media_active |= IFM_FDX | mii_phy_flowstatus(sc);
else
mii->mii_media_active |= IFM_HDX;
}
Index: head/sys/dev/mmc/host/dwmmc.c
===================================================================
--- head/sys/dev/mmc/host/dwmmc.c (revision 327172)
+++ head/sys/dev/mmc/host/dwmmc.c (revision 327173)
@@ -1,1188 +1,1186 @@
/*-
* Copyright (c) 2014 Ruslan Bukin <br@bsdpad.com>
* All rights reserved.
*
* This software was developed by SRI International and the University of
* Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237)
* ("CTSRD"), as part of the DARPA CRASH research programme.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Synopsys DesignWare Mobile Storage Host Controller
* Chapter 14, Altera Cyclone V Device Handbook (CV-5V2 2014.07.22)
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/bus.h>
#include <sys/kernel.h>
#include <sys/module.h>
#include <sys/malloc.h>
#include <sys/rman.h>
#include <dev/mmc/bridge.h>
#include <dev/mmc/mmcbrvar.h>
#include <dev/fdt/fdt_common.h>
#include <dev/ofw/openfirm.h>
#include <dev/ofw/ofw_bus.h>
#include <dev/ofw/ofw_bus_subr.h>
#include <machine/bus.h>
#include <machine/cpu.h>
#include <machine/intr.h>
#include <dev/mmc/host/dwmmc_reg.h>
#include <dev/mmc/host/dwmmc_var.h>
#include "opt_mmccam.h"
#include "mmcbr_if.h"
#define dprintf(x, arg...)
#define READ4(_sc, _reg) \
bus_read_4((_sc)->res[0], _reg)
#define WRITE4(_sc, _reg, _val) \
bus_write_4((_sc)->res[0], _reg, _val)
#define DIV_ROUND_UP(n, d) howmany(n, d)
#define DWMMC_LOCK(_sc) mtx_lock(&(_sc)->sc_mtx)
#define DWMMC_UNLOCK(_sc) mtx_unlock(&(_sc)->sc_mtx)
#define DWMMC_LOCK_INIT(_sc) \
mtx_init(&_sc->sc_mtx, device_get_nameunit(_sc->dev), \
"dwmmc", MTX_DEF)
#define DWMMC_LOCK_DESTROY(_sc) mtx_destroy(&_sc->sc_mtx);
#define DWMMC_ASSERT_LOCKED(_sc) mtx_assert(&_sc->sc_mtx, MA_OWNED);
#define DWMMC_ASSERT_UNLOCKED(_sc) mtx_assert(&_sc->sc_mtx, MA_NOTOWNED);
#define PENDING_CMD 0x01
#define PENDING_STOP 0x02
#define CARD_INIT_DONE 0x04
#define DWMMC_DATA_ERR_FLAGS (SDMMC_INTMASK_DRT | SDMMC_INTMASK_DCRC \
|SDMMC_INTMASK_HTO | SDMMC_INTMASK_SBE \
|SDMMC_INTMASK_EBE)
#define DWMMC_CMD_ERR_FLAGS (SDMMC_INTMASK_RTO | SDMMC_INTMASK_RCRC \
|SDMMC_INTMASK_RE)
#define DWMMC_ERR_FLAGS (DWMMC_DATA_ERR_FLAGS | DWMMC_CMD_ERR_FLAGS \
|SDMMC_INTMASK_HLE)
#define DES0_DIC (1 << 1)
#define DES0_LD (1 << 2)
#define DES0_FS (1 << 3)
#define DES0_CH (1 << 4)
#define DES0_ER (1 << 5)
#define DES0_CES (1 << 30)
#define DES0_OWN (1 << 31)
#define DES1_BS1_MASK 0xfff
#define DES1_BS1_SHIFT 0
struct idmac_desc {
uint32_t des0; /* control */
uint32_t des1; /* bufsize */
uint32_t des2; /* buf1 phys addr */
uint32_t des3; /* buf2 phys addr or next descr */
};
#define DESC_MAX 256
#define DESC_SIZE (sizeof(struct idmac_desc) * DESC_MAX)
#define DEF_MSIZE 0x2 /* Burst size of multiple transaction */
static void dwmmc_next_operation(struct dwmmc_softc *);
static int dwmmc_setup_bus(struct dwmmc_softc *, int);
static int dma_done(struct dwmmc_softc *, struct mmc_command *);
static int dma_stop(struct dwmmc_softc *);
static void pio_read(struct dwmmc_softc *, struct mmc_command *);
static void pio_write(struct dwmmc_softc *, struct mmc_command *);
static struct resource_spec dwmmc_spec[] = {
{ SYS_RES_MEMORY, 0, RF_ACTIVE },
{ SYS_RES_IRQ, 0, RF_ACTIVE },
{ -1, 0 }
};
#define HWTYPE_MASK (0x0000ffff)
#define HWFLAG_MASK (0xffff << 16)
static struct ofw_compat_data compat_data[] = {
{"altr,socfpga-dw-mshc", HWTYPE_ALTERA},
{"samsung,exynos5420-dw-mshc", HWTYPE_EXYNOS},
{"rockchip,rk2928-dw-mshc", HWTYPE_ROCKCHIP},
{NULL, HWTYPE_NONE},
};
static void
dwmmc_get1paddr(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
{
if (error != 0)
return;
*(bus_addr_t *)arg = segs[0].ds_addr;
}
static void
dwmmc_ring_setup(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
{
struct dwmmc_softc *sc;
int idx;
if (error != 0)
return;
sc = arg;
dprintf("nsegs %d seg0len %lu\n", nsegs, segs[0].ds_len);
for (idx = 0; idx < nsegs; idx++) {
sc->desc_ring[idx].des0 = (DES0_OWN | DES0_DIC | DES0_CH);
sc->desc_ring[idx].des1 = segs[idx].ds_len;
sc->desc_ring[idx].des2 = segs[idx].ds_addr;
if (idx == 0)
sc->desc_ring[idx].des0 |= DES0_FS;
if (idx == (nsegs - 1)) {
sc->desc_ring[idx].des0 &= ~(DES0_DIC | DES0_CH);
sc->desc_ring[idx].des0 |= DES0_LD;
}
}
}
static int
dwmmc_ctrl_reset(struct dwmmc_softc *sc, int reset_bits)
{
int reg;
int i;
reg = READ4(sc, SDMMC_CTRL);
reg |= (reset_bits);
WRITE4(sc, SDMMC_CTRL, reg);
/* Wait reset done */
for (i = 0; i < 100; i++) {
if (!(READ4(sc, SDMMC_CTRL) & reset_bits))
return (0);
DELAY(10);
}
device_printf(sc->dev, "Reset failed\n");
return (1);
}
static int
dma_setup(struct dwmmc_softc *sc)
{
int error;
int nidx;
int idx;
/*
* Set up TX descriptor ring, descriptors, and dma maps.
*/
error = bus_dma_tag_create(
bus_get_dma_tag(sc->dev), /* Parent tag. */
4096, 0, /* alignment, boundary */
BUS_SPACE_MAXADDR_32BIT, /* lowaddr */
BUS_SPACE_MAXADDR, /* highaddr */
NULL, NULL, /* filter, filterarg */
DESC_SIZE, 1, /* maxsize, nsegments */
DESC_SIZE, /* maxsegsize */
0, /* flags */
NULL, NULL, /* lockfunc, lockarg */
&sc->desc_tag);
if (error != 0) {
device_printf(sc->dev,
"could not create ring DMA tag.\n");
return (1);
}
error = bus_dmamem_alloc(sc->desc_tag, (void**)&sc->desc_ring,
BUS_DMA_COHERENT | BUS_DMA_WAITOK | BUS_DMA_ZERO,
&sc->desc_map);
if (error != 0) {
device_printf(sc->dev,
"could not allocate descriptor ring.\n");
return (1);
}
error = bus_dmamap_load(sc->desc_tag, sc->desc_map,
sc->desc_ring, DESC_SIZE, dwmmc_get1paddr,
&sc->desc_ring_paddr, 0);
if (error != 0) {
device_printf(sc->dev,
"could not load descriptor ring map.\n");
return (1);
}
for (idx = 0; idx < sc->desc_count; idx++) {
sc->desc_ring[idx].des0 = DES0_CH;
sc->desc_ring[idx].des1 = 0;
nidx = (idx + 1) % sc->desc_count;
sc->desc_ring[idx].des3 = sc->desc_ring_paddr + \
(nidx * sizeof(struct idmac_desc));
}
error = bus_dma_tag_create(
bus_get_dma_tag(sc->dev), /* Parent tag. */
4096, 0, /* alignment, boundary */
BUS_SPACE_MAXADDR_32BIT, /* lowaddr */
BUS_SPACE_MAXADDR, /* highaddr */
NULL, NULL, /* filter, filterarg */
sc->desc_count * MMC_SECTOR_SIZE, /* maxsize */
sc->desc_count, /* nsegments */
MMC_SECTOR_SIZE, /* maxsegsize */
0, /* flags */
NULL, NULL, /* lockfunc, lockarg */
&sc->buf_tag);
if (error != 0) {
device_printf(sc->dev,
"could not create ring DMA tag.\n");
return (1);
}
error = bus_dmamap_create(sc->buf_tag, 0,
&sc->buf_map);
if (error != 0) {
device_printf(sc->dev,
"could not create TX buffer DMA map.\n");
return (1);
}
return (0);
}
static void
dwmmc_cmd_done(struct dwmmc_softc *sc)
{
struct mmc_command *cmd;
cmd = sc->curcmd;
if (cmd == NULL)
return;
if (cmd->flags & MMC_RSP_PRESENT) {
if (cmd->flags & MMC_RSP_136) {
cmd->resp[3] = READ4(sc, SDMMC_RESP0);
cmd->resp[2] = READ4(sc, SDMMC_RESP1);
cmd->resp[1] = READ4(sc, SDMMC_RESP2);
cmd->resp[0] = READ4(sc, SDMMC_RESP3);
} else {
cmd->resp[3] = 0;
cmd->resp[2] = 0;
cmd->resp[1] = 0;
cmd->resp[0] = READ4(sc, SDMMC_RESP0);
}
}
}
static void
dwmmc_tasklet(struct dwmmc_softc *sc)
{
struct mmc_command *cmd;
cmd = sc->curcmd;
if (cmd == NULL)
return;
if (!sc->cmd_done)
return;
if (cmd->error != MMC_ERR_NONE || !cmd->data) {
dwmmc_next_operation(sc);
} else if (cmd->data && sc->dto_rcvd) {
if ((cmd->opcode == MMC_WRITE_MULTIPLE_BLOCK ||
cmd->opcode == MMC_READ_MULTIPLE_BLOCK) &&
sc->use_auto_stop) {
if (sc->acd_rcvd)
dwmmc_next_operation(sc);
} else {
dwmmc_next_operation(sc);
}
}
}
static void
dwmmc_intr(void *arg)
{
struct mmc_command *cmd;
struct dwmmc_softc *sc;
uint32_t reg;
sc = arg;
DWMMC_LOCK(sc);
cmd = sc->curcmd;
/* First handle SDMMC controller interrupts */
reg = READ4(sc, SDMMC_MINTSTS);
if (reg) {
dprintf("%s 0x%08x\n", __func__, reg);
if (reg & DWMMC_CMD_ERR_FLAGS) {
WRITE4(sc, SDMMC_RINTSTS, DWMMC_CMD_ERR_FLAGS);
dprintf("cmd err 0x%08x cmd 0x%08x\n",
reg, cmd->opcode);
cmd->error = MMC_ERR_TIMEOUT;
}
if (reg & DWMMC_DATA_ERR_FLAGS) {
WRITE4(sc, SDMMC_RINTSTS, DWMMC_DATA_ERR_FLAGS);
dprintf("data err 0x%08x cmd 0x%08x\n",
reg, cmd->opcode);
cmd->error = MMC_ERR_FAILED;
if (!sc->use_pio) {
dma_done(sc, cmd);
dma_stop(sc);
}
}
if (reg & SDMMC_INTMASK_CMD_DONE) {
dwmmc_cmd_done(sc);
sc->cmd_done = 1;
WRITE4(sc, SDMMC_RINTSTS, SDMMC_INTMASK_CMD_DONE);
}
if (reg & SDMMC_INTMASK_ACD) {
sc->acd_rcvd = 1;
WRITE4(sc, SDMMC_RINTSTS, SDMMC_INTMASK_ACD);
}
if (reg & SDMMC_INTMASK_DTO) {
sc->dto_rcvd = 1;
WRITE4(sc, SDMMC_RINTSTS, SDMMC_INTMASK_DTO);
}
if (reg & SDMMC_INTMASK_CD) {
/* XXX: Handle card detect */
WRITE4(sc, SDMMC_RINTSTS, SDMMC_INTMASK_CD);
}
}
if (sc->use_pio) {
if (reg & (SDMMC_INTMASK_RXDR|SDMMC_INTMASK_DTO)) {
pio_read(sc, cmd);
}
if (reg & (SDMMC_INTMASK_TXDR|SDMMC_INTMASK_DTO)) {
pio_write(sc, cmd);
}
} else {
/* Now handle DMA interrupts */
reg = READ4(sc, SDMMC_IDSTS);
if (reg) {
dprintf("dma intr 0x%08x\n", reg);
if (reg & (SDMMC_IDINTEN_TI | SDMMC_IDINTEN_RI)) {
WRITE4(sc, SDMMC_IDSTS, (SDMMC_IDINTEN_TI |
SDMMC_IDINTEN_RI));
WRITE4(sc, SDMMC_IDSTS, SDMMC_IDINTEN_NI);
dma_done(sc, cmd);
}
}
}
dwmmc_tasklet(sc);
DWMMC_UNLOCK(sc);
}
static int
parse_fdt(struct dwmmc_softc *sc)
{
pcell_t dts_value[3];
phandle_t node;
int len;
if ((node = ofw_bus_get_node(sc->dev)) == -1)
return (ENXIO);
/* fifo-depth */
if ((len = OF_getproplen(node, "fifo-depth")) > 0) {
OF_getencprop(node, "fifo-depth", dts_value, len);
sc->fifo_depth = dts_value[0];
}
/* num-slots */
sc->num_slots = 1;
if ((len = OF_getproplen(node, "num-slots")) > 0) {
OF_getencprop(node, "num-slots", dts_value, len);
sc->num_slots = dts_value[0];
}
/*
* We need some platform-specific code to know
* what the clock is supplied for our device.
* For now rely on the value specified in FDT.
*/
if (sc->bus_hz == 0) {
if ((len = OF_getproplen(node, "bus-frequency")) <= 0)
return (ENXIO);
OF_getencprop(node, "bus-frequency", dts_value, len);
sc->bus_hz = dts_value[0];
}
/*
* Platform-specific stuff
* XXX: Move to separate file
*/
if ((sc->hwtype & HWTYPE_MASK) != HWTYPE_EXYNOS)
return (0);
if ((len = OF_getproplen(node, "samsung,dw-mshc-ciu-div")) <= 0)
return (ENXIO);
OF_getencprop(node, "samsung,dw-mshc-ciu-div", dts_value, len);
sc->sdr_timing = (dts_value[0] << SDMMC_CLKSEL_DIVIDER_SHIFT);
sc->ddr_timing = (dts_value[0] << SDMMC_CLKSEL_DIVIDER_SHIFT);
if ((len = OF_getproplen(node, "samsung,dw-mshc-sdr-timing")) <= 0)
return (ENXIO);
OF_getencprop(node, "samsung,dw-mshc-sdr-timing", dts_value, len);
sc->sdr_timing |= ((dts_value[0] << SDMMC_CLKSEL_SAMPLE_SHIFT) |
(dts_value[1] << SDMMC_CLKSEL_DRIVE_SHIFT));
if ((len = OF_getproplen(node, "samsung,dw-mshc-ddr-timing")) <= 0)
return (ENXIO);
OF_getencprop(node, "samsung,dw-mshc-ddr-timing", dts_value, len);
sc->ddr_timing |= ((dts_value[0] << SDMMC_CLKSEL_SAMPLE_SHIFT) |
(dts_value[1] << SDMMC_CLKSEL_DRIVE_SHIFT));
return (0);
}
static int
dwmmc_probe(device_t dev)
{
uintptr_t hwtype;
if (!ofw_bus_status_okay(dev))
return (ENXIO);
hwtype = ofw_bus_search_compatible(dev, compat_data)->ocd_data;
if (hwtype == HWTYPE_NONE)
return (ENXIO);
device_set_desc(dev, "Synopsys DesignWare Mobile "
"Storage Host Controller");
return (BUS_PROBE_DEFAULT);
}
int
dwmmc_attach(device_t dev)
{
struct dwmmc_softc *sc;
int error;
int slot;
sc = device_get_softc(dev);
sc->dev = dev;
if (sc->hwtype == HWTYPE_NONE) {
sc->hwtype =
ofw_bus_search_compatible(dev, compat_data)->ocd_data;
}
/* Why not to use Auto Stop? It save a hundred of irq per second */
sc->use_auto_stop = 1;
error = parse_fdt(sc);
if (error != 0) {
device_printf(dev, "Can't get FDT property.\n");
return (ENXIO);
}
DWMMC_LOCK_INIT(sc);
if (bus_alloc_resources(dev, dwmmc_spec, sc->res)) {
device_printf(dev, "could not allocate resources\n");
return (ENXIO);
}
/* Setup interrupt handler. */
error = bus_setup_intr(dev, sc->res[1], INTR_TYPE_NET | INTR_MPSAFE,
NULL, dwmmc_intr, sc, &sc->intr_cookie);
if (error != 0) {
device_printf(dev, "could not setup interrupt handler.\n");
return (ENXIO);
}
device_printf(dev, "Hardware version ID is %04x\n",
READ4(sc, SDMMC_VERID) & 0xffff);
if (sc->desc_count == 0)
sc->desc_count = DESC_MAX;
if ((sc->hwtype & HWTYPE_MASK) == HWTYPE_ROCKCHIP) {
sc->use_pio = 1;
sc->pwren_inverted = 1;
} else if ((sc->hwtype & HWTYPE_MASK) == HWTYPE_EXYNOS) {
WRITE4(sc, EMMCP_MPSBEGIN0, 0);
WRITE4(sc, EMMCP_SEND0, 0);
WRITE4(sc, EMMCP_CTRL0, (MPSCTRL_SECURE_READ_BIT |
MPSCTRL_SECURE_WRITE_BIT |
MPSCTRL_NON_SECURE_READ_BIT |
MPSCTRL_NON_SECURE_WRITE_BIT |
MPSCTRL_VALID));
}
/* XXX: we support operation for slot index 0 only */
slot = 0;
if (sc->pwren_inverted) {
WRITE4(sc, SDMMC_PWREN, (0 << slot));
} else {
WRITE4(sc, SDMMC_PWREN, (1 << slot));
}
/* Reset all */
if (dwmmc_ctrl_reset(sc, (SDMMC_CTRL_RESET |
SDMMC_CTRL_FIFO_RESET |
SDMMC_CTRL_DMA_RESET)))
return (ENXIO);
dwmmc_setup_bus(sc, sc->host.f_min);
if (sc->fifo_depth == 0) {
sc->fifo_depth = 1 +
((READ4(sc, SDMMC_FIFOTH) >> SDMMC_FIFOTH_RXWMARK_S) & 0xfff);
device_printf(dev, "No fifo-depth, using FIFOTH %x\n",
sc->fifo_depth);
}
if (!sc->use_pio) {
if (dma_setup(sc))
return (ENXIO);
/* Install desc base */
WRITE4(sc, SDMMC_DBADDR, sc->desc_ring_paddr);
/* Enable DMA interrupts */
WRITE4(sc, SDMMC_IDSTS, SDMMC_IDINTEN_MASK);
WRITE4(sc, SDMMC_IDINTEN, (SDMMC_IDINTEN_NI |
SDMMC_IDINTEN_RI |
SDMMC_IDINTEN_TI));
}
/* Clear and disable interrups for a while */
WRITE4(sc, SDMMC_RINTSTS, 0xffffffff);
WRITE4(sc, SDMMC_INTMASK, 0);
/* Maximum timeout */
WRITE4(sc, SDMMC_TMOUT, 0xffffffff);
/* Enable interrupts */
WRITE4(sc, SDMMC_RINTSTS, 0xffffffff);
WRITE4(sc, SDMMC_INTMASK, (SDMMC_INTMASK_CMD_DONE |
SDMMC_INTMASK_DTO |
SDMMC_INTMASK_ACD |
SDMMC_INTMASK_TXDR |
SDMMC_INTMASK_RXDR |
DWMMC_ERR_FLAGS |
SDMMC_INTMASK_CD));
WRITE4(sc, SDMMC_CTRL, SDMMC_CTRL_INT_ENABLE);
sc->host.f_min = 400000;
sc->host.f_max = min(200000000, sc->bus_hz);
sc->host.host_ocr = MMC_OCR_320_330 | MMC_OCR_330_340;
sc->host.caps = MMC_CAP_4_BIT_DATA;
device_add_child(dev, "mmc", -1);
return (bus_generic_attach(dev));
}
static int
dwmmc_setup_bus(struct dwmmc_softc *sc, int freq)
{
int tout;
int div;
if (freq == 0) {
WRITE4(sc, SDMMC_CLKENA, 0);
WRITE4(sc, SDMMC_CMD, (SDMMC_CMD_WAIT_PRVDATA |
SDMMC_CMD_UPD_CLK_ONLY | SDMMC_CMD_START));
tout = 1000;
do {
if (tout-- < 0) {
device_printf(sc->dev, "Failed update clk\n");
return (1);
}
} while (READ4(sc, SDMMC_CMD) & SDMMC_CMD_START);
return (0);
}
WRITE4(sc, SDMMC_CLKENA, 0);
WRITE4(sc, SDMMC_CLKSRC, 0);
div = (sc->bus_hz != freq) ? DIV_ROUND_UP(sc->bus_hz, 2 * freq) : 0;
WRITE4(sc, SDMMC_CLKDIV, div);
WRITE4(sc, SDMMC_CMD, (SDMMC_CMD_WAIT_PRVDATA |
SDMMC_CMD_UPD_CLK_ONLY | SDMMC_CMD_START));
tout = 1000;
do {
if (tout-- < 0) {
device_printf(sc->dev, "Failed to update clk");
return (1);
}
} while (READ4(sc, SDMMC_CMD) & SDMMC_CMD_START);
WRITE4(sc, SDMMC_CLKENA, (SDMMC_CLKENA_CCLK_EN | SDMMC_CLKENA_LP));
WRITE4(sc, SDMMC_CMD, SDMMC_CMD_WAIT_PRVDATA |
SDMMC_CMD_UPD_CLK_ONLY | SDMMC_CMD_START);
tout = 1000;
do {
if (tout-- < 0) {
device_printf(sc->dev, "Failed to enable clk\n");
return (1);
}
} while (READ4(sc, SDMMC_CMD) & SDMMC_CMD_START);
return (0);
}
static int
dwmmc_update_ios(device_t brdev, device_t reqdev)
{
struct dwmmc_softc *sc;
struct mmc_ios *ios;
sc = device_get_softc(brdev);
ios = &sc->host.ios;
dprintf("Setting up clk %u bus_width %d\n",
ios->clock, ios->bus_width);
dwmmc_setup_bus(sc, ios->clock);
if (ios->bus_width == bus_width_8)
WRITE4(sc, SDMMC_CTYPE, SDMMC_CTYPE_8BIT);
else if (ios->bus_width == bus_width_4)
WRITE4(sc, SDMMC_CTYPE, SDMMC_CTYPE_4BIT);
else
WRITE4(sc, SDMMC_CTYPE, 0);
if ((sc->hwtype & HWTYPE_MASK) == HWTYPE_EXYNOS) {
/* XXX: take care about DDR or SDR use here */
WRITE4(sc, SDMMC_CLKSEL, sc->sdr_timing);
}
/*
* XXX: take care about DDR bit
*
* reg = READ4(sc, SDMMC_UHS_REG);
* reg |= (SDMMC_UHS_REG_DDR);
* WRITE4(sc, SDMMC_UHS_REG, reg);
*/
return (0);
}
static int
dma_done(struct dwmmc_softc *sc, struct mmc_command *cmd)
{
struct mmc_data *data;
data = cmd->data;
if (data->flags & MMC_DATA_WRITE)
bus_dmamap_sync(sc->buf_tag, sc->buf_map,
BUS_DMASYNC_POSTWRITE);
else
bus_dmamap_sync(sc->buf_tag, sc->buf_map,
BUS_DMASYNC_POSTREAD);
bus_dmamap_sync(sc->desc_tag, sc->desc_map,
BUS_DMASYNC_POSTWRITE);
bus_dmamap_unload(sc->buf_tag, sc->buf_map);
return (0);
}
static int
dma_stop(struct dwmmc_softc *sc)
{
int reg;
reg = READ4(sc, SDMMC_CTRL);
reg &= ~(SDMMC_CTRL_USE_IDMAC);
reg |= (SDMMC_CTRL_DMA_RESET);
WRITE4(sc, SDMMC_CTRL, reg);
reg = READ4(sc, SDMMC_BMOD);
reg &= ~(SDMMC_BMOD_DE | SDMMC_BMOD_FB);
reg |= (SDMMC_BMOD_SWR);
WRITE4(sc, SDMMC_BMOD, reg);
return (0);
}
static int
dma_prepare(struct dwmmc_softc *sc, struct mmc_command *cmd)
{
struct mmc_data *data;
- int len;
int err;
int reg;
data = cmd->data;
- len = data->len;
reg = READ4(sc, SDMMC_INTMASK);
reg &= ~(SDMMC_INTMASK_TXDR | SDMMC_INTMASK_RXDR);
WRITE4(sc, SDMMC_INTMASK, reg);
err = bus_dmamap_load(sc->buf_tag, sc->buf_map,
data->data, data->len, dwmmc_ring_setup,
sc, BUS_DMA_NOWAIT);
if (err != 0)
panic("dmamap_load failed\n");
/* Ensure the device can see the desc */
bus_dmamap_sync(sc->desc_tag, sc->desc_map,
BUS_DMASYNC_PREWRITE);
if (data->flags & MMC_DATA_WRITE)
bus_dmamap_sync(sc->buf_tag, sc->buf_map,
BUS_DMASYNC_PREWRITE);
else
bus_dmamap_sync(sc->buf_tag, sc->buf_map,
BUS_DMASYNC_PREREAD);
reg = (DEF_MSIZE << SDMMC_FIFOTH_MSIZE_S);
reg |= ((sc->fifo_depth / 2) - 1) << SDMMC_FIFOTH_RXWMARK_S;
reg |= (sc->fifo_depth / 2) << SDMMC_FIFOTH_TXWMARK_S;
WRITE4(sc, SDMMC_FIFOTH, reg);
wmb();
reg = READ4(sc, SDMMC_CTRL);
reg |= (SDMMC_CTRL_USE_IDMAC | SDMMC_CTRL_DMA_ENABLE);
WRITE4(sc, SDMMC_CTRL, reg);
wmb();
reg = READ4(sc, SDMMC_BMOD);
reg |= (SDMMC_BMOD_DE | SDMMC_BMOD_FB);
WRITE4(sc, SDMMC_BMOD, reg);
/* Start */
WRITE4(sc, SDMMC_PLDMND, 1);
return (0);
}
static int
pio_prepare(struct dwmmc_softc *sc, struct mmc_command *cmd)
{
struct mmc_data *data;
int reg;
data = cmd->data;
data->xfer_len = 0;
reg = (DEF_MSIZE << SDMMC_FIFOTH_MSIZE_S);
reg |= ((sc->fifo_depth / 2) - 1) << SDMMC_FIFOTH_RXWMARK_S;
reg |= (sc->fifo_depth / 2) << SDMMC_FIFOTH_TXWMARK_S;
WRITE4(sc, SDMMC_FIFOTH, reg);
wmb();
return (0);
}
static void
pio_read(struct dwmmc_softc *sc, struct mmc_command *cmd)
{
struct mmc_data *data;
uint32_t *p, status;
if (cmd == NULL || cmd->data == NULL)
return;
data = cmd->data;
if ((data->flags & MMC_DATA_READ) == 0)
return;
KASSERT((data->xfer_len & 3) == 0, ("xfer_len not aligned"));
p = (uint32_t *)data->data + (data->xfer_len >> 2);
while (data->xfer_len < data->len) {
status = READ4(sc, SDMMC_STATUS);
if (status & SDMMC_STATUS_FIFO_EMPTY)
break;
*p++ = READ4(sc, SDMMC_DATA);
data->xfer_len += 4;
}
WRITE4(sc, SDMMC_RINTSTS, SDMMC_INTMASK_RXDR);
}
static void
pio_write(struct dwmmc_softc *sc, struct mmc_command *cmd)
{
struct mmc_data *data;
uint32_t *p, status;
if (cmd == NULL || cmd->data == NULL)
return;
data = cmd->data;
if ((data->flags & MMC_DATA_WRITE) == 0)
return;
KASSERT((data->xfer_len & 3) == 0, ("xfer_len not aligned"));
p = (uint32_t *)data->data + (data->xfer_len >> 2);
while (data->xfer_len < data->len) {
status = READ4(sc, SDMMC_STATUS);
if (status & SDMMC_STATUS_FIFO_FULL)
break;
WRITE4(sc, SDMMC_DATA, *p++);
data->xfer_len += 4;
}
WRITE4(sc, SDMMC_RINTSTS, SDMMC_INTMASK_TXDR);
}
static void
dwmmc_start_cmd(struct dwmmc_softc *sc, struct mmc_command *cmd)
{
struct mmc_data *data;
uint32_t blksz;
uint32_t cmdr;
sc->curcmd = cmd;
data = cmd->data;
if ((sc->hwtype & HWTYPE_MASK) == HWTYPE_ROCKCHIP)
dwmmc_setup_bus(sc, sc->host.ios.clock);
/* XXX Upper layers don't always set this */
cmd->mrq = sc->req;
/* Begin setting up command register. */
cmdr = cmd->opcode;
dprintf("cmd->opcode 0x%08x\n", cmd->opcode);
if (cmd->opcode == MMC_STOP_TRANSMISSION ||
cmd->opcode == MMC_GO_IDLE_STATE ||
cmd->opcode == MMC_GO_INACTIVE_STATE)
cmdr |= SDMMC_CMD_STOP_ABORT;
else if (cmd->opcode != MMC_SEND_STATUS && data)
cmdr |= SDMMC_CMD_WAIT_PRVDATA;
/* Set up response handling. */
if (MMC_RSP(cmd->flags) != MMC_RSP_NONE) {
cmdr |= SDMMC_CMD_RESP_EXP;
if (cmd->flags & MMC_RSP_136)
cmdr |= SDMMC_CMD_RESP_LONG;
}
if (cmd->flags & MMC_RSP_CRC)
cmdr |= SDMMC_CMD_RESP_CRC;
/*
* XXX: Not all platforms want this.
*/
cmdr |= SDMMC_CMD_USE_HOLD_REG;
if ((sc->flags & CARD_INIT_DONE) == 0) {
sc->flags |= (CARD_INIT_DONE);
cmdr |= SDMMC_CMD_SEND_INIT;
}
if (data) {
if ((cmd->opcode == MMC_WRITE_MULTIPLE_BLOCK ||
cmd->opcode == MMC_READ_MULTIPLE_BLOCK) &&
sc->use_auto_stop)
cmdr |= SDMMC_CMD_SEND_ASTOP;
cmdr |= SDMMC_CMD_DATA_EXP;
if (data->flags & MMC_DATA_STREAM)
cmdr |= SDMMC_CMD_MODE_STREAM;
if (data->flags & MMC_DATA_WRITE)
cmdr |= SDMMC_CMD_DATA_WRITE;
WRITE4(sc, SDMMC_TMOUT, 0xffffffff);
WRITE4(sc, SDMMC_BYTCNT, data->len);
blksz = (data->len < MMC_SECTOR_SIZE) ? \
data->len : MMC_SECTOR_SIZE;
WRITE4(sc, SDMMC_BLKSIZ, blksz);
if (sc->use_pio) {
pio_prepare(sc, cmd);
} else {
dma_prepare(sc, cmd);
}
wmb();
}
dprintf("cmdr 0x%08x\n", cmdr);
WRITE4(sc, SDMMC_CMDARG, cmd->arg);
wmb();
WRITE4(sc, SDMMC_CMD, cmdr | SDMMC_CMD_START);
};
static void
dwmmc_next_operation(struct dwmmc_softc *sc)
{
struct mmc_request *req;
req = sc->req;
if (req == NULL)
return;
sc->acd_rcvd = 0;
sc->dto_rcvd = 0;
sc->cmd_done = 0;
/*
* XXX: Wait until card is still busy.
* We do need this to prevent data timeouts,
* mostly caused by multi-block write command
* followed by single-read.
*/
while(READ4(sc, SDMMC_STATUS) & (SDMMC_STATUS_DATA_BUSY))
continue;
if (sc->flags & PENDING_CMD) {
sc->flags &= ~PENDING_CMD;
dwmmc_start_cmd(sc, req->cmd);
return;
} else if (sc->flags & PENDING_STOP && !sc->use_auto_stop) {
sc->flags &= ~PENDING_STOP;
dwmmc_start_cmd(sc, req->stop);
return;
}
sc->req = NULL;
sc->curcmd = NULL;
req->done(req);
}
static int
dwmmc_request(device_t brdev, device_t reqdev, struct mmc_request *req)
{
struct dwmmc_softc *sc;
sc = device_get_softc(brdev);
dprintf("%s\n", __func__);
DWMMC_LOCK(sc);
if (sc->req != NULL) {
DWMMC_UNLOCK(sc);
return (EBUSY);
}
sc->req = req;
sc->flags |= PENDING_CMD;
if (sc->req->stop)
sc->flags |= PENDING_STOP;
dwmmc_next_operation(sc);
DWMMC_UNLOCK(sc);
return (0);
}
static int
dwmmc_get_ro(device_t brdev, device_t reqdev)
{
dprintf("%s\n", __func__);
return (0);
}
static int
dwmmc_acquire_host(device_t brdev, device_t reqdev)
{
struct dwmmc_softc *sc;
sc = device_get_softc(brdev);
DWMMC_LOCK(sc);
while (sc->bus_busy)
msleep(sc, &sc->sc_mtx, PZERO, "dwmmcah", hz / 5);
sc->bus_busy++;
DWMMC_UNLOCK(sc);
return (0);
}
static int
dwmmc_release_host(device_t brdev, device_t reqdev)
{
struct dwmmc_softc *sc;
sc = device_get_softc(brdev);
DWMMC_LOCK(sc);
sc->bus_busy--;
wakeup(sc);
DWMMC_UNLOCK(sc);
return (0);
}
static int
dwmmc_read_ivar(device_t bus, device_t child, int which, uintptr_t *result)
{
struct dwmmc_softc *sc;
sc = device_get_softc(bus);
switch (which) {
default:
return (EINVAL);
case MMCBR_IVAR_BUS_MODE:
*(int *)result = sc->host.ios.bus_mode;
break;
case MMCBR_IVAR_BUS_WIDTH:
*(int *)result = sc->host.ios.bus_width;
break;
case MMCBR_IVAR_CHIP_SELECT:
*(int *)result = sc->host.ios.chip_select;
break;
case MMCBR_IVAR_CLOCK:
*(int *)result = sc->host.ios.clock;
break;
case MMCBR_IVAR_F_MIN:
*(int *)result = sc->host.f_min;
break;
case MMCBR_IVAR_F_MAX:
*(int *)result = sc->host.f_max;
break;
case MMCBR_IVAR_HOST_OCR:
*(int *)result = sc->host.host_ocr;
break;
case MMCBR_IVAR_MODE:
*(int *)result = sc->host.mode;
break;
case MMCBR_IVAR_OCR:
*(int *)result = sc->host.ocr;
break;
case MMCBR_IVAR_POWER_MODE:
*(int *)result = sc->host.ios.power_mode;
break;
case MMCBR_IVAR_VDD:
*(int *)result = sc->host.ios.vdd;
break;
case MMCBR_IVAR_CAPS:
sc->host.caps |= MMC_CAP_4_BIT_DATA | MMC_CAP_8_BIT_DATA;
*(int *)result = sc->host.caps;
break;
case MMCBR_IVAR_MAX_DATA:
*(int *)result = sc->desc_count;
}
return (0);
}
static int
dwmmc_write_ivar(device_t bus, device_t child, int which, uintptr_t value)
{
struct dwmmc_softc *sc;
sc = device_get_softc(bus);
switch (which) {
default:
return (EINVAL);
case MMCBR_IVAR_BUS_MODE:
sc->host.ios.bus_mode = value;
break;
case MMCBR_IVAR_BUS_WIDTH:
sc->host.ios.bus_width = value;
break;
case MMCBR_IVAR_CHIP_SELECT:
sc->host.ios.chip_select = value;
break;
case MMCBR_IVAR_CLOCK:
sc->host.ios.clock = value;
break;
case MMCBR_IVAR_MODE:
sc->host.mode = value;
break;
case MMCBR_IVAR_OCR:
sc->host.ocr = value;
break;
case MMCBR_IVAR_POWER_MODE:
sc->host.ios.power_mode = value;
break;
case MMCBR_IVAR_VDD:
sc->host.ios.vdd = value;
break;
/* These are read-only */
case MMCBR_IVAR_CAPS:
case MMCBR_IVAR_HOST_OCR:
case MMCBR_IVAR_F_MIN:
case MMCBR_IVAR_F_MAX:
case MMCBR_IVAR_MAX_DATA:
return (EINVAL);
}
return (0);
}
static device_method_t dwmmc_methods[] = {
DEVMETHOD(device_probe, dwmmc_probe),
DEVMETHOD(device_attach, dwmmc_attach),
/* Bus interface */
DEVMETHOD(bus_read_ivar, dwmmc_read_ivar),
DEVMETHOD(bus_write_ivar, dwmmc_write_ivar),
/* mmcbr_if */
DEVMETHOD(mmcbr_update_ios, dwmmc_update_ios),
DEVMETHOD(mmcbr_request, dwmmc_request),
DEVMETHOD(mmcbr_get_ro, dwmmc_get_ro),
DEVMETHOD(mmcbr_acquire_host, dwmmc_acquire_host),
DEVMETHOD(mmcbr_release_host, dwmmc_release_host),
DEVMETHOD_END
};
driver_t dwmmc_driver = {
"dwmmc",
dwmmc_methods,
sizeof(struct dwmmc_softc),
};
static devclass_t dwmmc_devclass;
DRIVER_MODULE(dwmmc, simplebus, dwmmc_driver, dwmmc_devclass, NULL, NULL);
DRIVER_MODULE(dwmmc, ofwbus, dwmmc_driver, dwmmc_devclass, NULL, NULL);
#ifndef MMCCAM
MMC_DECLARE_BRIDGE(dwmmc);
#endif
Index: head/sys/dev/mmc/mmc.c
===================================================================
--- head/sys/dev/mmc/mmc.c (revision 327172)
+++ head/sys/dev/mmc/mmc.c (revision 327173)
@@ -1,2586 +1,2583 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2006 Bernd Walter. All rights reserved.
* Copyright (c) 2006 M. Warner Losh. All rights reserved.
* Copyright (c) 2017 Marius Strobl <marius@FreeBSD.org>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* Portions of this software may have been developed with reference to
* the SD Simplified Specification. The following disclaimer may apply:
*
* The following conditions apply to the release of the simplified
* specification ("Simplified Specification") by the SD Card Association and
* the SD Group. The Simplified Specification is a subset of the complete SD
* Specification which is owned by the SD Card Association and the SD
* Group. This Simplified Specification is provided on a non-confidential
* basis subject to the disclaimers below. Any implementation of the
* Simplified Specification may require a license from the SD Card
* Association, SD Group, SD-3C LLC or other third parties.
*
* Disclaimers:
*
* The information contained in the Simplified Specification is presented only
* as a standard specification for SD Cards and SD Host/Ancillary products and
* is provided "AS-IS" without any representations or warranties of any
* kind. No responsibility is assumed by the SD Group, SD-3C LLC or the SD
* Card Association for any damages, any infringements of patents or other
* right of the SD Group, SD-3C LLC, the SD Card Association or any third
* parties, which may result from its use. No license is granted by
* implication, estoppel or otherwise under any patent or other rights of the
* SD Group, SD-3C LLC, the SD Card Association or any third party. Nothing
* herein shall be construed as an obligation by the SD Group, the SD-3C LLC
* or the SD Card Association to disclose or distribute any technical
* information, know-how or other confidential information to any third party.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/lock.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/bus.h>
#include <sys/endian.h>
#include <sys/sysctl.h>
#include <sys/time.h>
#include <dev/mmc/bridge.h>
#include <dev/mmc/mmc_private.h>
#include <dev/mmc/mmc_subr.h>
#include <dev/mmc/mmcreg.h>
#include <dev/mmc/mmcbrvar.h>
#include <dev/mmc/mmcvar.h>
#include "mmcbr_if.h"
#include "mmcbus_if.h"
CTASSERT(bus_timing_max <= sizeof(uint32_t) * NBBY);
/*
* Per-card data
*/
struct mmc_ivars {
uint32_t raw_cid[4]; /* Raw bits of the CID */
uint32_t raw_csd[4]; /* Raw bits of the CSD */
uint32_t raw_scr[2]; /* Raw bits of the SCR */
uint8_t raw_ext_csd[MMC_EXTCSD_SIZE]; /* Raw bits of the EXT_CSD */
uint32_t raw_sd_status[16]; /* Raw bits of the SD_STATUS */
uint16_t rca;
u_char read_only; /* True when the device is read-only */
u_char high_cap; /* High Capacity device (block addressed) */
enum mmc_card_mode mode;
enum mmc_bus_width bus_width; /* Bus width to use */
struct mmc_cid cid; /* cid decoded */
struct mmc_csd csd; /* csd decoded */
struct mmc_scr scr; /* scr decoded */
struct mmc_sd_status sd_status; /* SD_STATUS decoded */
uint32_t sec_count; /* Card capacity in 512byte blocks */
uint32_t timings; /* Mask of bus timings supported */
uint32_t vccq_120; /* Mask of bus timings at VCCQ of 1.2 V */
uint32_t vccq_180; /* Mask of bus timings at VCCQ of 1.8 V */
uint32_t tran_speed; /* Max speed in normal mode */
uint32_t hs_tran_speed; /* Max speed in high speed mode */
uint32_t erase_sector; /* Card native erase sector size */
uint32_t cmd6_time; /* Generic switch timeout [us] */
uint32_t quirks; /* Quirks as per mmc_quirk->quirks */
char card_id_string[64];/* Formatted CID info (serial, MFG, etc) */
char card_sn_string[16];/* Formatted serial # for disk->d_ident */
};
#define CMD_RETRIES 3
static const struct mmc_quirk mmc_quirks[] = {
/*
* For some SanDisk iNAND devices, the CMD38 argument needs to be
* provided in EXT_CSD[113].
*/
{ 0x2, 0x100, "SEM02G", MMC_QUIRK_INAND_CMD38 },
{ 0x2, 0x100, "SEM04G", MMC_QUIRK_INAND_CMD38 },
{ 0x2, 0x100, "SEM08G", MMC_QUIRK_INAND_CMD38 },
{ 0x2, 0x100, "SEM16G", MMC_QUIRK_INAND_CMD38 },
{ 0x2, 0x100, "SEM32G", MMC_QUIRK_INAND_CMD38 },
/*
* Disable TRIM for Kingston eMMCs where a firmware bug can lead to
* unrecoverable data corruption.
*/
{ 0x70, MMC_QUIRK_OID_ANY, "V10008", MMC_QUIRK_BROKEN_TRIM },
{ 0x70, MMC_QUIRK_OID_ANY, "V10016", MMC_QUIRK_BROKEN_TRIM },
{ 0x0, 0x0, NULL, 0x0 }
};
static SYSCTL_NODE(_hw, OID_AUTO, mmc, CTLFLAG_RD, NULL, "mmc driver");
static int mmc_debug;
SYSCTL_INT(_hw_mmc, OID_AUTO, debug, CTLFLAG_RWTUN, &mmc_debug, 0,
"Debug level");
/* bus entry points */
static int mmc_acquire_bus(device_t busdev, device_t dev);
static int mmc_attach(device_t dev);
static int mmc_child_location_str(device_t dev, device_t child, char *buf,
size_t buflen);
static int mmc_detach(device_t dev);
static int mmc_probe(device_t dev);
static int mmc_read_ivar(device_t bus, device_t child, int which,
uintptr_t *result);
static int mmc_release_bus(device_t busdev, device_t dev);
static int mmc_resume(device_t dev);
static void mmc_retune_pause(device_t busdev, device_t dev, bool retune);
static void mmc_retune_unpause(device_t busdev, device_t dev);
static int mmc_suspend(device_t dev);
static int mmc_wait_for_request(device_t busdev, device_t dev,
struct mmc_request *req);
static int mmc_write_ivar(device_t bus, device_t child, int which,
uintptr_t value);
#define MMC_LOCK(_sc) mtx_lock(&(_sc)->sc_mtx)
#define MMC_UNLOCK(_sc) mtx_unlock(&(_sc)->sc_mtx)
#define MMC_LOCK_INIT(_sc) \
mtx_init(&(_sc)->sc_mtx, device_get_nameunit((_sc)->dev), \
"mmc", MTX_DEF)
#define MMC_LOCK_DESTROY(_sc) mtx_destroy(&(_sc)->sc_mtx);
#define MMC_ASSERT_LOCKED(_sc) mtx_assert(&(_sc)->sc_mtx, MA_OWNED);
#define MMC_ASSERT_UNLOCKED(_sc) mtx_assert(&(_sc)->sc_mtx, MA_NOTOWNED);
static int mmc_all_send_cid(struct mmc_softc *sc, uint32_t *rawcid);
static void mmc_app_decode_scr(uint32_t *raw_scr, struct mmc_scr *scr);
static void mmc_app_decode_sd_status(uint32_t *raw_sd_status,
struct mmc_sd_status *sd_status);
static int mmc_app_sd_status(struct mmc_softc *sc, uint16_t rca,
uint32_t *rawsdstatus);
static int mmc_app_send_scr(struct mmc_softc *sc, uint16_t rca,
uint32_t *rawscr);
static int mmc_calculate_clock(struct mmc_softc *sc);
static void mmc_decode_cid_mmc(uint32_t *raw_cid, struct mmc_cid *cid,
bool is_4_41p);
static void mmc_decode_cid_sd(uint32_t *raw_cid, struct mmc_cid *cid);
static void mmc_decode_csd_mmc(uint32_t *raw_csd, struct mmc_csd *csd);
static int mmc_decode_csd_sd(uint32_t *raw_csd, struct mmc_csd *csd);
static void mmc_delayed_attach(void *xsc);
static int mmc_delete_cards(struct mmc_softc *sc, bool final);
static void mmc_discover_cards(struct mmc_softc *sc);
static void mmc_format_card_id_string(struct mmc_ivars *ivar);
static void mmc_go_discovery(struct mmc_softc *sc);
static uint32_t mmc_get_bits(uint32_t *bits, int bit_len, int start,
int size);
static int mmc_highest_voltage(uint32_t ocr);
static bool mmc_host_timing(device_t dev, enum mmc_bus_timing timing);
static void mmc_idle_cards(struct mmc_softc *sc);
static void mmc_ms_delay(int ms);
static void mmc_log_card(device_t dev, struct mmc_ivars *ivar, int newcard);
static void mmc_power_down(struct mmc_softc *sc);
static void mmc_power_up(struct mmc_softc *sc);
static void mmc_rescan_cards(struct mmc_softc *sc);
static int mmc_retune(device_t busdev, device_t dev, bool reset);
static void mmc_scan(struct mmc_softc *sc);
static int mmc_sd_switch(struct mmc_softc *sc, uint8_t mode, uint8_t grp,
uint8_t value, uint8_t *res);
static int mmc_select_card(struct mmc_softc *sc, uint16_t rca);
static uint32_t mmc_select_vdd(struct mmc_softc *sc, uint32_t ocr);
static int mmc_send_app_op_cond(struct mmc_softc *sc, uint32_t ocr,
uint32_t *rocr);
static int mmc_send_csd(struct mmc_softc *sc, uint16_t rca, uint32_t *rawcsd);
static int mmc_send_if_cond(struct mmc_softc *sc, uint8_t vhs);
static int mmc_send_op_cond(struct mmc_softc *sc, uint32_t ocr,
uint32_t *rocr);
static int mmc_send_relative_addr(struct mmc_softc *sc, uint32_t *resp);
static int mmc_set_blocklen(struct mmc_softc *sc, uint32_t len);
static int mmc_set_card_bus_width(struct mmc_softc *sc, struct mmc_ivars *ivar,
enum mmc_bus_timing timing);
static int mmc_set_power_class(struct mmc_softc *sc, struct mmc_ivars *ivar);
static int mmc_set_relative_addr(struct mmc_softc *sc, uint16_t resp);
static int mmc_set_timing(struct mmc_softc *sc, struct mmc_ivars *ivar,
enum mmc_bus_timing timing);
static int mmc_set_vccq(struct mmc_softc *sc, struct mmc_ivars *ivar,
enum mmc_bus_timing timing);
static int mmc_switch_to_hs200(struct mmc_softc *sc, struct mmc_ivars *ivar,
uint32_t clock);
static int mmc_switch_to_hs400(struct mmc_softc *sc, struct mmc_ivars *ivar,
uint32_t max_dtr, enum mmc_bus_timing max_timing);
static int mmc_test_bus_width(struct mmc_softc *sc);
static uint32_t mmc_timing_to_dtr(struct mmc_ivars *ivar,
enum mmc_bus_timing timing);
static const char *mmc_timing_to_string(enum mmc_bus_timing timing);
static void mmc_update_child_list(struct mmc_softc *sc);
static int mmc_wait_for_command(struct mmc_softc *sc, uint32_t opcode,
uint32_t arg, uint32_t flags, uint32_t *resp, int retries);
static int mmc_wait_for_req(struct mmc_softc *sc, struct mmc_request *req);
static void mmc_wakeup(struct mmc_request *req);
static void
mmc_ms_delay(int ms)
{
DELAY(1000 * ms); /* XXX BAD */
}
static int
mmc_probe(device_t dev)
{
device_set_desc(dev, "MMC/SD bus");
return (0);
}
static int
mmc_attach(device_t dev)
{
struct mmc_softc *sc;
sc = device_get_softc(dev);
sc->dev = dev;
MMC_LOCK_INIT(sc);
/* We'll probe and attach our children later, but before / mount */
sc->config_intrhook.ich_func = mmc_delayed_attach;
sc->config_intrhook.ich_arg = sc;
if (config_intrhook_establish(&sc->config_intrhook) != 0)
device_printf(dev, "config_intrhook_establish failed\n");
return (0);
}
static int
mmc_detach(device_t dev)
{
struct mmc_softc *sc = device_get_softc(dev);
int err;
err = mmc_delete_cards(sc, true);
if (err != 0)
return (err);
mmc_power_down(sc);
MMC_LOCK_DESTROY(sc);
return (0);
}
static int
mmc_suspend(device_t dev)
{
struct mmc_softc *sc = device_get_softc(dev);
int err;
err = bus_generic_suspend(dev);
if (err != 0)
return (err);
/*
* We power down with the bus acquired here, mainly so that no device
* is selected any longer and sc->last_rca gets set to 0. Otherwise,
* the deselect as part of the bus acquisition in mmc_scan() may fail
* during resume, as the bus isn't powered up again before later in
* mmc_go_discovery().
*/
err = mmc_acquire_bus(dev, dev);
if (err != 0)
return (err);
mmc_power_down(sc);
err = mmc_release_bus(dev, dev);
return (err);
}
static int
mmc_resume(device_t dev)
{
struct mmc_softc *sc = device_get_softc(dev);
mmc_scan(sc);
return (bus_generic_resume(dev));
}
static int
mmc_acquire_bus(device_t busdev, device_t dev)
{
struct mmc_softc *sc;
struct mmc_ivars *ivar;
int err;
uint16_t rca;
enum mmc_bus_timing timing;
err = MMCBR_ACQUIRE_HOST(device_get_parent(busdev), busdev);
if (err)
return (err);
sc = device_get_softc(busdev);
MMC_LOCK(sc);
if (sc->owner)
panic("mmc: host bridge didn't serialize us.");
sc->owner = dev;
MMC_UNLOCK(sc);
if (busdev != dev) {
/*
* Keep track of the last rca that we've selected. If
* we're asked to do it again, don't. We never
* unselect unless the bus code itself wants the mmc
* bus, and constantly reselecting causes problems.
*/
ivar = device_get_ivars(dev);
rca = ivar->rca;
if (sc->last_rca != rca) {
if (mmc_select_card(sc, rca) != MMC_ERR_NONE) {
device_printf(busdev, "Card at relative "
"address %d failed to select\n", rca);
return (ENXIO);
}
sc->last_rca = rca;
timing = mmcbr_get_timing(busdev);
/*
* For eMMC modes, setting/updating bus width and VCCQ
* only really is necessary if there actually is more
* than one device on the bus as generally that already
* had to be done by mmc_calculate_clock() or one of
* its calees. Moreover, setting the bus width anew
* can trigger re-tuning (via a CRC error on the next
* CMD), even if not switching between devices an the
* previously selected one is still tuned. Obviously,
* we need to re-tune the host controller if devices
* are actually switched, though.
*/
if (timing >= bus_timing_mmc_ddr52 &&
sc->child_count == 1)
return (0);
/* Prepare bus width for the new card. */
if (bootverbose || mmc_debug) {
device_printf(busdev,
"setting bus width to %d bits %s timing\n",
(ivar->bus_width == bus_width_4) ? 4 :
(ivar->bus_width == bus_width_8) ? 8 : 1,
mmc_timing_to_string(timing));
}
if (mmc_set_card_bus_width(sc, ivar, timing) !=
MMC_ERR_NONE) {
device_printf(busdev, "Card at relative "
"address %d failed to set bus width\n",
rca);
return (ENXIO);
}
mmcbr_set_bus_width(busdev, ivar->bus_width);
mmcbr_update_ios(busdev);
if (mmc_set_vccq(sc, ivar, timing) != MMC_ERR_NONE) {
device_printf(busdev, "Failed to set VCCQ "
"for card at relative address %d\n", rca);
return (ENXIO);
}
if (timing >= bus_timing_mmc_hs200 &&
mmc_retune(busdev, dev, true) != 0) {
device_printf(busdev, "Card at relative "
"address %d failed to re-tune\n", rca);
return (ENXIO);
}
}
} else {
/*
* If there's a card selected, stand down.
*/
if (sc->last_rca != 0) {
if (mmc_select_card(sc, 0) != MMC_ERR_NONE)
return (ENXIO);
sc->last_rca = 0;
}
}
return (0);
}
static int
mmc_release_bus(device_t busdev, device_t dev)
{
struct mmc_softc *sc;
int err;
sc = device_get_softc(busdev);
MMC_LOCK(sc);
if (!sc->owner)
panic("mmc: releasing unowned bus.");
if (sc->owner != dev)
panic("mmc: you don't own the bus. game over.");
MMC_UNLOCK(sc);
err = MMCBR_RELEASE_HOST(device_get_parent(busdev), busdev);
if (err)
return (err);
MMC_LOCK(sc);
sc->owner = NULL;
MMC_UNLOCK(sc);
return (0);
}
static uint32_t
mmc_select_vdd(struct mmc_softc *sc, uint32_t ocr)
{
return (ocr & MMC_OCR_VOLTAGE);
}
static int
mmc_highest_voltage(uint32_t ocr)
{
int i;
for (i = MMC_OCR_MAX_VOLTAGE_SHIFT;
i >= MMC_OCR_MIN_VOLTAGE_SHIFT; i--)
if (ocr & (1 << i))
return (i);
return (-1);
}
static void
mmc_wakeup(struct mmc_request *req)
{
struct mmc_softc *sc;
sc = (struct mmc_softc *)req->done_data;
MMC_LOCK(sc);
req->flags |= MMC_REQ_DONE;
MMC_UNLOCK(sc);
wakeup(req);
}
static int
mmc_wait_for_req(struct mmc_softc *sc, struct mmc_request *req)
{
req->done = mmc_wakeup;
req->done_data = sc;
if (__predict_false(mmc_debug > 1)) {
device_printf(sc->dev, "REQUEST: CMD%d arg %#x flags %#x",
req->cmd->opcode, req->cmd->arg, req->cmd->flags);
if (req->cmd->data) {
printf(" data %d\n", (int)req->cmd->data->len);
} else
printf("\n");
}
MMCBR_REQUEST(device_get_parent(sc->dev), sc->dev, req);
MMC_LOCK(sc);
while ((req->flags & MMC_REQ_DONE) == 0)
msleep(req, &sc->sc_mtx, 0, "mmcreq", 0);
MMC_UNLOCK(sc);
if (__predict_false(mmc_debug > 2 || (mmc_debug > 0 &&
req->cmd->error != MMC_ERR_NONE)))
device_printf(sc->dev, "CMD%d RESULT: %d\n",
req->cmd->opcode, req->cmd->error);
return (0);
}
static int
mmc_wait_for_request(device_t busdev, device_t dev, struct mmc_request *req)
{
struct mmc_softc *sc;
struct mmc_ivars *ivar;
int err, i;
enum mmc_retune_req retune_req;
sc = device_get_softc(busdev);
KASSERT(sc->owner != NULL,
("%s: Request from %s without bus being acquired.", __func__,
device_get_nameunit(dev)));
/*
* Unless no device is selected or re-tuning is already ongoing,
* execute re-tuning if a) the bridge is requesting to do so and
* re-tuning hasn't been otherwise paused, or b) if a child asked
* to be re-tuned prior to pausing (see also mmc_retune_pause()).
*/
if (__predict_false(sc->last_rca != 0 && sc->retune_ongoing == 0 &&
(((retune_req = mmcbr_get_retune_req(busdev)) != retune_req_none &&
sc->retune_paused == 0) || sc->retune_needed == 1))) {
if (__predict_false(mmc_debug > 1)) {
device_printf(busdev,
"Re-tuning with%s circuit reset required\n",
retune_req == retune_req_reset ? "" : "out");
}
if (device_get_parent(dev) == busdev)
ivar = device_get_ivars(dev);
else {
for (i = 0; i < sc->child_count; i++) {
ivar = device_get_ivars(sc->child_list[i]);
if (ivar->rca == sc->last_rca)
break;
}
if (ivar->rca != sc->last_rca)
return (EINVAL);
}
sc->retune_ongoing = 1;
err = mmc_retune(busdev, dev, retune_req == retune_req_reset);
sc->retune_ongoing = 0;
switch (err) {
case MMC_ERR_NONE:
case MMC_ERR_FAILED: /* Re-tune error but still might work */
break;
case MMC_ERR_BADCRC: /* Switch failure on HS400 recovery */
return (ENXIO);
case MMC_ERR_INVALID: /* Driver implementation b0rken */
default: /* Unknown error, should not happen */
return (EINVAL);
}
sc->retune_needed = 0;
}
return (mmc_wait_for_req(sc, req));
}
static int
mmc_wait_for_command(struct mmc_softc *sc, uint32_t opcode,
uint32_t arg, uint32_t flags, uint32_t *resp, int retries)
{
struct mmc_command cmd;
int err;
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = opcode;
cmd.arg = arg;
cmd.flags = flags;
cmd.data = NULL;
err = mmc_wait_for_cmd(sc->dev, sc->dev, &cmd, retries);
if (err)
return (err);
if (resp) {
if (flags & MMC_RSP_136)
memcpy(resp, cmd.resp, 4 * sizeof(uint32_t));
else
*resp = cmd.resp[0];
}
return (0);
}
static void
mmc_idle_cards(struct mmc_softc *sc)
{
device_t dev;
struct mmc_command cmd;
dev = sc->dev;
mmcbr_set_chip_select(dev, cs_high);
mmcbr_update_ios(dev);
mmc_ms_delay(1);
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = MMC_GO_IDLE_STATE;
cmd.arg = 0;
cmd.flags = MMC_RSP_NONE | MMC_CMD_BC;
cmd.data = NULL;
mmc_wait_for_cmd(sc->dev, sc->dev, &cmd, CMD_RETRIES);
mmc_ms_delay(1);
mmcbr_set_chip_select(dev, cs_dontcare);
mmcbr_update_ios(dev);
mmc_ms_delay(1);
}
static int
mmc_send_app_op_cond(struct mmc_softc *sc, uint32_t ocr, uint32_t *rocr)
{
struct mmc_command cmd;
int err = MMC_ERR_NONE, i;
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = ACMD_SD_SEND_OP_COND;
cmd.arg = ocr;
cmd.flags = MMC_RSP_R3 | MMC_CMD_BCR;
cmd.data = NULL;
for (i = 0; i < 1000; i++) {
err = mmc_wait_for_app_cmd(sc->dev, sc->dev, 0, &cmd,
CMD_RETRIES);
if (err != MMC_ERR_NONE)
break;
if ((cmd.resp[0] & MMC_OCR_CARD_BUSY) ||
(ocr & MMC_OCR_VOLTAGE) == 0)
break;
err = MMC_ERR_TIMEOUT;
mmc_ms_delay(10);
}
if (rocr && err == MMC_ERR_NONE)
*rocr = cmd.resp[0];
return (err);
}
static int
mmc_send_op_cond(struct mmc_softc *sc, uint32_t ocr, uint32_t *rocr)
{
struct mmc_command cmd;
int err = MMC_ERR_NONE, i;
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = MMC_SEND_OP_COND;
cmd.arg = ocr;
cmd.flags = MMC_RSP_R3 | MMC_CMD_BCR;
cmd.data = NULL;
for (i = 0; i < 1000; i++) {
err = mmc_wait_for_cmd(sc->dev, sc->dev, &cmd, CMD_RETRIES);
if (err != MMC_ERR_NONE)
break;
if ((cmd.resp[0] & MMC_OCR_CARD_BUSY) ||
(ocr & MMC_OCR_VOLTAGE) == 0)
break;
err = MMC_ERR_TIMEOUT;
mmc_ms_delay(10);
}
if (rocr && err == MMC_ERR_NONE)
*rocr = cmd.resp[0];
return (err);
}
static int
mmc_send_if_cond(struct mmc_softc *sc, uint8_t vhs)
{
struct mmc_command cmd;
int err;
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = SD_SEND_IF_COND;
cmd.arg = (vhs << 8) + 0xAA;
cmd.flags = MMC_RSP_R7 | MMC_CMD_BCR;
cmd.data = NULL;
err = mmc_wait_for_cmd(sc->dev, sc->dev, &cmd, CMD_RETRIES);
return (err);
}
static void
mmc_power_up(struct mmc_softc *sc)
{
device_t dev;
enum mmc_vccq vccq;
dev = sc->dev;
mmcbr_set_vdd(dev, mmc_highest_voltage(mmcbr_get_host_ocr(dev)));
mmcbr_set_bus_mode(dev, opendrain);
mmcbr_set_chip_select(dev, cs_dontcare);
mmcbr_set_bus_width(dev, bus_width_1);
mmcbr_set_power_mode(dev, power_up);
mmcbr_set_clock(dev, 0);
mmcbr_update_ios(dev);
for (vccq = vccq_330; ; vccq--) {
mmcbr_set_vccq(dev, vccq);
if (mmcbr_switch_vccq(dev) == 0 || vccq == vccq_120)
break;
}
mmc_ms_delay(1);
mmcbr_set_clock(dev, SD_MMC_CARD_ID_FREQUENCY);
mmcbr_set_timing(dev, bus_timing_normal);
mmcbr_set_power_mode(dev, power_on);
mmcbr_update_ios(dev);
mmc_ms_delay(2);
}
static void
mmc_power_down(struct mmc_softc *sc)
{
device_t dev = sc->dev;
mmcbr_set_bus_mode(dev, opendrain);
mmcbr_set_chip_select(dev, cs_dontcare);
mmcbr_set_bus_width(dev, bus_width_1);
mmcbr_set_power_mode(dev, power_off);
mmcbr_set_clock(dev, 0);
mmcbr_set_timing(dev, bus_timing_normal);
mmcbr_update_ios(dev);
}
static int
mmc_select_card(struct mmc_softc *sc, uint16_t rca)
{
int err, flags;
flags = (rca ? MMC_RSP_R1B : MMC_RSP_NONE) | MMC_CMD_AC;
sc->retune_paused++;
err = mmc_wait_for_command(sc, MMC_SELECT_CARD, (uint32_t)rca << 16,
flags, NULL, CMD_RETRIES);
sc->retune_paused--;
return (err);
}
static int
mmc_sd_switch(struct mmc_softc *sc, uint8_t mode, uint8_t grp, uint8_t value,
uint8_t *res)
{
int err;
struct mmc_command cmd;
struct mmc_data data;
memset(&cmd, 0, sizeof(cmd));
memset(&data, 0, sizeof(data));
memset(res, 0, 64);
cmd.opcode = SD_SWITCH_FUNC;
cmd.flags = MMC_RSP_R1 | MMC_CMD_ADTC;
cmd.arg = mode << 31; /* 0 - check, 1 - set */
cmd.arg |= 0x00FFFFFF;
cmd.arg &= ~(0xF << (grp * 4));
cmd.arg |= value << (grp * 4);
cmd.data = &data;
data.data = res;
data.len = 64;
data.flags = MMC_DATA_READ;
err = mmc_wait_for_cmd(sc->dev, sc->dev, &cmd, CMD_RETRIES);
return (err);
}
static int
mmc_set_card_bus_width(struct mmc_softc *sc, struct mmc_ivars *ivar,
enum mmc_bus_timing timing)
{
struct mmc_command cmd;
int err;
uint8_t value;
if (mmcbr_get_mode(sc->dev) == mode_sd) {
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = ACMD_SET_CLR_CARD_DETECT;
cmd.flags = MMC_RSP_R1 | MMC_CMD_AC;
cmd.arg = SD_CLR_CARD_DETECT;
err = mmc_wait_for_app_cmd(sc->dev, sc->dev, ivar->rca, &cmd,
CMD_RETRIES);
if (err != 0)
return (err);
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = ACMD_SET_BUS_WIDTH;
cmd.flags = MMC_RSP_R1 | MMC_CMD_AC;
switch (ivar->bus_width) {
case bus_width_1:
cmd.arg = SD_BUS_WIDTH_1;
break;
case bus_width_4:
cmd.arg = SD_BUS_WIDTH_4;
break;
default:
return (MMC_ERR_INVALID);
}
err = mmc_wait_for_app_cmd(sc->dev, sc->dev, ivar->rca, &cmd,
CMD_RETRIES);
} else {
switch (ivar->bus_width) {
case bus_width_1:
if (timing == bus_timing_mmc_hs400 ||
timing == bus_timing_mmc_hs400es)
return (MMC_ERR_INVALID);
value = EXT_CSD_BUS_WIDTH_1;
break;
case bus_width_4:
switch (timing) {
case bus_timing_mmc_ddr52:
value = EXT_CSD_BUS_WIDTH_4_DDR;
break;
case bus_timing_mmc_hs400:
case bus_timing_mmc_hs400es:
return (MMC_ERR_INVALID);
default:
value = EXT_CSD_BUS_WIDTH_4;
break;
}
break;
case bus_width_8:
value = 0;
switch (timing) {
case bus_timing_mmc_hs400es:
value = EXT_CSD_BUS_WIDTH_ES;
/* FALLTHROUGH */
case bus_timing_mmc_ddr52:
case bus_timing_mmc_hs400:
value |= EXT_CSD_BUS_WIDTH_8_DDR;
break;
default:
value = EXT_CSD_BUS_WIDTH_8;
break;
}
break;
default:
return (MMC_ERR_INVALID);
}
err = mmc_switch(sc->dev, sc->dev, ivar->rca,
EXT_CSD_CMD_SET_NORMAL, EXT_CSD_BUS_WIDTH, value,
ivar->cmd6_time, true);
}
return (err);
}
static int
mmc_set_power_class(struct mmc_softc *sc, struct mmc_ivars *ivar)
{
device_t dev;
const uint8_t *ext_csd;
uint32_t clock;
uint8_t value;
dev = sc->dev;
if (mmcbr_get_mode(dev) != mode_mmc || ivar->csd.spec_vers < 4)
return (MMC_ERR_NONE);
value = 0;
ext_csd = ivar->raw_ext_csd;
clock = mmcbr_get_clock(dev);
switch (1 << mmcbr_get_vdd(dev)) {
case MMC_OCR_LOW_VOLTAGE:
if (clock <= MMC_TYPE_HS_26_MAX)
value = ext_csd[EXT_CSD_PWR_CL_26_195];
else if (clock <= MMC_TYPE_HS_52_MAX) {
if (mmcbr_get_timing(dev) >= bus_timing_mmc_ddr52 &&
ivar->bus_width >= bus_width_4)
value = ext_csd[EXT_CSD_PWR_CL_52_195_DDR];
else
value = ext_csd[EXT_CSD_PWR_CL_52_195];
} else if (clock <= MMC_TYPE_HS200_HS400ES_MAX)
value = ext_csd[EXT_CSD_PWR_CL_200_195];
break;
case MMC_OCR_270_280:
case MMC_OCR_280_290:
case MMC_OCR_290_300:
case MMC_OCR_300_310:
case MMC_OCR_310_320:
case MMC_OCR_320_330:
case MMC_OCR_330_340:
case MMC_OCR_340_350:
case MMC_OCR_350_360:
if (clock <= MMC_TYPE_HS_26_MAX)
value = ext_csd[EXT_CSD_PWR_CL_26_360];
else if (clock <= MMC_TYPE_HS_52_MAX) {
if (mmcbr_get_timing(dev) == bus_timing_mmc_ddr52 &&
ivar->bus_width >= bus_width_4)
value = ext_csd[EXT_CSD_PWR_CL_52_360_DDR];
else
value = ext_csd[EXT_CSD_PWR_CL_52_360];
} else if (clock <= MMC_TYPE_HS200_HS400ES_MAX) {
if (ivar->bus_width == bus_width_8)
value = ext_csd[EXT_CSD_PWR_CL_200_360_DDR];
else
value = ext_csd[EXT_CSD_PWR_CL_200_360];
}
break;
default:
device_printf(dev, "No power class support for VDD 0x%x\n",
1 << mmcbr_get_vdd(dev));
return (MMC_ERR_INVALID);
}
if (ivar->bus_width == bus_width_8)
value = (value & EXT_CSD_POWER_CLASS_8BIT_MASK) >>
EXT_CSD_POWER_CLASS_8BIT_SHIFT;
else
value = (value & EXT_CSD_POWER_CLASS_4BIT_MASK) >>
EXT_CSD_POWER_CLASS_4BIT_SHIFT;
if (value == 0)
return (MMC_ERR_NONE);
return (mmc_switch(dev, dev, ivar->rca, EXT_CSD_CMD_SET_NORMAL,
EXT_CSD_POWER_CLASS, value, ivar->cmd6_time, true));
}
static int
mmc_set_timing(struct mmc_softc *sc, struct mmc_ivars *ivar,
enum mmc_bus_timing timing)
{
u_char switch_res[64];
uint8_t value;
int err;
if (mmcbr_get_mode(sc->dev) == mode_sd) {
switch (timing) {
case bus_timing_normal:
value = SD_SWITCH_NORMAL_MODE;
break;
case bus_timing_hs:
value = SD_SWITCH_HS_MODE;
break;
default:
return (MMC_ERR_INVALID);
}
err = mmc_sd_switch(sc, SD_SWITCH_MODE_SET, SD_SWITCH_GROUP1,
value, switch_res);
if (err != MMC_ERR_NONE)
return (err);
if ((switch_res[16] & 0xf) != value)
return (MMC_ERR_FAILED);
mmcbr_set_timing(sc->dev, timing);
mmcbr_update_ios(sc->dev);
} else {
switch (timing) {
case bus_timing_normal:
value = EXT_CSD_HS_TIMING_BC;
break;
case bus_timing_hs:
case bus_timing_mmc_ddr52:
value = EXT_CSD_HS_TIMING_HS;
break;
case bus_timing_mmc_hs200:
value = EXT_CSD_HS_TIMING_HS200;
break;
case bus_timing_mmc_hs400:
case bus_timing_mmc_hs400es:
value = EXT_CSD_HS_TIMING_HS400;
break;
default:
return (MMC_ERR_INVALID);
}
err = mmc_switch(sc->dev, sc->dev, ivar->rca,
EXT_CSD_CMD_SET_NORMAL, EXT_CSD_HS_TIMING, value,
ivar->cmd6_time, false);
if (err != MMC_ERR_NONE)
return (err);
mmcbr_set_timing(sc->dev, timing);
mmcbr_update_ios(sc->dev);
err = mmc_switch_status(sc->dev, sc->dev, ivar->rca,
ivar->cmd6_time);
}
return (err);
}
static int
mmc_set_vccq(struct mmc_softc *sc, struct mmc_ivars *ivar,
enum mmc_bus_timing timing)
{
if (isset(&ivar->vccq_120, timing))
mmcbr_set_vccq(sc->dev, vccq_120);
else if (isset(&ivar->vccq_180, timing))
mmcbr_set_vccq(sc->dev, vccq_180);
else
mmcbr_set_vccq(sc->dev, vccq_330);
if (mmcbr_switch_vccq(sc->dev) != 0)
return (MMC_ERR_INVALID);
else
return (MMC_ERR_NONE);
}
static const uint8_t p8[8] = {
0x55, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
};
static const uint8_t p8ok[8] = {
0xAA, 0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
};
static const uint8_t p4[4] = {
0x5A, 0x00, 0x00, 0x00
};
static const uint8_t p4ok[4] = {
0xA5, 0x00, 0x00, 0x00
};
static int
mmc_test_bus_width(struct mmc_softc *sc)
{
struct mmc_command cmd;
struct mmc_data data;
uint8_t buf[8];
int err;
if (mmcbr_get_caps(sc->dev) & MMC_CAP_8_BIT_DATA) {
mmcbr_set_bus_width(sc->dev, bus_width_8);
mmcbr_update_ios(sc->dev);
sc->squelched++; /* Errors are expected, squelch reporting. */
memset(&cmd, 0, sizeof(cmd));
memset(&data, 0, sizeof(data));
cmd.opcode = MMC_BUSTEST_W;
cmd.arg = 0;
cmd.flags = MMC_RSP_R1 | MMC_CMD_ADTC;
cmd.data = &data;
data.data = __DECONST(void *, p8);
data.len = 8;
data.flags = MMC_DATA_WRITE;
mmc_wait_for_cmd(sc->dev, sc->dev, &cmd, 0);
memset(&cmd, 0, sizeof(cmd));
memset(&data, 0, sizeof(data));
cmd.opcode = MMC_BUSTEST_R;
cmd.arg = 0;
cmd.flags = MMC_RSP_R1 | MMC_CMD_ADTC;
cmd.data = &data;
data.data = buf;
data.len = 8;
data.flags = MMC_DATA_READ;
err = mmc_wait_for_cmd(sc->dev, sc->dev, &cmd, 0);
sc->squelched--;
mmcbr_set_bus_width(sc->dev, bus_width_1);
mmcbr_update_ios(sc->dev);
if (err == MMC_ERR_NONE && memcmp(buf, p8ok, 8) == 0)
return (bus_width_8);
}
if (mmcbr_get_caps(sc->dev) & MMC_CAP_4_BIT_DATA) {
mmcbr_set_bus_width(sc->dev, bus_width_4);
mmcbr_update_ios(sc->dev);
sc->squelched++; /* Errors are expected, squelch reporting. */
memset(&cmd, 0, sizeof(cmd));
memset(&data, 0, sizeof(data));
cmd.opcode = MMC_BUSTEST_W;
cmd.arg = 0;
cmd.flags = MMC_RSP_R1 | MMC_CMD_ADTC;
cmd.data = &data;
data.data = __DECONST(void *, p4);
data.len = 4;
data.flags = MMC_DATA_WRITE;
mmc_wait_for_cmd(sc->dev, sc->dev, &cmd, 0);
memset(&cmd, 0, sizeof(cmd));
memset(&data, 0, sizeof(data));
cmd.opcode = MMC_BUSTEST_R;
cmd.arg = 0;
cmd.flags = MMC_RSP_R1 | MMC_CMD_ADTC;
cmd.data = &data;
data.data = buf;
data.len = 4;
data.flags = MMC_DATA_READ;
err = mmc_wait_for_cmd(sc->dev, sc->dev, &cmd, 0);
sc->squelched--;
mmcbr_set_bus_width(sc->dev, bus_width_1);
mmcbr_update_ios(sc->dev);
if (err == MMC_ERR_NONE && memcmp(buf, p4ok, 4) == 0)
return (bus_width_4);
}
return (bus_width_1);
}
static uint32_t
mmc_get_bits(uint32_t *bits, int bit_len, int start, int size)
{
const int i = (bit_len / 32) - (start / 32) - 1;
const int shift = start & 31;
uint32_t retval = bits[i] >> shift;
if (size + shift > 32)
retval |= bits[i - 1] << (32 - shift);
return (retval & ((1llu << size) - 1));
}
static void
mmc_decode_cid_sd(uint32_t *raw_cid, struct mmc_cid *cid)
{
int i;
/* There's no version info, so we take it on faith */
memset(cid, 0, sizeof(*cid));
cid->mid = mmc_get_bits(raw_cid, 128, 120, 8);
cid->oid = mmc_get_bits(raw_cid, 128, 104, 16);
for (i = 0; i < 5; i++)
cid->pnm[i] = mmc_get_bits(raw_cid, 128, 96 - i * 8, 8);
cid->pnm[5] = 0;
cid->prv = mmc_get_bits(raw_cid, 128, 56, 8);
cid->psn = mmc_get_bits(raw_cid, 128, 24, 32);
cid->mdt_year = mmc_get_bits(raw_cid, 128, 12, 8) + 2000;
cid->mdt_month = mmc_get_bits(raw_cid, 128, 8, 4);
}
static void
mmc_decode_cid_mmc(uint32_t *raw_cid, struct mmc_cid *cid, bool is_4_41p)
{
int i;
/* There's no version info, so we take it on faith */
memset(cid, 0, sizeof(*cid));
cid->mid = mmc_get_bits(raw_cid, 128, 120, 8);
cid->oid = mmc_get_bits(raw_cid, 128, 104, 8);
for (i = 0; i < 6; i++)
cid->pnm[i] = mmc_get_bits(raw_cid, 128, 96 - i * 8, 8);
cid->pnm[6] = 0;
cid->prv = mmc_get_bits(raw_cid, 128, 48, 8);
cid->psn = mmc_get_bits(raw_cid, 128, 16, 32);
cid->mdt_month = mmc_get_bits(raw_cid, 128, 12, 4);
cid->mdt_year = mmc_get_bits(raw_cid, 128, 8, 4);
if (is_4_41p)
cid->mdt_year += 2013;
else
cid->mdt_year += 1997;
}
static void
mmc_format_card_id_string(struct mmc_ivars *ivar)
{
char oidstr[8];
uint8_t c1;
uint8_t c2;
/*
* Format a card ID string for use by the mmcsd driver, it's what
* appears between the <> in the following:
* mmcsd0: 968MB <SD SD01G 8.0 SN 2686905 MFG 08/2008 by 3 TN> at mmc0
* 22.5MHz/4bit/128-block
*
* Also format just the card serial number, which the mmcsd driver will
* use as the disk->d_ident string.
*
* The card_id_string in mmc_ivars is currently allocated as 64 bytes,
* and our max formatted length is currently 55 bytes if every field
* contains the largest value.
*
* Sometimes the oid is two printable ascii chars; when it's not,
* format it as 0xnnnn instead.
*/
c1 = (ivar->cid.oid >> 8) & 0x0ff;
c2 = ivar->cid.oid & 0x0ff;
if (c1 > 0x1f && c1 < 0x7f && c2 > 0x1f && c2 < 0x7f)
snprintf(oidstr, sizeof(oidstr), "%c%c", c1, c2);
else
snprintf(oidstr, sizeof(oidstr), "0x%04x", ivar->cid.oid);
snprintf(ivar->card_sn_string, sizeof(ivar->card_sn_string),
"%08X", ivar->cid.psn);
snprintf(ivar->card_id_string, sizeof(ivar->card_id_string),
"%s%s %s %d.%d SN %08X MFG %02d/%04d by %d %s",
ivar->mode == mode_sd ? "SD" : "MMC", ivar->high_cap ? "HC" : "",
ivar->cid.pnm, ivar->cid.prv >> 4, ivar->cid.prv & 0x0f,
ivar->cid.psn, ivar->cid.mdt_month, ivar->cid.mdt_year,
ivar->cid.mid, oidstr);
}
static const int exp[8] = {
1, 10, 100, 1000, 10000, 100000, 1000000, 10000000
};
static const int mant[16] = {
0, 10, 12, 13, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 70, 80
};
static const int cur_min[8] = {
500, 1000, 5000, 10000, 25000, 35000, 60000, 100000
};
static const int cur_max[8] = {
1000, 5000, 10000, 25000, 35000, 45000, 800000, 200000
};
static int
mmc_decode_csd_sd(uint32_t *raw_csd, struct mmc_csd *csd)
{
int v;
int m;
int e;
memset(csd, 0, sizeof(*csd));
csd->csd_structure = v = mmc_get_bits(raw_csd, 128, 126, 2);
if (v == 0) {
m = mmc_get_bits(raw_csd, 128, 115, 4);
e = mmc_get_bits(raw_csd, 128, 112, 3);
csd->tacc = (exp[e] * mant[m] + 9) / 10;
csd->nsac = mmc_get_bits(raw_csd, 128, 104, 8) * 100;
m = mmc_get_bits(raw_csd, 128, 99, 4);
e = mmc_get_bits(raw_csd, 128, 96, 3);
csd->tran_speed = exp[e] * 10000 * mant[m];
csd->ccc = mmc_get_bits(raw_csd, 128, 84, 12);
csd->read_bl_len = 1 << mmc_get_bits(raw_csd, 128, 80, 4);
csd->read_bl_partial = mmc_get_bits(raw_csd, 128, 79, 1);
csd->write_blk_misalign = mmc_get_bits(raw_csd, 128, 78, 1);
csd->read_blk_misalign = mmc_get_bits(raw_csd, 128, 77, 1);
csd->dsr_imp = mmc_get_bits(raw_csd, 128, 76, 1);
csd->vdd_r_curr_min =
cur_min[mmc_get_bits(raw_csd, 128, 59, 3)];
csd->vdd_r_curr_max =
cur_max[mmc_get_bits(raw_csd, 128, 56, 3)];
csd->vdd_w_curr_min =
cur_min[mmc_get_bits(raw_csd, 128, 53, 3)];
csd->vdd_w_curr_max =
cur_max[mmc_get_bits(raw_csd, 128, 50, 3)];
m = mmc_get_bits(raw_csd, 128, 62, 12);
e = mmc_get_bits(raw_csd, 128, 47, 3);
csd->capacity = ((1 + m) << (e + 2)) * csd->read_bl_len;
csd->erase_blk_en = mmc_get_bits(raw_csd, 128, 46, 1);
csd->erase_sector = mmc_get_bits(raw_csd, 128, 39, 7) + 1;
csd->wp_grp_size = mmc_get_bits(raw_csd, 128, 32, 7);
csd->wp_grp_enable = mmc_get_bits(raw_csd, 128, 31, 1);
csd->r2w_factor = 1 << mmc_get_bits(raw_csd, 128, 26, 3);
csd->write_bl_len = 1 << mmc_get_bits(raw_csd, 128, 22, 4);
csd->write_bl_partial = mmc_get_bits(raw_csd, 128, 21, 1);
return (MMC_ERR_NONE);
} else if (v == 1) {
m = mmc_get_bits(raw_csd, 128, 115, 4);
e = mmc_get_bits(raw_csd, 128, 112, 3);
csd->tacc = (exp[e] * mant[m] + 9) / 10;
csd->nsac = mmc_get_bits(raw_csd, 128, 104, 8) * 100;
m = mmc_get_bits(raw_csd, 128, 99, 4);
e = mmc_get_bits(raw_csd, 128, 96, 3);
csd->tran_speed = exp[e] * 10000 * mant[m];
csd->ccc = mmc_get_bits(raw_csd, 128, 84, 12);
csd->read_bl_len = 1 << mmc_get_bits(raw_csd, 128, 80, 4);
csd->read_bl_partial = mmc_get_bits(raw_csd, 128, 79, 1);
csd->write_blk_misalign = mmc_get_bits(raw_csd, 128, 78, 1);
csd->read_blk_misalign = mmc_get_bits(raw_csd, 128, 77, 1);
csd->dsr_imp = mmc_get_bits(raw_csd, 128, 76, 1);
csd->capacity = ((uint64_t)mmc_get_bits(raw_csd, 128, 48, 22) +
1) * 512 * 1024;
csd->erase_blk_en = mmc_get_bits(raw_csd, 128, 46, 1);
csd->erase_sector = mmc_get_bits(raw_csd, 128, 39, 7) + 1;
csd->wp_grp_size = mmc_get_bits(raw_csd, 128, 32, 7);
csd->wp_grp_enable = mmc_get_bits(raw_csd, 128, 31, 1);
csd->r2w_factor = 1 << mmc_get_bits(raw_csd, 128, 26, 3);
csd->write_bl_len = 1 << mmc_get_bits(raw_csd, 128, 22, 4);
csd->write_bl_partial = mmc_get_bits(raw_csd, 128, 21, 1);
return (MMC_ERR_NONE);
}
return (MMC_ERR_INVALID);
}
static void
mmc_decode_csd_mmc(uint32_t *raw_csd, struct mmc_csd *csd)
{
int m;
int e;
memset(csd, 0, sizeof(*csd));
csd->csd_structure = mmc_get_bits(raw_csd, 128, 126, 2);
csd->spec_vers = mmc_get_bits(raw_csd, 128, 122, 4);
m = mmc_get_bits(raw_csd, 128, 115, 4);
e = mmc_get_bits(raw_csd, 128, 112, 3);
csd->tacc = exp[e] * mant[m] + 9 / 10;
csd->nsac = mmc_get_bits(raw_csd, 128, 104, 8) * 100;
m = mmc_get_bits(raw_csd, 128, 99, 4);
e = mmc_get_bits(raw_csd, 128, 96, 3);
csd->tran_speed = exp[e] * 10000 * mant[m];
csd->ccc = mmc_get_bits(raw_csd, 128, 84, 12);
csd->read_bl_len = 1 << mmc_get_bits(raw_csd, 128, 80, 4);
csd->read_bl_partial = mmc_get_bits(raw_csd, 128, 79, 1);
csd->write_blk_misalign = mmc_get_bits(raw_csd, 128, 78, 1);
csd->read_blk_misalign = mmc_get_bits(raw_csd, 128, 77, 1);
csd->dsr_imp = mmc_get_bits(raw_csd, 128, 76, 1);
csd->vdd_r_curr_min = cur_min[mmc_get_bits(raw_csd, 128, 59, 3)];
csd->vdd_r_curr_max = cur_max[mmc_get_bits(raw_csd, 128, 56, 3)];
csd->vdd_w_curr_min = cur_min[mmc_get_bits(raw_csd, 128, 53, 3)];
csd->vdd_w_curr_max = cur_max[mmc_get_bits(raw_csd, 128, 50, 3)];
m = mmc_get_bits(raw_csd, 128, 62, 12);
e = mmc_get_bits(raw_csd, 128, 47, 3);
csd->capacity = ((1 + m) << (e + 2)) * csd->read_bl_len;
csd->erase_blk_en = 0;
csd->erase_sector = (mmc_get_bits(raw_csd, 128, 42, 5) + 1) *
(mmc_get_bits(raw_csd, 128, 37, 5) + 1);
csd->wp_grp_size = mmc_get_bits(raw_csd, 128, 32, 5);
csd->wp_grp_enable = mmc_get_bits(raw_csd, 128, 31, 1);
csd->r2w_factor = 1 << mmc_get_bits(raw_csd, 128, 26, 3);
csd->write_bl_len = 1 << mmc_get_bits(raw_csd, 128, 22, 4);
csd->write_bl_partial = mmc_get_bits(raw_csd, 128, 21, 1);
}
static void
mmc_app_decode_scr(uint32_t *raw_scr, struct mmc_scr *scr)
{
unsigned int scr_struct;
memset(scr, 0, sizeof(*scr));
scr_struct = mmc_get_bits(raw_scr, 64, 60, 4);
if (scr_struct != 0) {
printf("Unrecognised SCR structure version %d\n",
scr_struct);
return;
}
scr->sda_vsn = mmc_get_bits(raw_scr, 64, 56, 4);
scr->bus_widths = mmc_get_bits(raw_scr, 64, 48, 4);
}
static void
mmc_app_decode_sd_status(uint32_t *raw_sd_status,
struct mmc_sd_status *sd_status)
{
memset(sd_status, 0, sizeof(*sd_status));
sd_status->bus_width = mmc_get_bits(raw_sd_status, 512, 510, 2);
sd_status->secured_mode = mmc_get_bits(raw_sd_status, 512, 509, 1);
sd_status->card_type = mmc_get_bits(raw_sd_status, 512, 480, 16);
sd_status->prot_area = mmc_get_bits(raw_sd_status, 512, 448, 12);
sd_status->speed_class = mmc_get_bits(raw_sd_status, 512, 440, 8);
sd_status->perf_move = mmc_get_bits(raw_sd_status, 512, 432, 8);
sd_status->au_size = mmc_get_bits(raw_sd_status, 512, 428, 4);
sd_status->erase_size = mmc_get_bits(raw_sd_status, 512, 408, 16);
sd_status->erase_timeout = mmc_get_bits(raw_sd_status, 512, 402, 6);
sd_status->erase_offset = mmc_get_bits(raw_sd_status, 512, 400, 2);
}
static int
mmc_all_send_cid(struct mmc_softc *sc, uint32_t *rawcid)
{
struct mmc_command cmd;
int err;
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = MMC_ALL_SEND_CID;
cmd.arg = 0;
cmd.flags = MMC_RSP_R2 | MMC_CMD_BCR;
cmd.data = NULL;
err = mmc_wait_for_cmd(sc->dev, sc->dev, &cmd, CMD_RETRIES);
memcpy(rawcid, cmd.resp, 4 * sizeof(uint32_t));
return (err);
}
static int
mmc_send_csd(struct mmc_softc *sc, uint16_t rca, uint32_t *rawcsd)
{
struct mmc_command cmd;
int err;
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = MMC_SEND_CSD;
cmd.arg = rca << 16;
cmd.flags = MMC_RSP_R2 | MMC_CMD_BCR;
cmd.data = NULL;
err = mmc_wait_for_cmd(sc->dev, sc->dev, &cmd, CMD_RETRIES);
memcpy(rawcsd, cmd.resp, 4 * sizeof(uint32_t));
return (err);
}
static int
mmc_app_send_scr(struct mmc_softc *sc, uint16_t rca, uint32_t *rawscr)
{
int err;
struct mmc_command cmd;
struct mmc_data data;
memset(&cmd, 0, sizeof(cmd));
memset(&data, 0, sizeof(data));
memset(rawscr, 0, 8);
cmd.opcode = ACMD_SEND_SCR;
cmd.flags = MMC_RSP_R1 | MMC_CMD_ADTC;
cmd.arg = 0;
cmd.data = &data;
data.data = rawscr;
data.len = 8;
data.flags = MMC_DATA_READ;
err = mmc_wait_for_app_cmd(sc->dev, sc->dev, rca, &cmd, CMD_RETRIES);
rawscr[0] = be32toh(rawscr[0]);
rawscr[1] = be32toh(rawscr[1]);
return (err);
}
static int
mmc_app_sd_status(struct mmc_softc *sc, uint16_t rca, uint32_t *rawsdstatus)
{
struct mmc_command cmd;
struct mmc_data data;
int err, i;
memset(&cmd, 0, sizeof(cmd));
memset(&data, 0, sizeof(data));
memset(rawsdstatus, 0, 64);
cmd.opcode = ACMD_SD_STATUS;
cmd.flags = MMC_RSP_R1 | MMC_CMD_ADTC;
cmd.arg = 0;
cmd.data = &data;
data.data = rawsdstatus;
data.len = 64;
data.flags = MMC_DATA_READ;
err = mmc_wait_for_app_cmd(sc->dev, sc->dev, rca, &cmd, CMD_RETRIES);
for (i = 0; i < 16; i++)
rawsdstatus[i] = be32toh(rawsdstatus[i]);
return (err);
}
static int
mmc_set_relative_addr(struct mmc_softc *sc, uint16_t resp)
{
struct mmc_command cmd;
int err;
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = MMC_SET_RELATIVE_ADDR;
cmd.arg = resp << 16;
cmd.flags = MMC_RSP_R6 | MMC_CMD_BCR;
cmd.data = NULL;
err = mmc_wait_for_cmd(sc->dev, sc->dev, &cmd, CMD_RETRIES);
return (err);
}
static int
mmc_send_relative_addr(struct mmc_softc *sc, uint32_t *resp)
{
struct mmc_command cmd;
int err;
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = SD_SEND_RELATIVE_ADDR;
cmd.arg = 0;
cmd.flags = MMC_RSP_R6 | MMC_CMD_BCR;
cmd.data = NULL;
err = mmc_wait_for_cmd(sc->dev, sc->dev, &cmd, CMD_RETRIES);
*resp = cmd.resp[0];
return (err);
}
static int
mmc_set_blocklen(struct mmc_softc *sc, uint32_t len)
{
struct mmc_command cmd;
int err;
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = MMC_SET_BLOCKLEN;
cmd.arg = len;
cmd.flags = MMC_RSP_R1 | MMC_CMD_AC;
cmd.data = NULL;
err = mmc_wait_for_cmd(sc->dev, sc->dev, &cmd, CMD_RETRIES);
return (err);
}
static uint32_t
mmc_timing_to_dtr(struct mmc_ivars *ivar, enum mmc_bus_timing timing)
{
switch (timing) {
case bus_timing_normal:
return (ivar->tran_speed);
case bus_timing_hs:
return (ivar->hs_tran_speed);
case bus_timing_uhs_sdr12:
return (SD_SDR12_MAX);
case bus_timing_uhs_sdr25:
return (SD_SDR25_MAX);
case bus_timing_uhs_ddr50:
return (SD_DDR50_MAX);
case bus_timing_uhs_sdr50:
return (SD_SDR50_MAX);
case bus_timing_uhs_sdr104:
return (SD_SDR104_MAX);
case bus_timing_mmc_ddr52:
return (MMC_TYPE_DDR52_MAX);
case bus_timing_mmc_hs200:
case bus_timing_mmc_hs400:
case bus_timing_mmc_hs400es:
return (MMC_TYPE_HS200_HS400ES_MAX);
}
return (0);
}
static const char *
mmc_timing_to_string(enum mmc_bus_timing timing)
{
switch (timing) {
case bus_timing_normal:
return ("normal speed");
case bus_timing_hs:
return ("high speed");
case bus_timing_uhs_sdr12:
case bus_timing_uhs_sdr25:
case bus_timing_uhs_sdr50:
case bus_timing_uhs_sdr104:
return ("single data rate");
case bus_timing_uhs_ddr50:
case bus_timing_mmc_ddr52:
return ("dual data rate");
case bus_timing_mmc_hs200:
return ("HS200");
case bus_timing_mmc_hs400:
return ("HS400");
case bus_timing_mmc_hs400es:
return ("HS400 with enhanced strobe");
}
return ("");
}
static bool
mmc_host_timing(device_t dev, enum mmc_bus_timing timing)
{
int host_caps;
host_caps = mmcbr_get_caps(dev);
#define HOST_TIMING_CAP(host_caps, cap) ({ \
bool retval; \
if (((host_caps) & (cap)) == (cap)) \
retval = true; \
else \
retval = false; \
retval; \
})
switch (timing) {
case bus_timing_normal:
return (true);
case bus_timing_hs:
return (HOST_TIMING_CAP(host_caps, MMC_CAP_HSPEED));
case bus_timing_uhs_sdr12:
return (HOST_TIMING_CAP(host_caps, MMC_CAP_UHS_SDR12));
case bus_timing_uhs_sdr25:
return (HOST_TIMING_CAP(host_caps, MMC_CAP_UHS_SDR25));
case bus_timing_uhs_ddr50:
return (HOST_TIMING_CAP(host_caps, MMC_CAP_UHS_DDR50));
case bus_timing_uhs_sdr50:
return (HOST_TIMING_CAP(host_caps, MMC_CAP_UHS_SDR50));
case bus_timing_uhs_sdr104:
return (HOST_TIMING_CAP(host_caps, MMC_CAP_UHS_SDR104));
case bus_timing_mmc_ddr52:
return (HOST_TIMING_CAP(host_caps, MMC_CAP_MMC_DDR52));
case bus_timing_mmc_hs200:
return (HOST_TIMING_CAP(host_caps, MMC_CAP_MMC_HS200));
case bus_timing_mmc_hs400:
return (HOST_TIMING_CAP(host_caps, MMC_CAP_MMC_HS400));
case bus_timing_mmc_hs400es:
return (HOST_TIMING_CAP(host_caps, MMC_CAP_MMC_HS400 |
MMC_CAP_MMC_ENH_STROBE));
}
#undef HOST_TIMING_CAP
return (false);
}
static void
mmc_log_card(device_t dev, struct mmc_ivars *ivar, int newcard)
{
- enum mmc_bus_timing max_timing, timing;
+ enum mmc_bus_timing timing;
device_printf(dev, "Card at relative address 0x%04x%s:\n",
ivar->rca, newcard ? " added" : "");
device_printf(dev, " card: %s\n", ivar->card_id_string);
- max_timing = bus_timing_normal;
for (timing = bus_timing_max; timing > bus_timing_normal; timing--) {
- if (isset(&ivar->timings, timing)) {
- max_timing = timing;
+ if (isset(&ivar->timings, timing))
break;
- }
}
device_printf(dev, " quirks: %b\n", ivar->quirks, MMC_QUIRKS_FMT);
device_printf(dev, " bus: %ubit, %uMHz (%s timing)\n",
(ivar->bus_width == bus_width_1 ? 1 :
(ivar->bus_width == bus_width_4 ? 4 : 8)),
mmc_timing_to_dtr(ivar, timing) / 1000000,
mmc_timing_to_string(timing));
device_printf(dev, " memory: %u blocks, erase sector %u blocks%s\n",
ivar->sec_count, ivar->erase_sector,
ivar->read_only ? ", read-only" : "");
}
static void
mmc_discover_cards(struct mmc_softc *sc)
{
u_char switch_res[64];
uint32_t raw_cid[4];
struct mmc_ivars *ivar = NULL;
const struct mmc_quirk *quirk;
device_t child;
int err, host_caps, i, newcard;
uint32_t resp, sec_count, status;
uint16_t rca = 2;
host_caps = mmcbr_get_caps(sc->dev);
if (bootverbose || mmc_debug)
device_printf(sc->dev, "Probing cards\n");
while (1) {
child = NULL;
sc->squelched++; /* Errors are expected, squelch reporting. */
err = mmc_all_send_cid(sc, raw_cid);
sc->squelched--;
if (err == MMC_ERR_TIMEOUT)
break;
if (err != MMC_ERR_NONE) {
device_printf(sc->dev, "Error reading CID %d\n", err);
break;
}
newcard = 1;
for (i = 0; i < sc->child_count; i++) {
ivar = device_get_ivars(sc->child_list[i]);
if (memcmp(ivar->raw_cid, raw_cid, sizeof(raw_cid)) ==
0) {
newcard = 0;
break;
}
}
if (bootverbose || mmc_debug) {
device_printf(sc->dev,
"%sard detected (CID %08x%08x%08x%08x)\n",
newcard ? "New c" : "C",
raw_cid[0], raw_cid[1], raw_cid[2], raw_cid[3]);
}
if (newcard) {
ivar = malloc(sizeof(struct mmc_ivars), M_DEVBUF,
M_WAITOK | M_ZERO);
memcpy(ivar->raw_cid, raw_cid, sizeof(raw_cid));
}
if (mmcbr_get_ro(sc->dev))
ivar->read_only = 1;
ivar->bus_width = bus_width_1;
setbit(&ivar->timings, bus_timing_normal);
ivar->mode = mmcbr_get_mode(sc->dev);
if (ivar->mode == mode_sd) {
mmc_decode_cid_sd(ivar->raw_cid, &ivar->cid);
err = mmc_send_relative_addr(sc, &resp);
if (err != MMC_ERR_NONE) {
device_printf(sc->dev,
"Error getting RCA %d\n", err);
goto free_ivar;
}
ivar->rca = resp >> 16;
/* Get card CSD. */
err = mmc_send_csd(sc, ivar->rca, ivar->raw_csd);
if (err != MMC_ERR_NONE) {
device_printf(sc->dev,
"Error getting CSD %d\n", err);
goto free_ivar;
}
if (bootverbose || mmc_debug)
device_printf(sc->dev,
"%sard detected (CSD %08x%08x%08x%08x)\n",
newcard ? "New c" : "C", ivar->raw_csd[0],
ivar->raw_csd[1], ivar->raw_csd[2],
ivar->raw_csd[3]);
err = mmc_decode_csd_sd(ivar->raw_csd, &ivar->csd);
if (err != MMC_ERR_NONE) {
device_printf(sc->dev, "Error decoding CSD\n");
goto free_ivar;
}
ivar->sec_count = ivar->csd.capacity / MMC_SECTOR_SIZE;
if (ivar->csd.csd_structure > 0)
ivar->high_cap = 1;
ivar->tran_speed = ivar->csd.tran_speed;
ivar->erase_sector = ivar->csd.erase_sector *
ivar->csd.write_bl_len / MMC_SECTOR_SIZE;
err = mmc_send_status(sc->dev, sc->dev, ivar->rca,
&status);
if (err != MMC_ERR_NONE) {
device_printf(sc->dev,
"Error reading card status %d\n", err);
goto free_ivar;
}
if ((status & R1_CARD_IS_LOCKED) != 0) {
device_printf(sc->dev,
"Card is password protected, skipping\n");
goto free_ivar;
}
/* Get card SCR. Card must be selected to fetch it. */
err = mmc_select_card(sc, ivar->rca);
if (err != MMC_ERR_NONE) {
device_printf(sc->dev,
"Error selecting card %d\n", err);
goto free_ivar;
}
err = mmc_app_send_scr(sc, ivar->rca, ivar->raw_scr);
if (err != MMC_ERR_NONE) {
device_printf(sc->dev,
"Error reading SCR %d\n", err);
goto free_ivar;
}
mmc_app_decode_scr(ivar->raw_scr, &ivar->scr);
/* Get card switch capabilities (command class 10). */
if ((ivar->scr.sda_vsn >= 1) &&
(ivar->csd.ccc & (1 << 10))) {
err = mmc_sd_switch(sc, SD_SWITCH_MODE_CHECK,
SD_SWITCH_GROUP1, SD_SWITCH_NOCHANGE,
switch_res);
if (err == MMC_ERR_NONE &&
switch_res[13] & (1 << SD_SWITCH_HS_MODE)) {
setbit(&ivar->timings, bus_timing_hs);
ivar->hs_tran_speed = SD_HS_MAX;
}
}
/*
* We deselect then reselect the card here. Some cards
* become unselected and timeout with the above two
* commands, although the state tables / diagrams in the
* standard suggest they go back to the transfer state.
* Other cards don't become deselected, and if we
* attempt to blindly re-select them, we get timeout
* errors from some controllers. So we deselect then
* reselect to handle all situations. The only thing we
* use from the sd_status is the erase sector size, but
* it is still nice to get that right.
*/
(void)mmc_select_card(sc, 0);
(void)mmc_select_card(sc, ivar->rca);
(void)mmc_app_sd_status(sc, ivar->rca,
ivar->raw_sd_status);
mmc_app_decode_sd_status(ivar->raw_sd_status,
&ivar->sd_status);
if (ivar->sd_status.au_size != 0) {
ivar->erase_sector =
16 << ivar->sd_status.au_size;
}
/* Find maximum supported bus width. */
if ((host_caps & MMC_CAP_4_BIT_DATA) &&
(ivar->scr.bus_widths & SD_SCR_BUS_WIDTH_4))
ivar->bus_width = bus_width_4;
goto child_common;
}
ivar->rca = rca++;
err = mmc_set_relative_addr(sc, ivar->rca);
if (err != MMC_ERR_NONE) {
device_printf(sc->dev, "Error setting RCA %d\n", err);
goto free_ivar;
}
/* Get card CSD. */
err = mmc_send_csd(sc, ivar->rca, ivar->raw_csd);
if (err != MMC_ERR_NONE) {
device_printf(sc->dev, "Error getting CSD %d\n", err);
goto free_ivar;
}
if (bootverbose || mmc_debug)
device_printf(sc->dev,
"%sard detected (CSD %08x%08x%08x%08x)\n",
newcard ? "New c" : "C", ivar->raw_csd[0],
ivar->raw_csd[1], ivar->raw_csd[2],
ivar->raw_csd[3]);
mmc_decode_csd_mmc(ivar->raw_csd, &ivar->csd);
ivar->sec_count = ivar->csd.capacity / MMC_SECTOR_SIZE;
ivar->tran_speed = ivar->csd.tran_speed;
ivar->erase_sector = ivar->csd.erase_sector *
ivar->csd.write_bl_len / MMC_SECTOR_SIZE;
err = mmc_send_status(sc->dev, sc->dev, ivar->rca, &status);
if (err != MMC_ERR_NONE) {
device_printf(sc->dev,
"Error reading card status %d\n", err);
goto free_ivar;
}
if ((status & R1_CARD_IS_LOCKED) != 0) {
device_printf(sc->dev,
"Card is password protected, skipping\n");
goto free_ivar;
}
err = mmc_select_card(sc, ivar->rca);
if (err != MMC_ERR_NONE) {
device_printf(sc->dev, "Error selecting card %d\n",
err);
goto free_ivar;
}
/* Only MMC >= 4.x devices support EXT_CSD. */
if (ivar->csd.spec_vers >= 4) {
err = mmc_send_ext_csd(sc->dev, sc->dev,
ivar->raw_ext_csd);
if (err != MMC_ERR_NONE) {
device_printf(sc->dev,
"Error reading EXT_CSD %d\n", err);
goto free_ivar;
}
/* Handle extended capacity from EXT_CSD */
sec_count = ivar->raw_ext_csd[EXT_CSD_SEC_CNT] +
(ivar->raw_ext_csd[EXT_CSD_SEC_CNT + 1] << 8) +
(ivar->raw_ext_csd[EXT_CSD_SEC_CNT + 2] << 16) +
(ivar->raw_ext_csd[EXT_CSD_SEC_CNT + 3] << 24);
if (sec_count != 0) {
ivar->sec_count = sec_count;
ivar->high_cap = 1;
}
/* Find maximum supported bus width. */
ivar->bus_width = mmc_test_bus_width(sc);
/* Get device speeds beyond normal mode. */
if ((ivar->raw_ext_csd[EXT_CSD_CARD_TYPE] &
EXT_CSD_CARD_TYPE_HS_52) != 0) {
setbit(&ivar->timings, bus_timing_hs);
ivar->hs_tran_speed = MMC_TYPE_HS_52_MAX;
} else if ((ivar->raw_ext_csd[EXT_CSD_CARD_TYPE] &
EXT_CSD_CARD_TYPE_HS_26) != 0) {
setbit(&ivar->timings, bus_timing_hs);
ivar->hs_tran_speed = MMC_TYPE_HS_26_MAX;
}
if ((ivar->raw_ext_csd[EXT_CSD_CARD_TYPE] &
EXT_CSD_CARD_TYPE_DDR_52_1_2V) != 0 &&
(host_caps & MMC_CAP_SIGNALING_120) != 0) {
setbit(&ivar->timings, bus_timing_mmc_ddr52);
setbit(&ivar->vccq_120, bus_timing_mmc_ddr52);
}
if ((ivar->raw_ext_csd[EXT_CSD_CARD_TYPE] &
EXT_CSD_CARD_TYPE_DDR_52_1_8V) != 0 &&
(host_caps & MMC_CAP_SIGNALING_180) != 0) {
setbit(&ivar->timings, bus_timing_mmc_ddr52);
setbit(&ivar->vccq_180, bus_timing_mmc_ddr52);
}
if ((ivar->raw_ext_csd[EXT_CSD_CARD_TYPE] &
EXT_CSD_CARD_TYPE_HS200_1_2V) != 0 &&
(host_caps & MMC_CAP_SIGNALING_120) != 0) {
setbit(&ivar->timings, bus_timing_mmc_hs200);
setbit(&ivar->vccq_120, bus_timing_mmc_hs200);
}
if ((ivar->raw_ext_csd[EXT_CSD_CARD_TYPE] &
EXT_CSD_CARD_TYPE_HS200_1_8V) != 0 &&
(host_caps & MMC_CAP_SIGNALING_180) != 0) {
setbit(&ivar->timings, bus_timing_mmc_hs200);
setbit(&ivar->vccq_180, bus_timing_mmc_hs200);
}
if ((ivar->raw_ext_csd[EXT_CSD_CARD_TYPE] &
EXT_CSD_CARD_TYPE_HS400_1_2V) != 0 &&
(host_caps & MMC_CAP_SIGNALING_120) != 0 &&
ivar->bus_width == bus_width_8) {
setbit(&ivar->timings, bus_timing_mmc_hs400);
setbit(&ivar->vccq_120, bus_timing_mmc_hs400);
}
if ((ivar->raw_ext_csd[EXT_CSD_CARD_TYPE] &
EXT_CSD_CARD_TYPE_HS400_1_8V) != 0 &&
(host_caps & MMC_CAP_SIGNALING_180) != 0 &&
ivar->bus_width == bus_width_8) {
setbit(&ivar->timings, bus_timing_mmc_hs400);
setbit(&ivar->vccq_180, bus_timing_mmc_hs400);
}
if ((ivar->raw_ext_csd[EXT_CSD_CARD_TYPE] &
EXT_CSD_CARD_TYPE_HS400_1_2V) != 0 &&
(ivar->raw_ext_csd[EXT_CSD_STROBE_SUPPORT] &
EXT_CSD_STROBE_SUPPORT_EN) != 0 &&
(host_caps & MMC_CAP_SIGNALING_120) != 0 &&
ivar->bus_width == bus_width_8) {
setbit(&ivar->timings, bus_timing_mmc_hs400es);
setbit(&ivar->vccq_120, bus_timing_mmc_hs400es);
}
if ((ivar->raw_ext_csd[EXT_CSD_CARD_TYPE] &
EXT_CSD_CARD_TYPE_HS400_1_8V) != 0 &&
(ivar->raw_ext_csd[EXT_CSD_STROBE_SUPPORT] &
EXT_CSD_STROBE_SUPPORT_EN) != 0 &&
(host_caps & MMC_CAP_SIGNALING_180) != 0 &&
ivar->bus_width == bus_width_8) {
setbit(&ivar->timings, bus_timing_mmc_hs400es);
setbit(&ivar->vccq_180, bus_timing_mmc_hs400es);
}
/*
* Determine generic switch timeout (provided in
* units of 10 ms), defaulting to 500 ms.
*/
ivar->cmd6_time = 500 * 1000;
if (ivar->csd.spec_vers >= 6)
ivar->cmd6_time = 10 *
ivar->raw_ext_csd[EXT_CSD_GEN_CMD6_TIME];
/* Handle HC erase sector size. */
if (ivar->raw_ext_csd[EXT_CSD_ERASE_GRP_SIZE] != 0) {
ivar->erase_sector = 1024 *
ivar->raw_ext_csd[EXT_CSD_ERASE_GRP_SIZE];
err = mmc_switch(sc->dev, sc->dev, ivar->rca,
EXT_CSD_CMD_SET_NORMAL,
EXT_CSD_ERASE_GRP_DEF,
EXT_CSD_ERASE_GRP_DEF_EN,
ivar->cmd6_time, true);
if (err != MMC_ERR_NONE) {
device_printf(sc->dev,
"Error setting erase group %d\n",
err);
goto free_ivar;
}
}
}
mmc_decode_cid_mmc(ivar->raw_cid, &ivar->cid,
ivar->raw_ext_csd[EXT_CSD_REV] >= 5);
child_common:
for (quirk = &mmc_quirks[0]; quirk->mid != 0x0; quirk++) {
if ((quirk->mid == MMC_QUIRK_MID_ANY ||
quirk->mid == ivar->cid.mid) &&
(quirk->oid == MMC_QUIRK_OID_ANY ||
quirk->oid == ivar->cid.oid) &&
strncmp(quirk->pnm, ivar->cid.pnm,
sizeof(ivar->cid.pnm)) == 0) {
ivar->quirks = quirk->quirks;
break;
}
}
/*
* Some cards that report maximum I/O block sizes greater
* than 512 require the block length to be set to 512, even
* though that is supposed to be the default. Example:
*
* Transcend 2GB SDSC card, CID:
* mid=0x1b oid=0x534d pnm="00000" prv=1.0 mdt=00.2000
*/
if (ivar->csd.read_bl_len != MMC_SECTOR_SIZE ||
ivar->csd.write_bl_len != MMC_SECTOR_SIZE)
mmc_set_blocklen(sc, MMC_SECTOR_SIZE);
mmc_format_card_id_string(ivar);
if (bootverbose || mmc_debug)
mmc_log_card(sc->dev, ivar, newcard);
if (newcard) {
/* Add device. */
child = device_add_child(sc->dev, NULL, -1);
if (child != NULL) {
device_set_ivars(child, ivar);
sc->child_list = realloc(sc->child_list,
sizeof(device_t) * sc->child_count + 1,
M_DEVBUF, M_WAITOK);
sc->child_list[sc->child_count++] = child;
} else
device_printf(sc->dev, "Error adding child\n");
}
free_ivar:
if (newcard && child == NULL)
free(ivar, M_DEVBUF);
(void)mmc_select_card(sc, 0);
/*
* Not returning here when one MMC device could no be added
* potentially would mean looping forever when that device
* is broken (in which case it also may impact the remainder
* of the bus anyway, though).
*/
if ((newcard && child == NULL) ||
mmcbr_get_mode(sc->dev) == mode_sd)
return;
}
}
static void
mmc_update_child_list(struct mmc_softc *sc)
{
device_t child;
int i, j;
if (sc->child_count == 0) {
free(sc->child_list, M_DEVBUF);
return;
}
for (i = j = 0; i < sc->child_count; i++) {
for (;;) {
child = sc->child_list[j++];
if (child != NULL)
break;
}
if (i != j)
sc->child_list[i] = child;
}
sc->child_list = realloc(sc->child_list, sizeof(device_t) *
sc->child_count, M_DEVBUF, M_WAITOK);
}
static void
mmc_rescan_cards(struct mmc_softc *sc)
{
struct mmc_ivars *ivar;
int err, i, j;
for (i = j = 0; i < sc->child_count; i++) {
ivar = device_get_ivars(sc->child_list[i]);
if (mmc_select_card(sc, ivar->rca) != MMC_ERR_NONE) {
if (bootverbose || mmc_debug)
device_printf(sc->dev,
"Card at relative address %d lost\n",
ivar->rca);
err = device_delete_child(sc->dev, sc->child_list[i]);
if (err != 0) {
j++;
continue;
}
free(ivar, M_DEVBUF);
} else
j++;
}
if (sc->child_count == j)
goto out;
sc->child_count = j;
mmc_update_child_list(sc);
out:
(void)mmc_select_card(sc, 0);
}
static int
mmc_delete_cards(struct mmc_softc *sc, bool final)
{
struct mmc_ivars *ivar;
int err, i, j;
err = 0;
for (i = j = 0; i < sc->child_count; i++) {
ivar = device_get_ivars(sc->child_list[i]);
if (bootverbose || mmc_debug)
device_printf(sc->dev,
"Card at relative address %d deleted\n",
ivar->rca);
err = device_delete_child(sc->dev, sc->child_list[i]);
if (err != 0) {
j++;
if (final == false)
continue;
else
break;
}
free(ivar, M_DEVBUF);
}
sc->child_count = j;
mmc_update_child_list(sc);
return (err);
}
static void
mmc_go_discovery(struct mmc_softc *sc)
{
uint32_t ocr;
device_t dev;
int err;
dev = sc->dev;
if (mmcbr_get_power_mode(dev) != power_on) {
/*
* First, try SD modes
*/
sc->squelched++; /* Errors are expected, squelch reporting. */
mmcbr_set_mode(dev, mode_sd);
mmc_power_up(sc);
mmcbr_set_bus_mode(dev, pushpull);
if (bootverbose || mmc_debug)
device_printf(sc->dev, "Probing bus\n");
mmc_idle_cards(sc);
err = mmc_send_if_cond(sc, 1);
if ((bootverbose || mmc_debug) && err == 0)
device_printf(sc->dev,
"SD 2.0 interface conditions: OK\n");
if (mmc_send_app_op_cond(sc, 0, &ocr) != MMC_ERR_NONE) {
if (bootverbose || mmc_debug)
device_printf(sc->dev, "SD probe: failed\n");
/*
* Failed, try MMC
*/
mmcbr_set_mode(dev, mode_mmc);
if (mmc_send_op_cond(sc, 0, &ocr) != MMC_ERR_NONE) {
if (bootverbose || mmc_debug)
device_printf(sc->dev,
"MMC probe: failed\n");
ocr = 0; /* Failed both, powerdown. */
} else if (bootverbose || mmc_debug)
device_printf(sc->dev,
"MMC probe: OK (OCR: 0x%08x)\n", ocr);
} else if (bootverbose || mmc_debug)
device_printf(sc->dev, "SD probe: OK (OCR: 0x%08x)\n",
ocr);
sc->squelched--;
mmcbr_set_ocr(dev, mmc_select_vdd(sc, ocr));
if (mmcbr_get_ocr(dev) != 0)
mmc_idle_cards(sc);
} else {
mmcbr_set_bus_mode(dev, opendrain);
mmcbr_set_clock(dev, SD_MMC_CARD_ID_FREQUENCY);
mmcbr_update_ios(dev);
/* XXX recompute vdd based on new cards? */
}
/*
* Make sure that we have a mutually agreeable voltage to at least
* one card on the bus.
*/
if (bootverbose || mmc_debug)
device_printf(sc->dev, "Current OCR: 0x%08x\n",
mmcbr_get_ocr(dev));
if (mmcbr_get_ocr(dev) == 0) {
device_printf(sc->dev, "No compatible cards found on bus\n");
(void)mmc_delete_cards(sc, false);
mmc_power_down(sc);
return;
}
/*
* Reselect the cards after we've idled them above.
*/
if (mmcbr_get_mode(dev) == mode_sd) {
err = mmc_send_if_cond(sc, 1);
mmc_send_app_op_cond(sc,
(err ? 0 : MMC_OCR_CCS) | mmcbr_get_ocr(dev), NULL);
} else
mmc_send_op_cond(sc, MMC_OCR_CCS | mmcbr_get_ocr(dev), NULL);
mmc_discover_cards(sc);
mmc_rescan_cards(sc);
mmcbr_set_bus_mode(dev, pushpull);
mmcbr_update_ios(dev);
mmc_calculate_clock(sc);
}
static int
mmc_calculate_clock(struct mmc_softc *sc)
{
device_t dev;
struct mmc_ivars *ivar;
int i;
uint32_t dtr, max_dtr;
uint16_t rca;
enum mmc_bus_timing max_timing, timing;
bool changed, hs400;
dev = sc->dev;
max_dtr = mmcbr_get_f_max(dev);
max_timing = bus_timing_max;
do {
changed = false;
for (i = 0; i < sc->child_count; i++) {
ivar = device_get_ivars(sc->child_list[i]);
if (isclr(&ivar->timings, max_timing) ||
!mmc_host_timing(dev, max_timing)) {
for (timing = max_timing - 1; timing >=
bus_timing_normal; timing--) {
if (isset(&ivar->timings, timing) &&
mmc_host_timing(dev, timing)) {
max_timing = timing;
break;
}
}
changed = true;
}
dtr = mmc_timing_to_dtr(ivar, max_timing);
if (dtr < max_dtr) {
max_dtr = dtr;
changed = true;
}
}
} while (changed == true);
if (bootverbose || mmc_debug) {
device_printf(dev,
"setting transfer rate to %d.%03dMHz (%s timing)\n",
max_dtr / 1000000, (max_dtr / 1000) % 1000,
mmc_timing_to_string(max_timing));
}
/*
* HS400 must be tuned in HS200 mode, so in case of HS400 we begin
* with HS200 following the sequence as described in "6.6.2.2 HS200
* timing mode selection" of the eMMC specification v5.1, too, and
* switch to max_timing later. HS400ES requires no tuning and, thus,
* can be switch to directly, but requires the same detour via high
* speed mode as does HS400 (see mmc_switch_to_hs400()).
*/
hs400 = max_timing == bus_timing_mmc_hs400;
timing = hs400 == true ? bus_timing_mmc_hs200 : max_timing;
for (i = 0; i < sc->child_count; i++) {
ivar = device_get_ivars(sc->child_list[i]);
if ((ivar->timings & ~(1 << bus_timing_normal)) == 0)
continue;
rca = ivar->rca;
if (mmc_select_card(sc, rca) != MMC_ERR_NONE) {
device_printf(dev, "Card at relative address %d "
"failed to select\n", rca);
continue;
}
if (timing == bus_timing_mmc_hs200 || /* includes HS400 */
timing == bus_timing_mmc_hs400es) {
if (mmc_set_vccq(sc, ivar, timing) != MMC_ERR_NONE) {
device_printf(dev, "Failed to set VCCQ for "
"card at relative address %d\n", rca);
continue;
}
}
if (timing == bus_timing_mmc_hs200) { /* includes HS400 */
/* Set bus width (required for initial tuning). */
if (mmc_set_card_bus_width(sc, ivar, timing) !=
MMC_ERR_NONE) {
device_printf(dev, "Card at relative address "
"%d failed to set bus width\n", rca);
continue;
}
mmcbr_set_bus_width(dev, ivar->bus_width);
mmcbr_update_ios(dev);
} else if (timing == bus_timing_mmc_hs400es) {
if (mmc_switch_to_hs400(sc, ivar, max_dtr, timing) !=
MMC_ERR_NONE) {
device_printf(dev, "Card at relative address "
"%d failed to set %s timing\n", rca,
mmc_timing_to_string(timing));
continue;
}
goto power_class;
}
if (mmc_set_timing(sc, ivar, timing) != MMC_ERR_NONE) {
device_printf(dev, "Card at relative address %d "
"failed to set %s timing\n", rca,
mmc_timing_to_string(timing));
continue;
}
if (timing == bus_timing_mmc_ddr52) {
/*
* Set EXT_CSD_BUS_WIDTH_n_DDR in EXT_CSD_BUS_WIDTH
* (must be done after switching to EXT_CSD_HS_TIMING).
*/
if (mmc_set_card_bus_width(sc, ivar, timing) !=
MMC_ERR_NONE) {
device_printf(dev, "Card at relative address "
"%d failed to set bus width\n", rca);
continue;
}
mmcbr_set_bus_width(dev, ivar->bus_width);
mmcbr_update_ios(dev);
if (mmc_set_vccq(sc, ivar, timing) != MMC_ERR_NONE) {
device_printf(dev, "Failed to set VCCQ for "
"card at relative address %d\n", rca);
continue;
}
}
/* Set clock (must be done before initial tuning). */
mmcbr_set_clock(dev, max_dtr);
mmcbr_update_ios(dev);
if (mmcbr_tune(dev, hs400) != 0) {
device_printf(dev, "Card at relative address %d "
"failed to execute initial tuning\n", rca);
continue;
}
if (hs400 == true && mmc_switch_to_hs400(sc, ivar, max_dtr,
max_timing) != MMC_ERR_NONE) {
device_printf(dev, "Card at relative address %d "
"failed to set %s timing\n", rca,
mmc_timing_to_string(max_timing));
continue;
}
power_class:
if (mmc_set_power_class(sc, ivar) != MMC_ERR_NONE) {
device_printf(dev, "Card at relative address %d "
"failed to set power class\n", rca);
}
}
(void)mmc_select_card(sc, 0);
return (max_dtr);
}
/*
* Switch from HS200 to HS400 (either initially or for re-tuning) or directly
* to HS400ES. This follows the sequences described in "6.6.2.3 HS400 timing
* mode selection" of the eMMC specification v5.1.
*/
static int
mmc_switch_to_hs400(struct mmc_softc *sc, struct mmc_ivars *ivar,
uint32_t clock, enum mmc_bus_timing max_timing)
{
device_t dev;
int err;
uint16_t rca;
dev = sc->dev;
rca = ivar->rca;
/*
* Both clock and timing must be set as appropriate for high speed
* before eventually switching to HS400/HS400ES; mmc_set_timing()
* will issue mmcbr_update_ios().
*/
mmcbr_set_clock(dev, ivar->hs_tran_speed);
err = mmc_set_timing(sc, ivar, bus_timing_hs);
if (err != MMC_ERR_NONE)
return (err);
/*
* Set EXT_CSD_BUS_WIDTH_8_DDR in EXT_CSD_BUS_WIDTH (and additionally
* EXT_CSD_BUS_WIDTH_ES for HS400ES).
*/
err = mmc_set_card_bus_width(sc, ivar, max_timing);
if (err != MMC_ERR_NONE)
return (err);
mmcbr_set_bus_width(dev, ivar->bus_width);
mmcbr_update_ios(dev);
/* Finally, switch to HS400/HS400ES mode. */
err = mmc_set_timing(sc, ivar, max_timing);
if (err != MMC_ERR_NONE)
return (err);
mmcbr_set_clock(dev, clock);
mmcbr_update_ios(dev);
return (MMC_ERR_NONE);
}
/*
* Switch from HS400 to HS200 (for re-tuning).
*/
static int
mmc_switch_to_hs200(struct mmc_softc *sc, struct mmc_ivars *ivar,
uint32_t clock)
{
device_t dev;
int err;
uint16_t rca;
dev = sc->dev;
rca = ivar->rca;
/*
* Both clock and timing must initially be set as appropriate for
* DDR52 before eventually switching to HS200; mmc_set_timing()
* will issue mmcbr_update_ios().
*/
mmcbr_set_clock(dev, ivar->hs_tran_speed);
err = mmc_set_timing(sc, ivar, bus_timing_mmc_ddr52);
if (err != MMC_ERR_NONE)
return (err);
/*
* Next, switch to high speed. Thus, clear EXT_CSD_BUS_WIDTH_n_DDR
* in EXT_CSD_BUS_WIDTH and update bus width and timing in ios.
*/
err = mmc_set_card_bus_width(sc, ivar, bus_timing_hs);
if (err != MMC_ERR_NONE)
return (err);
mmcbr_set_bus_width(dev, ivar->bus_width);
mmcbr_set_timing(sc->dev, bus_timing_hs);
mmcbr_update_ios(dev);
/* Finally, switch to HS200 mode. */
err = mmc_set_timing(sc, ivar, bus_timing_mmc_hs200);
if (err != MMC_ERR_NONE)
return (err);
mmcbr_set_clock(dev, clock);
mmcbr_update_ios(dev);
return (MMC_ERR_NONE);
}
static int
mmc_retune(device_t busdev, device_t dev, bool reset)
{
struct mmc_softc *sc;
struct mmc_ivars *ivar;
int err;
uint32_t clock;
enum mmc_bus_timing timing;
if (device_get_parent(dev) != busdev)
return (MMC_ERR_INVALID);
sc = device_get_softc(busdev);
if (sc->retune_needed != 1 && sc->retune_paused != 0)
return (MMC_ERR_INVALID);
timing = mmcbr_get_timing(busdev);
if (timing == bus_timing_mmc_hs400) {
/*
* Controllers use the data strobe line to latch data from
* the devices in HS400 mode so periodic re-tuning isn't
* expected to be required, i. e. only if a CRC or tuning
* error is signaled to the bridge. In these latter cases
* we are asked to reset the tuning circuit and need to do
* the switch timing dance.
*/
if (reset == false)
return (0);
ivar = device_get_ivars(dev);
clock = mmcbr_get_clock(busdev);
if (mmc_switch_to_hs200(sc, ivar, clock) != MMC_ERR_NONE)
return (MMC_ERR_BADCRC);
}
err = mmcbr_retune(busdev, reset);
if (err != 0 && timing == bus_timing_mmc_hs400)
return (MMC_ERR_BADCRC);
switch (err) {
case 0:
break;
case EIO:
return (MMC_ERR_FAILED);
default:
return (MMC_ERR_INVALID);
}
if (timing == bus_timing_mmc_hs400) {
if (mmc_switch_to_hs400(sc, ivar, clock, timing) !=
MMC_ERR_NONE)
return (MMC_ERR_BADCRC);
}
return (MMC_ERR_NONE);
}
static void
mmc_retune_pause(device_t busdev, device_t dev, bool retune)
{
struct mmc_softc *sc;
sc = device_get_softc(busdev);
KASSERT(device_get_parent(dev) == busdev,
("%s: %s is not a child of %s", __func__, device_get_nameunit(dev),
device_get_nameunit(busdev)));
KASSERT(sc->owner != NULL,
("%s: Request from %s without bus being acquired.", __func__,
device_get_nameunit(dev)));
if (retune == true && sc->retune_paused == 0)
sc->retune_needed = 1;
sc->retune_paused++;
}
static void
mmc_retune_unpause(device_t busdev, device_t dev)
{
struct mmc_softc *sc;
sc = device_get_softc(busdev);
KASSERT(device_get_parent(dev) == busdev,
("%s: %s is not a child of %s", __func__, device_get_nameunit(dev),
device_get_nameunit(busdev)));
KASSERT(sc->owner != NULL,
("%s: Request from %s without bus being acquired.", __func__,
device_get_nameunit(dev)));
KASSERT(sc->retune_paused != 0,
("%s: Re-tune pause count already at 0", __func__));
sc->retune_paused--;
}
static void
mmc_scan(struct mmc_softc *sc)
{
device_t dev = sc->dev;
int err;
err = mmc_acquire_bus(dev, dev);
if (err != 0) {
device_printf(dev, "Failed to acquire bus for scanning\n");
return;
}
mmc_go_discovery(sc);
err = mmc_release_bus(dev, dev);
if (err != 0) {
device_printf(dev, "Failed to release bus after scanning\n");
return;
}
(void)bus_generic_attach(dev);
}
static int
mmc_read_ivar(device_t bus, device_t child, int which, uintptr_t *result)
{
struct mmc_ivars *ivar = device_get_ivars(child);
switch (which) {
default:
return (EINVAL);
case MMC_IVAR_SPEC_VERS:
*result = ivar->csd.spec_vers;
break;
case MMC_IVAR_DSR_IMP:
*result = ivar->csd.dsr_imp;
break;
case MMC_IVAR_MEDIA_SIZE:
*result = ivar->sec_count;
break;
case MMC_IVAR_RCA:
*result = ivar->rca;
break;
case MMC_IVAR_SECTOR_SIZE:
*result = MMC_SECTOR_SIZE;
break;
case MMC_IVAR_TRAN_SPEED:
*result = mmcbr_get_clock(bus);
break;
case MMC_IVAR_READ_ONLY:
*result = ivar->read_only;
break;
case MMC_IVAR_HIGH_CAP:
*result = ivar->high_cap;
break;
case MMC_IVAR_CARD_TYPE:
*result = ivar->mode;
break;
case MMC_IVAR_BUS_WIDTH:
*result = ivar->bus_width;
break;
case MMC_IVAR_ERASE_SECTOR:
*result = ivar->erase_sector;
break;
case MMC_IVAR_MAX_DATA:
*result = mmcbr_get_max_data(bus);
break;
case MMC_IVAR_CMD6_TIMEOUT:
*result = ivar->cmd6_time;
break;
case MMC_IVAR_QUIRKS:
*result = ivar->quirks;
break;
case MMC_IVAR_CARD_ID_STRING:
*(char **)result = ivar->card_id_string;
break;
case MMC_IVAR_CARD_SN_STRING:
*(char **)result = ivar->card_sn_string;
break;
}
return (0);
}
static int
mmc_write_ivar(device_t bus, device_t child, int which, uintptr_t value)
{
/*
* None are writable ATM
*/
return (EINVAL);
}
static void
mmc_delayed_attach(void *xsc)
{
struct mmc_softc *sc = xsc;
mmc_scan(sc);
config_intrhook_disestablish(&sc->config_intrhook);
}
static int
mmc_child_location_str(device_t dev, device_t child, char *buf,
size_t buflen)
{
snprintf(buf, buflen, "rca=0x%04x", mmc_get_rca(child));
return (0);
}
static device_method_t mmc_methods[] = {
/* device_if */
DEVMETHOD(device_probe, mmc_probe),
DEVMETHOD(device_attach, mmc_attach),
DEVMETHOD(device_detach, mmc_detach),
DEVMETHOD(device_suspend, mmc_suspend),
DEVMETHOD(device_resume, mmc_resume),
/* Bus interface */
DEVMETHOD(bus_read_ivar, mmc_read_ivar),
DEVMETHOD(bus_write_ivar, mmc_write_ivar),
DEVMETHOD(bus_child_location_str, mmc_child_location_str),
/* MMC Bus interface */
DEVMETHOD(mmcbus_retune_pause, mmc_retune_pause),
DEVMETHOD(mmcbus_retune_unpause, mmc_retune_unpause),
DEVMETHOD(mmcbus_wait_for_request, mmc_wait_for_request),
DEVMETHOD(mmcbus_acquire_bus, mmc_acquire_bus),
DEVMETHOD(mmcbus_release_bus, mmc_release_bus),
DEVMETHOD_END
};
driver_t mmc_driver = {
"mmc",
mmc_methods,
sizeof(struct mmc_softc),
};
devclass_t mmc_devclass;
MODULE_VERSION(mmc, MMC_VERSION);
Index: head/sys/dev/mmc/mmcsd.c
===================================================================
--- head/sys/dev/mmc/mmcsd.c (revision 327172)
+++ head/sys/dev/mmc/mmcsd.c (revision 327173)
@@ -1,1472 +1,1470 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2006 Bernd Walter. All rights reserved.
* Copyright (c) 2006 M. Warner Losh. All rights reserved.
* Copyright (c) 2017 Marius Strobl <marius@FreeBSD.org>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* Portions of this software may have been developed with reference to
* the SD Simplified Specification. The following disclaimer may apply:
*
* The following conditions apply to the release of the simplified
* specification ("Simplified Specification") by the SD Card Association and
* the SD Group. The Simplified Specification is a subset of the complete SD
* Specification which is owned by the SD Card Association and the SD
* Group. This Simplified Specification is provided on a non-confidential
* basis subject to the disclaimers below. Any implementation of the
* Simplified Specification may require a license from the SD Card
* Association, SD Group, SD-3C LLC or other third parties.
*
* Disclaimers:
*
* The information contained in the Simplified Specification is presented only
* as a standard specification for SD Cards and SD Host/Ancillary products and
* is provided "AS-IS" without any representations or warranties of any
* kind. No responsibility is assumed by the SD Group, SD-3C LLC or the SD
* Card Association for any damages, any infringements of patents or other
* right of the SD Group, SD-3C LLC, the SD Card Association or any third
* parties, which may result from its use. No license is granted by
* implication, estoppel or otherwise under any patent or other rights of the
* SD Group, SD-3C LLC, the SD Card Association or any third party. Nothing
* herein shall be construed as an obligation by the SD Group, the SD-3C LLC
* or the SD Card Association to disclose or distribute any technical
* information, know-how or other confidential information to any third party.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/bio.h>
#include <sys/bus.h>
#include <sys/conf.h>
#include <sys/fcntl.h>
#include <sys/ioccom.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/slicer.h>
#include <sys/time.h>
#include <geom/geom.h>
#include <geom/geom_disk.h>
#include <dev/mmc/bridge.h>
#include <dev/mmc/mmc_ioctl.h>
#include <dev/mmc/mmc_subr.h>
#include <dev/mmc/mmcbrvar.h>
#include <dev/mmc/mmcreg.h>
#include <dev/mmc/mmcvar.h>
#include "mmcbus_if.h"
#if __FreeBSD_version < 800002
#define kproc_create kthread_create
#define kproc_exit kthread_exit
#endif
#define MMCSD_CMD_RETRIES 5
#define MMCSD_FMT_BOOT "mmcsd%dboot"
#define MMCSD_FMT_GP "mmcsd%dgp"
#define MMCSD_FMT_RPMB "mmcsd%drpmb"
#define MMCSD_LABEL_ENH "enh"
#define MMCSD_PART_NAMELEN (16 + 1)
struct mmcsd_softc;
struct mmcsd_part {
struct mtx disk_mtx;
struct mtx ioctl_mtx;
struct mmcsd_softc *sc;
struct disk *disk;
struct proc *p;
struct bio_queue_head bio_queue;
daddr_t eblock, eend; /* Range remaining after the last erase. */
u_int cnt;
u_int type;
int running;
int suspend;
int ioctl;
bool ro;
char name[MMCSD_PART_NAMELEN];
};
struct mmcsd_softc {
device_t dev;
device_t mmcbus;
struct mmcsd_part *part[MMC_PART_MAX];
enum mmc_card_mode mode;
u_int max_data; /* Maximum data size [blocks] */
u_int erase_sector; /* Device native erase sector size [blocks] */
uint8_t high_cap; /* High Capacity device (block addressed) */
uint8_t part_curr; /* Partition currently switched to */
uint8_t ext_csd[MMC_EXTCSD_SIZE];
uint16_t rca;
uint32_t flags;
#define MMCSD_INAND_CMD38 0x0001
#define MMCSD_USE_TRIM 0x0002
uint32_t cmd6_time; /* Generic switch timeout [us] */
uint32_t part_time; /* Partition switch timeout [us] */
off_t enh_base; /* Enhanced user data area slice base ... */
off_t enh_size; /* ... and size [bytes] */
int log_count;
struct timeval log_time;
struct cdev *rpmb_dev;
};
static const char *errmsg[] =
{
"None",
"Timeout",
"Bad CRC",
"Fifo",
"Failed",
"Invalid",
"NO MEMORY"
};
#define LOG_PPS 5 /* Log no more than 5 errors per second. */
/* bus entry points */
static int mmcsd_attach(device_t dev);
static int mmcsd_detach(device_t dev);
static int mmcsd_probe(device_t dev);
/* disk routines */
static int mmcsd_close(struct disk *dp);
static int mmcsd_dump(void *arg, void *virtual, vm_offset_t physical,
off_t offset, size_t length);
static int mmcsd_getattr(struct bio *);
static int mmcsd_ioctl_disk(struct disk *disk, u_long cmd, void *data,
int fflag, struct thread *td);
static int mmcsd_open(struct disk *dp);
static void mmcsd_strategy(struct bio *bp);
static void mmcsd_task(void *arg);
/* RMPB cdev interface */
static int mmcsd_ioctl_rpmb(struct cdev *dev, u_long cmd, caddr_t data,
int fflag, struct thread *td);
static void mmcsd_add_part(struct mmcsd_softc *sc, u_int type,
const char *name, u_int cnt, off_t media_size, bool ro);
static int mmcsd_bus_bit_width(device_t dev);
static daddr_t mmcsd_delete(struct mmcsd_part *part, struct bio *bp);
static const char *mmcsd_errmsg(int e);
static int mmcsd_ioctl(struct mmcsd_part *part, u_long cmd, void *data,
int fflag);
static int mmcsd_ioctl_cmd(struct mmcsd_part *part, struct mmc_ioc_cmd *mic,
int fflag);
static uintmax_t mmcsd_pretty_size(off_t size, char *unit);
static daddr_t mmcsd_rw(struct mmcsd_part *part, struct bio *bp);
static int mmcsd_set_blockcount(struct mmcsd_softc *sc, u_int count, bool rel);
static int mmcsd_slicer(device_t dev, const char *provider,
struct flash_slice *slices, int *nslices);
static int mmcsd_switch_part(device_t bus, device_t dev, uint16_t rca,
u_int part);
#define MMCSD_DISK_LOCK(_part) mtx_lock(&(_part)->disk_mtx)
#define MMCSD_DISK_UNLOCK(_part) mtx_unlock(&(_part)->disk_mtx)
#define MMCSD_DISK_LOCK_INIT(_part) \
mtx_init(&(_part)->disk_mtx, (_part)->name, "mmcsd disk", MTX_DEF)
#define MMCSD_DISK_LOCK_DESTROY(_part) mtx_destroy(&(_part)->disk_mtx);
#define MMCSD_DISK_ASSERT_LOCKED(_part) \
mtx_assert(&(_part)->disk_mtx, MA_OWNED);
#define MMCSD_DISK_ASSERT_UNLOCKED(_part) \
mtx_assert(&(_part)->disk_mtx, MA_NOTOWNED);
#define MMCSD_IOCTL_LOCK(_part) mtx_lock(&(_part)->ioctl_mtx)
#define MMCSD_IOCTL_UNLOCK(_part) mtx_unlock(&(_part)->ioctl_mtx)
#define MMCSD_IOCTL_LOCK_INIT(_part) \
mtx_init(&(_part)->ioctl_mtx, (_part)->name, "mmcsd IOCTL", MTX_DEF)
#define MMCSD_IOCTL_LOCK_DESTROY(_part) mtx_destroy(&(_part)->ioctl_mtx);
#define MMCSD_IOCTL_ASSERT_LOCKED(_part) \
mtx_assert(&(_part)->ioctl_mtx, MA_OWNED);
#define MMCSD_IOCLT_ASSERT_UNLOCKED(_part) \
mtx_assert(&(_part)->ioctl_mtx, MA_NOTOWNED);
static int
mmcsd_probe(device_t dev)
{
device_quiet(dev);
device_set_desc(dev, "MMC/SD Memory Card");
return (0);
}
static int
mmcsd_attach(device_t dev)
{
device_t mmcbus;
struct mmcsd_softc *sc;
const uint8_t *ext_csd;
off_t erase_size, sector_size, size, wp_size;
uintmax_t bytes;
int err, i;
uint32_t quirks;
uint8_t rev;
bool comp, ro;
char unit[2];
sc = device_get_softc(dev);
sc->dev = dev;
sc->mmcbus = mmcbus = device_get_parent(dev);
sc->mode = mmcbr_get_mode(mmcbus);
/*
* Note that in principle with an SDHCI-like re-tuning implementation,
* the maximum data size can change at runtime due to a device removal/
* insertion that results in switches to/from a transfer mode involving
* re-tuning, iff there are multiple devices on a given bus. Until now
* mmc(4) lacks support for rescanning already attached buses, however,
* and sdhci(4) to date has no support for shared buses in the first
* place either.
*/
sc->max_data = mmc_get_max_data(dev);
sc->high_cap = mmc_get_high_cap(dev);
sc->rca = mmc_get_rca(dev);
sc->cmd6_time = mmc_get_cmd6_timeout(dev);
quirks = mmc_get_quirks(dev);
/* Only MMC >= 4.x devices support EXT_CSD. */
if (mmc_get_spec_vers(dev) >= 4) {
MMCBUS_ACQUIRE_BUS(mmcbus, dev);
err = mmc_send_ext_csd(mmcbus, dev, sc->ext_csd);
MMCBUS_RELEASE_BUS(mmcbus, dev);
if (err != MMC_ERR_NONE) {
device_printf(dev, "Error reading EXT_CSD %s\n",
mmcsd_errmsg(err));
return (ENXIO);
}
}
ext_csd = sc->ext_csd;
if ((quirks & MMC_QUIRK_INAND_CMD38) != 0) {
if (mmc_get_spec_vers(dev) < 4) {
device_printf(dev,
"MMC_QUIRK_INAND_CMD38 set but no EXT_CSD\n");
return (EINVAL);
}
sc->flags |= MMCSD_INAND_CMD38;
}
/*
* EXT_CSD_SEC_FEATURE_SUPPORT_GB_CL_EN denotes support for both
* insecure and secure TRIM.
*/
if ((ext_csd[EXT_CSD_SEC_FEATURE_SUPPORT] &
EXT_CSD_SEC_FEATURE_SUPPORT_GB_CL_EN) != 0 &&
(quirks & MMC_QUIRK_BROKEN_TRIM) == 0) {
if (bootverbose)
device_printf(dev, "taking advantage of TRIM\n");
sc->flags |= MMCSD_USE_TRIM;
sc->erase_sector = 1;
} else
sc->erase_sector = mmc_get_erase_sector(dev);
/*
* Enhanced user data area and general purpose partitions are only
* supported in revision 1.4 (EXT_CSD_REV == 4) and later, the RPMB
* partition in revision 1.5 (MMC v4.41, EXT_CSD_REV == 5) and later.
*/
rev = ext_csd[EXT_CSD_REV];
/*
* Ignore user-creatable enhanced user data area and general purpose
* partitions partitions as long as partitioning hasn't been finished.
*/
comp = (ext_csd[EXT_CSD_PART_SET] & EXT_CSD_PART_SET_COMPLETED) != 0;
/*
* Add enhanced user data area slice, unless it spans the entirety of
* the user data area. The enhanced area is of a multiple of high
* capacity write protect groups ((ERASE_GRP_SIZE + HC_WP_GRP_SIZE) *
* 512 KB) and its offset given in either sectors or bytes, depending
* on whether it's a high capacity device or not.
* NB: The slicer and its slices need to be registered before adding
* the disk for the corresponding user data area as re-tasting is
* racy.
*/
sector_size = mmc_get_sector_size(dev);
size = ext_csd[EXT_CSD_ENH_SIZE_MULT] +
(ext_csd[EXT_CSD_ENH_SIZE_MULT + 1] << 8) +
(ext_csd[EXT_CSD_ENH_SIZE_MULT + 2] << 16);
if (rev >= 4 && comp == TRUE && size > 0 &&
(ext_csd[EXT_CSD_PART_SUPPORT] &
EXT_CSD_PART_SUPPORT_ENH_ATTR_EN) != 0 &&
(ext_csd[EXT_CSD_PART_ATTR] & (EXT_CSD_PART_ATTR_ENH_USR)) != 0) {
erase_size = ext_csd[EXT_CSD_ERASE_GRP_SIZE] * 1024 *
MMC_SECTOR_SIZE;
wp_size = ext_csd[EXT_CSD_HC_WP_GRP_SIZE];
size *= erase_size * wp_size;
if (size != mmc_get_media_size(dev) * sector_size) {
sc->enh_size = size;
sc->enh_base = (ext_csd[EXT_CSD_ENH_START_ADDR] +
(ext_csd[EXT_CSD_ENH_START_ADDR + 1] << 8) +
(ext_csd[EXT_CSD_ENH_START_ADDR + 2] << 16) +
(ext_csd[EXT_CSD_ENH_START_ADDR + 3] << 24)) *
(sc->high_cap == 0 ? MMC_SECTOR_SIZE : 1);
} else if (bootverbose)
device_printf(dev,
"enhanced user data area spans entire device\n");
}
/*
* Add default partition. This may be the only one or the user
* data area in case partitions are supported.
*/
ro = mmc_get_read_only(dev);
mmcsd_add_part(sc, EXT_CSD_PART_CONFIG_ACC_DEFAULT, "mmcsd",
device_get_unit(dev), mmc_get_media_size(dev) * sector_size, ro);
if (mmc_get_spec_vers(dev) < 3)
return (0);
/* Belatedly announce enhanced user data slice. */
if (sc->enh_size != 0) {
bytes = mmcsd_pretty_size(size, unit);
printf(FLASH_SLICES_FMT ": %ju%sB enhanced user data area "
"slice offset 0x%jx at %s\n", device_get_nameunit(dev),
MMCSD_LABEL_ENH, bytes, unit, (uintmax_t)sc->enh_base,
device_get_nameunit(dev));
}
/*
* Determine partition switch timeout (provided in units of 10 ms)
* and ensure it's at least 300 ms as some eMMC chips lie.
*/
sc->part_time = max(ext_csd[EXT_CSD_PART_SWITCH_TO] * 10 * 1000,
300 * 1000);
/* Add boot partitions, which are of a fixed multiple of 128 KB. */
size = ext_csd[EXT_CSD_BOOT_SIZE_MULT] * MMC_BOOT_RPMB_BLOCK_SIZE;
if (size > 0 && (mmcbr_get_caps(mmcbus) & MMC_CAP_BOOT_NOACC) == 0) {
mmcsd_add_part(sc, EXT_CSD_PART_CONFIG_ACC_BOOT0,
MMCSD_FMT_BOOT, 0, size,
ro | ((ext_csd[EXT_CSD_BOOT_WP_STATUS] &
EXT_CSD_BOOT_WP_STATUS_BOOT0_MASK) != 0));
mmcsd_add_part(sc, EXT_CSD_PART_CONFIG_ACC_BOOT1,
MMCSD_FMT_BOOT, 1, size,
ro | ((ext_csd[EXT_CSD_BOOT_WP_STATUS] &
EXT_CSD_BOOT_WP_STATUS_BOOT1_MASK) != 0));
}
/* Add RPMB partition, which also is of a fixed multiple of 128 KB. */
size = ext_csd[EXT_CSD_RPMB_MULT] * MMC_BOOT_RPMB_BLOCK_SIZE;
if (rev >= 5 && size > 0)
mmcsd_add_part(sc, EXT_CSD_PART_CONFIG_ACC_RPMB,
MMCSD_FMT_RPMB, 0, size, ro);
if (rev <= 3 || comp == FALSE)
return (0);
/*
* Add general purpose partitions, which are of a multiple of high
* capacity write protect groups, too.
*/
if ((ext_csd[EXT_CSD_PART_SUPPORT] & EXT_CSD_PART_SUPPORT_EN) != 0) {
erase_size = ext_csd[EXT_CSD_ERASE_GRP_SIZE] * 1024 *
MMC_SECTOR_SIZE;
wp_size = ext_csd[EXT_CSD_HC_WP_GRP_SIZE];
for (i = 0; i < MMC_PART_GP_MAX; i++) {
size = ext_csd[EXT_CSD_GP_SIZE_MULT + i * 3] +
(ext_csd[EXT_CSD_GP_SIZE_MULT + i * 3 + 1] << 8) +
(ext_csd[EXT_CSD_GP_SIZE_MULT + i * 3 + 2] << 16);
if (size == 0)
continue;
mmcsd_add_part(sc, EXT_CSD_PART_CONFIG_ACC_GP0 + i,
MMCSD_FMT_GP, i, size * erase_size * wp_size, ro);
}
}
return (0);
}
static uintmax_t
mmcsd_pretty_size(off_t size, char *unit)
{
uintmax_t bytes;
int i;
/*
* Display in most natural units. There's no card < 1MB. However,
* RPMB partitions occasionally are smaller than that, though. The
* SD standard goes to 2 GiB due to its reliance on FAT, but the data
* format supports up to 4 GiB and some card makers push it up to this
* limit. The SDHC standard only goes to 32 GiB due to FAT32, but the
* data format supports up to 2 TiB however. 2048 GB isn't too ugly,
* so we note it in passing here and don't add the code to print TB).
* Since these cards are sold in terms of MB and GB not MiB and GiB,
* report them like that. We also round to the nearest unit, since
* many cards are a few percent short, even of the power of 10 size.
*/
bytes = size;
unit[0] = unit[1] = '\0';
for (i = 0; i <= 2 && bytes >= 1000; i++) {
bytes = (bytes + 1000 / 2 - 1) / 1000;
switch (i) {
case 0:
unit[0] = 'k';
break;
case 1:
unit[0] = 'M';
break;
case 2:
unit[0] = 'G';
break;
default:
break;
}
}
return (bytes);
}
static struct cdevsw mmcsd_rpmb_cdevsw = {
.d_version = D_VERSION,
.d_name = "mmcsdrpmb",
.d_ioctl = mmcsd_ioctl_rpmb
};
static void
mmcsd_add_part(struct mmcsd_softc *sc, u_int type, const char *name, u_int cnt,
off_t media_size, bool ro)
{
struct make_dev_args args;
device_t dev, mmcbus;
const char *ext;
const uint8_t *ext_csd;
struct mmcsd_part *part;
struct disk *d;
uintmax_t bytes;
u_int gp;
uint32_t speed;
uint8_t extattr;
bool enh;
char unit[2];
dev = sc->dev;
mmcbus = sc->mmcbus;
part = sc->part[type] = malloc(sizeof(*part), M_DEVBUF,
M_WAITOK | M_ZERO);
part->sc = sc;
part->cnt = cnt;
part->type = type;
part->ro = ro;
snprintf(part->name, sizeof(part->name), name, device_get_unit(dev));
MMCSD_IOCTL_LOCK_INIT(part);
/*
* For the RPMB partition, allow IOCTL access only.
* NB: If ever attaching RPMB partitions to disk(9), the re-tuning
* implementation and especially its pausing need to be revisited,
* because then re-tuning requests may be issued by the IOCTL half
* of this driver while re-tuning is already paused by the disk(9)
* one and vice versa.
*/
if (type == EXT_CSD_PART_CONFIG_ACC_RPMB) {
make_dev_args_init(&args);
args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
args.mda_devsw = &mmcsd_rpmb_cdevsw;
args.mda_uid = UID_ROOT;
args.mda_gid = GID_OPERATOR;
args.mda_mode = 0640;
args.mda_si_drv1 = part;
if (make_dev_s(&args, &sc->rpmb_dev, "%s", part->name) != 0) {
device_printf(dev, "Failed to make RPMB device\n");
free(part, M_DEVBUF);
return;
}
} else {
MMCSD_DISK_LOCK_INIT(part);
d = part->disk = disk_alloc();
d->d_open = mmcsd_open;
d->d_close = mmcsd_close;
d->d_strategy = mmcsd_strategy;
d->d_ioctl = mmcsd_ioctl_disk;
d->d_dump = mmcsd_dump;
d->d_getattr = mmcsd_getattr;
d->d_name = part->name;
d->d_drv1 = part;
d->d_sectorsize = mmc_get_sector_size(dev);
d->d_maxsize = sc->max_data * d->d_sectorsize;
d->d_mediasize = media_size;
d->d_stripesize = sc->erase_sector * d->d_sectorsize;
d->d_unit = cnt;
d->d_flags = DISKFLAG_CANDELETE;
d->d_delmaxsize = mmc_get_erase_sector(dev) * d->d_sectorsize;
strlcpy(d->d_ident, mmc_get_card_sn_string(dev),
sizeof(d->d_ident));
strlcpy(d->d_descr, mmc_get_card_id_string(dev),
sizeof(d->d_descr));
d->d_rotation_rate = DISK_RR_NON_ROTATING;
disk_create(d, DISK_VERSION);
bioq_init(&part->bio_queue);
part->running = 1;
kproc_create(&mmcsd_task, part, &part->p, 0, 0,
"%s%d: mmc/sd card", part->name, cnt);
}
bytes = mmcsd_pretty_size(media_size, unit);
if (type == EXT_CSD_PART_CONFIG_ACC_DEFAULT) {
speed = mmcbr_get_clock(mmcbus);
printf("%s%d: %ju%sB <%s>%s at %s %d.%01dMHz/%dbit/%d-block\n",
part->name, cnt, bytes, unit, mmc_get_card_id_string(dev),
ro ? " (read-only)" : "", device_get_nameunit(mmcbus),
speed / 1000000, (speed / 100000) % 10,
mmcsd_bus_bit_width(dev), sc->max_data);
} else if (type == EXT_CSD_PART_CONFIG_ACC_RPMB) {
printf("%s: %ju%sB partion %d%s at %s\n", part->name, bytes,
unit, type, ro ? " (read-only)" : "",
device_get_nameunit(dev));
} else {
enh = false;
ext = NULL;
extattr = 0;
if (type >= EXT_CSD_PART_CONFIG_ACC_GP0 &&
type <= EXT_CSD_PART_CONFIG_ACC_GP3) {
ext_csd = sc->ext_csd;
gp = type - EXT_CSD_PART_CONFIG_ACC_GP0;
if ((ext_csd[EXT_CSD_PART_SUPPORT] &
EXT_CSD_PART_SUPPORT_ENH_ATTR_EN) != 0 &&
(ext_csd[EXT_CSD_PART_ATTR] &
(EXT_CSD_PART_ATTR_ENH_GP0 << gp)) != 0)
enh = true;
else if ((ext_csd[EXT_CSD_PART_SUPPORT] &
EXT_CSD_PART_SUPPORT_EXT_ATTR_EN) != 0) {
extattr = (ext_csd[EXT_CSD_EXT_PART_ATTR +
(gp / 2)] >> (4 * (gp % 2))) & 0xF;
switch (extattr) {
case EXT_CSD_EXT_PART_ATTR_DEFAULT:
break;
case EXT_CSD_EXT_PART_ATTR_SYSTEMCODE:
ext = "system code";
break;
case EXT_CSD_EXT_PART_ATTR_NPERSISTENT:
ext = "non-persistent";
break;
default:
ext = "reserved";
break;
}
}
}
if (ext == NULL)
printf("%s%d: %ju%sB partion %d%s%s at %s\n",
part->name, cnt, bytes, unit, type, enh ?
" enhanced" : "", ro ? " (read-only)" : "",
device_get_nameunit(dev));
else
printf("%s%d: %ju%sB partion %d extended 0x%x "
"(%s)%s at %s\n", part->name, cnt, bytes, unit,
type, extattr, ext, ro ? " (read-only)" : "",
device_get_nameunit(dev));
}
}
static int
mmcsd_slicer(device_t dev, const char *provider,
struct flash_slice *slices, int *nslices)
{
char name[MMCSD_PART_NAMELEN];
struct mmcsd_softc *sc;
struct mmcsd_part *part;
*nslices = 0;
if (slices == NULL)
return (ENOMEM);
sc = device_get_softc(dev);
if (sc->enh_size == 0)
return (ENXIO);
part = sc->part[EXT_CSD_PART_CONFIG_ACC_DEFAULT];
snprintf(name, sizeof(name), "%s%d", part->disk->d_name,
part->disk->d_unit);
if (strcmp(name, provider) != 0)
return (ENXIO);
*nslices = 1;
slices[0].base = sc->enh_base;
slices[0].size = sc->enh_size;
slices[0].label = MMCSD_LABEL_ENH;
return (0);
}
static int
mmcsd_detach(device_t dev)
{
struct mmcsd_softc *sc = device_get_softc(dev);
struct mmcsd_part *part;
int i;
for (i = 0; i < MMC_PART_MAX; i++) {
part = sc->part[i];
if (part != NULL) {
if (part->disk != NULL) {
MMCSD_DISK_LOCK(part);
part->suspend = 0;
if (part->running > 0) {
/* kill thread */
part->running = 0;
wakeup(part);
/* wait for thread to finish. */
while (part->running != -1)
msleep(part, &part->disk_mtx, 0,
"mmcsd disk detach", 0);
}
MMCSD_DISK_UNLOCK(part);
}
MMCSD_IOCTL_LOCK(part);
while (part->ioctl > 0)
msleep(part, &part->ioctl_mtx, 0,
"mmcsd IOCTL detach", 0);
part->ioctl = -1;
MMCSD_IOCTL_UNLOCK(part);
}
}
if (sc->rpmb_dev != NULL)
destroy_dev(sc->rpmb_dev);
for (i = 0; i < MMC_PART_MAX; i++) {
part = sc->part[i];
if (part != NULL) {
if (part->disk != NULL) {
/* Flush the request queue. */
bioq_flush(&part->bio_queue, NULL, ENXIO);
/* kill disk */
disk_destroy(part->disk);
MMCSD_DISK_LOCK_DESTROY(part);
}
MMCSD_IOCTL_LOCK_DESTROY(part);
free(part, M_DEVBUF);
}
}
return (0);
}
static int
mmcsd_suspend(device_t dev)
{
struct mmcsd_softc *sc = device_get_softc(dev);
struct mmcsd_part *part;
int i;
for (i = 0; i < MMC_PART_MAX; i++) {
part = sc->part[i];
if (part != NULL) {
if (part->disk != NULL) {
MMCSD_DISK_LOCK(part);
part->suspend = 1;
if (part->running > 0) {
/* kill thread */
part->running = 0;
wakeup(part);
/* wait for thread to finish. */
while (part->running != -1)
msleep(part, &part->disk_mtx, 0,
"mmcsd disk suspension", 0);
}
MMCSD_DISK_UNLOCK(part);
}
MMCSD_IOCTL_LOCK(part);
while (part->ioctl > 0)
msleep(part, &part->ioctl_mtx, 0,
"mmcsd IOCTL suspension", 0);
part->ioctl = -1;
MMCSD_IOCTL_UNLOCK(part);
}
}
return (0);
}
static int
mmcsd_resume(device_t dev)
{
struct mmcsd_softc *sc = device_get_softc(dev);
struct mmcsd_part *part;
int i;
for (i = 0; i < MMC_PART_MAX; i++) {
part = sc->part[i];
if (part != NULL) {
if (part->disk != NULL) {
MMCSD_DISK_LOCK(part);
part->suspend = 0;
if (part->running <= 0) {
part->running = 1;
MMCSD_DISK_UNLOCK(part);
kproc_create(&mmcsd_task, part,
&part->p, 0, 0, "%s%d: mmc/sd card",
part->name, part->cnt);
} else
MMCSD_DISK_UNLOCK(part);
}
MMCSD_IOCTL_LOCK(part);
part->ioctl = 0;
MMCSD_IOCTL_UNLOCK(part);
}
}
return (0);
}
static int
mmcsd_open(struct disk *dp __unused)
{
return (0);
}
static int
mmcsd_close(struct disk *dp __unused)
{
return (0);
}
static void
mmcsd_strategy(struct bio *bp)
{
- struct mmcsd_softc *sc;
struct mmcsd_part *part;
part = bp->bio_disk->d_drv1;
- sc = part->sc;
MMCSD_DISK_LOCK(part);
if (part->running > 0 || part->suspend > 0) {
bioq_disksort(&part->bio_queue, bp);
MMCSD_DISK_UNLOCK(part);
wakeup(part);
} else {
MMCSD_DISK_UNLOCK(part);
biofinish(bp, NULL, ENXIO);
}
}
static int
mmcsd_ioctl_rpmb(struct cdev *dev, u_long cmd, caddr_t data,
int fflag, struct thread *td __unused)
{
return (mmcsd_ioctl(dev->si_drv1, cmd, data, fflag));
}
static int
mmcsd_ioctl_disk(struct disk *disk, u_long cmd, void *data, int fflag,
struct thread *td __unused)
{
return (mmcsd_ioctl(disk->d_drv1, cmd, data, fflag));
}
static int
mmcsd_ioctl(struct mmcsd_part *part, u_long cmd, void *data, int fflag)
{
struct mmc_ioc_cmd *mic;
struct mmc_ioc_multi_cmd *mimc;
int i, err;
u_long cnt, size;
if ((fflag & FREAD) == 0)
return (EBADF);
err = 0;
switch (cmd) {
case MMC_IOC_CMD:
mic = data;
err = mmcsd_ioctl_cmd(part, mic, fflag);
break;
case MMC_IOC_MULTI_CMD:
mimc = data;
if (mimc->num_of_cmds == 0)
break;
if (mimc->num_of_cmds > MMC_IOC_MAX_CMDS)
return (EINVAL);
cnt = mimc->num_of_cmds;
size = sizeof(*mic) * cnt;
mic = malloc(size, M_TEMP, M_WAITOK);
err = copyin((const void *)mimc->cmds, mic, size);
if (err == 0) {
for (i = 0; i < cnt; i++) {
err = mmcsd_ioctl_cmd(part, &mic[i], fflag);
if (err != 0)
break;
}
}
free(mic, M_TEMP);
break;
default:
return (ENOIOCTL);
}
return (err);
}
static int
mmcsd_ioctl_cmd(struct mmcsd_part *part, struct mmc_ioc_cmd *mic, int fflag)
{
struct mmc_command cmd;
struct mmc_data data;
struct mmcsd_softc *sc;
device_t dev, mmcbus;
void *dp;
u_long len;
int err, retries;
uint32_t status;
uint16_t rca;
if ((fflag & FWRITE) == 0 && mic->write_flag != 0)
return (EBADF);
if (part->ro == TRUE && mic->write_flag != 0)
return (EROFS);
/*
* We don't need to explicitly lock against the disk(9) half of this
* driver as MMCBUS_ACQUIRE_BUS() will serialize us. However, it's
* necessary to protect against races with detachment and suspension,
* especially since it's required to switch away from RPMB partitions
* again after an access (see mmcsd_switch_part()).
*/
MMCSD_IOCTL_LOCK(part);
while (part->ioctl != 0) {
if (part->ioctl < 0) {
MMCSD_IOCTL_UNLOCK(part);
return (ENXIO);
}
msleep(part, &part->ioctl_mtx, 0, "mmcsd IOCTL", 0);
}
part->ioctl = 1;
MMCSD_IOCTL_UNLOCK(part);
err = 0;
dp = NULL;
len = mic->blksz * mic->blocks;
if (len > MMC_IOC_MAX_BYTES) {
err = EOVERFLOW;
goto out;
}
if (len != 0) {
dp = malloc(len, M_TEMP, M_WAITOK);
err = copyin((void *)(uintptr_t)mic->data_ptr, dp, len);
if (err != 0)
goto out;
}
memset(&cmd, 0, sizeof(cmd));
memset(&data, 0, sizeof(data));
cmd.opcode = mic->opcode;
cmd.arg = mic->arg;
cmd.flags = mic->flags;
if (len != 0) {
data.len = len;
data.data = dp;
data.flags = mic->write_flag != 0 ? MMC_DATA_WRITE :
MMC_DATA_READ;
cmd.data = &data;
}
sc = part->sc;
rca = sc->rca;
if (mic->is_acmd == 0) {
/* Enforce/patch/restrict RCA-based commands */
switch (cmd.opcode) {
case MMC_SET_RELATIVE_ADDR:
case MMC_SELECT_CARD:
err = EPERM;
goto out;
case MMC_STOP_TRANSMISSION:
if ((cmd.arg & 0x1) == 0)
break;
/* FALLTHROUGH */
case MMC_SLEEP_AWAKE:
case MMC_SEND_CSD:
case MMC_SEND_CID:
case MMC_SEND_STATUS:
case MMC_GO_INACTIVE_STATE:
case MMC_FAST_IO:
case MMC_APP_CMD:
cmd.arg = (cmd.arg & 0x0000FFFF) | (rca << 16);
break;
default:
break;
}
}
dev = sc->dev;
mmcbus = sc->mmcbus;
MMCBUS_ACQUIRE_BUS(mmcbus, dev);
err = mmcsd_switch_part(mmcbus, dev, rca, part->type);
if (err != MMC_ERR_NONE)
goto release;
if (part->type == EXT_CSD_PART_CONFIG_ACC_RPMB) {
err = mmcsd_set_blockcount(sc, mic->blocks,
mic->write_flag & (1 << 31));
if (err != MMC_ERR_NONE)
goto switch_back;
}
if (mic->is_acmd != 0)
(void)mmc_wait_for_app_cmd(mmcbus, dev, rca, &cmd, 0);
else
(void)mmc_wait_for_cmd(mmcbus, dev, &cmd, 0);
if (part->type == EXT_CSD_PART_CONFIG_ACC_RPMB) {
/*
* If the request went to the RPMB partition, try to ensure
* that the command actually has completed ...
*/
retries = MMCSD_CMD_RETRIES;
do {
err = mmc_send_status(mmcbus, dev, rca, &status);
if (err != MMC_ERR_NONE)
break;
if (R1_STATUS(status) == 0 &&
R1_CURRENT_STATE(status) != R1_STATE_PRG)
break;
DELAY(1000);
} while (retries-- > 0);
switch_back:
/* ... and always switch back to the default partition. */
err = mmcsd_switch_part(mmcbus, dev, rca,
EXT_CSD_PART_CONFIG_ACC_DEFAULT);
if (err != MMC_ERR_NONE)
goto release;
}
/*
* If EXT_CSD was changed, our copy is outdated now. Specifically,
* the upper bits of EXT_CSD_PART_CONFIG used in mmcsd_switch_part(),
* so retrieve EXT_CSD again.
*/
if (cmd.opcode == MMC_SWITCH_FUNC) {
err = mmc_send_ext_csd(mmcbus, dev, sc->ext_csd);
if (err != MMC_ERR_NONE)
goto release;
}
MMCBUS_RELEASE_BUS(mmcbus, dev);
if (cmd.error != MMC_ERR_NONE) {
switch (cmd.error) {
case MMC_ERR_TIMEOUT:
err = ETIMEDOUT;
break;
case MMC_ERR_BADCRC:
err = EILSEQ;
break;
case MMC_ERR_INVALID:
err = EINVAL;
break;
case MMC_ERR_NO_MEMORY:
err = ENOMEM;
break;
default:
err = EIO;
break;
}
goto out;
}
memcpy(mic->response, cmd.resp, 4 * sizeof(uint32_t));
if (mic->write_flag == 0 && len != 0) {
err = copyout(dp, (void *)(uintptr_t)mic->data_ptr, len);
if (err != 0)
goto out;
}
goto out;
release:
MMCBUS_RELEASE_BUS(mmcbus, dev);
err = EIO;
out:
MMCSD_IOCTL_LOCK(part);
part->ioctl = 0;
MMCSD_IOCTL_UNLOCK(part);
wakeup(part);
if (dp != NULL)
free(dp, M_TEMP);
return (err);
}
static int
mmcsd_getattr(struct bio *bp)
{
struct mmcsd_part *part;
device_t dev;
if (strcmp(bp->bio_attribute, "MMC::device") == 0) {
if (bp->bio_length != sizeof(dev))
return (EFAULT);
part = bp->bio_disk->d_drv1;
dev = part->sc->dev;
bcopy(&dev, bp->bio_data, sizeof(dev));
bp->bio_completed = bp->bio_length;
return (0);
}
return (-1);
}
static int
mmcsd_set_blockcount(struct mmcsd_softc *sc, u_int count, bool reliable)
{
struct mmc_command cmd;
struct mmc_request req;
memset(&req, 0, sizeof(req));
memset(&cmd, 0, sizeof(cmd));
cmd.mrq = &req;
req.cmd = &cmd;
cmd.opcode = MMC_SET_BLOCK_COUNT;
cmd.arg = count & 0x0000FFFF;
if (reliable)
cmd.arg |= 1 << 31;
cmd.flags = MMC_RSP_R1 | MMC_CMD_AC;
MMCBUS_WAIT_FOR_REQUEST(sc->mmcbus, sc->dev, &req);
return (cmd.error);
}
static int
mmcsd_switch_part(device_t bus, device_t dev, uint16_t rca, u_int part)
{
struct mmcsd_softc *sc;
int err;
uint8_t value;
sc = device_get_softc(dev);
if (sc->mode == mode_sd)
return (MMC_ERR_NONE);
/*
* According to section "6.2.2 Command restrictions" of the eMMC
* specification v5.1, CMD19/CMD21 aren't allowed to be used with
* RPMB partitions. So we pause re-tuning along with triggering
* it up-front to decrease the likelihood of re-tuning becoming
* necessary while accessing an RPMB partition. Consequently, an
* RPMB partition should immediately be switched away from again
* after an access in order to allow for re-tuning to take place
* anew.
*/
if (part == EXT_CSD_PART_CONFIG_ACC_RPMB)
MMCBUS_RETUNE_PAUSE(sc->mmcbus, sc->dev, true);
if (sc->part_curr == part)
return (MMC_ERR_NONE);
value = (sc->ext_csd[EXT_CSD_PART_CONFIG] &
~EXT_CSD_PART_CONFIG_ACC_MASK) | part;
/* Jump! */
err = mmc_switch(bus, dev, rca, EXT_CSD_CMD_SET_NORMAL,
EXT_CSD_PART_CONFIG, value, sc->part_time, true);
if (err != MMC_ERR_NONE) {
if (part == EXT_CSD_PART_CONFIG_ACC_RPMB)
MMCBUS_RETUNE_UNPAUSE(sc->mmcbus, sc->dev);
return (err);
}
sc->ext_csd[EXT_CSD_PART_CONFIG] = value;
if (sc->part_curr == EXT_CSD_PART_CONFIG_ACC_RPMB)
MMCBUS_RETUNE_UNPAUSE(sc->mmcbus, sc->dev);
sc->part_curr = part;
return (MMC_ERR_NONE);
}
static const char *
mmcsd_errmsg(int e)
{
if (e < 0 || e > MMC_ERR_MAX)
return "Bad error code";
return (errmsg[e]);
}
static daddr_t
mmcsd_rw(struct mmcsd_part *part, struct bio *bp)
{
daddr_t block, end;
struct mmc_command cmd;
struct mmc_command stop;
struct mmc_request req;
struct mmc_data data;
struct mmcsd_softc *sc;
device_t dev, mmcbus;
u_int numblocks, sz;
char *vaddr;
sc = part->sc;
dev = sc->dev;
mmcbus = sc->mmcbus;
block = bp->bio_pblkno;
sz = part->disk->d_sectorsize;
end = bp->bio_pblkno + (bp->bio_bcount / sz);
while (block < end) {
vaddr = bp->bio_data + (block - bp->bio_pblkno) * sz;
numblocks = min(end - block, sc->max_data);
memset(&req, 0, sizeof(req));
memset(&cmd, 0, sizeof(cmd));
memset(&stop, 0, sizeof(stop));
memset(&data, 0, sizeof(data));
cmd.mrq = &req;
req.cmd = &cmd;
cmd.data = &data;
if (bp->bio_cmd == BIO_READ) {
if (numblocks > 1)
cmd.opcode = MMC_READ_MULTIPLE_BLOCK;
else
cmd.opcode = MMC_READ_SINGLE_BLOCK;
} else {
if (numblocks > 1)
cmd.opcode = MMC_WRITE_MULTIPLE_BLOCK;
else
cmd.opcode = MMC_WRITE_BLOCK;
}
cmd.arg = block;
if (sc->high_cap == 0)
cmd.arg <<= 9;
cmd.flags = MMC_RSP_R1 | MMC_CMD_ADTC;
data.data = vaddr;
data.mrq = &req;
if (bp->bio_cmd == BIO_READ)
data.flags = MMC_DATA_READ;
else
data.flags = MMC_DATA_WRITE;
data.len = numblocks * sz;
if (numblocks > 1) {
data.flags |= MMC_DATA_MULTI;
stop.opcode = MMC_STOP_TRANSMISSION;
stop.arg = 0;
stop.flags = MMC_RSP_R1B | MMC_CMD_AC;
stop.mrq = &req;
req.stop = &stop;
}
MMCBUS_WAIT_FOR_REQUEST(mmcbus, dev, &req);
if (req.cmd->error != MMC_ERR_NONE) {
if (ppsratecheck(&sc->log_time, &sc->log_count,
LOG_PPS))
device_printf(dev, "Error indicated: %d %s\n",
req.cmd->error,
mmcsd_errmsg(req.cmd->error));
break;
}
block += numblocks;
}
return (block);
}
static daddr_t
mmcsd_delete(struct mmcsd_part *part, struct bio *bp)
{
daddr_t block, end, start, stop;
struct mmc_command cmd;
struct mmc_request req;
struct mmcsd_softc *sc;
device_t dev, mmcbus;
u_int erase_sector, sz;
int err;
bool use_trim;
sc = part->sc;
dev = sc->dev;
mmcbus = sc->mmcbus;
block = bp->bio_pblkno;
sz = part->disk->d_sectorsize;
end = bp->bio_pblkno + (bp->bio_bcount / sz);
use_trim = sc->flags & MMCSD_USE_TRIM;
if (use_trim == true) {
start = block;
stop = end;
} else {
/* Coalesce with the remainder of the previous request. */
if (block > part->eblock && block <= part->eend)
block = part->eblock;
if (end >= part->eblock && end < part->eend)
end = part->eend;
/* Safely round to the erase sector boundaries. */
erase_sector = sc->erase_sector;
start = block + erase_sector - 1; /* Round up. */
start -= start % erase_sector;
stop = end; /* Round down. */
stop -= end % erase_sector;
/*
* We can't erase an area smaller than an erase sector, so
* store it for later.
*/
if (start >= stop) {
part->eblock = block;
part->eend = end;
return (end);
}
}
if ((sc->flags & MMCSD_INAND_CMD38) != 0) {
err = mmc_switch(mmcbus, dev, sc->rca, EXT_CSD_CMD_SET_NORMAL,
EXT_CSD_INAND_CMD38, use_trim == true ?
EXT_CSD_INAND_CMD38_TRIM : EXT_CSD_INAND_CMD38_ERASE,
sc->cmd6_time, true);
if (err != MMC_ERR_NONE) {
device_printf(dev,
"Setting iNAND erase command failed %s\n",
mmcsd_errmsg(err));
return (block);
}
}
/*
* Pause re-tuning so it won't interfere with the order of erase
* commands. Note that these latter don't use the data lines, so
* re-tuning shouldn't actually become necessary during erase.
*/
MMCBUS_RETUNE_PAUSE(mmcbus, dev, false);
/* Set erase start position. */
memset(&req, 0, sizeof(req));
memset(&cmd, 0, sizeof(cmd));
cmd.mrq = &req;
req.cmd = &cmd;
if (mmc_get_card_type(dev) == mode_sd)
cmd.opcode = SD_ERASE_WR_BLK_START;
else
cmd.opcode = MMC_ERASE_GROUP_START;
cmd.arg = start;
if (sc->high_cap == 0)
cmd.arg <<= 9;
cmd.flags = MMC_RSP_R1 | MMC_CMD_AC;
MMCBUS_WAIT_FOR_REQUEST(mmcbus, dev, &req);
if (req.cmd->error != MMC_ERR_NONE) {
device_printf(dev, "Setting erase start position failed %s\n",
mmcsd_errmsg(req.cmd->error));
block = bp->bio_pblkno;
goto unpause;
}
/* Set erase stop position. */
memset(&req, 0, sizeof(req));
memset(&cmd, 0, sizeof(cmd));
req.cmd = &cmd;
if (mmc_get_card_type(dev) == mode_sd)
cmd.opcode = SD_ERASE_WR_BLK_END;
else
cmd.opcode = MMC_ERASE_GROUP_END;
cmd.arg = stop;
if (sc->high_cap == 0)
cmd.arg <<= 9;
cmd.arg--;
cmd.flags = MMC_RSP_R1 | MMC_CMD_AC;
MMCBUS_WAIT_FOR_REQUEST(mmcbus, dev, &req);
if (req.cmd->error != MMC_ERR_NONE) {
device_printf(dev, "Setting erase stop position failed %s\n",
mmcsd_errmsg(req.cmd->error));
block = bp->bio_pblkno;
goto unpause;
}
/* Erase range. */
memset(&req, 0, sizeof(req));
memset(&cmd, 0, sizeof(cmd));
req.cmd = &cmd;
cmd.opcode = MMC_ERASE;
cmd.arg = use_trim == true ? MMC_ERASE_TRIM : MMC_ERASE_ERASE;
cmd.flags = MMC_RSP_R1B | MMC_CMD_AC;
MMCBUS_WAIT_FOR_REQUEST(mmcbus, dev, &req);
if (req.cmd->error != MMC_ERR_NONE) {
device_printf(dev, "Issuing erase command failed %s\n",
mmcsd_errmsg(req.cmd->error));
block = bp->bio_pblkno;
goto unpause;
}
if (use_trim == false) {
/* Store one of the remaining parts for the next call. */
if (bp->bio_pblkno >= part->eblock || block == start) {
part->eblock = stop; /* Predict next forward. */
part->eend = end;
} else {
part->eblock = block; /* Predict next backward. */
part->eend = start;
}
}
block = end;
unpause:
MMCBUS_RETUNE_UNPAUSE(mmcbus, dev);
return (block);
}
static int
mmcsd_dump(void *arg, void *virtual, vm_offset_t physical, off_t offset,
size_t length)
{
struct bio bp;
daddr_t block, end;
struct disk *disk;
struct mmcsd_softc *sc;
struct mmcsd_part *part;
device_t dev, mmcbus;
int err;
/* length zero is special and really means flush buffers to media */
if (!length)
return (0);
disk = arg;
part = disk->d_drv1;
sc = part->sc;
dev = sc->dev;
mmcbus = sc->mmcbus;
g_reset_bio(&bp);
bp.bio_disk = disk;
bp.bio_pblkno = offset / disk->d_sectorsize;
bp.bio_bcount = length;
bp.bio_data = virtual;
bp.bio_cmd = BIO_WRITE;
end = bp.bio_pblkno + bp.bio_bcount / disk->d_sectorsize;
MMCBUS_ACQUIRE_BUS(mmcbus, dev);
err = mmcsd_switch_part(mmcbus, dev, sc->rca, part->type);
if (err != MMC_ERR_NONE) {
if (ppsratecheck(&sc->log_time, &sc->log_count, LOG_PPS))
device_printf(dev, "Partition switch error\n");
MMCBUS_RELEASE_BUS(mmcbus, dev);
return (EIO);
}
block = mmcsd_rw(part, &bp);
MMCBUS_RELEASE_BUS(mmcbus, dev);
return ((end < block) ? EIO : 0);
}
static void
mmcsd_task(void *arg)
{
daddr_t block, end;
struct mmcsd_part *part;
struct mmcsd_softc *sc;
struct bio *bp;
device_t dev, mmcbus;
int err, sz;
part = arg;
sc = part->sc;
dev = sc->dev;
mmcbus = sc->mmcbus;
while (1) {
MMCSD_DISK_LOCK(part);
do {
if (part->running == 0)
goto out;
bp = bioq_takefirst(&part->bio_queue);
if (bp == NULL)
msleep(part, &part->disk_mtx, PRIBIO,
"mmcsd disk jobqueue", 0);
} while (bp == NULL);
MMCSD_DISK_UNLOCK(part);
if (bp->bio_cmd != BIO_READ && part->ro) {
bp->bio_error = EROFS;
bp->bio_resid = bp->bio_bcount;
bp->bio_flags |= BIO_ERROR;
biodone(bp);
continue;
}
MMCBUS_ACQUIRE_BUS(mmcbus, dev);
sz = part->disk->d_sectorsize;
block = bp->bio_pblkno;
end = bp->bio_pblkno + (bp->bio_bcount / sz);
err = mmcsd_switch_part(mmcbus, dev, sc->rca, part->type);
if (err != MMC_ERR_NONE) {
if (ppsratecheck(&sc->log_time, &sc->log_count,
LOG_PPS))
device_printf(dev, "Partition switch error\n");
goto release;
}
if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) {
/* Access to the remaining erase block obsoletes it. */
if (block < part->eend && end > part->eblock)
part->eblock = part->eend = 0;
block = mmcsd_rw(part, bp);
} else if (bp->bio_cmd == BIO_DELETE) {
block = mmcsd_delete(part, bp);
}
release:
MMCBUS_RELEASE_BUS(mmcbus, dev);
if (block < end) {
bp->bio_error = EIO;
bp->bio_resid = (end - block) * sz;
bp->bio_flags |= BIO_ERROR;
} else {
bp->bio_resid = 0;
}
biodone(bp);
}
out:
/* tell parent we're done */
part->running = -1;
MMCSD_DISK_UNLOCK(part);
wakeup(part);
kproc_exit(0);
}
static int
mmcsd_bus_bit_width(device_t dev)
{
if (mmc_get_bus_width(dev) == bus_width_1)
return (1);
if (mmc_get_bus_width(dev) == bus_width_4)
return (4);
return (8);
}
static device_method_t mmcsd_methods[] = {
DEVMETHOD(device_probe, mmcsd_probe),
DEVMETHOD(device_attach, mmcsd_attach),
DEVMETHOD(device_detach, mmcsd_detach),
DEVMETHOD(device_suspend, mmcsd_suspend),
DEVMETHOD(device_resume, mmcsd_resume),
DEVMETHOD_END
};
static driver_t mmcsd_driver = {
"mmcsd",
mmcsd_methods,
sizeof(struct mmcsd_softc),
};
static devclass_t mmcsd_devclass;
static int
mmcsd_handler(module_t mod __unused, int what, void *arg __unused)
{
switch (what) {
case MOD_LOAD:
flash_register_slicer(mmcsd_slicer, FLASH_SLICES_TYPE_MMC,
TRUE);
return (0);
case MOD_UNLOAD:
flash_register_slicer(NULL, FLASH_SLICES_TYPE_MMC, TRUE);
return (0);
}
return (0);
}
DRIVER_MODULE(mmcsd, mmc, mmcsd_driver, mmcsd_devclass, mmcsd_handler, NULL);
MODULE_DEPEND(mmcsd, g_flashmap, 0, 0, 0);
MMC_DEPEND(mmcsd);
Index: head/sys/dev/ofw/ofw_bus_subr.c
===================================================================
--- head/sys/dev/ofw/ofw_bus_subr.c (revision 327172)
+++ head/sys/dev/ofw/ofw_bus_subr.c (revision 327173)
@@ -1,964 +1,962 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2001 - 2003 by Thomas Moestl <tmm@FreeBSD.org>.
* Copyright (c) 2005 Marius Strobl <marius@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions, and the following disclaimer,
* without modification, immediately at the beginning of the file.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_platform.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/bus.h>
#include <sys/errno.h>
#include <sys/libkern.h>
#include <machine/resource.h>
#include <dev/ofw/ofw_bus.h>
#include <dev/ofw/ofw_bus_subr.h>
#include <dev/ofw/openfirm.h>
#include "ofw_bus_if.h"
#define OFW_COMPAT_LEN 255
#define OFW_STATUS_LEN 16
int
ofw_bus_gen_setup_devinfo(struct ofw_bus_devinfo *obd, phandle_t node)
{
if (obd == NULL)
return (ENOMEM);
/* The 'name' property is considered mandatory. */
if ((OF_getprop_alloc(node, "name", 1, (void **)&obd->obd_name)) == -1)
return (EINVAL);
OF_getprop_alloc(node, "compatible", 1, (void **)&obd->obd_compat);
OF_getprop_alloc(node, "device_type", 1, (void **)&obd->obd_type);
OF_getprop_alloc(node, "model", 1, (void **)&obd->obd_model);
OF_getprop_alloc(node, "status", 1, (void **)&obd->obd_status);
obd->obd_node = node;
return (0);
}
void
ofw_bus_gen_destroy_devinfo(struct ofw_bus_devinfo *obd)
{
if (obd == NULL)
return;
if (obd->obd_compat != NULL)
free(obd->obd_compat, M_OFWPROP);
if (obd->obd_model != NULL)
free(obd->obd_model, M_OFWPROP);
if (obd->obd_name != NULL)
free(obd->obd_name, M_OFWPROP);
if (obd->obd_type != NULL)
free(obd->obd_type, M_OFWPROP);
if (obd->obd_status != NULL)
free(obd->obd_status, M_OFWPROP);
}
int
ofw_bus_gen_child_pnpinfo_str(device_t cbdev, device_t child, char *buf,
size_t buflen)
{
if (ofw_bus_get_name(child) != NULL) {
strlcat(buf, "name=", buflen);
strlcat(buf, ofw_bus_get_name(child), buflen);
}
if (ofw_bus_get_compat(child) != NULL) {
strlcat(buf, " compat=", buflen);
strlcat(buf, ofw_bus_get_compat(child), buflen);
}
return (0);
};
const char *
ofw_bus_gen_get_compat(device_t bus, device_t dev)
{
const struct ofw_bus_devinfo *obd;
obd = OFW_BUS_GET_DEVINFO(bus, dev);
if (obd == NULL)
return (NULL);
return (obd->obd_compat);
}
const char *
ofw_bus_gen_get_model(device_t bus, device_t dev)
{
const struct ofw_bus_devinfo *obd;
obd = OFW_BUS_GET_DEVINFO(bus, dev);
if (obd == NULL)
return (NULL);
return (obd->obd_model);
}
const char *
ofw_bus_gen_get_name(device_t bus, device_t dev)
{
const struct ofw_bus_devinfo *obd;
obd = OFW_BUS_GET_DEVINFO(bus, dev);
if (obd == NULL)
return (NULL);
return (obd->obd_name);
}
phandle_t
ofw_bus_gen_get_node(device_t bus, device_t dev)
{
const struct ofw_bus_devinfo *obd;
obd = OFW_BUS_GET_DEVINFO(bus, dev);
if (obd == NULL)
return (0);
return (obd->obd_node);
}
const char *
ofw_bus_gen_get_type(device_t bus, device_t dev)
{
const struct ofw_bus_devinfo *obd;
obd = OFW_BUS_GET_DEVINFO(bus, dev);
if (obd == NULL)
return (NULL);
return (obd->obd_type);
}
const char *
ofw_bus_get_status(device_t dev)
{
const struct ofw_bus_devinfo *obd;
obd = OFW_BUS_GET_DEVINFO(device_get_parent(dev), dev);
if (obd == NULL)
return (NULL);
return (obd->obd_status);
}
int
ofw_bus_status_okay(device_t dev)
{
const char *status;
status = ofw_bus_get_status(dev);
if (status == NULL || strcmp(status, "okay") == 0 ||
strcmp(status, "ok") == 0)
return (1);
return (0);
}
int
ofw_bus_node_status_okay(phandle_t node)
{
char status[OFW_STATUS_LEN];
int len;
len = OF_getproplen(node, "status");
if (len <= 0)
return (1);
OF_getprop(node, "status", status, OFW_STATUS_LEN);
if ((len == 5 && (bcmp(status, "okay", len) == 0)) ||
(len == 3 && (bcmp(status, "ok", len))))
return (1);
return (0);
}
static int
ofw_bus_node_is_compatible_int(const char *compat, int len,
const char *onecompat)
{
int onelen, l, ret;
onelen = strlen(onecompat);
ret = 0;
while (len > 0) {
if (strlen(compat) == onelen &&
strncasecmp(compat, onecompat, onelen) == 0) {
/* Found it. */
ret = 1;
break;
}
/* Slide to the next sub-string. */
l = strlen(compat) + 1;
compat += l;
len -= l;
}
return (ret);
}
int
ofw_bus_node_is_compatible(phandle_t node, const char *compatstr)
{
char compat[OFW_COMPAT_LEN];
int len, rv;
if ((len = OF_getproplen(node, "compatible")) <= 0)
return (0);
bzero(compat, OFW_COMPAT_LEN);
if (OF_getprop(node, "compatible", compat, OFW_COMPAT_LEN) < 0)
return (0);
rv = ofw_bus_node_is_compatible_int(compat, len, compatstr);
return (rv);
}
int
ofw_bus_is_compatible(device_t dev, const char *onecompat)
{
phandle_t node;
const char *compat;
int len;
if ((compat = ofw_bus_get_compat(dev)) == NULL)
return (0);
if ((node = ofw_bus_get_node(dev)) == -1)
return (0);
/* Get total 'compatible' prop len */
if ((len = OF_getproplen(node, "compatible")) <= 0)
return (0);
return (ofw_bus_node_is_compatible_int(compat, len, onecompat));
}
int
ofw_bus_is_compatible_strict(device_t dev, const char *compatible)
{
const char *compat;
size_t len;
if ((compat = ofw_bus_get_compat(dev)) == NULL)
return (0);
len = strlen(compatible);
if (strlen(compat) == len &&
strncasecmp(compat, compatible, len) == 0)
return (1);
return (0);
}
const struct ofw_compat_data *
ofw_bus_search_compatible(device_t dev, const struct ofw_compat_data *compat)
{
if (compat == NULL)
return NULL;
for (; compat->ocd_str != NULL; ++compat) {
if (ofw_bus_is_compatible(dev, compat->ocd_str))
break;
}
return (compat);
}
int
ofw_bus_has_prop(device_t dev, const char *propname)
{
phandle_t node;
if ((node = ofw_bus_get_node(dev)) == -1)
return (0);
return (OF_hasprop(node, propname));
}
void
ofw_bus_setup_iinfo(phandle_t node, struct ofw_bus_iinfo *ii, int intrsz)
{
pcell_t addrc;
int msksz;
if (OF_getencprop(node, "#address-cells", &addrc, sizeof(addrc)) == -1)
addrc = 2;
ii->opi_addrc = addrc * sizeof(pcell_t);
ii->opi_imapsz = OF_getencprop_alloc(node, "interrupt-map", 1,
(void **)&ii->opi_imap);
if (ii->opi_imapsz > 0) {
msksz = OF_getencprop_alloc(node, "interrupt-map-mask", 1,
(void **)&ii->opi_imapmsk);
/*
* Failure to get the mask is ignored; a full mask is used
* then. We barf on bad mask sizes, however.
*/
if (msksz != -1 && msksz != ii->opi_addrc + intrsz)
panic("ofw_bus_setup_iinfo: bad interrupt-map-mask "
"property!");
}
}
int
ofw_bus_lookup_imap(phandle_t node, struct ofw_bus_iinfo *ii, void *reg,
int regsz, void *pintr, int pintrsz, void *mintr, int mintrsz,
phandle_t *iparent)
{
uint8_t maskbuf[regsz + pintrsz];
int rv;
if (ii->opi_imapsz <= 0)
return (0);
KASSERT(regsz >= ii->opi_addrc,
("ofw_bus_lookup_imap: register size too small: %d < %d",
regsz, ii->opi_addrc));
if (node != -1) {
rv = OF_getencprop(node, "reg", reg, regsz);
if (rv < regsz)
panic("ofw_bus_lookup_imap: cannot get reg property");
}
return (ofw_bus_search_intrmap(pintr, pintrsz, reg, ii->opi_addrc,
ii->opi_imap, ii->opi_imapsz, ii->opi_imapmsk, maskbuf, mintr,
mintrsz, iparent));
}
/*
* Map an interrupt using the firmware reg, interrupt-map and
* interrupt-map-mask properties.
* The interrupt property to be mapped must be of size intrsz, and pointed to
* by intr. The regs property of the node for which the mapping is done must
* be passed as regs. This property is an array of register specifications;
* the size of the address part of such a specification must be passed as
* physsz. Only the first element of the property is used.
* imap and imapsz hold the interrupt mask and it's size.
* imapmsk is a pointer to the interrupt-map-mask property, which must have
* a size of physsz + intrsz; it may be NULL, in which case a full mask is
* assumed.
* maskbuf must point to a buffer of length physsz + intrsz.
* The interrupt is returned in result, which must point to a buffer of length
* rintrsz (which gives the expected size of the mapped interrupt).
* Returns number of cells in the interrupt if a mapping was found, 0 otherwise.
*/
int
ofw_bus_search_intrmap(void *intr, int intrsz, void *regs, int physsz,
void *imap, int imapsz, void *imapmsk, void *maskbuf, void *result,
int rintrsz, phandle_t *iparent)
{
phandle_t parent;
uint8_t *ref = maskbuf;
uint8_t *uiintr = intr;
uint8_t *uiregs = regs;
uint8_t *uiimapmsk = imapmsk;
uint8_t *mptr;
pcell_t paddrsz;
pcell_t pintrsz;
- int i, rsz, tsz;
+ int i, tsz;
- rsz = -1;
if (imapmsk != NULL) {
for (i = 0; i < physsz; i++)
ref[i] = uiregs[i] & uiimapmsk[i];
for (i = 0; i < intrsz; i++)
ref[physsz + i] = uiintr[i] & uiimapmsk[physsz + i];
} else {
bcopy(regs, ref, physsz);
bcopy(intr, ref + physsz, intrsz);
}
mptr = imap;
i = imapsz;
paddrsz = 0;
while (i > 0) {
bcopy(mptr + physsz + intrsz, &parent, sizeof(parent));
#ifndef OFW_IMAP_NO_IPARENT_ADDR_CELLS
/*
* Find if we need to read the parent address data.
* CHRP-derived OF bindings, including ePAPR-compliant FDTs,
* use this as an optional part of the specifier.
*/
if (OF_getencprop(OF_node_from_xref(parent),
"#address-cells", &paddrsz, sizeof(paddrsz)) == -1)
paddrsz = 0; /* default */
paddrsz *= sizeof(pcell_t);
#endif
if (OF_searchencprop(OF_node_from_xref(parent),
"#interrupt-cells", &pintrsz, sizeof(pintrsz)) == -1)
pintrsz = 1; /* default */
pintrsz *= sizeof(pcell_t);
/* Compute the map stride size. */
tsz = physsz + intrsz + sizeof(phandle_t) + paddrsz + pintrsz;
KASSERT(i >= tsz, ("ofw_bus_search_intrmap: truncated map"));
if (bcmp(ref, mptr, physsz + intrsz) == 0) {
bcopy(mptr + physsz + intrsz + sizeof(parent) + paddrsz,
result, MIN(rintrsz, pintrsz));
if (iparent != NULL)
*iparent = parent;
return (pintrsz/sizeof(pcell_t));
}
mptr += tsz;
i -= tsz;
}
return (0);
}
int
ofw_bus_msimap(phandle_t node, uint16_t pci_rid, phandle_t *msi_parent,
uint32_t *msi_rid)
{
pcell_t *map, mask, msi_base, rid_base, rid_length;
ssize_t len;
- uint32_t masked_rid, rid;
+ uint32_t masked_rid;
int err, i;
/* TODO: This should be OF_searchprop_alloc if we had it */
len = OF_getencprop_alloc(node, "msi-map", sizeof(*map), (void **)&map);
if (len < 0) {
if (msi_parent != NULL) {
*msi_parent = 0;
OF_getencprop(node, "msi-parent", msi_parent,
sizeof(*msi_parent));
}
if (msi_rid != NULL)
*msi_rid = pci_rid;
return (0);
}
err = ENOENT;
- rid = 0;
mask = 0xffffffff;
OF_getencprop(node, "msi-map-mask", &mask, sizeof(mask));
masked_rid = pci_rid & mask;
for (i = 0; i < len; i += 4) {
rid_base = map[i + 0];
rid_length = map[i + 3];
if (masked_rid < rid_base ||
masked_rid >= (rid_base + rid_length))
continue;
msi_base = map[i + 2];
if (msi_parent != NULL)
*msi_parent = map[i + 1];
if (msi_rid != NULL)
*msi_rid = masked_rid - rid_base + msi_base;
err = 0;
break;
}
free(map, M_OFWPROP);
return (err);
}
int
ofw_bus_reg_to_rl(device_t dev, phandle_t node, pcell_t acells, pcell_t scells,
struct resource_list *rl)
{
uint64_t phys, size;
ssize_t i, j, rid, nreg, ret;
uint32_t *reg;
char *name;
/*
* This may be just redundant when having ofw_bus_devinfo
* but makes this routine independent of it.
*/
ret = OF_getprop_alloc(node, "name", sizeof(*name), (void **)&name);
if (ret == -1)
name = NULL;
ret = OF_getencprop_alloc(node, "reg", sizeof(*reg), (void **)&reg);
nreg = (ret == -1) ? 0 : ret;
if (nreg % (acells + scells) != 0) {
if (bootverbose)
device_printf(dev, "Malformed reg property on <%s>\n",
(name == NULL) ? "unknown" : name);
nreg = 0;
}
for (i = 0, rid = 0; i < nreg; i += acells + scells, rid++) {
phys = size = 0;
for (j = 0; j < acells; j++) {
phys <<= 32;
phys |= reg[i + j];
}
for (j = 0; j < scells; j++) {
size <<= 32;
size |= reg[i + acells + j];
}
/* Skip the dummy reg property of glue devices like ssm(4). */
if (size != 0)
resource_list_add(rl, SYS_RES_MEMORY, rid,
phys, phys + size - 1, size);
}
free(name, M_OFWPROP);
free(reg, M_OFWPROP);
return (0);
}
/*
* Get interrupt parent for given node.
* Returns 0 if interrupt parent doesn't exist.
*/
phandle_t
ofw_bus_find_iparent(phandle_t node)
{
phandle_t iparent;
if (OF_searchencprop(node, "interrupt-parent", &iparent,
sizeof(iparent)) == -1) {
for (iparent = node; iparent != 0;
iparent = OF_parent(iparent)) {
if (OF_hasprop(iparent, "interrupt-controller"))
break;
}
iparent = OF_xref_from_node(iparent);
}
return (iparent);
}
int
ofw_bus_intr_to_rl(device_t dev, phandle_t node,
struct resource_list *rl, int *rlen)
{
phandle_t iparent;
uint32_t icells, *intr;
int err, i, irqnum, nintr, rid;
boolean_t extended;
nintr = OF_getencprop_alloc(node, "interrupts", sizeof(*intr),
(void **)&intr);
if (nintr > 0) {
iparent = ofw_bus_find_iparent(node);
if (iparent == 0) {
device_printf(dev, "No interrupt-parent found, "
"assuming direct parent\n");
iparent = OF_parent(node);
iparent = OF_xref_from_node(iparent);
}
if (OF_searchencprop(OF_node_from_xref(iparent),
"#interrupt-cells", &icells, sizeof(icells)) == -1) {
device_printf(dev, "Missing #interrupt-cells "
"property, assuming <1>\n");
icells = 1;
}
if (icells < 1 || icells > nintr) {
device_printf(dev, "Invalid #interrupt-cells property "
"value <%d>, assuming <1>\n", icells);
icells = 1;
}
extended = false;
} else {
nintr = OF_getencprop_alloc(node, "interrupts-extended",
sizeof(*intr), (void **)&intr);
if (nintr <= 0)
return (0);
extended = true;
}
err = 0;
rid = 0;
for (i = 0; i < nintr; i += icells) {
if (extended) {
iparent = intr[i++];
if (OF_searchencprop(OF_node_from_xref(iparent),
"#interrupt-cells", &icells, sizeof(icells)) == -1) {
device_printf(dev, "Missing #interrupt-cells "
"property\n");
err = ENOENT;
break;
}
if (icells < 1 || (i + icells) > nintr) {
device_printf(dev, "Invalid #interrupt-cells "
"property value <%d>\n", icells);
err = ERANGE;
break;
}
}
irqnum = ofw_bus_map_intr(dev, iparent, icells, &intr[i]);
resource_list_add(rl, SYS_RES_IRQ, rid++, irqnum, irqnum, 1);
}
if (rlen != NULL)
*rlen = rid;
free(intr, M_OFWPROP);
return (err);
}
int
ofw_bus_intr_by_rid(device_t dev, phandle_t node, int wanted_rid,
phandle_t *producer, int *ncells, pcell_t **cells)
{
phandle_t iparent;
uint32_t icells, *intr;
int err, i, nintr, rid;
boolean_t extended;
nintr = OF_getencprop_alloc(node, "interrupts", sizeof(*intr),
(void **)&intr);
if (nintr > 0) {
iparent = ofw_bus_find_iparent(node);
if (iparent == 0) {
device_printf(dev, "No interrupt-parent found, "
"assuming direct parent\n");
iparent = OF_parent(node);
iparent = OF_xref_from_node(iparent);
}
if (OF_searchencprop(OF_node_from_xref(iparent),
"#interrupt-cells", &icells, sizeof(icells)) == -1) {
device_printf(dev, "Missing #interrupt-cells "
"property, assuming <1>\n");
icells = 1;
}
if (icells < 1 || icells > nintr) {
device_printf(dev, "Invalid #interrupt-cells property "
"value <%d>, assuming <1>\n", icells);
icells = 1;
}
extended = false;
} else {
nintr = OF_getencprop_alloc(node, "interrupts-extended",
sizeof(*intr), (void **)&intr);
if (nintr <= 0)
return (ESRCH);
extended = true;
}
err = ESRCH;
rid = 0;
for (i = 0; i < nintr; i += icells, rid++) {
if (extended) {
iparent = intr[i++];
if (OF_searchencprop(OF_node_from_xref(iparent),
"#interrupt-cells", &icells, sizeof(icells)) == -1) {
device_printf(dev, "Missing #interrupt-cells "
"property\n");
err = ENOENT;
break;
}
if (icells < 1 || (i + icells) > nintr) {
device_printf(dev, "Invalid #interrupt-cells "
"property value <%d>\n", icells);
err = ERANGE;
break;
}
}
if (rid == wanted_rid) {
*cells = malloc(icells * sizeof(**cells), M_OFWPROP,
M_WAITOK);
*producer = iparent;
*ncells= icells;
memcpy(*cells, intr + i, icells * sizeof(**cells));
err = 0;
break;
}
}
free(intr, M_OFWPROP);
return (err);
}
phandle_t
ofw_bus_find_child(phandle_t start, const char *child_name)
{
char *name;
int ret;
phandle_t child;
for (child = OF_child(start); child != 0; child = OF_peer(child)) {
ret = OF_getprop_alloc(child, "name", sizeof(*name), (void **)&name);
if (ret == -1)
continue;
if (strcmp(name, child_name) == 0) {
free(name, M_OFWPROP);
return (child);
}
free(name, M_OFWPROP);
}
return (0);
}
phandle_t
ofw_bus_find_compatible(phandle_t node, const char *onecompat)
{
phandle_t child, ret;
/*
* Traverse all children of 'start' node, and find first with
* matching 'compatible' property.
*/
for (child = OF_child(node); child != 0; child = OF_peer(child)) {
if (ofw_bus_node_is_compatible(child, onecompat) != 0)
return (child);
ret = ofw_bus_find_compatible(child, onecompat);
if (ret != 0)
return (ret);
}
return (0);
}
/**
* @brief Return child of bus whose phandle is node
*
* A direct child of @p will be returned if it its phandle in the
* OFW tree is @p node. Otherwise, NULL is returned.
*
* @param bus The bus to examine
* @param node The phandle_t to look for.
*/
device_t
ofw_bus_find_child_device_by_phandle(device_t bus, phandle_t node)
{
device_t *children, retval, child;
int nkid, i;
/*
* Nothing can match the flag value for no node.
*/
if (node == -1)
return (NULL);
/*
* Search the children for a match. We microoptimize
* a bit by not using ofw_bus_get since we already know
* the parent. We do not recurse.
*/
if (device_get_children(bus, &children, &nkid) != 0)
return (NULL);
retval = NULL;
for (i = 0; i < nkid; i++) {
child = children[i];
if (OFW_BUS_GET_NODE(bus, child) == node) {
retval = child;
break;
}
}
free(children, M_TEMP);
return (retval);
}
/*
* Parse property that contain list of xrefs and values
* (like standard "clocks" and "resets" properties)
* Input arguments:
* node - consumers device node
* list_name - name of parsed list - "clocks"
* cells_name - name of size property - "#clock-cells"
* idx - the index of the requested list entry, or, if -1, an indication
* to return the number of entries in the parsed list.
* Output arguments:
* producer - handle of producer
* ncells - number of cells in result or the number of items in the list when
* idx == -1.
* cells - array of decoded cells
*/
static int
ofw_bus_parse_xref_list_internal(phandle_t node, const char *list_name,
const char *cells_name, int idx, phandle_t *producer, int *ncells,
pcell_t **cells)
{
phandle_t pnode;
phandle_t *elems;
uint32_t pcells;
int rv, i, j, nelems, cnt;
elems = NULL;
nelems = OF_getencprop_alloc(node, list_name, sizeof(*elems),
(void **)&elems);
if (nelems <= 0)
return (ENOENT);
rv = (idx == -1) ? 0 : ENOENT;
for (i = 0, cnt = 0; i < nelems; i += pcells, cnt++) {
pnode = elems[i++];
if (OF_getencprop(OF_node_from_xref(pnode),
cells_name, &pcells, sizeof(pcells)) == -1) {
printf("Missing %s property\n", cells_name);
rv = ENOENT;
break;
}
if ((i + pcells) > nelems) {
printf("Invalid %s property value <%d>\n", cells_name,
pcells);
rv = ERANGE;
break;
}
if (cnt == idx) {
*cells= malloc(pcells * sizeof(**cells), M_OFWPROP,
M_WAITOK);
*producer = pnode;
*ncells = pcells;
for (j = 0; j < pcells; j++)
(*cells)[j] = elems[i + j];
rv = 0;
break;
}
}
if (elems != NULL)
free(elems, M_OFWPROP);
if (idx == -1 && rv == 0)
*ncells = cnt;
return (rv);
}
/*
* Parse property that contain list of xrefs and values
* (like standard "clocks" and "resets" properties)
* Input arguments:
* node - consumers device node
* list_name - name of parsed list - "clocks"
* cells_name - name of size property - "#clock-cells"
* idx - the index of the requested list entry (>= 0)
* Output arguments:
* producer - handle of producer
* ncells - number of cells in result
* cells - array of decoded cells
*/
int
ofw_bus_parse_xref_list_alloc(phandle_t node, const char *list_name,
const char *cells_name, int idx, phandle_t *producer, int *ncells,
pcell_t **cells)
{
KASSERT(idx >= 0,
("ofw_bus_parse_xref_list_alloc: negative index supplied"));
return (ofw_bus_parse_xref_list_internal(node, list_name, cells_name,
idx, producer, ncells, cells));
}
/*
* Parse property that contain list of xrefs and values
* (like standard "clocks" and "resets" properties)
* and determine the number of items in the list
* Input arguments:
* node - consumers device node
* list_name - name of parsed list - "clocks"
* cells_name - name of size property - "#clock-cells"
* Output arguments:
* count - number of items in list
*/
int
ofw_bus_parse_xref_list_get_length(phandle_t node, const char *list_name,
const char *cells_name, int *count)
{
return (ofw_bus_parse_xref_list_internal(node, list_name, cells_name,
-1, NULL, count, NULL));
}
/*
* Find index of string in string list property (case sensitive).
*/
int
ofw_bus_find_string_index(phandle_t node, const char *list_name,
const char *name, int *idx)
{
char *elems;
int rv, i, cnt, nelems;
elems = NULL;
nelems = OF_getprop_alloc(node, list_name, 1, (void **)&elems);
if (nelems <= 0)
return (ENOENT);
rv = ENOENT;
for (i = 0, cnt = 0; i < nelems; cnt++) {
if (strcmp(elems + i, name) == 0) {
*idx = cnt;
rv = 0;
break;
}
i += strlen(elems + i) + 1;
}
if (elems != NULL)
free(elems, M_OFWPROP);
return (rv);
}
/*
* Create zero terminated array of strings from string list property.
*/
int
ofw_bus_string_list_to_array(phandle_t node, const char *list_name,
const char ***out_array)
{
char *elems, *tptr;
const char **array;
int i, cnt, nelems, len;
elems = NULL;
nelems = OF_getprop_alloc(node, list_name, 1, (void **)&elems);
if (nelems <= 0)
return (nelems);
/* Count number of strings. */
for (i = 0, cnt = 0; i < nelems; cnt++)
i += strlen(elems + i) + 1;
/* Allocate space for arrays and all strings. */
array = malloc((cnt + 1) * sizeof(char *) + nelems, M_OFWPROP,
M_WAITOK);
/* Get address of first string. */
tptr = (char *)(array + cnt + 1);
/* Copy strings. */
memcpy(tptr, elems, nelems);
free(elems, M_OFWPROP);
/* Fill string pointers. */
for (i = 0, cnt = 0; i < nelems; cnt++) {
len = strlen(tptr) + 1;
array[cnt] = tptr;
i += len;
tptr += len;
}
array[cnt] = NULL;
*out_array = array;
return (cnt);
}
Index: head/sys/dev/ofw/ofwpci.c
===================================================================
--- head/sys/dev/ofw/ofwpci.c (revision 327172)
+++ head/sys/dev/ofw/ofwpci.c (revision 327173)
@@ -1,678 +1,675 @@
/*-
* Copyright (c) 2011 Nathan Whitehorn
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/module.h>
#include <sys/bus.h>
#include <sys/conf.h>
#include <sys/kernel.h>
#include <sys/rman.h>
#include <dev/ofw/openfirm.h>
#include <dev/ofw/ofw_pci.h>
#include <dev/ofw/ofw_bus.h>
#include <dev/ofw/ofw_bus_subr.h>
#include <dev/ofw/ofwpci.h>
#include <dev/pci/pcivar.h>
#include <dev/pci/pcireg.h>
#include <dev/pci/pcib_private.h>
#include <machine/bus.h>
#include <machine/md_var.h>
#include <machine/resource.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include "pcib_if.h"
/*
* If it is necessary to set another value of this for
* some platforms it should be set at fdt.h file
*/
#ifndef PCI_MAP_INTR
#define PCI_MAP_INTR 4
#endif
#define PCI_INTR_PINS 4
/*
* bus interface.
*/
static struct resource * ofw_pci_alloc_resource(device_t, device_t,
int, int *, rman_res_t, rman_res_t, rman_res_t, u_int);
static int ofw_pci_release_resource(device_t, device_t, int, int,
struct resource *);
static int ofw_pci_activate_resource(device_t, device_t, int, int,
struct resource *);
static int ofw_pci_deactivate_resource(device_t, device_t, int, int,
struct resource *);
static int ofw_pci_adjust_resource(device_t, device_t, int,
struct resource *, rman_res_t, rman_res_t);
#ifdef __powerpc__
static bus_space_tag_t ofw_pci_bus_get_bus_tag(device_t, device_t);
#endif
/*
* pcib interface
*/
static int ofw_pci_maxslots(device_t);
/*
* ofw_bus interface
*/
static phandle_t ofw_pci_get_node(device_t, device_t);
/*
* local methods
*/
static int ofw_pci_fill_ranges(phandle_t, struct ofw_pci_range *);
static struct rman *ofw_pci_get_rman(struct ofw_pci_softc *, int, u_int);
/*
* Driver methods.
*/
static device_method_t ofw_pci_methods[] = {
/* Device interface */
DEVMETHOD(device_attach, ofw_pci_attach),
/* Bus interface */
DEVMETHOD(bus_print_child, bus_generic_print_child),
DEVMETHOD(bus_read_ivar, ofw_pci_read_ivar),
DEVMETHOD(bus_write_ivar, ofw_pci_write_ivar),
DEVMETHOD(bus_setup_intr, bus_generic_setup_intr),
DEVMETHOD(bus_teardown_intr, bus_generic_teardown_intr),
DEVMETHOD(bus_alloc_resource, ofw_pci_alloc_resource),
DEVMETHOD(bus_release_resource, ofw_pci_release_resource),
DEVMETHOD(bus_activate_resource, ofw_pci_activate_resource),
DEVMETHOD(bus_deactivate_resource, ofw_pci_deactivate_resource),
DEVMETHOD(bus_adjust_resource, ofw_pci_adjust_resource),
#ifdef __powerpc__
DEVMETHOD(bus_get_bus_tag, ofw_pci_bus_get_bus_tag),
#endif
/* pcib interface */
DEVMETHOD(pcib_maxslots, ofw_pci_maxslots),
DEVMETHOD(pcib_route_interrupt, ofw_pci_route_interrupt),
DEVMETHOD(pcib_request_feature, pcib_request_feature_allow),
/* ofw_bus interface */
DEVMETHOD(ofw_bus_get_node, ofw_pci_get_node),
DEVMETHOD_END
};
DEFINE_CLASS_0(ofw_pci, ofw_pci_driver, ofw_pci_methods, 0);
int
ofw_pci_init(device_t dev)
{
struct ofw_pci_softc *sc;
phandle_t node;
u_int32_t busrange[2];
struct ofw_pci_range *rp;
int i, error;
struct ofw_pci_cell_info *cell_info;
node = ofw_bus_get_node(dev);
sc = device_get_softc(dev);
sc->sc_initialized = 1;
sc->sc_range = NULL;
sc->sc_pci_domain = device_get_unit(dev);
cell_info = (struct ofw_pci_cell_info *)malloc(sizeof(*cell_info),
M_DEVBUF, M_WAITOK | M_ZERO);
sc->sc_cell_info = cell_info;
if (OF_getencprop(node, "bus-range", busrange, sizeof(busrange)) != 8)
busrange[0] = 0;
sc->sc_dev = dev;
sc->sc_node = node;
sc->sc_bus = busrange[0];
if (sc->sc_quirks & OFW_PCI_QUIRK_RANGES_ON_CHILDREN) {
phandle_t c;
int n, i;
sc->sc_nrange = 0;
for (c = OF_child(node); c != 0; c = OF_peer(c)) {
n = ofw_pci_nranges(c, cell_info);
if (n > 0)
sc->sc_nrange += n;
}
if (sc->sc_nrange == 0) {
error = ENXIO;
goto out;
}
sc->sc_range = malloc(sc->sc_nrange * sizeof(sc->sc_range[0]),
M_DEVBUF, M_WAITOK);
i = 0;
for (c = OF_child(node); c != 0; c = OF_peer(c)) {
n = ofw_pci_fill_ranges(c, &sc->sc_range[i]);
if (n > 0)
i += n;
}
KASSERT(i == sc->sc_nrange, ("range count mismatch"));
} else {
sc->sc_nrange = ofw_pci_nranges(node, cell_info);
if (sc->sc_nrange <= 0) {
device_printf(dev, "could not getranges\n");
error = ENXIO;
goto out;
}
sc->sc_range = malloc(sc->sc_nrange * sizeof(sc->sc_range[0]),
M_DEVBUF, M_WAITOK);
ofw_pci_fill_ranges(node, sc->sc_range);
}
sc->sc_io_rman.rm_type = RMAN_ARRAY;
sc->sc_io_rman.rm_descr = "PCI I/O Ports";
error = rman_init(&sc->sc_io_rman);
if (error != 0) {
device_printf(dev, "rman_init() failed. error = %d\n", error);
goto out;
}
sc->sc_mem_rman.rm_type = RMAN_ARRAY;
sc->sc_mem_rman.rm_descr = "PCI Non Prefetchable Memory";
error = rman_init(&sc->sc_mem_rman);
if (error != 0) {
device_printf(dev, "rman_init() failed. error = %d\n", error);
goto out;
}
sc->sc_pmem_rman.rm_type = RMAN_ARRAY;
sc->sc_pmem_rman.rm_descr = "PCI Prefetchable Memory";
error = rman_init(&sc->sc_pmem_rman);
if (error != 0) {
device_printf(dev, "rman_init() failed. error = %d\n", error);
goto out;
}
for (i = 0; i < sc->sc_nrange; i++) {
error = 0;
rp = sc->sc_range + i;
if (sc->sc_range_mask & ((uint64_t)1 << i))
continue;
switch (rp->pci_hi & OFW_PCI_PHYS_HI_SPACEMASK) {
case OFW_PCI_PHYS_HI_SPACE_CONFIG:
break;
case OFW_PCI_PHYS_HI_SPACE_IO:
error = rman_manage_region(&sc->sc_io_rman, rp->pci,
rp->pci + rp->size - 1);
break;
case OFW_PCI_PHYS_HI_SPACE_MEM32:
case OFW_PCI_PHYS_HI_SPACE_MEM64:
if (rp->pci_hi & OFW_PCI_PHYS_HI_PREFETCHABLE) {
sc->sc_have_pmem = 1;
error = rman_manage_region(&sc->sc_pmem_rman,
rp->pci, rp->pci + rp->size - 1);
} else {
error = rman_manage_region(&sc->sc_mem_rman,
rp->pci, rp->pci + rp->size - 1);
}
break;
}
if (error != 0) {
device_printf(dev,
"rman_manage_region(%x, %#jx, %#jx) failed. "
"error = %d\n", rp->pci_hi &
OFW_PCI_PHYS_HI_SPACEMASK, rp->pci,
rp->pci + rp->size - 1, error);
goto out;
}
}
ofw_bus_setup_iinfo(node, &sc->sc_pci_iinfo, sizeof(cell_t));
return (0);
out:
free(cell_info, M_DEVBUF);
free(sc->sc_range, M_DEVBUF);
rman_fini(&sc->sc_io_rman);
rman_fini(&sc->sc_mem_rman);
rman_fini(&sc->sc_pmem_rman);
return (error);
}
int
ofw_pci_attach(device_t dev)
{
struct ofw_pci_softc *sc;
int error;
sc = device_get_softc(dev);
if (!sc->sc_initialized) {
error = ofw_pci_init(dev);
if (error != 0)
return (error);
}
device_add_child(dev, "pci", -1);
return (bus_generic_attach(dev));
}
static int
ofw_pci_maxslots(device_t dev)
{
return (PCI_SLOTMAX);
}
int
ofw_pci_route_interrupt(device_t bus, device_t dev, int pin)
{
struct ofw_pci_softc *sc;
struct ofw_pci_register reg;
uint32_t pintr, mintr[PCI_MAP_INTR];
int intrcells;
phandle_t iparent;
sc = device_get_softc(bus);
pintr = pin;
/* Fabricate imap information in case this isn't an OFW device */
bzero(&reg, sizeof(reg));
reg.phys_hi = (pci_get_bus(dev) << OFW_PCI_PHYS_HI_BUSSHIFT) |
(pci_get_slot(dev) << OFW_PCI_PHYS_HI_DEVICESHIFT) |
(pci_get_function(dev) << OFW_PCI_PHYS_HI_FUNCTIONSHIFT);
intrcells = ofw_bus_lookup_imap(ofw_bus_get_node(dev),
&sc->sc_pci_iinfo, &reg, sizeof(reg), &pintr, sizeof(pintr),
mintr, sizeof(mintr), &iparent);
if (intrcells != 0) {
pintr = ofw_bus_map_intr(dev, iparent, intrcells, mintr);
return (pintr);
}
/*
* Maybe it's a real interrupt, not an intpin
*/
if (pin > PCI_INTR_PINS)
return (pin);
device_printf(bus, "could not route pin %d for device %d.%d\n",
pin, pci_get_slot(dev), pci_get_function(dev));
return (PCI_INVALID_IRQ);
}
int
ofw_pci_read_ivar(device_t dev, device_t child, int which, uintptr_t *result)
{
struct ofw_pci_softc *sc;
sc = device_get_softc(dev);
switch (which) {
case PCIB_IVAR_DOMAIN:
*result = sc->sc_pci_domain;
return (0);
case PCIB_IVAR_BUS:
*result = sc->sc_bus;
return (0);
default:
break;
}
return (ENOENT);
}
int
ofw_pci_write_ivar(device_t dev, device_t child, int which, uintptr_t value)
{
struct ofw_pci_softc *sc;
sc = device_get_softc(dev);
switch (which) {
case PCIB_IVAR_BUS:
sc->sc_bus = value;
return (0);
default:
break;
}
return (ENOENT);
}
int
ofw_pci_nranges(phandle_t node, struct ofw_pci_cell_info *info)
{
ssize_t nbase_ranges;
if (info == NULL)
return (-1);
info->host_address_cells = 1;
info->size_cells = 2;
info->pci_address_cell = 3;
OF_getencprop(OF_parent(node), "#address-cells",
&(info->host_address_cells), sizeof(info->host_address_cells));
OF_getencprop(node, "#address-cells",
&(info->pci_address_cell), sizeof(info->pci_address_cell));
OF_getencprop(node, "#size-cells", &(info->size_cells),
sizeof(info->size_cells));
nbase_ranges = OF_getproplen(node, "ranges");
if (nbase_ranges <= 0)
return (-1);
return (nbase_ranges / sizeof(cell_t) /
(info->pci_address_cell + info->host_address_cells +
info->size_cells));
}
static struct resource *
ofw_pci_alloc_resource(device_t bus, device_t child, int type, int *rid,
rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
{
struct ofw_pci_softc *sc;
struct resource *rv;
struct rman *rm;
int needactivate;
needactivate = flags & RF_ACTIVE;
flags &= ~RF_ACTIVE;
sc = device_get_softc(bus);
#if defined(NEW_PCIB) && defined(PCI_RES_BUS)
if (type == PCI_RES_BUS) {
return (pci_domain_alloc_bus(sc->sc_pci_domain, child, rid,
start, end, count, flags | needactivate));
}
#endif
rm = ofw_pci_get_rman(sc, type, flags);
if (rm == NULL) {
return (bus_generic_alloc_resource(bus, child, type, rid,
start, end, count, flags | needactivate));
}
rv = rman_reserve_resource(rm, start, end, count, flags, child);
if (rv == NULL) {
device_printf(bus, "failed to reserve resource for %s\n",
device_get_nameunit(child));
return (NULL);
}
rman_set_rid(rv, *rid);
if (needactivate) {
if (bus_activate_resource(child, type, *rid, rv) != 0) {
device_printf(bus,
"failed to activate resource for %s\n",
device_get_nameunit(child));
rman_release_resource(rv);
return (NULL);
}
}
return (rv);
}
static int
ofw_pci_release_resource(device_t bus, device_t child, int type, int rid,
struct resource *res)
{
struct ofw_pci_softc *sc;
struct rman *rm;
int error;
sc = device_get_softc(bus);
#if defined(NEW_PCIB) && defined(PCI_RES_BUS)
if (type == PCI_RES_BUS)
return (pci_domain_release_bus(sc->sc_pci_domain, child, rid,
res));
#endif
rm = ofw_pci_get_rman(sc, type, rman_get_flags(res));
if (rm == NULL) {
return (bus_generic_release_resource(bus, child, type, rid,
res));
}
KASSERT(rman_is_region_manager(res, rm), ("rman mismatch"));
if (rman_get_flags(res) & RF_ACTIVE) {
error = bus_deactivate_resource(child, type, rid, res);
if (error != 0)
return (error);
}
return (rman_release_resource(res));
}
static int
ofw_pci_activate_resource(device_t bus, device_t child, int type, int rid,
struct resource *res)
{
struct ofw_pci_softc *sc;
bus_space_handle_t handle;
bus_space_tag_t tag;
struct ofw_pci_range *rp;
vm_paddr_t start;
int space;
int rv;
sc = device_get_softc(bus);
if (type != SYS_RES_IOPORT && type != SYS_RES_MEMORY) {
return (bus_generic_activate_resource(bus, child, type, rid,
res));
}
start = (vm_paddr_t)rman_get_start(res);
/*
* Map this through the ranges list
*/
for (rp = sc->sc_range; rp < sc->sc_range + sc->sc_nrange &&
rp->pci_hi != 0; rp++) {
if (start < rp->pci || start >= rp->pci + rp->size)
continue;
switch (rp->pci_hi & OFW_PCI_PHYS_HI_SPACEMASK) {
case OFW_PCI_PHYS_HI_SPACE_IO:
space = SYS_RES_IOPORT;
break;
case OFW_PCI_PHYS_HI_SPACE_MEM32:
case OFW_PCI_PHYS_HI_SPACE_MEM64:
space = SYS_RES_MEMORY;
break;
default:
space = -1;
}
if (type == space) {
start += (rp->host - rp->pci);
break;
}
}
if (bootverbose)
printf("ofw_pci mapdev: start %jx, len %jd\n",
(rman_res_t)start, rman_get_size(res));
tag = BUS_GET_BUS_TAG(child, child);
if (tag == NULL)
return (ENOMEM);
rman_set_bustag(res, tag);
rv = bus_space_map(tag, start,
rman_get_size(res), 0, &handle);
if (rv != 0)
return (ENOMEM);
rman_set_bushandle(res, handle);
rman_set_virtual(res, (void *)handle); /* XXX for powerpc only ? */
return (rman_activate_resource(res));
}
#ifdef __powerpc__
static bus_space_tag_t
ofw_pci_bus_get_bus_tag(device_t bus, device_t child)
{
return (&bs_le_tag);
}
#endif
static int
ofw_pci_deactivate_resource(device_t bus, device_t child, int type, int rid,
struct resource *res)
{
- struct ofw_pci_softc *sc;
vm_size_t psize;
-
- sc = device_get_softc(bus);
if (type != SYS_RES_IOPORT && type != SYS_RES_MEMORY) {
return (bus_generic_deactivate_resource(bus, child, type, rid,
res));
}
psize = rman_get_size(res);
pmap_unmapdev((vm_offset_t)rman_get_virtual(res), psize);
return (rman_deactivate_resource(res));
}
static int
ofw_pci_adjust_resource(device_t bus, device_t child, int type,
struct resource *res, rman_res_t start, rman_res_t end)
{
struct rman *rm;
struct ofw_pci_softc *sc;
sc = device_get_softc(bus);
#if defined(NEW_PCIB) && defined(PCI_RES_BUS)
if (type == PCI_RES_BUS)
return (pci_domain_adjust_bus(sc->sc_pci_domain, child, res,
start, end));
#endif
rm = ofw_pci_get_rman(sc, type, rman_get_flags(res));
if (rm == NULL) {
return (bus_generic_adjust_resource(bus, child, type, res,
start, end));
}
KASSERT(rman_is_region_manager(res, rm), ("rman mismatch"));
KASSERT(!(rman_get_flags(res) & RF_ACTIVE),
("active resources cannot be adjusted"));
return (rman_adjust_resource(res, start, end));
}
static phandle_t
ofw_pci_get_node(device_t bus, device_t dev)
{
struct ofw_pci_softc *sc;
sc = device_get_softc(bus);
/* We only have one child, the PCI bus, which needs our own node. */
return (sc->sc_node);
}
static int
ofw_pci_fill_ranges(phandle_t node, struct ofw_pci_range *ranges)
{
int host_address_cells = 1, pci_address_cells = 3, size_cells = 2;
cell_t *base_ranges;
ssize_t nbase_ranges;
int nranges;
int i, j, k;
OF_getencprop(OF_parent(node), "#address-cells", &host_address_cells,
sizeof(host_address_cells));
OF_getencprop(node, "#address-cells", &pci_address_cells,
sizeof(pci_address_cells));
OF_getencprop(node, "#size-cells", &size_cells, sizeof(size_cells));
nbase_ranges = OF_getproplen(node, "ranges");
if (nbase_ranges <= 0)
return (-1);
nranges = nbase_ranges / sizeof(cell_t) /
(pci_address_cells + host_address_cells + size_cells);
base_ranges = malloc(nbase_ranges, M_DEVBUF, M_WAITOK);
OF_getencprop(node, "ranges", base_ranges, nbase_ranges);
for (i = 0, j = 0; i < nranges; i++) {
ranges[i].pci_hi = base_ranges[j++];
ranges[i].pci = 0;
for (k = 0; k < pci_address_cells - 1; k++) {
ranges[i].pci <<= 32;
ranges[i].pci |= base_ranges[j++];
}
ranges[i].host = 0;
for (k = 0; k < host_address_cells; k++) {
ranges[i].host <<= 32;
ranges[i].host |= base_ranges[j++];
}
ranges[i].size = 0;
for (k = 0; k < size_cells; k++) {
ranges[i].size <<= 32;
ranges[i].size |= base_ranges[j++];
}
}
free(base_ranges, M_DEVBUF);
return (nranges);
}
static struct rman *
ofw_pci_get_rman(struct ofw_pci_softc *sc, int type, u_int flags)
{
switch (type) {
case SYS_RES_IOPORT:
return (&sc->sc_io_rman);
case SYS_RES_MEMORY:
if (sc->sc_have_pmem && (flags & RF_PREFETCHABLE))
return (&sc->sc_pmem_rman);
else
return (&sc->sc_mem_rman);
default:
break;
}
return (NULL);
}
Index: head/sys/dev/pci/pci.c
===================================================================
--- head/sys/dev/pci/pci.c (revision 327172)
+++ head/sys/dev/pci/pci.c (revision 327173)
@@ -1,6147 +1,6143 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 1997, Stefan Esser <se@freebsd.org>
* Copyright (c) 2000, Michael Smith <msmith@freebsd.org>
* Copyright (c) 2000, BSDi
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice unmodified, this list of conditions, and the following
* disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_bus.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/module.h>
#include <sys/limits.h>
#include <sys/linker.h>
#include <sys/fcntl.h>
#include <sys/conf.h>
#include <sys/kernel.h>
#include <sys/queue.h>
#include <sys/sysctl.h>
#include <sys/endian.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <vm/vm_extern.h>
#include <sys/bus.h>
#include <machine/bus.h>
#include <sys/rman.h>
#include <machine/resource.h>
#include <machine/stdarg.h>
#if defined(__i386__) || defined(__amd64__) || defined(__powerpc__)
#include <machine/intr_machdep.h>
#endif
#include <sys/pciio.h>
#include <dev/pci/pcireg.h>
#include <dev/pci/pcivar.h>
#include <dev/pci/pci_private.h>
#ifdef PCI_IOV
#include <sys/nv.h>
#include <dev/pci/pci_iov_private.h>
#endif
#include <dev/usb/controller/xhcireg.h>
#include <dev/usb/controller/ehcireg.h>
#include <dev/usb/controller/ohcireg.h>
#include <dev/usb/controller/uhcireg.h>
#include "pcib_if.h"
#include "pci_if.h"
#define PCIR_IS_BIOS(cfg, reg) \
(((cfg)->hdrtype == PCIM_HDRTYPE_NORMAL && reg == PCIR_BIOS) || \
((cfg)->hdrtype == PCIM_HDRTYPE_BRIDGE && reg == PCIR_BIOS_1))
static int pci_has_quirk(uint32_t devid, int quirk);
static pci_addr_t pci_mapbase(uint64_t mapreg);
static const char *pci_maptype(uint64_t mapreg);
static int pci_maprange(uint64_t mapreg);
static pci_addr_t pci_rombase(uint64_t mapreg);
static int pci_romsize(uint64_t testval);
static void pci_fixancient(pcicfgregs *cfg);
static int pci_printf(pcicfgregs *cfg, const char *fmt, ...);
static int pci_porten(device_t dev);
static int pci_memen(device_t dev);
static void pci_assign_interrupt(device_t bus, device_t dev,
int force_route);
static int pci_add_map(device_t bus, device_t dev, int reg,
struct resource_list *rl, int force, int prefetch);
static int pci_probe(device_t dev);
static int pci_attach(device_t dev);
static int pci_detach(device_t dev);
static void pci_load_vendor_data(void);
static int pci_describe_parse_line(char **ptr, int *vendor,
int *device, char **desc);
static char *pci_describe_device(device_t dev);
static int pci_modevent(module_t mod, int what, void *arg);
static void pci_hdrtypedata(device_t pcib, int b, int s, int f,
pcicfgregs *cfg);
static void pci_read_cap(device_t pcib, pcicfgregs *cfg);
static int pci_read_vpd_reg(device_t pcib, pcicfgregs *cfg,
int reg, uint32_t *data);
#if 0
static int pci_write_vpd_reg(device_t pcib, pcicfgregs *cfg,
int reg, uint32_t data);
#endif
static void pci_read_vpd(device_t pcib, pcicfgregs *cfg);
static void pci_mask_msix(device_t dev, u_int index);
static void pci_unmask_msix(device_t dev, u_int index);
static int pci_msi_blacklisted(void);
static int pci_msix_blacklisted(void);
static void pci_resume_msi(device_t dev);
static void pci_resume_msix(device_t dev);
static int pci_remap_intr_method(device_t bus, device_t dev,
u_int irq);
static void pci_hint_device_unit(device_t acdev, device_t child,
const char *name, int *unitp);
static int pci_get_id_method(device_t dev, device_t child,
enum pci_id_type type, uintptr_t *rid);
static struct pci_devinfo * pci_fill_devinfo(device_t pcib, device_t bus, int d,
int b, int s, int f, uint16_t vid, uint16_t did);
static device_method_t pci_methods[] = {
/* Device interface */
DEVMETHOD(device_probe, pci_probe),
DEVMETHOD(device_attach, pci_attach),
DEVMETHOD(device_detach, pci_detach),
DEVMETHOD(device_shutdown, bus_generic_shutdown),
DEVMETHOD(device_suspend, bus_generic_suspend),
DEVMETHOD(device_resume, pci_resume),
/* Bus interface */
DEVMETHOD(bus_print_child, pci_print_child),
DEVMETHOD(bus_probe_nomatch, pci_probe_nomatch),
DEVMETHOD(bus_read_ivar, pci_read_ivar),
DEVMETHOD(bus_write_ivar, pci_write_ivar),
DEVMETHOD(bus_driver_added, pci_driver_added),
DEVMETHOD(bus_setup_intr, pci_setup_intr),
DEVMETHOD(bus_teardown_intr, pci_teardown_intr),
DEVMETHOD(bus_get_dma_tag, pci_get_dma_tag),
DEVMETHOD(bus_get_resource_list,pci_get_resource_list),
DEVMETHOD(bus_set_resource, bus_generic_rl_set_resource),
DEVMETHOD(bus_get_resource, bus_generic_rl_get_resource),
DEVMETHOD(bus_delete_resource, pci_delete_resource),
DEVMETHOD(bus_alloc_resource, pci_alloc_resource),
DEVMETHOD(bus_adjust_resource, bus_generic_adjust_resource),
DEVMETHOD(bus_release_resource, pci_release_resource),
DEVMETHOD(bus_activate_resource, pci_activate_resource),
DEVMETHOD(bus_deactivate_resource, pci_deactivate_resource),
DEVMETHOD(bus_child_deleted, pci_child_deleted),
DEVMETHOD(bus_child_detached, pci_child_detached),
DEVMETHOD(bus_child_pnpinfo_str, pci_child_pnpinfo_str_method),
DEVMETHOD(bus_child_location_str, pci_child_location_str_method),
DEVMETHOD(bus_hint_device_unit, pci_hint_device_unit),
DEVMETHOD(bus_remap_intr, pci_remap_intr_method),
DEVMETHOD(bus_suspend_child, pci_suspend_child),
DEVMETHOD(bus_resume_child, pci_resume_child),
DEVMETHOD(bus_rescan, pci_rescan_method),
/* PCI interface */
DEVMETHOD(pci_read_config, pci_read_config_method),
DEVMETHOD(pci_write_config, pci_write_config_method),
DEVMETHOD(pci_enable_busmaster, pci_enable_busmaster_method),
DEVMETHOD(pci_disable_busmaster, pci_disable_busmaster_method),
DEVMETHOD(pci_enable_io, pci_enable_io_method),
DEVMETHOD(pci_disable_io, pci_disable_io_method),
DEVMETHOD(pci_get_vpd_ident, pci_get_vpd_ident_method),
DEVMETHOD(pci_get_vpd_readonly, pci_get_vpd_readonly_method),
DEVMETHOD(pci_get_powerstate, pci_get_powerstate_method),
DEVMETHOD(pci_set_powerstate, pci_set_powerstate_method),
DEVMETHOD(pci_assign_interrupt, pci_assign_interrupt_method),
DEVMETHOD(pci_find_cap, pci_find_cap_method),
DEVMETHOD(pci_find_extcap, pci_find_extcap_method),
DEVMETHOD(pci_find_htcap, pci_find_htcap_method),
DEVMETHOD(pci_alloc_msi, pci_alloc_msi_method),
DEVMETHOD(pci_alloc_msix, pci_alloc_msix_method),
DEVMETHOD(pci_enable_msi, pci_enable_msi_method),
DEVMETHOD(pci_enable_msix, pci_enable_msix_method),
DEVMETHOD(pci_disable_msi, pci_disable_msi_method),
DEVMETHOD(pci_remap_msix, pci_remap_msix_method),
DEVMETHOD(pci_release_msi, pci_release_msi_method),
DEVMETHOD(pci_msi_count, pci_msi_count_method),
DEVMETHOD(pci_msix_count, pci_msix_count_method),
DEVMETHOD(pci_msix_pba_bar, pci_msix_pba_bar_method),
DEVMETHOD(pci_msix_table_bar, pci_msix_table_bar_method),
DEVMETHOD(pci_get_id, pci_get_id_method),
DEVMETHOD(pci_alloc_devinfo, pci_alloc_devinfo_method),
DEVMETHOD(pci_child_added, pci_child_added_method),
#ifdef PCI_IOV
DEVMETHOD(pci_iov_attach, pci_iov_attach_method),
DEVMETHOD(pci_iov_detach, pci_iov_detach_method),
DEVMETHOD(pci_create_iov_child, pci_create_iov_child_method),
#endif
DEVMETHOD_END
};
DEFINE_CLASS_0(pci, pci_driver, pci_methods, sizeof(struct pci_softc));
static devclass_t pci_devclass;
DRIVER_MODULE(pci, pcib, pci_driver, pci_devclass, pci_modevent, NULL);
MODULE_VERSION(pci, 1);
static char *pci_vendordata;
static size_t pci_vendordata_size;
struct pci_quirk {
uint32_t devid; /* Vendor/device of the card */
int type;
#define PCI_QUIRK_MAP_REG 1 /* PCI map register in weird place */
#define PCI_QUIRK_DISABLE_MSI 2 /* Neither MSI nor MSI-X work */
#define PCI_QUIRK_ENABLE_MSI_VM 3 /* Older chipset in VM where MSI works */
#define PCI_QUIRK_UNMAP_REG 4 /* Ignore PCI map register */
#define PCI_QUIRK_DISABLE_MSIX 5 /* MSI-X doesn't work */
#define PCI_QUIRK_MSI_INTX_BUG 6 /* PCIM_CMD_INTxDIS disables MSI */
int arg1;
int arg2;
};
static const struct pci_quirk pci_quirks[] = {
/* The Intel 82371AB and 82443MX have a map register at offset 0x90. */
{ 0x71138086, PCI_QUIRK_MAP_REG, 0x90, 0 },
{ 0x719b8086, PCI_QUIRK_MAP_REG, 0x90, 0 },
/* As does the Serverworks OSB4 (the SMBus mapping register) */
{ 0x02001166, PCI_QUIRK_MAP_REG, 0x90, 0 },
/*
* MSI doesn't work with the ServerWorks CNB20-HE Host Bridge
* or the CMIC-SL (AKA ServerWorks GC_LE).
*/
{ 0x00141166, PCI_QUIRK_DISABLE_MSI, 0, 0 },
{ 0x00171166, PCI_QUIRK_DISABLE_MSI, 0, 0 },
/*
* MSI doesn't work on earlier Intel chipsets including
* E7500, E7501, E7505, 845, 865, 875/E7210, and 855.
*/
{ 0x25408086, PCI_QUIRK_DISABLE_MSI, 0, 0 },
{ 0x254c8086, PCI_QUIRK_DISABLE_MSI, 0, 0 },
{ 0x25508086, PCI_QUIRK_DISABLE_MSI, 0, 0 },
{ 0x25608086, PCI_QUIRK_DISABLE_MSI, 0, 0 },
{ 0x25708086, PCI_QUIRK_DISABLE_MSI, 0, 0 },
{ 0x25788086, PCI_QUIRK_DISABLE_MSI, 0, 0 },
{ 0x35808086, PCI_QUIRK_DISABLE_MSI, 0, 0 },
/*
* MSI doesn't work with devices behind the AMD 8131 HT-PCIX
* bridge.
*/
{ 0x74501022, PCI_QUIRK_DISABLE_MSI, 0, 0 },
/*
* MSI-X allocation doesn't work properly for devices passed through
* by VMware up to at least ESXi 5.1.
*/
{ 0x079015ad, PCI_QUIRK_DISABLE_MSIX, 0, 0 }, /* PCI/PCI-X */
{ 0x07a015ad, PCI_QUIRK_DISABLE_MSIX, 0, 0 }, /* PCIe */
/*
* Some virtualization environments emulate an older chipset
* but support MSI just fine. QEMU uses the Intel 82440.
*/
{ 0x12378086, PCI_QUIRK_ENABLE_MSI_VM, 0, 0 },
/*
* HPET MMIO base address may appear in Bar1 for AMD SB600 SMBus
* controller depending on SoftPciRst register (PM_IO 0x55 [7]).
* It prevents us from attaching hpet(4) when the bit is unset.
* Note this quirk only affects SB600 revision A13 and earlier.
* For SB600 A21 and later, firmware must set the bit to hide it.
* For SB700 and later, it is unused and hardcoded to zero.
*/
{ 0x43851002, PCI_QUIRK_UNMAP_REG, 0x14, 0 },
/*
* Atheros AR8161/AR8162/E2200/E2400/E2500 Ethernet controllers have
* a bug that MSI interrupt does not assert if PCIM_CMD_INTxDIS bit
* of the command register is set.
*/
{ 0x10911969, PCI_QUIRK_MSI_INTX_BUG, 0, 0 },
{ 0xE0911969, PCI_QUIRK_MSI_INTX_BUG, 0, 0 },
{ 0xE0A11969, PCI_QUIRK_MSI_INTX_BUG, 0, 0 },
{ 0xE0B11969, PCI_QUIRK_MSI_INTX_BUG, 0, 0 },
{ 0x10901969, PCI_QUIRK_MSI_INTX_BUG, 0, 0 },
/*
* Broadcom BCM5714(S)/BCM5715(S)/BCM5780(S) Ethernet MACs don't
* issue MSI interrupts with PCIM_CMD_INTxDIS set either.
*/
{ 0x166814e4, PCI_QUIRK_MSI_INTX_BUG, 0, 0 }, /* BCM5714 */
{ 0x166914e4, PCI_QUIRK_MSI_INTX_BUG, 0, 0 }, /* BCM5714S */
{ 0x166a14e4, PCI_QUIRK_MSI_INTX_BUG, 0, 0 }, /* BCM5780 */
{ 0x166b14e4, PCI_QUIRK_MSI_INTX_BUG, 0, 0 }, /* BCM5780S */
{ 0x167814e4, PCI_QUIRK_MSI_INTX_BUG, 0, 0 }, /* BCM5715 */
{ 0x167914e4, PCI_QUIRK_MSI_INTX_BUG, 0, 0 }, /* BCM5715S */
{ 0 }
};
/* map register information */
#define PCI_MAPMEM 0x01 /* memory map */
#define PCI_MAPMEMP 0x02 /* prefetchable memory map */
#define PCI_MAPPORT 0x04 /* port map */
struct devlist pci_devq;
uint32_t pci_generation;
uint32_t pci_numdevs = 0;
static int pcie_chipset, pcix_chipset;
/* sysctl vars */
SYSCTL_NODE(_hw, OID_AUTO, pci, CTLFLAG_RD, 0, "PCI bus tuning parameters");
static int pci_enable_io_modes = 1;
SYSCTL_INT(_hw_pci, OID_AUTO, enable_io_modes, CTLFLAG_RWTUN,
&pci_enable_io_modes, 1,
"Enable I/O and memory bits in the config register. Some BIOSes do not"
" enable these bits correctly. We'd like to do this all the time, but"
" there are some peripherals that this causes problems with.");
static int pci_do_realloc_bars = 0;
SYSCTL_INT(_hw_pci, OID_AUTO, realloc_bars, CTLFLAG_RWTUN,
&pci_do_realloc_bars, 0,
"Attempt to allocate a new range for any BARs whose original "
"firmware-assigned ranges fail to allocate during the initial device scan.");
static int pci_do_power_nodriver = 0;
SYSCTL_INT(_hw_pci, OID_AUTO, do_power_nodriver, CTLFLAG_RWTUN,
&pci_do_power_nodriver, 0,
"Place a function into D3 state when no driver attaches to it. 0 means"
" disable. 1 means conservatively place devices into D3 state. 2 means"
" aggressively place devices into D3 state. 3 means put absolutely"
" everything in D3 state.");
int pci_do_power_resume = 1;
SYSCTL_INT(_hw_pci, OID_AUTO, do_power_resume, CTLFLAG_RWTUN,
&pci_do_power_resume, 1,
"Transition from D3 -> D0 on resume.");
int pci_do_power_suspend = 1;
SYSCTL_INT(_hw_pci, OID_AUTO, do_power_suspend, CTLFLAG_RWTUN,
&pci_do_power_suspend, 1,
"Transition from D0 -> D3 on suspend.");
static int pci_do_msi = 1;
SYSCTL_INT(_hw_pci, OID_AUTO, enable_msi, CTLFLAG_RWTUN, &pci_do_msi, 1,
"Enable support for MSI interrupts");
static int pci_do_msix = 1;
SYSCTL_INT(_hw_pci, OID_AUTO, enable_msix, CTLFLAG_RWTUN, &pci_do_msix, 1,
"Enable support for MSI-X interrupts");
static int pci_msix_rewrite_table = 0;
SYSCTL_INT(_hw_pci, OID_AUTO, msix_rewrite_table, CTLFLAG_RWTUN,
&pci_msix_rewrite_table, 0,
"Rewrite entire MSI-X table when updating MSI-X entries");
static int pci_honor_msi_blacklist = 1;
SYSCTL_INT(_hw_pci, OID_AUTO, honor_msi_blacklist, CTLFLAG_RDTUN,
&pci_honor_msi_blacklist, 1, "Honor chipset blacklist for MSI/MSI-X");
#if defined(__i386__) || defined(__amd64__)
static int pci_usb_takeover = 1;
#else
static int pci_usb_takeover = 0;
#endif
SYSCTL_INT(_hw_pci, OID_AUTO, usb_early_takeover, CTLFLAG_RDTUN,
&pci_usb_takeover, 1,
"Enable early takeover of USB controllers. Disable this if you depend on"
" BIOS emulation of USB devices, that is you use USB devices (like"
" keyboard or mouse) but do not load USB drivers");
static int pci_clear_bars;
SYSCTL_INT(_hw_pci, OID_AUTO, clear_bars, CTLFLAG_RDTUN, &pci_clear_bars, 0,
"Ignore firmware-assigned resources for BARs.");
#if defined(NEW_PCIB) && defined(PCI_RES_BUS)
static int pci_clear_buses;
SYSCTL_INT(_hw_pci, OID_AUTO, clear_buses, CTLFLAG_RDTUN, &pci_clear_buses, 0,
"Ignore firmware-assigned bus numbers.");
#endif
static int pci_enable_ari = 1;
SYSCTL_INT(_hw_pci, OID_AUTO, enable_ari, CTLFLAG_RDTUN, &pci_enable_ari,
0, "Enable support for PCIe Alternative RID Interpretation");
static int
pci_has_quirk(uint32_t devid, int quirk)
{
const struct pci_quirk *q;
for (q = &pci_quirks[0]; q->devid; q++) {
if (q->devid == devid && q->type == quirk)
return (1);
}
return (0);
}
/* Find a device_t by bus/slot/function in domain 0 */
device_t
pci_find_bsf(uint8_t bus, uint8_t slot, uint8_t func)
{
return (pci_find_dbsf(0, bus, slot, func));
}
/* Find a device_t by domain/bus/slot/function */
device_t
pci_find_dbsf(uint32_t domain, uint8_t bus, uint8_t slot, uint8_t func)
{
struct pci_devinfo *dinfo;
STAILQ_FOREACH(dinfo, &pci_devq, pci_links) {
if ((dinfo->cfg.domain == domain) &&
(dinfo->cfg.bus == bus) &&
(dinfo->cfg.slot == slot) &&
(dinfo->cfg.func == func)) {
return (dinfo->cfg.dev);
}
}
return (NULL);
}
/* Find a device_t by vendor/device ID */
device_t
pci_find_device(uint16_t vendor, uint16_t device)
{
struct pci_devinfo *dinfo;
STAILQ_FOREACH(dinfo, &pci_devq, pci_links) {
if ((dinfo->cfg.vendor == vendor) &&
(dinfo->cfg.device == device)) {
return (dinfo->cfg.dev);
}
}
return (NULL);
}
device_t
pci_find_class(uint8_t class, uint8_t subclass)
{
struct pci_devinfo *dinfo;
STAILQ_FOREACH(dinfo, &pci_devq, pci_links) {
if (dinfo->cfg.baseclass == class &&
dinfo->cfg.subclass == subclass) {
return (dinfo->cfg.dev);
}
}
return (NULL);
}
static int
pci_printf(pcicfgregs *cfg, const char *fmt, ...)
{
va_list ap;
int retval;
retval = printf("pci%d:%d:%d:%d: ", cfg->domain, cfg->bus, cfg->slot,
cfg->func);
va_start(ap, fmt);
retval += vprintf(fmt, ap);
va_end(ap);
return (retval);
}
/* return base address of memory or port map */
static pci_addr_t
pci_mapbase(uint64_t mapreg)
{
if (PCI_BAR_MEM(mapreg))
return (mapreg & PCIM_BAR_MEM_BASE);
else
return (mapreg & PCIM_BAR_IO_BASE);
}
/* return map type of memory or port map */
static const char *
pci_maptype(uint64_t mapreg)
{
if (PCI_BAR_IO(mapreg))
return ("I/O Port");
if (mapreg & PCIM_BAR_MEM_PREFETCH)
return ("Prefetchable Memory");
return ("Memory");
}
/* return log2 of map size decoded for memory or port map */
int
pci_mapsize(uint64_t testval)
{
int ln2size;
testval = pci_mapbase(testval);
ln2size = 0;
if (testval != 0) {
while ((testval & 1) == 0)
{
ln2size++;
testval >>= 1;
}
}
return (ln2size);
}
/* return base address of device ROM */
static pci_addr_t
pci_rombase(uint64_t mapreg)
{
return (mapreg & PCIM_BIOS_ADDR_MASK);
}
/* return log2 of map size decided for device ROM */
static int
pci_romsize(uint64_t testval)
{
int ln2size;
testval = pci_rombase(testval);
ln2size = 0;
if (testval != 0) {
while ((testval & 1) == 0)
{
ln2size++;
testval >>= 1;
}
}
return (ln2size);
}
/* return log2 of address range supported by map register */
static int
pci_maprange(uint64_t mapreg)
{
int ln2range = 0;
if (PCI_BAR_IO(mapreg))
ln2range = 32;
else
switch (mapreg & PCIM_BAR_MEM_TYPE) {
case PCIM_BAR_MEM_32:
ln2range = 32;
break;
case PCIM_BAR_MEM_1MB:
ln2range = 20;
break;
case PCIM_BAR_MEM_64:
ln2range = 64;
break;
}
return (ln2range);
}
/* adjust some values from PCI 1.0 devices to match 2.0 standards ... */
static void
pci_fixancient(pcicfgregs *cfg)
{
if ((cfg->hdrtype & PCIM_HDRTYPE) != PCIM_HDRTYPE_NORMAL)
return;
/* PCI to PCI bridges use header type 1 */
if (cfg->baseclass == PCIC_BRIDGE && cfg->subclass == PCIS_BRIDGE_PCI)
cfg->hdrtype = PCIM_HDRTYPE_BRIDGE;
}
/* extract header type specific config data */
static void
pci_hdrtypedata(device_t pcib, int b, int s, int f, pcicfgregs *cfg)
{
#define REG(n, w) PCIB_READ_CONFIG(pcib, b, s, f, n, w)
switch (cfg->hdrtype & PCIM_HDRTYPE) {
case PCIM_HDRTYPE_NORMAL:
cfg->subvendor = REG(PCIR_SUBVEND_0, 2);
cfg->subdevice = REG(PCIR_SUBDEV_0, 2);
cfg->mingnt = REG(PCIR_MINGNT, 1);
cfg->maxlat = REG(PCIR_MAXLAT, 1);
cfg->nummaps = PCI_MAXMAPS_0;
break;
case PCIM_HDRTYPE_BRIDGE:
cfg->bridge.br_seclat = REG(PCIR_SECLAT_1, 1);
cfg->bridge.br_subbus = REG(PCIR_SUBBUS_1, 1);
cfg->bridge.br_secbus = REG(PCIR_SECBUS_1, 1);
cfg->bridge.br_pribus = REG(PCIR_PRIBUS_1, 1);
cfg->bridge.br_control = REG(PCIR_BRIDGECTL_1, 2);
cfg->nummaps = PCI_MAXMAPS_1;
break;
case PCIM_HDRTYPE_CARDBUS:
cfg->bridge.br_seclat = REG(PCIR_SECLAT_2, 1);
cfg->bridge.br_subbus = REG(PCIR_SUBBUS_2, 1);
cfg->bridge.br_secbus = REG(PCIR_SECBUS_2, 1);
cfg->bridge.br_pribus = REG(PCIR_PRIBUS_2, 1);
cfg->bridge.br_control = REG(PCIR_BRIDGECTL_2, 2);
cfg->subvendor = REG(PCIR_SUBVEND_2, 2);
cfg->subdevice = REG(PCIR_SUBDEV_2, 2);
cfg->nummaps = PCI_MAXMAPS_2;
break;
}
#undef REG
}
/* read configuration header into pcicfgregs structure */
struct pci_devinfo *
pci_read_device(device_t pcib, device_t bus, int d, int b, int s, int f)
{
#define REG(n, w) PCIB_READ_CONFIG(pcib, b, s, f, n, w)
uint16_t vid, did;
vid = REG(PCIR_VENDOR, 2);
did = REG(PCIR_DEVICE, 2);
if (vid != 0xffff)
return (pci_fill_devinfo(pcib, bus, d, b, s, f, vid, did));
return (NULL);
}
struct pci_devinfo *
pci_alloc_devinfo_method(device_t dev)
{
return (malloc(sizeof(struct pci_devinfo), M_DEVBUF,
M_WAITOK | M_ZERO));
}
static struct pci_devinfo *
pci_fill_devinfo(device_t pcib, device_t bus, int d, int b, int s, int f,
uint16_t vid, uint16_t did)
{
struct pci_devinfo *devlist_entry;
pcicfgregs *cfg;
devlist_entry = PCI_ALLOC_DEVINFO(bus);
cfg = &devlist_entry->cfg;
cfg->domain = d;
cfg->bus = b;
cfg->slot = s;
cfg->func = f;
cfg->vendor = vid;
cfg->device = did;
cfg->cmdreg = REG(PCIR_COMMAND, 2);
cfg->statreg = REG(PCIR_STATUS, 2);
cfg->baseclass = REG(PCIR_CLASS, 1);
cfg->subclass = REG(PCIR_SUBCLASS, 1);
cfg->progif = REG(PCIR_PROGIF, 1);
cfg->revid = REG(PCIR_REVID, 1);
cfg->hdrtype = REG(PCIR_HDRTYPE, 1);
cfg->cachelnsz = REG(PCIR_CACHELNSZ, 1);
cfg->lattimer = REG(PCIR_LATTIMER, 1);
cfg->intpin = REG(PCIR_INTPIN, 1);
cfg->intline = REG(PCIR_INTLINE, 1);
cfg->mfdev = (cfg->hdrtype & PCIM_MFDEV) != 0;
cfg->hdrtype &= ~PCIM_MFDEV;
STAILQ_INIT(&cfg->maps);
cfg->iov = NULL;
pci_fixancient(cfg);
pci_hdrtypedata(pcib, b, s, f, cfg);
if (REG(PCIR_STATUS, 2) & PCIM_STATUS_CAPPRESENT)
pci_read_cap(pcib, cfg);
STAILQ_INSERT_TAIL(&pci_devq, devlist_entry, pci_links);
devlist_entry->conf.pc_sel.pc_domain = cfg->domain;
devlist_entry->conf.pc_sel.pc_bus = cfg->bus;
devlist_entry->conf.pc_sel.pc_dev = cfg->slot;
devlist_entry->conf.pc_sel.pc_func = cfg->func;
devlist_entry->conf.pc_hdr = cfg->hdrtype;
devlist_entry->conf.pc_subvendor = cfg->subvendor;
devlist_entry->conf.pc_subdevice = cfg->subdevice;
devlist_entry->conf.pc_vendor = cfg->vendor;
devlist_entry->conf.pc_device = cfg->device;
devlist_entry->conf.pc_class = cfg->baseclass;
devlist_entry->conf.pc_subclass = cfg->subclass;
devlist_entry->conf.pc_progif = cfg->progif;
devlist_entry->conf.pc_revid = cfg->revid;
pci_numdevs++;
pci_generation++;
return (devlist_entry);
}
#undef REG
static void
pci_ea_fill_info(device_t pcib, pcicfgregs *cfg)
{
#define REG(n, w) PCIB_READ_CONFIG(pcib, cfg->bus, cfg->slot, cfg->func, \
cfg->ea.ea_location + (n), w)
int num_ent;
int ptr;
int a, b;
uint32_t val;
int ent_size;
uint32_t dw[4];
uint64_t base, max_offset;
struct pci_ea_entry *eae;
if (cfg->ea.ea_location == 0)
return;
STAILQ_INIT(&cfg->ea.ea_entries);
/* Determine the number of entries */
num_ent = REG(PCIR_EA_NUM_ENT, 2);
num_ent &= PCIM_EA_NUM_ENT_MASK;
/* Find the first entry to care of */
ptr = PCIR_EA_FIRST_ENT;
/* Skip DWORD 2 for type 1 functions */
if ((cfg->hdrtype & PCIM_HDRTYPE) == PCIM_HDRTYPE_BRIDGE)
ptr += 4;
for (a = 0; a < num_ent; a++) {
eae = malloc(sizeof(*eae), M_DEVBUF, M_WAITOK | M_ZERO);
eae->eae_cfg_offset = cfg->ea.ea_location + ptr;
/* Read a number of dwords in the entry */
val = REG(ptr, 4);
ptr += 4;
ent_size = (val & PCIM_EA_ES);
for (b = 0; b < ent_size; b++) {
dw[b] = REG(ptr, 4);
ptr += 4;
}
eae->eae_flags = val;
eae->eae_bei = (PCIM_EA_BEI & val) >> PCIM_EA_BEI_OFFSET;
base = dw[0] & PCIM_EA_FIELD_MASK;
max_offset = dw[1] | ~PCIM_EA_FIELD_MASK;
b = 2;
if (((dw[0] & PCIM_EA_IS_64) != 0) && (b < ent_size)) {
base |= (uint64_t)dw[b] << 32UL;
b++;
}
if (((dw[1] & PCIM_EA_IS_64) != 0)
&& (b < ent_size)) {
max_offset |= (uint64_t)dw[b] << 32UL;
b++;
}
eae->eae_base = base;
eae->eae_max_offset = max_offset;
STAILQ_INSERT_TAIL(&cfg->ea.ea_entries, eae, eae_link);
if (bootverbose) {
printf("PCI(EA) dev %04x:%04x, bei %d, flags #%x, base #%jx, max_offset #%jx\n",
cfg->vendor, cfg->device, eae->eae_bei, eae->eae_flags,
(uintmax_t)eae->eae_base, (uintmax_t)eae->eae_max_offset);
}
}
}
#undef REG
static void
pci_read_cap(device_t pcib, pcicfgregs *cfg)
{
#define REG(n, w) PCIB_READ_CONFIG(pcib, cfg->bus, cfg->slot, cfg->func, n, w)
#define WREG(n, v, w) PCIB_WRITE_CONFIG(pcib, cfg->bus, cfg->slot, cfg->func, n, v, w)
#if defined(__i386__) || defined(__amd64__) || defined(__powerpc__)
uint64_t addr;
#endif
uint32_t val;
int ptr, nextptr, ptrptr;
switch (cfg->hdrtype & PCIM_HDRTYPE) {
case PCIM_HDRTYPE_NORMAL:
case PCIM_HDRTYPE_BRIDGE:
ptrptr = PCIR_CAP_PTR;
break;
case PCIM_HDRTYPE_CARDBUS:
ptrptr = PCIR_CAP_PTR_2; /* cardbus capabilities ptr */
break;
default:
return; /* no extended capabilities support */
}
nextptr = REG(ptrptr, 1); /* sanity check? */
/*
* Read capability entries.
*/
while (nextptr != 0) {
/* Sanity check */
if (nextptr > 255) {
printf("illegal PCI extended capability offset %d\n",
nextptr);
return;
}
/* Find the next entry */
ptr = nextptr;
nextptr = REG(ptr + PCICAP_NEXTPTR, 1);
/* Process this entry */
switch (REG(ptr + PCICAP_ID, 1)) {
case PCIY_PMG: /* PCI power management */
if (cfg->pp.pp_cap == 0) {
cfg->pp.pp_cap = REG(ptr + PCIR_POWER_CAP, 2);
cfg->pp.pp_status = ptr + PCIR_POWER_STATUS;
cfg->pp.pp_bse = ptr + PCIR_POWER_BSE;
if ((nextptr - ptr) > PCIR_POWER_DATA)
cfg->pp.pp_data = ptr + PCIR_POWER_DATA;
}
break;
case PCIY_HT: /* HyperTransport */
/* Determine HT-specific capability type. */
val = REG(ptr + PCIR_HT_COMMAND, 2);
if ((val & 0xe000) == PCIM_HTCAP_SLAVE)
cfg->ht.ht_slave = ptr;
#if defined(__i386__) || defined(__amd64__) || defined(__powerpc__)
switch (val & PCIM_HTCMD_CAP_MASK) {
case PCIM_HTCAP_MSI_MAPPING:
if (!(val & PCIM_HTCMD_MSI_FIXED)) {
/* Sanity check the mapping window. */
addr = REG(ptr + PCIR_HTMSI_ADDRESS_HI,
4);
addr <<= 32;
addr |= REG(ptr + PCIR_HTMSI_ADDRESS_LO,
4);
if (addr != MSI_INTEL_ADDR_BASE)
device_printf(pcib,
"HT device at pci%d:%d:%d:%d has non-default MSI window 0x%llx\n",
cfg->domain, cfg->bus,
cfg->slot, cfg->func,
(long long)addr);
} else
addr = MSI_INTEL_ADDR_BASE;
cfg->ht.ht_msimap = ptr;
cfg->ht.ht_msictrl = val;
cfg->ht.ht_msiaddr = addr;
break;
}
#endif
break;
case PCIY_MSI: /* PCI MSI */
cfg->msi.msi_location = ptr;
cfg->msi.msi_ctrl = REG(ptr + PCIR_MSI_CTRL, 2);
cfg->msi.msi_msgnum = 1 << ((cfg->msi.msi_ctrl &
PCIM_MSICTRL_MMC_MASK)>>1);
break;
case PCIY_MSIX: /* PCI MSI-X */
cfg->msix.msix_location = ptr;
cfg->msix.msix_ctrl = REG(ptr + PCIR_MSIX_CTRL, 2);
cfg->msix.msix_msgnum = (cfg->msix.msix_ctrl &
PCIM_MSIXCTRL_TABLE_SIZE) + 1;
val = REG(ptr + PCIR_MSIX_TABLE, 4);
cfg->msix.msix_table_bar = PCIR_BAR(val &
PCIM_MSIX_BIR_MASK);
cfg->msix.msix_table_offset = val & ~PCIM_MSIX_BIR_MASK;
val = REG(ptr + PCIR_MSIX_PBA, 4);
cfg->msix.msix_pba_bar = PCIR_BAR(val &
PCIM_MSIX_BIR_MASK);
cfg->msix.msix_pba_offset = val & ~PCIM_MSIX_BIR_MASK;
break;
case PCIY_VPD: /* PCI Vital Product Data */
cfg->vpd.vpd_reg = ptr;
break;
case PCIY_SUBVENDOR:
/* Should always be true. */
if ((cfg->hdrtype & PCIM_HDRTYPE) ==
PCIM_HDRTYPE_BRIDGE) {
val = REG(ptr + PCIR_SUBVENDCAP_ID, 4);
cfg->subvendor = val & 0xffff;
cfg->subdevice = val >> 16;
}
break;
case PCIY_PCIX: /* PCI-X */
/*
* Assume we have a PCI-X chipset if we have
* at least one PCI-PCI bridge with a PCI-X
* capability. Note that some systems with
* PCI-express or HT chipsets might match on
* this check as well.
*/
if ((cfg->hdrtype & PCIM_HDRTYPE) ==
PCIM_HDRTYPE_BRIDGE)
pcix_chipset = 1;
cfg->pcix.pcix_location = ptr;
break;
case PCIY_EXPRESS: /* PCI-express */
/*
* Assume we have a PCI-express chipset if we have
* at least one PCI-express device.
*/
pcie_chipset = 1;
cfg->pcie.pcie_location = ptr;
val = REG(ptr + PCIER_FLAGS, 2);
cfg->pcie.pcie_type = val & PCIEM_FLAGS_TYPE;
break;
case PCIY_EA: /* Enhanced Allocation */
cfg->ea.ea_location = ptr;
pci_ea_fill_info(pcib, cfg);
break;
default:
break;
}
}
#if defined(__powerpc__)
/*
* Enable the MSI mapping window for all HyperTransport
* slaves. PCI-PCI bridges have their windows enabled via
* PCIB_MAP_MSI().
*/
if (cfg->ht.ht_slave != 0 && cfg->ht.ht_msimap != 0 &&
!(cfg->ht.ht_msictrl & PCIM_HTCMD_MSI_ENABLE)) {
device_printf(pcib,
"Enabling MSI window for HyperTransport slave at pci%d:%d:%d:%d\n",
cfg->domain, cfg->bus, cfg->slot, cfg->func);
cfg->ht.ht_msictrl |= PCIM_HTCMD_MSI_ENABLE;
WREG(cfg->ht.ht_msimap + PCIR_HT_COMMAND, cfg->ht.ht_msictrl,
2);
}
#endif
/* REG and WREG use carry through to next functions */
}
/*
* PCI Vital Product Data
*/
#define PCI_VPD_TIMEOUT 1000000
static int
pci_read_vpd_reg(device_t pcib, pcicfgregs *cfg, int reg, uint32_t *data)
{
int count = PCI_VPD_TIMEOUT;
KASSERT((reg & 3) == 0, ("VPD register must by 4 byte aligned"));
WREG(cfg->vpd.vpd_reg + PCIR_VPD_ADDR, reg, 2);
while ((REG(cfg->vpd.vpd_reg + PCIR_VPD_ADDR, 2) & 0x8000) != 0x8000) {
if (--count < 0)
return (ENXIO);
DELAY(1); /* limit looping */
}
*data = (REG(cfg->vpd.vpd_reg + PCIR_VPD_DATA, 4));
return (0);
}
#if 0
static int
pci_write_vpd_reg(device_t pcib, pcicfgregs *cfg, int reg, uint32_t data)
{
int count = PCI_VPD_TIMEOUT;
KASSERT((reg & 3) == 0, ("VPD register must by 4 byte aligned"));
WREG(cfg->vpd.vpd_reg + PCIR_VPD_DATA, data, 4);
WREG(cfg->vpd.vpd_reg + PCIR_VPD_ADDR, reg | 0x8000, 2);
while ((REG(cfg->vpd.vpd_reg + PCIR_VPD_ADDR, 2) & 0x8000) == 0x8000) {
if (--count < 0)
return (ENXIO);
DELAY(1); /* limit looping */
}
return (0);
}
#endif
#undef PCI_VPD_TIMEOUT
struct vpd_readstate {
device_t pcib;
pcicfgregs *cfg;
uint32_t val;
int bytesinval;
int off;
uint8_t cksum;
};
static int
vpd_nextbyte(struct vpd_readstate *vrs, uint8_t *data)
{
uint32_t reg;
uint8_t byte;
if (vrs->bytesinval == 0) {
if (pci_read_vpd_reg(vrs->pcib, vrs->cfg, vrs->off, &reg))
return (ENXIO);
vrs->val = le32toh(reg);
vrs->off += 4;
byte = vrs->val & 0xff;
vrs->bytesinval = 3;
} else {
vrs->val = vrs->val >> 8;
byte = vrs->val & 0xff;
vrs->bytesinval--;
}
vrs->cksum += byte;
*data = byte;
return (0);
}
static void
pci_read_vpd(device_t pcib, pcicfgregs *cfg)
{
struct vpd_readstate vrs;
int state;
int name;
int remain;
int i;
int alloc, off; /* alloc/off for RO/W arrays */
int cksumvalid;
int dflen;
uint8_t byte;
uint8_t byte2;
/* init vpd reader */
vrs.bytesinval = 0;
vrs.off = 0;
vrs.pcib = pcib;
vrs.cfg = cfg;
vrs.cksum = 0;
state = 0;
name = remain = i = 0; /* shut up stupid gcc */
alloc = off = 0; /* shut up stupid gcc */
dflen = 0; /* shut up stupid gcc */
cksumvalid = -1;
while (state >= 0) {
if (vpd_nextbyte(&vrs, &byte)) {
state = -2;
break;
}
#if 0
printf("vpd: val: %#x, off: %d, bytesinval: %d, byte: %#hhx, " \
"state: %d, remain: %d, name: %#x, i: %d\n", vrs.val,
vrs.off, vrs.bytesinval, byte, state, remain, name, i);
#endif
switch (state) {
case 0: /* item name */
if (byte & 0x80) {
if (vpd_nextbyte(&vrs, &byte2)) {
state = -2;
break;
}
remain = byte2;
if (vpd_nextbyte(&vrs, &byte2)) {
state = -2;
break;
}
remain |= byte2 << 8;
if (remain > (0x7f*4 - vrs.off)) {
state = -1;
pci_printf(cfg,
"invalid VPD data, remain %#x\n",
remain);
}
name = byte & 0x7f;
} else {
remain = byte & 0x7;
name = (byte >> 3) & 0xf;
}
switch (name) {
case 0x2: /* String */
cfg->vpd.vpd_ident = malloc(remain + 1,
M_DEVBUF, M_WAITOK);
i = 0;
state = 1;
break;
case 0xf: /* End */
state = -1;
break;
case 0x10: /* VPD-R */
alloc = 8;
off = 0;
cfg->vpd.vpd_ros = malloc(alloc *
sizeof(*cfg->vpd.vpd_ros), M_DEVBUF,
M_WAITOK | M_ZERO);
state = 2;
break;
case 0x11: /* VPD-W */
alloc = 8;
off = 0;
cfg->vpd.vpd_w = malloc(alloc *
sizeof(*cfg->vpd.vpd_w), M_DEVBUF,
M_WAITOK | M_ZERO);
state = 5;
break;
default: /* Invalid data, abort */
state = -1;
break;
}
break;
case 1: /* Identifier String */
cfg->vpd.vpd_ident[i++] = byte;
remain--;
if (remain == 0) {
cfg->vpd.vpd_ident[i] = '\0';
state = 0;
}
break;
case 2: /* VPD-R Keyword Header */
if (off == alloc) {
cfg->vpd.vpd_ros = reallocf(cfg->vpd.vpd_ros,
(alloc *= 2) * sizeof(*cfg->vpd.vpd_ros),
M_DEVBUF, M_WAITOK | M_ZERO);
}
cfg->vpd.vpd_ros[off].keyword[0] = byte;
if (vpd_nextbyte(&vrs, &byte2)) {
state = -2;
break;
}
cfg->vpd.vpd_ros[off].keyword[1] = byte2;
if (vpd_nextbyte(&vrs, &byte2)) {
state = -2;
break;
}
cfg->vpd.vpd_ros[off].len = dflen = byte2;
if (dflen == 0 &&
strncmp(cfg->vpd.vpd_ros[off].keyword, "RV",
2) == 0) {
/*
* if this happens, we can't trust the rest
* of the VPD.
*/
pci_printf(cfg, "bad keyword length: %d\n",
dflen);
cksumvalid = 0;
state = -1;
break;
} else if (dflen == 0) {
cfg->vpd.vpd_ros[off].value = malloc(1 *
sizeof(*cfg->vpd.vpd_ros[off].value),
M_DEVBUF, M_WAITOK);
cfg->vpd.vpd_ros[off].value[0] = '\x00';
} else
cfg->vpd.vpd_ros[off].value = malloc(
(dflen + 1) *
sizeof(*cfg->vpd.vpd_ros[off].value),
M_DEVBUF, M_WAITOK);
remain -= 3;
i = 0;
/* keep in sync w/ state 3's transistions */
if (dflen == 0 && remain == 0)
state = 0;
else if (dflen == 0)
state = 2;
else
state = 3;
break;
case 3: /* VPD-R Keyword Value */
cfg->vpd.vpd_ros[off].value[i++] = byte;
if (strncmp(cfg->vpd.vpd_ros[off].keyword,
"RV", 2) == 0 && cksumvalid == -1) {
if (vrs.cksum == 0)
cksumvalid = 1;
else {
if (bootverbose)
pci_printf(cfg,
"bad VPD cksum, remain %hhu\n",
vrs.cksum);
cksumvalid = 0;
state = -1;
break;
}
}
dflen--;
remain--;
/* keep in sync w/ state 2's transistions */
if (dflen == 0)
cfg->vpd.vpd_ros[off++].value[i++] = '\0';
if (dflen == 0 && remain == 0) {
cfg->vpd.vpd_rocnt = off;
cfg->vpd.vpd_ros = reallocf(cfg->vpd.vpd_ros,
off * sizeof(*cfg->vpd.vpd_ros),
M_DEVBUF, M_WAITOK | M_ZERO);
state = 0;
} else if (dflen == 0)
state = 2;
break;
case 4:
remain--;
if (remain == 0)
state = 0;
break;
case 5: /* VPD-W Keyword Header */
if (off == alloc) {
cfg->vpd.vpd_w = reallocf(cfg->vpd.vpd_w,
(alloc *= 2) * sizeof(*cfg->vpd.vpd_w),
M_DEVBUF, M_WAITOK | M_ZERO);
}
cfg->vpd.vpd_w[off].keyword[0] = byte;
if (vpd_nextbyte(&vrs, &byte2)) {
state = -2;
break;
}
cfg->vpd.vpd_w[off].keyword[1] = byte2;
if (vpd_nextbyte(&vrs, &byte2)) {
state = -2;
break;
}
cfg->vpd.vpd_w[off].len = dflen = byte2;
cfg->vpd.vpd_w[off].start = vrs.off - vrs.bytesinval;
cfg->vpd.vpd_w[off].value = malloc((dflen + 1) *
sizeof(*cfg->vpd.vpd_w[off].value),
M_DEVBUF, M_WAITOK);
remain -= 3;
i = 0;
/* keep in sync w/ state 6's transistions */
if (dflen == 0 && remain == 0)
state = 0;
else if (dflen == 0)
state = 5;
else
state = 6;
break;
case 6: /* VPD-W Keyword Value */
cfg->vpd.vpd_w[off].value[i++] = byte;
dflen--;
remain--;
/* keep in sync w/ state 5's transistions */
if (dflen == 0)
cfg->vpd.vpd_w[off++].value[i++] = '\0';
if (dflen == 0 && remain == 0) {
cfg->vpd.vpd_wcnt = off;
cfg->vpd.vpd_w = reallocf(cfg->vpd.vpd_w,
off * sizeof(*cfg->vpd.vpd_w),
M_DEVBUF, M_WAITOK | M_ZERO);
state = 0;
} else if (dflen == 0)
state = 5;
break;
default:
pci_printf(cfg, "invalid state: %d\n", state);
state = -1;
break;
}
}
if (cksumvalid == 0 || state < -1) {
/* read-only data bad, clean up */
if (cfg->vpd.vpd_ros != NULL) {
for (off = 0; cfg->vpd.vpd_ros[off].value; off++)
free(cfg->vpd.vpd_ros[off].value, M_DEVBUF);
free(cfg->vpd.vpd_ros, M_DEVBUF);
cfg->vpd.vpd_ros = NULL;
}
}
if (state < -1) {
/* I/O error, clean up */
pci_printf(cfg, "failed to read VPD data.\n");
if (cfg->vpd.vpd_ident != NULL) {
free(cfg->vpd.vpd_ident, M_DEVBUF);
cfg->vpd.vpd_ident = NULL;
}
if (cfg->vpd.vpd_w != NULL) {
for (off = 0; cfg->vpd.vpd_w[off].value; off++)
free(cfg->vpd.vpd_w[off].value, M_DEVBUF);
free(cfg->vpd.vpd_w, M_DEVBUF);
cfg->vpd.vpd_w = NULL;
}
}
cfg->vpd.vpd_cached = 1;
#undef REG
#undef WREG
}
int
pci_get_vpd_ident_method(device_t dev, device_t child, const char **identptr)
{
struct pci_devinfo *dinfo = device_get_ivars(child);
pcicfgregs *cfg = &dinfo->cfg;
if (!cfg->vpd.vpd_cached && cfg->vpd.vpd_reg != 0)
pci_read_vpd(device_get_parent(dev), cfg);
*identptr = cfg->vpd.vpd_ident;
if (*identptr == NULL)
return (ENXIO);
return (0);
}
int
pci_get_vpd_readonly_method(device_t dev, device_t child, const char *kw,
const char **vptr)
{
struct pci_devinfo *dinfo = device_get_ivars(child);
pcicfgregs *cfg = &dinfo->cfg;
int i;
if (!cfg->vpd.vpd_cached && cfg->vpd.vpd_reg != 0)
pci_read_vpd(device_get_parent(dev), cfg);
for (i = 0; i < cfg->vpd.vpd_rocnt; i++)
if (memcmp(kw, cfg->vpd.vpd_ros[i].keyword,
sizeof(cfg->vpd.vpd_ros[i].keyword)) == 0) {
*vptr = cfg->vpd.vpd_ros[i].value;
return (0);
}
*vptr = NULL;
return (ENXIO);
}
struct pcicfg_vpd *
pci_fetch_vpd_list(device_t dev)
{
struct pci_devinfo *dinfo = device_get_ivars(dev);
pcicfgregs *cfg = &dinfo->cfg;
if (!cfg->vpd.vpd_cached && cfg->vpd.vpd_reg != 0)
pci_read_vpd(device_get_parent(device_get_parent(dev)), cfg);
return (&cfg->vpd);
}
/*
* Find the requested HyperTransport capability and return the offset
* in configuration space via the pointer provided. The function
* returns 0 on success and an error code otherwise.
*/
int
pci_find_htcap_method(device_t dev, device_t child, int capability, int *capreg)
{
int ptr, error;
uint16_t val;
error = pci_find_cap(child, PCIY_HT, &ptr);
if (error)
return (error);
/*
* Traverse the capabilities list checking each HT capability
* to see if it matches the requested HT capability.
*/
while (ptr != 0) {
val = pci_read_config(child, ptr + PCIR_HT_COMMAND, 2);
if (capability == PCIM_HTCAP_SLAVE ||
capability == PCIM_HTCAP_HOST)
val &= 0xe000;
else
val &= PCIM_HTCMD_CAP_MASK;
if (val == capability) {
if (capreg != NULL)
*capreg = ptr;
return (0);
}
/* Skip to the next HT capability. */
while (ptr != 0) {
ptr = pci_read_config(child, ptr + PCICAP_NEXTPTR, 1);
if (pci_read_config(child, ptr + PCICAP_ID, 1) ==
PCIY_HT)
break;
}
}
return (ENOENT);
}
/*
* Find the requested capability and return the offset in
* configuration space via the pointer provided. The function returns
* 0 on success and an error code otherwise.
*/
int
pci_find_cap_method(device_t dev, device_t child, int capability,
int *capreg)
{
struct pci_devinfo *dinfo = device_get_ivars(child);
pcicfgregs *cfg = &dinfo->cfg;
u_int32_t status;
u_int8_t ptr;
/*
* Check the CAP_LIST bit of the PCI status register first.
*/
status = pci_read_config(child, PCIR_STATUS, 2);
if (!(status & PCIM_STATUS_CAPPRESENT))
return (ENXIO);
/*
* Determine the start pointer of the capabilities list.
*/
switch (cfg->hdrtype & PCIM_HDRTYPE) {
case PCIM_HDRTYPE_NORMAL:
case PCIM_HDRTYPE_BRIDGE:
ptr = PCIR_CAP_PTR;
break;
case PCIM_HDRTYPE_CARDBUS:
ptr = PCIR_CAP_PTR_2;
break;
default:
/* XXX: panic? */
return (ENXIO); /* no extended capabilities support */
}
ptr = pci_read_config(child, ptr, 1);
/*
* Traverse the capabilities list.
*/
while (ptr != 0) {
if (pci_read_config(child, ptr + PCICAP_ID, 1) == capability) {
if (capreg != NULL)
*capreg = ptr;
return (0);
}
ptr = pci_read_config(child, ptr + PCICAP_NEXTPTR, 1);
}
return (ENOENT);
}
/*
* Find the requested extended capability and return the offset in
* configuration space via the pointer provided. The function returns
* 0 on success and an error code otherwise.
*/
int
pci_find_extcap_method(device_t dev, device_t child, int capability,
int *capreg)
{
struct pci_devinfo *dinfo = device_get_ivars(child);
pcicfgregs *cfg = &dinfo->cfg;
uint32_t ecap;
uint16_t ptr;
/* Only supported for PCI-express devices. */
if (cfg->pcie.pcie_location == 0)
return (ENXIO);
ptr = PCIR_EXTCAP;
ecap = pci_read_config(child, ptr, 4);
if (ecap == 0xffffffff || ecap == 0)
return (ENOENT);
for (;;) {
if (PCI_EXTCAP_ID(ecap) == capability) {
if (capreg != NULL)
*capreg = ptr;
return (0);
}
ptr = PCI_EXTCAP_NEXTPTR(ecap);
if (ptr == 0)
break;
ecap = pci_read_config(child, ptr, 4);
}
return (ENOENT);
}
/*
* Support for MSI-X message interrupts.
*/
static void
pci_write_msix_entry(device_t dev, u_int index, uint64_t address, uint32_t data)
{
struct pci_devinfo *dinfo = device_get_ivars(dev);
struct pcicfg_msix *msix = &dinfo->cfg.msix;
uint32_t offset;
KASSERT(msix->msix_table_len > index, ("bogus index"));
offset = msix->msix_table_offset + index * 16;
bus_write_4(msix->msix_table_res, offset, address & 0xffffffff);
bus_write_4(msix->msix_table_res, offset + 4, address >> 32);
bus_write_4(msix->msix_table_res, offset + 8, data);
}
void
pci_enable_msix_method(device_t dev, device_t child, u_int index,
uint64_t address, uint32_t data)
{
if (pci_msix_rewrite_table) {
struct pci_devinfo *dinfo = device_get_ivars(child);
struct pcicfg_msix *msix = &dinfo->cfg.msix;
/*
* Some VM hosts require MSIX to be disabled in the
* control register before updating the MSIX table
* entries are allowed. It is not enough to only
* disable MSIX while updating a single entry. MSIX
* must be disabled while updating all entries in the
* table.
*/
pci_write_config(child,
msix->msix_location + PCIR_MSIX_CTRL,
msix->msix_ctrl & ~PCIM_MSIXCTRL_MSIX_ENABLE, 2);
pci_resume_msix(child);
} else
pci_write_msix_entry(child, index, address, data);
/* Enable MSI -> HT mapping. */
pci_ht_map_msi(child, address);
}
void
pci_mask_msix(device_t dev, u_int index)
{
struct pci_devinfo *dinfo = device_get_ivars(dev);
struct pcicfg_msix *msix = &dinfo->cfg.msix;
uint32_t offset, val;
KASSERT(msix->msix_msgnum > index, ("bogus index"));
offset = msix->msix_table_offset + index * 16 + 12;
val = bus_read_4(msix->msix_table_res, offset);
if (!(val & PCIM_MSIX_VCTRL_MASK)) {
val |= PCIM_MSIX_VCTRL_MASK;
bus_write_4(msix->msix_table_res, offset, val);
}
}
void
pci_unmask_msix(device_t dev, u_int index)
{
struct pci_devinfo *dinfo = device_get_ivars(dev);
struct pcicfg_msix *msix = &dinfo->cfg.msix;
uint32_t offset, val;
KASSERT(msix->msix_table_len > index, ("bogus index"));
offset = msix->msix_table_offset + index * 16 + 12;
val = bus_read_4(msix->msix_table_res, offset);
if (val & PCIM_MSIX_VCTRL_MASK) {
val &= ~PCIM_MSIX_VCTRL_MASK;
bus_write_4(msix->msix_table_res, offset, val);
}
}
int
pci_pending_msix(device_t dev, u_int index)
{
struct pci_devinfo *dinfo = device_get_ivars(dev);
struct pcicfg_msix *msix = &dinfo->cfg.msix;
uint32_t offset, bit;
KASSERT(msix->msix_table_len > index, ("bogus index"));
offset = msix->msix_pba_offset + (index / 32) * 4;
bit = 1 << index % 32;
return (bus_read_4(msix->msix_pba_res, offset) & bit);
}
/*
* Restore MSI-X registers and table during resume. If MSI-X is
* enabled then walk the virtual table to restore the actual MSI-X
* table.
*/
static void
pci_resume_msix(device_t dev)
{
struct pci_devinfo *dinfo = device_get_ivars(dev);
struct pcicfg_msix *msix = &dinfo->cfg.msix;
struct msix_table_entry *mte;
struct msix_vector *mv;
int i;
if (msix->msix_alloc > 0) {
/* First, mask all vectors. */
for (i = 0; i < msix->msix_msgnum; i++)
pci_mask_msix(dev, i);
/* Second, program any messages with at least one handler. */
for (i = 0; i < msix->msix_table_len; i++) {
mte = &msix->msix_table[i];
if (mte->mte_vector == 0 || mte->mte_handlers == 0)
continue;
mv = &msix->msix_vectors[mte->mte_vector - 1];
pci_write_msix_entry(dev, i, mv->mv_address,
mv->mv_data);
pci_unmask_msix(dev, i);
}
}
pci_write_config(dev, msix->msix_location + PCIR_MSIX_CTRL,
msix->msix_ctrl, 2);
}
/*
* Attempt to allocate *count MSI-X messages. The actual number allocated is
* returned in *count. After this function returns, each message will be
* available to the driver as SYS_RES_IRQ resources starting at rid 1.
*/
int
pci_alloc_msix_method(device_t dev, device_t child, int *count)
{
struct pci_devinfo *dinfo = device_get_ivars(child);
pcicfgregs *cfg = &dinfo->cfg;
struct resource_list_entry *rle;
int actual, error, i, irq, max;
/* Don't let count == 0 get us into trouble. */
if (*count == 0)
return (EINVAL);
/* If rid 0 is allocated, then fail. */
rle = resource_list_find(&dinfo->resources, SYS_RES_IRQ, 0);
if (rle != NULL && rle->res != NULL)
return (ENXIO);
/* Already have allocated messages? */
if (cfg->msi.msi_alloc != 0 || cfg->msix.msix_alloc != 0)
return (ENXIO);
/* If MSI-X is blacklisted for this system, fail. */
if (pci_msix_blacklisted())
return (ENXIO);
/* MSI-X capability present? */
if (cfg->msix.msix_location == 0 || !pci_do_msix)
return (ENODEV);
/* Make sure the appropriate BARs are mapped. */
rle = resource_list_find(&dinfo->resources, SYS_RES_MEMORY,
cfg->msix.msix_table_bar);
if (rle == NULL || rle->res == NULL ||
!(rman_get_flags(rle->res) & RF_ACTIVE))
return (ENXIO);
cfg->msix.msix_table_res = rle->res;
if (cfg->msix.msix_pba_bar != cfg->msix.msix_table_bar) {
rle = resource_list_find(&dinfo->resources, SYS_RES_MEMORY,
cfg->msix.msix_pba_bar);
if (rle == NULL || rle->res == NULL ||
!(rman_get_flags(rle->res) & RF_ACTIVE))
return (ENXIO);
}
cfg->msix.msix_pba_res = rle->res;
if (bootverbose)
device_printf(child,
"attempting to allocate %d MSI-X vectors (%d supported)\n",
*count, cfg->msix.msix_msgnum);
max = min(*count, cfg->msix.msix_msgnum);
for (i = 0; i < max; i++) {
/* Allocate a message. */
error = PCIB_ALLOC_MSIX(device_get_parent(dev), child, &irq);
if (error) {
if (i == 0)
return (error);
break;
}
resource_list_add(&dinfo->resources, SYS_RES_IRQ, i + 1, irq,
irq, 1);
}
actual = i;
if (bootverbose) {
rle = resource_list_find(&dinfo->resources, SYS_RES_IRQ, 1);
if (actual == 1)
device_printf(child, "using IRQ %ju for MSI-X\n",
rle->start);
else {
int run;
/*
* Be fancy and try to print contiguous runs of
* IRQ values as ranges. 'irq' is the previous IRQ.
* 'run' is true if we are in a range.
*/
device_printf(child, "using IRQs %ju", rle->start);
irq = rle->start;
run = 0;
for (i = 1; i < actual; i++) {
rle = resource_list_find(&dinfo->resources,
SYS_RES_IRQ, i + 1);
/* Still in a run? */
if (rle->start == irq + 1) {
run = 1;
irq++;
continue;
}
/* Finish previous range. */
if (run) {
printf("-%d", irq);
run = 0;
}
/* Start new range. */
printf(",%ju", rle->start);
irq = rle->start;
}
/* Unfinished range? */
if (run)
printf("-%d", irq);
printf(" for MSI-X\n");
}
}
/* Mask all vectors. */
for (i = 0; i < cfg->msix.msix_msgnum; i++)
pci_mask_msix(child, i);
/* Allocate and initialize vector data and virtual table. */
cfg->msix.msix_vectors = malloc(sizeof(struct msix_vector) * actual,
M_DEVBUF, M_WAITOK | M_ZERO);
cfg->msix.msix_table = malloc(sizeof(struct msix_table_entry) * actual,
M_DEVBUF, M_WAITOK | M_ZERO);
for (i = 0; i < actual; i++) {
rle = resource_list_find(&dinfo->resources, SYS_RES_IRQ, i + 1);
cfg->msix.msix_vectors[i].mv_irq = rle->start;
cfg->msix.msix_table[i].mte_vector = i + 1;
}
/* Update control register to enable MSI-X. */
cfg->msix.msix_ctrl |= PCIM_MSIXCTRL_MSIX_ENABLE;
pci_write_config(child, cfg->msix.msix_location + PCIR_MSIX_CTRL,
cfg->msix.msix_ctrl, 2);
/* Update counts of alloc'd messages. */
cfg->msix.msix_alloc = actual;
cfg->msix.msix_table_len = actual;
*count = actual;
return (0);
}
/*
* By default, pci_alloc_msix() will assign the allocated IRQ
* resources consecutively to the first N messages in the MSI-X table.
* However, device drivers may want to use different layouts if they
* either receive fewer messages than they asked for, or they wish to
* populate the MSI-X table sparsely. This method allows the driver
* to specify what layout it wants. It must be called after a
* successful pci_alloc_msix() but before any of the associated
* SYS_RES_IRQ resources are allocated via bus_alloc_resource().
*
* The 'vectors' array contains 'count' message vectors. The array
* maps directly to the MSI-X table in that index 0 in the array
* specifies the vector for the first message in the MSI-X table, etc.
* The vector value in each array index can either be 0 to indicate
* that no vector should be assigned to a message slot, or it can be a
* number from 1 to N (where N is the count returned from a
* succcessful call to pci_alloc_msix()) to indicate which message
* vector (IRQ) to be used for the corresponding message.
*
* On successful return, each message with a non-zero vector will have
* an associated SYS_RES_IRQ whose rid is equal to the array index +
* 1. Additionally, if any of the IRQs allocated via the previous
* call to pci_alloc_msix() are not used in the mapping, those IRQs
* will be freed back to the system automatically.
*
* For example, suppose a driver has a MSI-X table with 6 messages and
* asks for 6 messages, but pci_alloc_msix() only returns a count of
* 3. Call the three vectors allocated by pci_alloc_msix() A, B, and
* C. After the call to pci_alloc_msix(), the device will be setup to
* have an MSI-X table of ABC--- (where - means no vector assigned).
* If the driver then passes a vector array of { 1, 0, 1, 2, 0, 2 },
* then the MSI-X table will look like A-AB-B, and the 'C' vector will
* be freed back to the system. This device will also have valid
* SYS_RES_IRQ rids of 1, 3, 4, and 6.
*
* In any case, the SYS_RES_IRQ rid X will always map to the message
* at MSI-X table index X - 1 and will only be valid if a vector is
* assigned to that table entry.
*/
int
pci_remap_msix_method(device_t dev, device_t child, int count,
const u_int *vectors)
{
struct pci_devinfo *dinfo = device_get_ivars(child);
struct pcicfg_msix *msix = &dinfo->cfg.msix;
struct resource_list_entry *rle;
int i, irq, j, *used;
/*
* Have to have at least one message in the table but the
* table can't be bigger than the actual MSI-X table in the
* device.
*/
if (count == 0 || count > msix->msix_msgnum)
return (EINVAL);
/* Sanity check the vectors. */
for (i = 0; i < count; i++)
if (vectors[i] > msix->msix_alloc)
return (EINVAL);
/*
* Make sure there aren't any holes in the vectors to be used.
* It's a big pain to support it, and it doesn't really make
* sense anyway. Also, at least one vector must be used.
*/
used = malloc(sizeof(int) * msix->msix_alloc, M_DEVBUF, M_WAITOK |
M_ZERO);
for (i = 0; i < count; i++)
if (vectors[i] != 0)
used[vectors[i] - 1] = 1;
for (i = 0; i < msix->msix_alloc - 1; i++)
if (used[i] == 0 && used[i + 1] == 1) {
free(used, M_DEVBUF);
return (EINVAL);
}
if (used[0] != 1) {
free(used, M_DEVBUF);
return (EINVAL);
}
/* Make sure none of the resources are allocated. */
for (i = 0; i < msix->msix_table_len; i++) {
if (msix->msix_table[i].mte_vector == 0)
continue;
if (msix->msix_table[i].mte_handlers > 0) {
free(used, M_DEVBUF);
return (EBUSY);
}
rle = resource_list_find(&dinfo->resources, SYS_RES_IRQ, i + 1);
KASSERT(rle != NULL, ("missing resource"));
if (rle->res != NULL) {
free(used, M_DEVBUF);
return (EBUSY);
}
}
/* Free the existing resource list entries. */
for (i = 0; i < msix->msix_table_len; i++) {
if (msix->msix_table[i].mte_vector == 0)
continue;
resource_list_delete(&dinfo->resources, SYS_RES_IRQ, i + 1);
}
/*
* Build the new virtual table keeping track of which vectors are
* used.
*/
free(msix->msix_table, M_DEVBUF);
msix->msix_table = malloc(sizeof(struct msix_table_entry) * count,
M_DEVBUF, M_WAITOK | M_ZERO);
for (i = 0; i < count; i++)
msix->msix_table[i].mte_vector = vectors[i];
msix->msix_table_len = count;
/* Free any unused IRQs and resize the vectors array if necessary. */
j = msix->msix_alloc - 1;
if (used[j] == 0) {
struct msix_vector *vec;
while (used[j] == 0) {
PCIB_RELEASE_MSIX(device_get_parent(dev), child,
msix->msix_vectors[j].mv_irq);
j--;
}
vec = malloc(sizeof(struct msix_vector) * (j + 1), M_DEVBUF,
M_WAITOK);
bcopy(msix->msix_vectors, vec, sizeof(struct msix_vector) *
(j + 1));
free(msix->msix_vectors, M_DEVBUF);
msix->msix_vectors = vec;
msix->msix_alloc = j + 1;
}
free(used, M_DEVBUF);
/* Map the IRQs onto the rids. */
for (i = 0; i < count; i++) {
if (vectors[i] == 0)
continue;
irq = msix->msix_vectors[vectors[i] - 1].mv_irq;
resource_list_add(&dinfo->resources, SYS_RES_IRQ, i + 1, irq,
irq, 1);
}
if (bootverbose) {
device_printf(child, "Remapped MSI-X IRQs as: ");
for (i = 0; i < count; i++) {
if (i != 0)
printf(", ");
if (vectors[i] == 0)
printf("---");
else
printf("%d",
msix->msix_vectors[vectors[i] - 1].mv_irq);
}
printf("\n");
}
return (0);
}
static int
pci_release_msix(device_t dev, device_t child)
{
struct pci_devinfo *dinfo = device_get_ivars(child);
struct pcicfg_msix *msix = &dinfo->cfg.msix;
struct resource_list_entry *rle;
int i;
/* Do we have any messages to release? */
if (msix->msix_alloc == 0)
return (ENODEV);
/* Make sure none of the resources are allocated. */
for (i = 0; i < msix->msix_table_len; i++) {
if (msix->msix_table[i].mte_vector == 0)
continue;
if (msix->msix_table[i].mte_handlers > 0)
return (EBUSY);
rle = resource_list_find(&dinfo->resources, SYS_RES_IRQ, i + 1);
KASSERT(rle != NULL, ("missing resource"));
if (rle->res != NULL)
return (EBUSY);
}
/* Update control register to disable MSI-X. */
msix->msix_ctrl &= ~PCIM_MSIXCTRL_MSIX_ENABLE;
pci_write_config(child, msix->msix_location + PCIR_MSIX_CTRL,
msix->msix_ctrl, 2);
/* Free the resource list entries. */
for (i = 0; i < msix->msix_table_len; i++) {
if (msix->msix_table[i].mte_vector == 0)
continue;
resource_list_delete(&dinfo->resources, SYS_RES_IRQ, i + 1);
}
free(msix->msix_table, M_DEVBUF);
msix->msix_table_len = 0;
/* Release the IRQs. */
for (i = 0; i < msix->msix_alloc; i++)
PCIB_RELEASE_MSIX(device_get_parent(dev), child,
msix->msix_vectors[i].mv_irq);
free(msix->msix_vectors, M_DEVBUF);
msix->msix_alloc = 0;
return (0);
}
/*
* Return the max supported MSI-X messages this device supports.
* Basically, assuming the MD code can alloc messages, this function
* should return the maximum value that pci_alloc_msix() can return.
* Thus, it is subject to the tunables, etc.
*/
int
pci_msix_count_method(device_t dev, device_t child)
{
struct pci_devinfo *dinfo = device_get_ivars(child);
struct pcicfg_msix *msix = &dinfo->cfg.msix;
if (pci_do_msix && msix->msix_location != 0)
return (msix->msix_msgnum);
return (0);
}
int
pci_msix_pba_bar_method(device_t dev, device_t child)
{
struct pci_devinfo *dinfo = device_get_ivars(child);
struct pcicfg_msix *msix = &dinfo->cfg.msix;
if (pci_do_msix && msix->msix_location != 0)
return (msix->msix_pba_bar);
return (-1);
}
int
pci_msix_table_bar_method(device_t dev, device_t child)
{
struct pci_devinfo *dinfo = device_get_ivars(child);
struct pcicfg_msix *msix = &dinfo->cfg.msix;
if (pci_do_msix && msix->msix_location != 0)
return (msix->msix_table_bar);
return (-1);
}
/*
* HyperTransport MSI mapping control
*/
void
pci_ht_map_msi(device_t dev, uint64_t addr)
{
struct pci_devinfo *dinfo = device_get_ivars(dev);
struct pcicfg_ht *ht = &dinfo->cfg.ht;
if (!ht->ht_msimap)
return;
if (addr && !(ht->ht_msictrl & PCIM_HTCMD_MSI_ENABLE) &&
ht->ht_msiaddr >> 20 == addr >> 20) {
/* Enable MSI -> HT mapping. */
ht->ht_msictrl |= PCIM_HTCMD_MSI_ENABLE;
pci_write_config(dev, ht->ht_msimap + PCIR_HT_COMMAND,
ht->ht_msictrl, 2);
}
if (!addr && ht->ht_msictrl & PCIM_HTCMD_MSI_ENABLE) {
/* Disable MSI -> HT mapping. */
ht->ht_msictrl &= ~PCIM_HTCMD_MSI_ENABLE;
pci_write_config(dev, ht->ht_msimap + PCIR_HT_COMMAND,
ht->ht_msictrl, 2);
}
}
int
pci_get_max_payload(device_t dev)
{
struct pci_devinfo *dinfo = device_get_ivars(dev);
int cap;
uint16_t val;
cap = dinfo->cfg.pcie.pcie_location;
if (cap == 0)
return (0);
val = pci_read_config(dev, cap + PCIER_DEVICE_CTL, 2);
val &= PCIEM_CTL_MAX_PAYLOAD;
val >>= 5;
return (1 << (val + 7));
}
int
pci_get_max_read_req(device_t dev)
{
struct pci_devinfo *dinfo = device_get_ivars(dev);
int cap;
uint16_t val;
cap = dinfo->cfg.pcie.pcie_location;
if (cap == 0)
return (0);
val = pci_read_config(dev, cap + PCIER_DEVICE_CTL, 2);
val &= PCIEM_CTL_MAX_READ_REQUEST;
val >>= 12;
return (1 << (val + 7));
}
int
pci_set_max_read_req(device_t dev, int size)
{
struct pci_devinfo *dinfo = device_get_ivars(dev);
int cap;
uint16_t val;
cap = dinfo->cfg.pcie.pcie_location;
if (cap == 0)
return (0);
if (size < 128)
size = 128;
if (size > 4096)
size = 4096;
size = (1 << (fls(size) - 1));
val = pci_read_config(dev, cap + PCIER_DEVICE_CTL, 2);
val &= ~PCIEM_CTL_MAX_READ_REQUEST;
val |= (fls(size) - 8) << 12;
pci_write_config(dev, cap + PCIER_DEVICE_CTL, val, 2);
return (size);
}
uint32_t
pcie_read_config(device_t dev, int reg, int width)
{
struct pci_devinfo *dinfo = device_get_ivars(dev);
int cap;
cap = dinfo->cfg.pcie.pcie_location;
if (cap == 0) {
if (width == 2)
return (0xffff);
return (0xffffffff);
}
return (pci_read_config(dev, cap + reg, width));
}
void
pcie_write_config(device_t dev, int reg, uint32_t value, int width)
{
struct pci_devinfo *dinfo = device_get_ivars(dev);
int cap;
cap = dinfo->cfg.pcie.pcie_location;
if (cap == 0)
return;
pci_write_config(dev, cap + reg, value, width);
}
/*
* Adjusts a PCI-e capability register by clearing the bits in mask
* and setting the bits in (value & mask). Bits not set in mask are
* not adjusted.
*
* Returns the old value on success or all ones on failure.
*/
uint32_t
pcie_adjust_config(device_t dev, int reg, uint32_t mask, uint32_t value,
int width)
{
struct pci_devinfo *dinfo = device_get_ivars(dev);
uint32_t old, new;
int cap;
cap = dinfo->cfg.pcie.pcie_location;
if (cap == 0) {
if (width == 2)
return (0xffff);
return (0xffffffff);
}
old = pci_read_config(dev, cap + reg, width);
new = old & ~mask;
new |= (value & mask);
pci_write_config(dev, cap + reg, new, width);
return (old);
}
/*
* Support for MSI message signalled interrupts.
*/
void
pci_enable_msi_method(device_t dev, device_t child, uint64_t address,
uint16_t data)
{
struct pci_devinfo *dinfo = device_get_ivars(child);
struct pcicfg_msi *msi = &dinfo->cfg.msi;
/* Write data and address values. */
pci_write_config(child, msi->msi_location + PCIR_MSI_ADDR,
address & 0xffffffff, 4);
if (msi->msi_ctrl & PCIM_MSICTRL_64BIT) {
pci_write_config(child, msi->msi_location + PCIR_MSI_ADDR_HIGH,
address >> 32, 4);
pci_write_config(child, msi->msi_location + PCIR_MSI_DATA_64BIT,
data, 2);
} else
pci_write_config(child, msi->msi_location + PCIR_MSI_DATA, data,
2);
/* Enable MSI in the control register. */
msi->msi_ctrl |= PCIM_MSICTRL_MSI_ENABLE;
pci_write_config(child, msi->msi_location + PCIR_MSI_CTRL,
msi->msi_ctrl, 2);
/* Enable MSI -> HT mapping. */
pci_ht_map_msi(child, address);
}
void
pci_disable_msi_method(device_t dev, device_t child)
{
struct pci_devinfo *dinfo = device_get_ivars(child);
struct pcicfg_msi *msi = &dinfo->cfg.msi;
/* Disable MSI -> HT mapping. */
pci_ht_map_msi(child, 0);
/* Disable MSI in the control register. */
msi->msi_ctrl &= ~PCIM_MSICTRL_MSI_ENABLE;
pci_write_config(child, msi->msi_location + PCIR_MSI_CTRL,
msi->msi_ctrl, 2);
}
/*
* Restore MSI registers during resume. If MSI is enabled then
* restore the data and address registers in addition to the control
* register.
*/
static void
pci_resume_msi(device_t dev)
{
struct pci_devinfo *dinfo = device_get_ivars(dev);
struct pcicfg_msi *msi = &dinfo->cfg.msi;
uint64_t address;
uint16_t data;
if (msi->msi_ctrl & PCIM_MSICTRL_MSI_ENABLE) {
address = msi->msi_addr;
data = msi->msi_data;
pci_write_config(dev, msi->msi_location + PCIR_MSI_ADDR,
address & 0xffffffff, 4);
if (msi->msi_ctrl & PCIM_MSICTRL_64BIT) {
pci_write_config(dev, msi->msi_location +
PCIR_MSI_ADDR_HIGH, address >> 32, 4);
pci_write_config(dev, msi->msi_location +
PCIR_MSI_DATA_64BIT, data, 2);
} else
pci_write_config(dev, msi->msi_location + PCIR_MSI_DATA,
data, 2);
}
pci_write_config(dev, msi->msi_location + PCIR_MSI_CTRL, msi->msi_ctrl,
2);
}
static int
pci_remap_intr_method(device_t bus, device_t dev, u_int irq)
{
struct pci_devinfo *dinfo = device_get_ivars(dev);
pcicfgregs *cfg = &dinfo->cfg;
struct resource_list_entry *rle;
struct msix_table_entry *mte;
struct msix_vector *mv;
uint64_t addr;
uint32_t data;
int error, i, j;
/*
* Handle MSI first. We try to find this IRQ among our list
* of MSI IRQs. If we find it, we request updated address and
* data registers and apply the results.
*/
if (cfg->msi.msi_alloc > 0) {
/* If we don't have any active handlers, nothing to do. */
if (cfg->msi.msi_handlers == 0)
return (0);
for (i = 0; i < cfg->msi.msi_alloc; i++) {
rle = resource_list_find(&dinfo->resources, SYS_RES_IRQ,
i + 1);
if (rle->start == irq) {
error = PCIB_MAP_MSI(device_get_parent(bus),
dev, irq, &addr, &data);
if (error)
return (error);
pci_disable_msi(dev);
dinfo->cfg.msi.msi_addr = addr;
dinfo->cfg.msi.msi_data = data;
pci_enable_msi(dev, addr, data);
return (0);
}
}
return (ENOENT);
}
/*
* For MSI-X, we check to see if we have this IRQ. If we do,
* we request the updated mapping info. If that works, we go
* through all the slots that use this IRQ and update them.
*/
if (cfg->msix.msix_alloc > 0) {
for (i = 0; i < cfg->msix.msix_alloc; i++) {
mv = &cfg->msix.msix_vectors[i];
if (mv->mv_irq == irq) {
error = PCIB_MAP_MSI(device_get_parent(bus),
dev, irq, &addr, &data);
if (error)
return (error);
mv->mv_address = addr;
mv->mv_data = data;
for (j = 0; j < cfg->msix.msix_table_len; j++) {
mte = &cfg->msix.msix_table[j];
if (mte->mte_vector != i + 1)
continue;
if (mte->mte_handlers == 0)
continue;
pci_mask_msix(dev, j);
pci_enable_msix(dev, j, addr, data);
pci_unmask_msix(dev, j);
}
}
}
return (ENOENT);
}
return (ENOENT);
}
/*
* Returns true if the specified device is blacklisted because MSI
* doesn't work.
*/
int
pci_msi_device_blacklisted(device_t dev)
{
if (!pci_honor_msi_blacklist)
return (0);
return (pci_has_quirk(pci_get_devid(dev), PCI_QUIRK_DISABLE_MSI));
}
/*
* Determine if MSI is blacklisted globally on this system. Currently,
* we just check for blacklisted chipsets as represented by the
* host-PCI bridge at device 0:0:0. In the future, it may become
* necessary to check other system attributes, such as the kenv values
* that give the motherboard manufacturer and model number.
*/
static int
pci_msi_blacklisted(void)
{
device_t dev;
if (!pci_honor_msi_blacklist)
return (0);
/* Blacklist all non-PCI-express and non-PCI-X chipsets. */
if (!(pcie_chipset || pcix_chipset)) {
if (vm_guest != VM_GUEST_NO) {
/*
* Whitelist older chipsets in virtual
* machines known to support MSI.
*/
dev = pci_find_bsf(0, 0, 0);
if (dev != NULL)
return (!pci_has_quirk(pci_get_devid(dev),
PCI_QUIRK_ENABLE_MSI_VM));
}
return (1);
}
dev = pci_find_bsf(0, 0, 0);
if (dev != NULL)
return (pci_msi_device_blacklisted(dev));
return (0);
}
/*
* Returns true if the specified device is blacklisted because MSI-X
* doesn't work. Note that this assumes that if MSI doesn't work,
* MSI-X doesn't either.
*/
int
pci_msix_device_blacklisted(device_t dev)
{
if (!pci_honor_msi_blacklist)
return (0);
if (pci_has_quirk(pci_get_devid(dev), PCI_QUIRK_DISABLE_MSIX))
return (1);
return (pci_msi_device_blacklisted(dev));
}
/*
* Determine if MSI-X is blacklisted globally on this system. If MSI
* is blacklisted, assume that MSI-X is as well. Check for additional
* chipsets where MSI works but MSI-X does not.
*/
static int
pci_msix_blacklisted(void)
{
device_t dev;
if (!pci_honor_msi_blacklist)
return (0);
dev = pci_find_bsf(0, 0, 0);
if (dev != NULL && pci_has_quirk(pci_get_devid(dev),
PCI_QUIRK_DISABLE_MSIX))
return (1);
return (pci_msi_blacklisted());
}
/*
* Attempt to allocate *count MSI messages. The actual number allocated is
* returned in *count. After this function returns, each message will be
* available to the driver as SYS_RES_IRQ resources starting at a rid 1.
*/
int
pci_alloc_msi_method(device_t dev, device_t child, int *count)
{
struct pci_devinfo *dinfo = device_get_ivars(child);
pcicfgregs *cfg = &dinfo->cfg;
struct resource_list_entry *rle;
int actual, error, i, irqs[32];
uint16_t ctrl;
/* Don't let count == 0 get us into trouble. */
if (*count == 0)
return (EINVAL);
/* If rid 0 is allocated, then fail. */
rle = resource_list_find(&dinfo->resources, SYS_RES_IRQ, 0);
if (rle != NULL && rle->res != NULL)
return (ENXIO);
/* Already have allocated messages? */
if (cfg->msi.msi_alloc != 0 || cfg->msix.msix_alloc != 0)
return (ENXIO);
/* If MSI is blacklisted for this system, fail. */
if (pci_msi_blacklisted())
return (ENXIO);
/* MSI capability present? */
if (cfg->msi.msi_location == 0 || !pci_do_msi)
return (ENODEV);
if (bootverbose)
device_printf(child,
"attempting to allocate %d MSI vectors (%d supported)\n",
*count, cfg->msi.msi_msgnum);
/* Don't ask for more than the device supports. */
actual = min(*count, cfg->msi.msi_msgnum);
/* Don't ask for more than 32 messages. */
actual = min(actual, 32);
/* MSI requires power of 2 number of messages. */
if (!powerof2(actual))
return (EINVAL);
for (;;) {
/* Try to allocate N messages. */
error = PCIB_ALLOC_MSI(device_get_parent(dev), child, actual,
actual, irqs);
if (error == 0)
break;
if (actual == 1)
return (error);
/* Try N / 2. */
actual >>= 1;
}
/*
* We now have N actual messages mapped onto SYS_RES_IRQ
* resources in the irqs[] array, so add new resources
* starting at rid 1.
*/
for (i = 0; i < actual; i++)
resource_list_add(&dinfo->resources, SYS_RES_IRQ, i + 1,
irqs[i], irqs[i], 1);
if (bootverbose) {
if (actual == 1)
device_printf(child, "using IRQ %d for MSI\n", irqs[0]);
else {
int run;
/*
* Be fancy and try to print contiguous runs
* of IRQ values as ranges. 'run' is true if
* we are in a range.
*/
device_printf(child, "using IRQs %d", irqs[0]);
run = 0;
for (i = 1; i < actual; i++) {
/* Still in a run? */
if (irqs[i] == irqs[i - 1] + 1) {
run = 1;
continue;
}
/* Finish previous range. */
if (run) {
printf("-%d", irqs[i - 1]);
run = 0;
}
/* Start new range. */
printf(",%d", irqs[i]);
}
/* Unfinished range? */
if (run)
printf("-%d", irqs[actual - 1]);
printf(" for MSI\n");
}
}
/* Update control register with actual count. */
ctrl = cfg->msi.msi_ctrl;
ctrl &= ~PCIM_MSICTRL_MME_MASK;
ctrl |= (ffs(actual) - 1) << 4;
cfg->msi.msi_ctrl = ctrl;
pci_write_config(child, cfg->msi.msi_location + PCIR_MSI_CTRL, ctrl, 2);
/* Update counts of alloc'd messages. */
cfg->msi.msi_alloc = actual;
cfg->msi.msi_handlers = 0;
*count = actual;
return (0);
}
/* Release the MSI messages associated with this device. */
int
pci_release_msi_method(device_t dev, device_t child)
{
struct pci_devinfo *dinfo = device_get_ivars(child);
struct pcicfg_msi *msi = &dinfo->cfg.msi;
struct resource_list_entry *rle;
int error, i, irqs[32];
/* Try MSI-X first. */
error = pci_release_msix(dev, child);
if (error != ENODEV)
return (error);
/* Do we have any messages to release? */
if (msi->msi_alloc == 0)
return (ENODEV);
KASSERT(msi->msi_alloc <= 32, ("more than 32 alloc'd messages"));
/* Make sure none of the resources are allocated. */
if (msi->msi_handlers > 0)
return (EBUSY);
for (i = 0; i < msi->msi_alloc; i++) {
rle = resource_list_find(&dinfo->resources, SYS_RES_IRQ, i + 1);
KASSERT(rle != NULL, ("missing MSI resource"));
if (rle->res != NULL)
return (EBUSY);
irqs[i] = rle->start;
}
/* Update control register with 0 count. */
KASSERT(!(msi->msi_ctrl & PCIM_MSICTRL_MSI_ENABLE),
("%s: MSI still enabled", __func__));
msi->msi_ctrl &= ~PCIM_MSICTRL_MME_MASK;
pci_write_config(child, msi->msi_location + PCIR_MSI_CTRL,
msi->msi_ctrl, 2);
/* Release the messages. */
PCIB_RELEASE_MSI(device_get_parent(dev), child, msi->msi_alloc, irqs);
for (i = 0; i < msi->msi_alloc; i++)
resource_list_delete(&dinfo->resources, SYS_RES_IRQ, i + 1);
/* Update alloc count. */
msi->msi_alloc = 0;
msi->msi_addr = 0;
msi->msi_data = 0;
return (0);
}
/*
* Return the max supported MSI messages this device supports.
* Basically, assuming the MD code can alloc messages, this function
* should return the maximum value that pci_alloc_msi() can return.
* Thus, it is subject to the tunables, etc.
*/
int
pci_msi_count_method(device_t dev, device_t child)
{
struct pci_devinfo *dinfo = device_get_ivars(child);
struct pcicfg_msi *msi = &dinfo->cfg.msi;
if (pci_do_msi && msi->msi_location != 0)
return (msi->msi_msgnum);
return (0);
}
/* free pcicfgregs structure and all depending data structures */
int
pci_freecfg(struct pci_devinfo *dinfo)
{
struct devlist *devlist_head;
struct pci_map *pm, *next;
int i;
devlist_head = &pci_devq;
if (dinfo->cfg.vpd.vpd_reg) {
free(dinfo->cfg.vpd.vpd_ident, M_DEVBUF);
for (i = 0; i < dinfo->cfg.vpd.vpd_rocnt; i++)
free(dinfo->cfg.vpd.vpd_ros[i].value, M_DEVBUF);
free(dinfo->cfg.vpd.vpd_ros, M_DEVBUF);
for (i = 0; i < dinfo->cfg.vpd.vpd_wcnt; i++)
free(dinfo->cfg.vpd.vpd_w[i].value, M_DEVBUF);
free(dinfo->cfg.vpd.vpd_w, M_DEVBUF);
}
STAILQ_FOREACH_SAFE(pm, &dinfo->cfg.maps, pm_link, next) {
free(pm, M_DEVBUF);
}
STAILQ_REMOVE(devlist_head, dinfo, pci_devinfo, pci_links);
free(dinfo, M_DEVBUF);
/* increment the generation count */
pci_generation++;
/* we're losing one device */
pci_numdevs--;
return (0);
}
/*
* PCI power manangement
*/
int
pci_set_powerstate_method(device_t dev, device_t child, int state)
{
struct pci_devinfo *dinfo = device_get_ivars(child);
pcicfgregs *cfg = &dinfo->cfg;
uint16_t status;
int oldstate, highest, delay;
if (cfg->pp.pp_cap == 0)
return (EOPNOTSUPP);
/*
* Optimize a no state change request away. While it would be OK to
* write to the hardware in theory, some devices have shown odd
* behavior when going from D3 -> D3.
*/
oldstate = pci_get_powerstate(child);
if (oldstate == state)
return (0);
/*
* The PCI power management specification states that after a state
* transition between PCI power states, system software must
* guarantee a minimal delay before the function accesses the device.
* Compute the worst case delay that we need to guarantee before we
* access the device. Many devices will be responsive much more
* quickly than this delay, but there are some that don't respond
* instantly to state changes. Transitions to/from D3 state require
* 10ms, while D2 requires 200us, and D0/1 require none. The delay
* is done below with DELAY rather than a sleeper function because
* this function can be called from contexts where we cannot sleep.
*/
highest = (oldstate > state) ? oldstate : state;
if (highest == PCI_POWERSTATE_D3)
delay = 10000;
else if (highest == PCI_POWERSTATE_D2)
delay = 200;
else
delay = 0;
status = PCI_READ_CONFIG(dev, child, cfg->pp.pp_status, 2)
& ~PCIM_PSTAT_DMASK;
switch (state) {
case PCI_POWERSTATE_D0:
status |= PCIM_PSTAT_D0;
break;
case PCI_POWERSTATE_D1:
if ((cfg->pp.pp_cap & PCIM_PCAP_D1SUPP) == 0)
return (EOPNOTSUPP);
status |= PCIM_PSTAT_D1;
break;
case PCI_POWERSTATE_D2:
if ((cfg->pp.pp_cap & PCIM_PCAP_D2SUPP) == 0)
return (EOPNOTSUPP);
status |= PCIM_PSTAT_D2;
break;
case PCI_POWERSTATE_D3:
status |= PCIM_PSTAT_D3;
break;
default:
return (EINVAL);
}
if (bootverbose)
pci_printf(cfg, "Transition from D%d to D%d\n", oldstate,
state);
PCI_WRITE_CONFIG(dev, child, cfg->pp.pp_status, status, 2);
if (delay)
DELAY(delay);
return (0);
}
int
pci_get_powerstate_method(device_t dev, device_t child)
{
struct pci_devinfo *dinfo = device_get_ivars(child);
pcicfgregs *cfg = &dinfo->cfg;
uint16_t status;
int result;
if (cfg->pp.pp_cap != 0) {
status = PCI_READ_CONFIG(dev, child, cfg->pp.pp_status, 2);
switch (status & PCIM_PSTAT_DMASK) {
case PCIM_PSTAT_D0:
result = PCI_POWERSTATE_D0;
break;
case PCIM_PSTAT_D1:
result = PCI_POWERSTATE_D1;
break;
case PCIM_PSTAT_D2:
result = PCI_POWERSTATE_D2;
break;
case PCIM_PSTAT_D3:
result = PCI_POWERSTATE_D3;
break;
default:
result = PCI_POWERSTATE_UNKNOWN;
break;
}
} else {
/* No support, device is always at D0 */
result = PCI_POWERSTATE_D0;
}
return (result);
}
/*
* Some convenience functions for PCI device drivers.
*/
static __inline void
pci_set_command_bit(device_t dev, device_t child, uint16_t bit)
{
uint16_t command;
command = PCI_READ_CONFIG(dev, child, PCIR_COMMAND, 2);
command |= bit;
PCI_WRITE_CONFIG(dev, child, PCIR_COMMAND, command, 2);
}
static __inline void
pci_clear_command_bit(device_t dev, device_t child, uint16_t bit)
{
uint16_t command;
command = PCI_READ_CONFIG(dev, child, PCIR_COMMAND, 2);
command &= ~bit;
PCI_WRITE_CONFIG(dev, child, PCIR_COMMAND, command, 2);
}
int
pci_enable_busmaster_method(device_t dev, device_t child)
{
pci_set_command_bit(dev, child, PCIM_CMD_BUSMASTEREN);
return (0);
}
int
pci_disable_busmaster_method(device_t dev, device_t child)
{
pci_clear_command_bit(dev, child, PCIM_CMD_BUSMASTEREN);
return (0);
}
int
pci_enable_io_method(device_t dev, device_t child, int space)
{
uint16_t bit;
switch(space) {
case SYS_RES_IOPORT:
bit = PCIM_CMD_PORTEN;
break;
case SYS_RES_MEMORY:
bit = PCIM_CMD_MEMEN;
break;
default:
return (EINVAL);
}
pci_set_command_bit(dev, child, bit);
return (0);
}
int
pci_disable_io_method(device_t dev, device_t child, int space)
{
uint16_t bit;
switch(space) {
case SYS_RES_IOPORT:
bit = PCIM_CMD_PORTEN;
break;
case SYS_RES_MEMORY:
bit = PCIM_CMD_MEMEN;
break;
default:
return (EINVAL);
}
pci_clear_command_bit(dev, child, bit);
return (0);
}
/*
* New style pci driver. Parent device is either a pci-host-bridge or a
* pci-pci-bridge. Both kinds are represented by instances of pcib.
*/
void
pci_print_verbose(struct pci_devinfo *dinfo)
{
if (bootverbose) {
pcicfgregs *cfg = &dinfo->cfg;
printf("found->\tvendor=0x%04x, dev=0x%04x, revid=0x%02x\n",
cfg->vendor, cfg->device, cfg->revid);
printf("\tdomain=%d, bus=%d, slot=%d, func=%d\n",
cfg->domain, cfg->bus, cfg->slot, cfg->func);
printf("\tclass=%02x-%02x-%02x, hdrtype=0x%02x, mfdev=%d\n",
cfg->baseclass, cfg->subclass, cfg->progif, cfg->hdrtype,
cfg->mfdev);
printf("\tcmdreg=0x%04x, statreg=0x%04x, cachelnsz=%d (dwords)\n",
cfg->cmdreg, cfg->statreg, cfg->cachelnsz);
printf("\tlattimer=0x%02x (%d ns), mingnt=0x%02x (%d ns), maxlat=0x%02x (%d ns)\n",
cfg->lattimer, cfg->lattimer * 30, cfg->mingnt,
cfg->mingnt * 250, cfg->maxlat, cfg->maxlat * 250);
if (cfg->intpin > 0)
printf("\tintpin=%c, irq=%d\n",
cfg->intpin +'a' -1, cfg->intline);
if (cfg->pp.pp_cap) {
uint16_t status;
status = pci_read_config(cfg->dev, cfg->pp.pp_status, 2);
printf("\tpowerspec %d supports D0%s%s D3 current D%d\n",
cfg->pp.pp_cap & PCIM_PCAP_SPEC,
cfg->pp.pp_cap & PCIM_PCAP_D1SUPP ? " D1" : "",
cfg->pp.pp_cap & PCIM_PCAP_D2SUPP ? " D2" : "",
status & PCIM_PSTAT_DMASK);
}
if (cfg->msi.msi_location) {
int ctrl;
ctrl = cfg->msi.msi_ctrl;
printf("\tMSI supports %d message%s%s%s\n",
cfg->msi.msi_msgnum,
(cfg->msi.msi_msgnum == 1) ? "" : "s",
(ctrl & PCIM_MSICTRL_64BIT) ? ", 64 bit" : "",
(ctrl & PCIM_MSICTRL_VECTOR) ? ", vector masks":"");
}
if (cfg->msix.msix_location) {
printf("\tMSI-X supports %d message%s ",
cfg->msix.msix_msgnum,
(cfg->msix.msix_msgnum == 1) ? "" : "s");
if (cfg->msix.msix_table_bar == cfg->msix.msix_pba_bar)
printf("in map 0x%x\n",
cfg->msix.msix_table_bar);
else
printf("in maps 0x%x and 0x%x\n",
cfg->msix.msix_table_bar,
cfg->msix.msix_pba_bar);
}
}
}
static int
pci_porten(device_t dev)
{
return (pci_read_config(dev, PCIR_COMMAND, 2) & PCIM_CMD_PORTEN) != 0;
}
static int
pci_memen(device_t dev)
{
return (pci_read_config(dev, PCIR_COMMAND, 2) & PCIM_CMD_MEMEN) != 0;
}
void
pci_read_bar(device_t dev, int reg, pci_addr_t *mapp, pci_addr_t *testvalp,
int *bar64)
{
struct pci_devinfo *dinfo;
pci_addr_t map, testval;
int ln2range;
uint16_t cmd;
/*
* The device ROM BAR is special. It is always a 32-bit
* memory BAR. Bit 0 is special and should not be set when
* sizing the BAR.
*/
dinfo = device_get_ivars(dev);
if (PCIR_IS_BIOS(&dinfo->cfg, reg)) {
map = pci_read_config(dev, reg, 4);
pci_write_config(dev, reg, 0xfffffffe, 4);
testval = pci_read_config(dev, reg, 4);
pci_write_config(dev, reg, map, 4);
*mapp = map;
*testvalp = testval;
if (bar64 != NULL)
*bar64 = 0;
return;
}
map = pci_read_config(dev, reg, 4);
ln2range = pci_maprange(map);
if (ln2range == 64)
map |= (pci_addr_t)pci_read_config(dev, reg + 4, 4) << 32;
/*
* Disable decoding via the command register before
* determining the BAR's length since we will be placing it in
* a weird state.
*/
cmd = pci_read_config(dev, PCIR_COMMAND, 2);
pci_write_config(dev, PCIR_COMMAND,
cmd & ~(PCI_BAR_MEM(map) ? PCIM_CMD_MEMEN : PCIM_CMD_PORTEN), 2);
/*
* Determine the BAR's length by writing all 1's. The bottom
* log_2(size) bits of the BAR will stick as 0 when we read
* the value back.
*
* NB: according to the PCI Local Bus Specification, rev. 3.0:
* "Software writes 0FFFFFFFFh to both registers, reads them back,
* and combines the result into a 64-bit value." (section 6.2.5.1)
*
* Writes to both registers must be performed before attempting to
* read back the size value.
*/
testval = 0;
pci_write_config(dev, reg, 0xffffffff, 4);
if (ln2range == 64) {
pci_write_config(dev, reg + 4, 0xffffffff, 4);
testval |= (pci_addr_t)pci_read_config(dev, reg + 4, 4) << 32;
}
testval |= pci_read_config(dev, reg, 4);
/*
* Restore the original value of the BAR. We may have reprogrammed
* the BAR of the low-level console device and when booting verbose,
* we need the console device addressable.
*/
pci_write_config(dev, reg, map, 4);
if (ln2range == 64)
pci_write_config(dev, reg + 4, map >> 32, 4);
pci_write_config(dev, PCIR_COMMAND, cmd, 2);
*mapp = map;
*testvalp = testval;
if (bar64 != NULL)
*bar64 = (ln2range == 64);
}
static void
pci_write_bar(device_t dev, struct pci_map *pm, pci_addr_t base)
{
struct pci_devinfo *dinfo;
int ln2range;
/* The device ROM BAR is always a 32-bit memory BAR. */
dinfo = device_get_ivars(dev);
if (PCIR_IS_BIOS(&dinfo->cfg, pm->pm_reg))
ln2range = 32;
else
ln2range = pci_maprange(pm->pm_value);
pci_write_config(dev, pm->pm_reg, base, 4);
if (ln2range == 64)
pci_write_config(dev, pm->pm_reg + 4, base >> 32, 4);
pm->pm_value = pci_read_config(dev, pm->pm_reg, 4);
if (ln2range == 64)
pm->pm_value |= (pci_addr_t)pci_read_config(dev,
pm->pm_reg + 4, 4) << 32;
}
struct pci_map *
pci_find_bar(device_t dev, int reg)
{
struct pci_devinfo *dinfo;
struct pci_map *pm;
dinfo = device_get_ivars(dev);
STAILQ_FOREACH(pm, &dinfo->cfg.maps, pm_link) {
if (pm->pm_reg == reg)
return (pm);
}
return (NULL);
}
int
pci_bar_enabled(device_t dev, struct pci_map *pm)
{
struct pci_devinfo *dinfo;
uint16_t cmd;
dinfo = device_get_ivars(dev);
if (PCIR_IS_BIOS(&dinfo->cfg, pm->pm_reg) &&
!(pm->pm_value & PCIM_BIOS_ENABLE))
return (0);
cmd = pci_read_config(dev, PCIR_COMMAND, 2);
if (PCIR_IS_BIOS(&dinfo->cfg, pm->pm_reg) || PCI_BAR_MEM(pm->pm_value))
return ((cmd & PCIM_CMD_MEMEN) != 0);
else
return ((cmd & PCIM_CMD_PORTEN) != 0);
}
struct pci_map *
pci_add_bar(device_t dev, int reg, pci_addr_t value, pci_addr_t size)
{
struct pci_devinfo *dinfo;
struct pci_map *pm, *prev;
dinfo = device_get_ivars(dev);
pm = malloc(sizeof(*pm), M_DEVBUF, M_WAITOK | M_ZERO);
pm->pm_reg = reg;
pm->pm_value = value;
pm->pm_size = size;
STAILQ_FOREACH(prev, &dinfo->cfg.maps, pm_link) {
KASSERT(prev->pm_reg != pm->pm_reg, ("duplicate map %02x",
reg));
if (STAILQ_NEXT(prev, pm_link) == NULL ||
STAILQ_NEXT(prev, pm_link)->pm_reg > pm->pm_reg)
break;
}
if (prev != NULL)
STAILQ_INSERT_AFTER(&dinfo->cfg.maps, prev, pm, pm_link);
else
STAILQ_INSERT_TAIL(&dinfo->cfg.maps, pm, pm_link);
return (pm);
}
static void
pci_restore_bars(device_t dev)
{
struct pci_devinfo *dinfo;
struct pci_map *pm;
int ln2range;
dinfo = device_get_ivars(dev);
STAILQ_FOREACH(pm, &dinfo->cfg.maps, pm_link) {
if (PCIR_IS_BIOS(&dinfo->cfg, pm->pm_reg))
ln2range = 32;
else
ln2range = pci_maprange(pm->pm_value);
pci_write_config(dev, pm->pm_reg, pm->pm_value, 4);
if (ln2range == 64)
pci_write_config(dev, pm->pm_reg + 4,
pm->pm_value >> 32, 4);
}
}
/*
* Add a resource based on a pci map register. Return 1 if the map
* register is a 32bit map register or 2 if it is a 64bit register.
*/
static int
pci_add_map(device_t bus, device_t dev, int reg, struct resource_list *rl,
int force, int prefetch)
{
struct pci_map *pm;
pci_addr_t base, map, testval;
pci_addr_t start, end, count;
int barlen, basezero, flags, maprange, mapsize, type;
uint16_t cmd;
struct resource *res;
/*
* The BAR may already exist if the device is a CardBus card
* whose CIS is stored in this BAR.
*/
pm = pci_find_bar(dev, reg);
if (pm != NULL) {
maprange = pci_maprange(pm->pm_value);
barlen = maprange == 64 ? 2 : 1;
return (barlen);
}
pci_read_bar(dev, reg, &map, &testval, NULL);
if (PCI_BAR_MEM(map)) {
type = SYS_RES_MEMORY;
if (map & PCIM_BAR_MEM_PREFETCH)
prefetch = 1;
} else
type = SYS_RES_IOPORT;
mapsize = pci_mapsize(testval);
base = pci_mapbase(map);
#ifdef __PCI_BAR_ZERO_VALID
basezero = 0;
#else
basezero = base == 0;
#endif
maprange = pci_maprange(map);
barlen = maprange == 64 ? 2 : 1;
/*
* For I/O registers, if bottom bit is set, and the next bit up
* isn't clear, we know we have a BAR that doesn't conform to the
* spec, so ignore it. Also, sanity check the size of the data
* areas to the type of memory involved. Memory must be at least
* 16 bytes in size, while I/O ranges must be at least 4.
*/
if (PCI_BAR_IO(testval) && (testval & PCIM_BAR_IO_RESERVED) != 0)
return (barlen);
if ((type == SYS_RES_MEMORY && mapsize < 4) ||
(type == SYS_RES_IOPORT && mapsize < 2))
return (barlen);
/* Save a record of this BAR. */
pm = pci_add_bar(dev, reg, map, mapsize);
if (bootverbose) {
printf("\tmap[%02x]: type %s, range %2d, base %#jx, size %2d",
reg, pci_maptype(map), maprange, (uintmax_t)base, mapsize);
if (type == SYS_RES_IOPORT && !pci_porten(dev))
printf(", port disabled\n");
else if (type == SYS_RES_MEMORY && !pci_memen(dev))
printf(", memory disabled\n");
else
printf(", enabled\n");
}
/*
* If base is 0, then we have problems if this architecture does
* not allow that. It is best to ignore such entries for the
* moment. These will be allocated later if the driver specifically
* requests them. However, some removable buses look better when
* all resources are allocated, so allow '0' to be overriden.
*
* Similarly treat maps whose values is the same as the test value
* read back. These maps have had all f's written to them by the
* BIOS in an attempt to disable the resources.
*/
if (!force && (basezero || map == testval))
return (barlen);
if ((u_long)base != base) {
device_printf(bus,
"pci%d:%d:%d:%d bar %#x too many address bits",
pci_get_domain(dev), pci_get_bus(dev), pci_get_slot(dev),
pci_get_function(dev), reg);
return (barlen);
}
/*
* This code theoretically does the right thing, but has
* undesirable side effects in some cases where peripherals
* respond oddly to having these bits enabled. Let the user
* be able to turn them off (since pci_enable_io_modes is 1 by
* default).
*/
if (pci_enable_io_modes) {
/* Turn on resources that have been left off by a lazy BIOS */
if (type == SYS_RES_IOPORT && !pci_porten(dev)) {
cmd = pci_read_config(dev, PCIR_COMMAND, 2);
cmd |= PCIM_CMD_PORTEN;
pci_write_config(dev, PCIR_COMMAND, cmd, 2);
}
if (type == SYS_RES_MEMORY && !pci_memen(dev)) {
cmd = pci_read_config(dev, PCIR_COMMAND, 2);
cmd |= PCIM_CMD_MEMEN;
pci_write_config(dev, PCIR_COMMAND, cmd, 2);
}
} else {
if (type == SYS_RES_IOPORT && !pci_porten(dev))
return (barlen);
if (type == SYS_RES_MEMORY && !pci_memen(dev))
return (barlen);
}
count = (pci_addr_t)1 << mapsize;
flags = RF_ALIGNMENT_LOG2(mapsize);
if (prefetch)
flags |= RF_PREFETCHABLE;
if (basezero || base == pci_mapbase(testval) || pci_clear_bars) {
start = 0; /* Let the parent decide. */
end = ~0;
} else {
start = base;
end = base + count - 1;
}
resource_list_add(rl, type, reg, start, end, count);
/*
* Try to allocate the resource for this BAR from our parent
* so that this resource range is already reserved. The
* driver for this device will later inherit this resource in
* pci_alloc_resource().
*/
res = resource_list_reserve(rl, bus, dev, type, &reg, start, end, count,
flags);
if (pci_do_realloc_bars && res == NULL && (start != 0 || end != ~0)) {
/*
* If the allocation fails, try to allocate a resource for
* this BAR using any available range. The firmware felt
* it was important enough to assign a resource, so don't
* disable decoding if we can help it.
*/
resource_list_delete(rl, type, reg);
resource_list_add(rl, type, reg, 0, ~0, count);
res = resource_list_reserve(rl, bus, dev, type, &reg, 0, ~0,
count, flags);
}
if (res == NULL) {
/*
* If the allocation fails, delete the resource list entry
* and disable decoding for this device.
*
* If the driver requests this resource in the future,
* pci_reserve_map() will try to allocate a fresh
* resource range.
*/
resource_list_delete(rl, type, reg);
pci_disable_io(dev, type);
if (bootverbose)
device_printf(bus,
"pci%d:%d:%d:%d bar %#x failed to allocate\n",
pci_get_domain(dev), pci_get_bus(dev),
pci_get_slot(dev), pci_get_function(dev), reg);
} else {
start = rman_get_start(res);
pci_write_bar(dev, pm, start);
}
return (barlen);
}
/*
* For ATA devices we need to decide early what addressing mode to use.
* Legacy demands that the primary and secondary ATA ports sits on the
* same addresses that old ISA hardware did. This dictates that we use
* those addresses and ignore the BAR's if we cannot set PCI native
* addressing mode.
*/
static void
pci_ata_maps(device_t bus, device_t dev, struct resource_list *rl, int force,
uint32_t prefetchmask)
{
int rid, type, progif;
#if 0
/* if this device supports PCI native addressing use it */
progif = pci_read_config(dev, PCIR_PROGIF, 1);
if ((progif & 0x8a) == 0x8a) {
if (pci_mapbase(pci_read_config(dev, PCIR_BAR(0), 4)) &&
pci_mapbase(pci_read_config(dev, PCIR_BAR(2), 4))) {
printf("Trying ATA native PCI addressing mode\n");
pci_write_config(dev, PCIR_PROGIF, progif | 0x05, 1);
}
}
#endif
progif = pci_read_config(dev, PCIR_PROGIF, 1);
type = SYS_RES_IOPORT;
if (progif & PCIP_STORAGE_IDE_MODEPRIM) {
pci_add_map(bus, dev, PCIR_BAR(0), rl, force,
prefetchmask & (1 << 0));
pci_add_map(bus, dev, PCIR_BAR(1), rl, force,
prefetchmask & (1 << 1));
} else {
rid = PCIR_BAR(0);
resource_list_add(rl, type, rid, 0x1f0, 0x1f7, 8);
(void)resource_list_reserve(rl, bus, dev, type, &rid, 0x1f0,
0x1f7, 8, 0);
rid = PCIR_BAR(1);
resource_list_add(rl, type, rid, 0x3f6, 0x3f6, 1);
(void)resource_list_reserve(rl, bus, dev, type, &rid, 0x3f6,
0x3f6, 1, 0);
}
if (progif & PCIP_STORAGE_IDE_MODESEC) {
pci_add_map(bus, dev, PCIR_BAR(2), rl, force,
prefetchmask & (1 << 2));
pci_add_map(bus, dev, PCIR_BAR(3), rl, force,
prefetchmask & (1 << 3));
} else {
rid = PCIR_BAR(2);
resource_list_add(rl, type, rid, 0x170, 0x177, 8);
(void)resource_list_reserve(rl, bus, dev, type, &rid, 0x170,
0x177, 8, 0);
rid = PCIR_BAR(3);
resource_list_add(rl, type, rid, 0x376, 0x376, 1);
(void)resource_list_reserve(rl, bus, dev, type, &rid, 0x376,
0x376, 1, 0);
}
pci_add_map(bus, dev, PCIR_BAR(4), rl, force,
prefetchmask & (1 << 4));
pci_add_map(bus, dev, PCIR_BAR(5), rl, force,
prefetchmask & (1 << 5));
}
static void
pci_assign_interrupt(device_t bus, device_t dev, int force_route)
{
struct pci_devinfo *dinfo = device_get_ivars(dev);
pcicfgregs *cfg = &dinfo->cfg;
char tunable_name[64];
int irq;
/* Has to have an intpin to have an interrupt. */
if (cfg->intpin == 0)
return;
/* Let the user override the IRQ with a tunable. */
irq = PCI_INVALID_IRQ;
snprintf(tunable_name, sizeof(tunable_name),
"hw.pci%d.%d.%d.INT%c.irq",
cfg->domain, cfg->bus, cfg->slot, cfg->intpin + 'A' - 1);
if (TUNABLE_INT_FETCH(tunable_name, &irq) && (irq >= 255 || irq <= 0))
irq = PCI_INVALID_IRQ;
/*
* If we didn't get an IRQ via the tunable, then we either use the
* IRQ value in the intline register or we ask the bus to route an
* interrupt for us. If force_route is true, then we only use the
* value in the intline register if the bus was unable to assign an
* IRQ.
*/
if (!PCI_INTERRUPT_VALID(irq)) {
if (!PCI_INTERRUPT_VALID(cfg->intline) || force_route)
irq = PCI_ASSIGN_INTERRUPT(bus, dev);
if (!PCI_INTERRUPT_VALID(irq))
irq = cfg->intline;
}
/* If after all that we don't have an IRQ, just bail. */
if (!PCI_INTERRUPT_VALID(irq))
return;
/* Update the config register if it changed. */
if (irq != cfg->intline) {
cfg->intline = irq;
pci_write_config(dev, PCIR_INTLINE, irq, 1);
}
/* Add this IRQ as rid 0 interrupt resource. */
resource_list_add(&dinfo->resources, SYS_RES_IRQ, 0, irq, irq, 1);
}
/* Perform early OHCI takeover from SMM. */
static void
ohci_early_takeover(device_t self)
{
struct resource *res;
uint32_t ctl;
int rid;
int i;
rid = PCIR_BAR(0);
res = bus_alloc_resource_any(self, SYS_RES_MEMORY, &rid, RF_ACTIVE);
if (res == NULL)
return;
ctl = bus_read_4(res, OHCI_CONTROL);
if (ctl & OHCI_IR) {
if (bootverbose)
printf("ohci early: "
"SMM active, request owner change\n");
bus_write_4(res, OHCI_COMMAND_STATUS, OHCI_OCR);
for (i = 0; (i < 100) && (ctl & OHCI_IR); i++) {
DELAY(1000);
ctl = bus_read_4(res, OHCI_CONTROL);
}
if (ctl & OHCI_IR) {
if (bootverbose)
printf("ohci early: "
"SMM does not respond, resetting\n");
bus_write_4(res, OHCI_CONTROL, OHCI_HCFS_RESET);
}
/* Disable interrupts */
bus_write_4(res, OHCI_INTERRUPT_DISABLE, OHCI_ALL_INTRS);
}
bus_release_resource(self, SYS_RES_MEMORY, rid, res);
}
/* Perform early UHCI takeover from SMM. */
static void
uhci_early_takeover(device_t self)
{
struct resource *res;
int rid;
/*
* Set the PIRQD enable bit and switch off all the others. We don't
* want legacy support to interfere with us XXX Does this also mean
* that the BIOS won't touch the keyboard anymore if it is connected
* to the ports of the root hub?
*/
pci_write_config(self, PCI_LEGSUP, PCI_LEGSUP_USBPIRQDEN, 2);
/* Disable interrupts */
rid = PCI_UHCI_BASE_REG;
res = bus_alloc_resource_any(self, SYS_RES_IOPORT, &rid, RF_ACTIVE);
if (res != NULL) {
bus_write_2(res, UHCI_INTR, 0);
bus_release_resource(self, SYS_RES_IOPORT, rid, res);
}
}
/* Perform early EHCI takeover from SMM. */
static void
ehci_early_takeover(device_t self)
{
struct resource *res;
uint32_t cparams;
uint32_t eec;
uint8_t eecp;
uint8_t bios_sem;
uint8_t offs;
int rid;
int i;
rid = PCIR_BAR(0);
res = bus_alloc_resource_any(self, SYS_RES_MEMORY, &rid, RF_ACTIVE);
if (res == NULL)
return;
cparams = bus_read_4(res, EHCI_HCCPARAMS);
/* Synchronise with the BIOS if it owns the controller. */
for (eecp = EHCI_HCC_EECP(cparams); eecp != 0;
eecp = EHCI_EECP_NEXT(eec)) {
eec = pci_read_config(self, eecp, 4);
if (EHCI_EECP_ID(eec) != EHCI_EC_LEGSUP) {
continue;
}
bios_sem = pci_read_config(self, eecp +
EHCI_LEGSUP_BIOS_SEM, 1);
if (bios_sem == 0) {
continue;
}
if (bootverbose)
printf("ehci early: "
"SMM active, request owner change\n");
pci_write_config(self, eecp + EHCI_LEGSUP_OS_SEM, 1, 1);
for (i = 0; (i < 100) && (bios_sem != 0); i++) {
DELAY(1000);
bios_sem = pci_read_config(self, eecp +
EHCI_LEGSUP_BIOS_SEM, 1);
}
if (bios_sem != 0) {
if (bootverbose)
printf("ehci early: "
"SMM does not respond\n");
}
/* Disable interrupts */
offs = EHCI_CAPLENGTH(bus_read_4(res, EHCI_CAPLEN_HCIVERSION));
bus_write_4(res, offs + EHCI_USBINTR, 0);
}
bus_release_resource(self, SYS_RES_MEMORY, rid, res);
}
/* Perform early XHCI takeover from SMM. */
static void
xhci_early_takeover(device_t self)
{
struct resource *res;
uint32_t cparams;
uint32_t eec;
uint8_t eecp;
uint8_t bios_sem;
uint8_t offs;
int rid;
int i;
rid = PCIR_BAR(0);
res = bus_alloc_resource_any(self, SYS_RES_MEMORY, &rid, RF_ACTIVE);
if (res == NULL)
return;
cparams = bus_read_4(res, XHCI_HCSPARAMS0);
eec = -1;
/* Synchronise with the BIOS if it owns the controller. */
for (eecp = XHCI_HCS0_XECP(cparams) << 2; eecp != 0 && XHCI_XECP_NEXT(eec);
eecp += XHCI_XECP_NEXT(eec) << 2) {
eec = bus_read_4(res, eecp);
if (XHCI_XECP_ID(eec) != XHCI_ID_USB_LEGACY)
continue;
bios_sem = bus_read_1(res, eecp + XHCI_XECP_BIOS_SEM);
if (bios_sem == 0)
continue;
if (bootverbose)
printf("xhci early: "
"SMM active, request owner change\n");
bus_write_1(res, eecp + XHCI_XECP_OS_SEM, 1);
/* wait a maximum of 5 second */
for (i = 0; (i < 5000) && (bios_sem != 0); i++) {
DELAY(1000);
bios_sem = bus_read_1(res, eecp +
XHCI_XECP_BIOS_SEM);
}
if (bios_sem != 0) {
if (bootverbose)
printf("xhci early: "
"SMM does not respond\n");
}
/* Disable interrupts */
offs = bus_read_1(res, XHCI_CAPLENGTH);
bus_write_4(res, offs + XHCI_USBCMD, 0);
bus_read_4(res, offs + XHCI_USBSTS);
}
bus_release_resource(self, SYS_RES_MEMORY, rid, res);
}
#if defined(NEW_PCIB) && defined(PCI_RES_BUS)
static void
pci_reserve_secbus(device_t bus, device_t dev, pcicfgregs *cfg,
struct resource_list *rl)
{
struct resource *res;
char *cp;
rman_res_t start, end, count;
int rid, sec_bus, sec_reg, sub_bus, sub_reg, sup_bus;
switch (cfg->hdrtype & PCIM_HDRTYPE) {
case PCIM_HDRTYPE_BRIDGE:
sec_reg = PCIR_SECBUS_1;
sub_reg = PCIR_SUBBUS_1;
break;
case PCIM_HDRTYPE_CARDBUS:
sec_reg = PCIR_SECBUS_2;
sub_reg = PCIR_SUBBUS_2;
break;
default:
return;
}
/*
* If the existing bus range is valid, attempt to reserve it
* from our parent. If this fails for any reason, clear the
* secbus and subbus registers.
*
* XXX: Should we reset sub_bus to sec_bus if it is < sec_bus?
* This would at least preserve the existing sec_bus if it is
* valid.
*/
sec_bus = PCI_READ_CONFIG(bus, dev, sec_reg, 1);
sub_bus = PCI_READ_CONFIG(bus, dev, sub_reg, 1);
/* Quirk handling. */
switch (pci_get_devid(dev)) {
case 0x12258086: /* Intel 82454KX/GX (Orion) */
sup_bus = pci_read_config(dev, 0x41, 1);
if (sup_bus != 0xff) {
sec_bus = sup_bus + 1;
sub_bus = sup_bus + 1;
PCI_WRITE_CONFIG(bus, dev, sec_reg, sec_bus, 1);
PCI_WRITE_CONFIG(bus, dev, sub_reg, sub_bus, 1);
}
break;
case 0x00dd10de:
/* Compaq R3000 BIOS sets wrong subordinate bus number. */
if ((cp = kern_getenv("smbios.planar.maker")) == NULL)
break;
if (strncmp(cp, "Compal", 6) != 0) {
freeenv(cp);
break;
}
freeenv(cp);
if ((cp = kern_getenv("smbios.planar.product")) == NULL)
break;
if (strncmp(cp, "08A0", 4) != 0) {
freeenv(cp);
break;
}
freeenv(cp);
if (sub_bus < 0xa) {
sub_bus = 0xa;
PCI_WRITE_CONFIG(bus, dev, sub_reg, sub_bus, 1);
}
break;
}
if (bootverbose)
printf("\tsecbus=%d, subbus=%d\n", sec_bus, sub_bus);
if (sec_bus > 0 && sub_bus >= sec_bus) {
start = sec_bus;
end = sub_bus;
count = end - start + 1;
resource_list_add(rl, PCI_RES_BUS, 0, 0, ~0, count);
/*
* If requested, clear secondary bus registers in
* bridge devices to force a complete renumbering
* rather than reserving the existing range. However,
* preserve the existing size.
*/
if (pci_clear_buses)
goto clear;
rid = 0;
res = resource_list_reserve(rl, bus, dev, PCI_RES_BUS, &rid,
start, end, count, 0);
if (res != NULL)
return;
if (bootverbose)
device_printf(bus,
"pci%d:%d:%d:%d secbus failed to allocate\n",
pci_get_domain(dev), pci_get_bus(dev),
pci_get_slot(dev), pci_get_function(dev));
}
clear:
PCI_WRITE_CONFIG(bus, dev, sec_reg, 0, 1);
PCI_WRITE_CONFIG(bus, dev, sub_reg, 0, 1);
}
static struct resource *
pci_alloc_secbus(device_t dev, device_t child, int *rid, rman_res_t start,
rman_res_t end, rman_res_t count, u_int flags)
{
struct pci_devinfo *dinfo;
pcicfgregs *cfg;
struct resource_list *rl;
struct resource *res;
int sec_reg, sub_reg;
dinfo = device_get_ivars(child);
cfg = &dinfo->cfg;
rl = &dinfo->resources;
switch (cfg->hdrtype & PCIM_HDRTYPE) {
case PCIM_HDRTYPE_BRIDGE:
sec_reg = PCIR_SECBUS_1;
sub_reg = PCIR_SUBBUS_1;
break;
case PCIM_HDRTYPE_CARDBUS:
sec_reg = PCIR_SECBUS_2;
sub_reg = PCIR_SUBBUS_2;
break;
default:
return (NULL);
}
if (*rid != 0)
return (NULL);
if (resource_list_find(rl, PCI_RES_BUS, *rid) == NULL)
resource_list_add(rl, PCI_RES_BUS, *rid, start, end, count);
if (!resource_list_reserved(rl, PCI_RES_BUS, *rid)) {
res = resource_list_reserve(rl, dev, child, PCI_RES_BUS, rid,
start, end, count, flags & ~RF_ACTIVE);
if (res == NULL) {
resource_list_delete(rl, PCI_RES_BUS, *rid);
device_printf(child, "allocating %ju bus%s failed\n",
count, count == 1 ? "" : "es");
return (NULL);
}
if (bootverbose)
device_printf(child,
"Lazy allocation of %ju bus%s at %ju\n", count,
count == 1 ? "" : "es", rman_get_start(res));
PCI_WRITE_CONFIG(dev, child, sec_reg, rman_get_start(res), 1);
PCI_WRITE_CONFIG(dev, child, sub_reg, rman_get_end(res), 1);
}
return (resource_list_alloc(rl, dev, child, PCI_RES_BUS, rid, start,
end, count, flags));
}
#endif
static int
pci_ea_bei_to_rid(device_t dev, int bei)
{
#ifdef PCI_IOV
struct pci_devinfo *dinfo;
int iov_pos;
struct pcicfg_iov *iov;
dinfo = device_get_ivars(dev);
iov = dinfo->cfg.iov;
if (iov != NULL)
iov_pos = iov->iov_pos;
else
iov_pos = 0;
#endif
/* Check if matches BAR */
if ((bei >= PCIM_EA_BEI_BAR_0) &&
(bei <= PCIM_EA_BEI_BAR_5))
return (PCIR_BAR(bei));
/* Check ROM */
if (bei == PCIM_EA_BEI_ROM)
return (PCIR_BIOS);
#ifdef PCI_IOV
/* Check if matches VF_BAR */
if ((iov != NULL) && (bei >= PCIM_EA_BEI_VF_BAR_0) &&
(bei <= PCIM_EA_BEI_VF_BAR_5))
return (PCIR_SRIOV_BAR(bei - PCIM_EA_BEI_VF_BAR_0) +
iov_pos);
#endif
return (-1);
}
int
pci_ea_is_enabled(device_t dev, int rid)
{
struct pci_ea_entry *ea;
struct pci_devinfo *dinfo;
dinfo = device_get_ivars(dev);
STAILQ_FOREACH(ea, &dinfo->cfg.ea.ea_entries, eae_link) {
if (pci_ea_bei_to_rid(dev, ea->eae_bei) == rid)
return ((ea->eae_flags & PCIM_EA_ENABLE) > 0);
}
return (0);
}
void
pci_add_resources_ea(device_t bus, device_t dev, int alloc_iov)
{
struct pci_ea_entry *ea;
struct pci_devinfo *dinfo;
pci_addr_t start, end, count;
struct resource_list *rl;
int type, flags, rid;
struct resource *res;
uint32_t tmp;
#ifdef PCI_IOV
struct pcicfg_iov *iov;
#endif
dinfo = device_get_ivars(dev);
rl = &dinfo->resources;
flags = 0;
#ifdef PCI_IOV
iov = dinfo->cfg.iov;
#endif
if (dinfo->cfg.ea.ea_location == 0)
return;
STAILQ_FOREACH(ea, &dinfo->cfg.ea.ea_entries, eae_link) {
/*
* TODO: Ignore EA-BAR if is not enabled.
* Currently the EA implementation supports
* only situation, where EA structure contains
* predefined entries. In case they are not enabled
* leave them unallocated and proceed with
* a legacy-BAR mechanism.
*/
if ((ea->eae_flags & PCIM_EA_ENABLE) == 0)
continue;
switch ((ea->eae_flags & PCIM_EA_PP) >> PCIM_EA_PP_OFFSET) {
case PCIM_EA_P_MEM_PREFETCH:
case PCIM_EA_P_VF_MEM_PREFETCH:
flags = RF_PREFETCHABLE;
/* FALLTHROUGH */
case PCIM_EA_P_VF_MEM:
case PCIM_EA_P_MEM:
type = SYS_RES_MEMORY;
break;
case PCIM_EA_P_IO:
type = SYS_RES_IOPORT;
break;
default:
continue;
}
if (alloc_iov != 0) {
#ifdef PCI_IOV
/* Allocating IOV, confirm BEI matches */
if ((ea->eae_bei < PCIM_EA_BEI_VF_BAR_0) ||
(ea->eae_bei > PCIM_EA_BEI_VF_BAR_5))
continue;
#else
continue;
#endif
} else {
/* Allocating BAR, confirm BEI matches */
if (((ea->eae_bei < PCIM_EA_BEI_BAR_0) ||
(ea->eae_bei > PCIM_EA_BEI_BAR_5)) &&
(ea->eae_bei != PCIM_EA_BEI_ROM))
continue;
}
rid = pci_ea_bei_to_rid(dev, ea->eae_bei);
if (rid < 0)
continue;
/* Skip resources already allocated by EA */
if ((resource_list_find(rl, SYS_RES_MEMORY, rid) != NULL) ||
(resource_list_find(rl, SYS_RES_IOPORT, rid) != NULL))
continue;
start = ea->eae_base;
count = ea->eae_max_offset + 1;
#ifdef PCI_IOV
if (iov != NULL)
count = count * iov->iov_num_vfs;
#endif
end = start + count - 1;
if (count == 0)
continue;
resource_list_add(rl, type, rid, start, end, count);
res = resource_list_reserve(rl, bus, dev, type, &rid, start, end, count,
flags);
if (res == NULL) {
resource_list_delete(rl, type, rid);
/*
* Failed to allocate using EA, disable entry.
* Another attempt to allocation will be performed
* further, but this time using legacy BAR registers
*/
tmp = pci_read_config(dev, ea->eae_cfg_offset, 4);
tmp &= ~PCIM_EA_ENABLE;
pci_write_config(dev, ea->eae_cfg_offset, tmp, 4);
/*
* Disabling entry might fail in case it is hardwired.
* Read flags again to match current status.
*/
ea->eae_flags = pci_read_config(dev, ea->eae_cfg_offset, 4);
continue;
}
/* As per specification, fill BAR with zeros */
pci_write_config(dev, rid, 0, 4);
}
}
void
pci_add_resources(device_t bus, device_t dev, int force, uint32_t prefetchmask)
{
struct pci_devinfo *dinfo;
pcicfgregs *cfg;
struct resource_list *rl;
const struct pci_quirk *q;
uint32_t devid;
int i;
dinfo = device_get_ivars(dev);
cfg = &dinfo->cfg;
rl = &dinfo->resources;
devid = (cfg->device << 16) | cfg->vendor;
/* Allocate resources using Enhanced Allocation */
pci_add_resources_ea(bus, dev, 0);
/* ATA devices needs special map treatment */
if ((pci_get_class(dev) == PCIC_STORAGE) &&
(pci_get_subclass(dev) == PCIS_STORAGE_IDE) &&
((pci_get_progif(dev) & PCIP_STORAGE_IDE_MASTERDEV) ||
(!pci_read_config(dev, PCIR_BAR(0), 4) &&
!pci_read_config(dev, PCIR_BAR(2), 4))) )
pci_ata_maps(bus, dev, rl, force, prefetchmask);
else
for (i = 0; i < cfg->nummaps;) {
/* Skip resources already managed by EA */
if ((resource_list_find(rl, SYS_RES_MEMORY, PCIR_BAR(i)) != NULL) ||
(resource_list_find(rl, SYS_RES_IOPORT, PCIR_BAR(i)) != NULL) ||
pci_ea_is_enabled(dev, PCIR_BAR(i))) {
i++;
continue;
}
/*
* Skip quirked resources.
*/
for (q = &pci_quirks[0]; q->devid != 0; q++)
if (q->devid == devid &&
q->type == PCI_QUIRK_UNMAP_REG &&
q->arg1 == PCIR_BAR(i))
break;
if (q->devid != 0) {
i++;
continue;
}
i += pci_add_map(bus, dev, PCIR_BAR(i), rl, force,
prefetchmask & (1 << i));
}
/*
* Add additional, quirked resources.
*/
for (q = &pci_quirks[0]; q->devid != 0; q++)
if (q->devid == devid && q->type == PCI_QUIRK_MAP_REG)
pci_add_map(bus, dev, q->arg1, rl, force, 0);
if (cfg->intpin > 0 && PCI_INTERRUPT_VALID(cfg->intline)) {
#ifdef __PCI_REROUTE_INTERRUPT
/*
* Try to re-route interrupts. Sometimes the BIOS or
* firmware may leave bogus values in these registers.
* If the re-route fails, then just stick with what we
* have.
*/
pci_assign_interrupt(bus, dev, 1);
#else
pci_assign_interrupt(bus, dev, 0);
#endif
}
if (pci_usb_takeover && pci_get_class(dev) == PCIC_SERIALBUS &&
pci_get_subclass(dev) == PCIS_SERIALBUS_USB) {
if (pci_get_progif(dev) == PCIP_SERIALBUS_USB_XHCI)
xhci_early_takeover(dev);
else if (pci_get_progif(dev) == PCIP_SERIALBUS_USB_EHCI)
ehci_early_takeover(dev);
else if (pci_get_progif(dev) == PCIP_SERIALBUS_USB_OHCI)
ohci_early_takeover(dev);
else if (pci_get_progif(dev) == PCIP_SERIALBUS_USB_UHCI)
uhci_early_takeover(dev);
}
#if defined(NEW_PCIB) && defined(PCI_RES_BUS)
/*
* Reserve resources for secondary bus ranges behind bridge
* devices.
*/
pci_reserve_secbus(bus, dev, cfg, rl);
#endif
}
static struct pci_devinfo *
pci_identify_function(device_t pcib, device_t dev, int domain, int busno,
int slot, int func)
{
struct pci_devinfo *dinfo;
dinfo = pci_read_device(pcib, dev, domain, busno, slot, func);
if (dinfo != NULL)
pci_add_child(dev, dinfo);
return (dinfo);
}
void
pci_add_children(device_t dev, int domain, int busno)
{
#define REG(n, w) PCIB_READ_CONFIG(pcib, busno, s, f, n, w)
device_t pcib = device_get_parent(dev);
struct pci_devinfo *dinfo;
int maxslots;
int s, f, pcifunchigh;
uint8_t hdrtype;
int first_func;
/*
* Try to detect a device at slot 0, function 0. If it exists, try to
* enable ARI. We must enable ARI before detecting the rest of the
* functions on this bus as ARI changes the set of slots and functions
* that are legal on this bus.
*/
dinfo = pci_identify_function(pcib, dev, domain, busno, 0, 0);
if (dinfo != NULL && pci_enable_ari)
PCIB_TRY_ENABLE_ARI(pcib, dinfo->cfg.dev);
/*
* Start looking for new devices on slot 0 at function 1 because we
* just identified the device at slot 0, function 0.
*/
first_func = 1;
maxslots = PCIB_MAXSLOTS(pcib);
for (s = 0; s <= maxslots; s++, first_func = 0) {
pcifunchigh = 0;
f = 0;
DELAY(1);
hdrtype = REG(PCIR_HDRTYPE, 1);
if ((hdrtype & PCIM_HDRTYPE) > PCI_MAXHDRTYPE)
continue;
if (hdrtype & PCIM_MFDEV)
pcifunchigh = PCIB_MAXFUNCS(pcib);
for (f = first_func; f <= pcifunchigh; f++)
pci_identify_function(pcib, dev, domain, busno, s, f);
}
#undef REG
}
int
pci_rescan_method(device_t dev)
{
#define REG(n, w) PCIB_READ_CONFIG(pcib, busno, s, f, n, w)
device_t pcib = device_get_parent(dev);
- struct pci_softc *sc;
device_t child, *devlist, *unchanged;
int devcount, error, i, j, maxslots, oldcount;
int busno, domain, s, f, pcifunchigh;
uint8_t hdrtype;
/* No need to check for ARI on a rescan. */
error = device_get_children(dev, &devlist, &devcount);
if (error)
return (error);
if (devcount != 0) {
unchanged = malloc(devcount * sizeof(device_t), M_TEMP,
M_NOWAIT | M_ZERO);
if (unchanged == NULL) {
free(devlist, M_TEMP);
return (ENOMEM);
}
} else
unchanged = NULL;
- sc = device_get_softc(dev);
domain = pcib_get_domain(dev);
busno = pcib_get_bus(dev);
maxslots = PCIB_MAXSLOTS(pcib);
for (s = 0; s <= maxslots; s++) {
/* If function 0 is not present, skip to the next slot. */
f = 0;
if (REG(PCIR_VENDOR, 2) == 0xffff)
continue;
pcifunchigh = 0;
hdrtype = REG(PCIR_HDRTYPE, 1);
if ((hdrtype & PCIM_HDRTYPE) > PCI_MAXHDRTYPE)
continue;
if (hdrtype & PCIM_MFDEV)
pcifunchigh = PCIB_MAXFUNCS(pcib);
for (f = 0; f <= pcifunchigh; f++) {
if (REG(PCIR_VENDOR, 2) == 0xffff)
continue;
/*
* Found a valid function. Check if a
* device_t for this device already exists.
*/
for (i = 0; i < devcount; i++) {
child = devlist[i];
if (child == NULL)
continue;
if (pci_get_slot(child) == s &&
pci_get_function(child) == f) {
unchanged[i] = child;
goto next_func;
}
}
pci_identify_function(pcib, dev, domain, busno, s, f);
next_func:;
}
}
/* Remove devices that are no longer present. */
for (i = 0; i < devcount; i++) {
if (unchanged[i] != NULL)
continue;
device_delete_child(dev, devlist[i]);
}
free(devlist, M_TEMP);
oldcount = devcount;
/* Try to attach the devices just added. */
error = device_get_children(dev, &devlist, &devcount);
if (error) {
free(unchanged, M_TEMP);
return (error);
}
for (i = 0; i < devcount; i++) {
for (j = 0; j < oldcount; j++) {
if (devlist[i] == unchanged[j])
goto next_device;
}
device_probe_and_attach(devlist[i]);
next_device:;
}
free(unchanged, M_TEMP);
free(devlist, M_TEMP);
return (0);
#undef REG
}
#ifdef PCI_IOV
device_t
pci_add_iov_child(device_t bus, device_t pf, uint16_t rid, uint16_t vid,
uint16_t did)
{
- struct pci_devinfo *pf_dinfo, *vf_dinfo;
+ struct pci_devinfo *vf_dinfo;
device_t pcib;
int busno, slot, func;
-
- pf_dinfo = device_get_ivars(pf);
pcib = device_get_parent(bus);
PCIB_DECODE_RID(pcib, rid, &busno, &slot, &func);
vf_dinfo = pci_fill_devinfo(pcib, bus, pci_get_domain(pcib), busno,
slot, func, vid, did);
vf_dinfo->cfg.flags |= PCICFG_VF;
pci_add_child(bus, vf_dinfo);
return (vf_dinfo->cfg.dev);
}
device_t
pci_create_iov_child_method(device_t bus, device_t pf, uint16_t rid,
uint16_t vid, uint16_t did)
{
return (pci_add_iov_child(bus, pf, rid, vid, did));
}
#endif
void
pci_add_child(device_t bus, struct pci_devinfo *dinfo)
{
dinfo->cfg.dev = device_add_child(bus, NULL, -1);
device_set_ivars(dinfo->cfg.dev, dinfo);
resource_list_init(&dinfo->resources);
pci_cfg_save(dinfo->cfg.dev, dinfo, 0);
pci_cfg_restore(dinfo->cfg.dev, dinfo);
pci_print_verbose(dinfo);
pci_add_resources(bus, dinfo->cfg.dev, 0, 0);
pci_child_added(dinfo->cfg.dev);
EVENTHANDLER_INVOKE(pci_add_device, dinfo->cfg.dev);
}
void
pci_child_added_method(device_t dev, device_t child)
{
}
static int
pci_probe(device_t dev)
{
device_set_desc(dev, "PCI bus");
/* Allow other subclasses to override this driver. */
return (BUS_PROBE_GENERIC);
}
int
pci_attach_common(device_t dev)
{
struct pci_softc *sc;
int busno, domain;
#ifdef PCI_DMA_BOUNDARY
int error, tag_valid;
#endif
#ifdef PCI_RES_BUS
int rid;
#endif
sc = device_get_softc(dev);
domain = pcib_get_domain(dev);
busno = pcib_get_bus(dev);
#ifdef PCI_RES_BUS
rid = 0;
sc->sc_bus = bus_alloc_resource(dev, PCI_RES_BUS, &rid, busno, busno,
1, 0);
if (sc->sc_bus == NULL) {
device_printf(dev, "failed to allocate bus number\n");
return (ENXIO);
}
#endif
if (bootverbose)
device_printf(dev, "domain=%d, physical bus=%d\n",
domain, busno);
#ifdef PCI_DMA_BOUNDARY
tag_valid = 0;
if (device_get_devclass(device_get_parent(device_get_parent(dev))) !=
devclass_find("pci")) {
error = bus_dma_tag_create(bus_get_dma_tag(dev), 1,
PCI_DMA_BOUNDARY, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
NULL, NULL, BUS_SPACE_MAXSIZE, BUS_SPACE_UNRESTRICTED,
BUS_SPACE_MAXSIZE, 0, NULL, NULL, &sc->sc_dma_tag);
if (error)
device_printf(dev, "Failed to create DMA tag: %d\n",
error);
else
tag_valid = 1;
}
if (!tag_valid)
#endif
sc->sc_dma_tag = bus_get_dma_tag(dev);
return (0);
}
static int
pci_attach(device_t dev)
{
int busno, domain, error;
error = pci_attach_common(dev);
if (error)
return (error);
/*
* Since there can be multiple independently numbered PCI
* buses on systems with multiple PCI domains, we can't use
* the unit number to decide which bus we are probing. We ask
* the parent pcib what our domain and bus numbers are.
*/
domain = pcib_get_domain(dev);
busno = pcib_get_bus(dev);
pci_add_children(dev, domain, busno);
return (bus_generic_attach(dev));
}
static int
pci_detach(device_t dev)
{
#ifdef PCI_RES_BUS
struct pci_softc *sc;
#endif
int error;
error = bus_generic_detach(dev);
if (error)
return (error);
#ifdef PCI_RES_BUS
sc = device_get_softc(dev);
error = bus_release_resource(dev, PCI_RES_BUS, 0, sc->sc_bus);
if (error)
return (error);
#endif
return (device_delete_children(dev));
}
static void
pci_hint_device_unit(device_t dev, device_t child, const char *name, int *unitp)
{
int line, unit;
const char *at;
char me1[24], me2[32];
uint8_t b, s, f;
uint32_t d;
d = pci_get_domain(child);
b = pci_get_bus(child);
s = pci_get_slot(child);
f = pci_get_function(child);
snprintf(me1, sizeof(me1), "pci%u:%u:%u", b, s, f);
snprintf(me2, sizeof(me2), "pci%u:%u:%u:%u", d, b, s, f);
line = 0;
while (resource_find_dev(&line, name, &unit, "at", NULL) == 0) {
resource_string_value(name, unit, "at", &at);
if (strcmp(at, me1) != 0 && strcmp(at, me2) != 0)
continue; /* No match, try next candidate */
*unitp = unit;
return;
}
}
static void
pci_set_power_child(device_t dev, device_t child, int state)
{
device_t pcib;
int dstate;
/*
* Set the device to the given state. If the firmware suggests
* a different power state, use it instead. If power management
* is not present, the firmware is responsible for managing
* device power. Skip children who aren't attached since they
* are handled separately.
*/
pcib = device_get_parent(dev);
dstate = state;
if (device_is_attached(child) &&
PCIB_POWER_FOR_SLEEP(pcib, child, &dstate) == 0)
pci_set_powerstate(child, dstate);
}
int
pci_suspend_child(device_t dev, device_t child)
{
struct pci_devinfo *dinfo;
int error;
dinfo = device_get_ivars(child);
/*
* Save the PCI configuration space for the child and set the
* device in the appropriate power state for this sleep state.
*/
pci_cfg_save(child, dinfo, 0);
/* Suspend devices before potentially powering them down. */
error = bus_generic_suspend_child(dev, child);
if (error)
return (error);
if (pci_do_power_suspend)
pci_set_power_child(dev, child, PCI_POWERSTATE_D3);
return (0);
}
int
pci_resume_child(device_t dev, device_t child)
{
struct pci_devinfo *dinfo;
if (pci_do_power_resume)
pci_set_power_child(dev, child, PCI_POWERSTATE_D0);
dinfo = device_get_ivars(child);
pci_cfg_restore(child, dinfo);
if (!device_is_attached(child))
pci_cfg_save(child, dinfo, 1);
bus_generic_resume_child(dev, child);
return (0);
}
int
pci_resume(device_t dev)
{
device_t child, *devlist;
int error, i, numdevs;
if ((error = device_get_children(dev, &devlist, &numdevs)) != 0)
return (error);
/*
* Resume critical devices first, then everything else later.
*/
for (i = 0; i < numdevs; i++) {
child = devlist[i];
switch (pci_get_class(child)) {
case PCIC_DISPLAY:
case PCIC_MEMORY:
case PCIC_BRIDGE:
case PCIC_BASEPERIPH:
BUS_RESUME_CHILD(dev, child);
break;
}
}
for (i = 0; i < numdevs; i++) {
child = devlist[i];
switch (pci_get_class(child)) {
case PCIC_DISPLAY:
case PCIC_MEMORY:
case PCIC_BRIDGE:
case PCIC_BASEPERIPH:
break;
default:
BUS_RESUME_CHILD(dev, child);
}
}
free(devlist, M_TEMP);
return (0);
}
static void
pci_load_vendor_data(void)
{
caddr_t data;
void *ptr;
size_t sz;
data = preload_search_by_type("pci_vendor_data");
if (data != NULL) {
ptr = preload_fetch_addr(data);
sz = preload_fetch_size(data);
if (ptr != NULL && sz != 0) {
pci_vendordata = ptr;
pci_vendordata_size = sz;
/* terminate the database */
pci_vendordata[pci_vendordata_size] = '\n';
}
}
}
void
pci_driver_added(device_t dev, driver_t *driver)
{
int numdevs;
device_t *devlist;
device_t child;
struct pci_devinfo *dinfo;
int i;
if (bootverbose)
device_printf(dev, "driver added\n");
DEVICE_IDENTIFY(driver, dev);
if (device_get_children(dev, &devlist, &numdevs) != 0)
return;
for (i = 0; i < numdevs; i++) {
child = devlist[i];
if (device_get_state(child) != DS_NOTPRESENT)
continue;
dinfo = device_get_ivars(child);
pci_print_verbose(dinfo);
if (bootverbose)
pci_printf(&dinfo->cfg, "reprobing on driver added\n");
pci_cfg_restore(child, dinfo);
if (device_probe_and_attach(child) != 0)
pci_child_detached(dev, child);
}
free(devlist, M_TEMP);
}
int
pci_setup_intr(device_t dev, device_t child, struct resource *irq, int flags,
driver_filter_t *filter, driver_intr_t *intr, void *arg, void **cookiep)
{
struct pci_devinfo *dinfo;
struct msix_table_entry *mte;
struct msix_vector *mv;
uint64_t addr;
uint32_t data;
void *cookie;
int error, rid;
error = bus_generic_setup_intr(dev, child, irq, flags, filter, intr,
arg, &cookie);
if (error)
return (error);
/* If this is not a direct child, just bail out. */
if (device_get_parent(child) != dev) {
*cookiep = cookie;
return(0);
}
rid = rman_get_rid(irq);
if (rid == 0) {
/* Make sure that INTx is enabled */
pci_clear_command_bit(dev, child, PCIM_CMD_INTxDIS);
} else {
/*
* Check to see if the interrupt is MSI or MSI-X.
* Ask our parent to map the MSI and give
* us the address and data register values.
* If we fail for some reason, teardown the
* interrupt handler.
*/
dinfo = device_get_ivars(child);
if (dinfo->cfg.msi.msi_alloc > 0) {
if (dinfo->cfg.msi.msi_addr == 0) {
KASSERT(dinfo->cfg.msi.msi_handlers == 0,
("MSI has handlers, but vectors not mapped"));
error = PCIB_MAP_MSI(device_get_parent(dev),
child, rman_get_start(irq), &addr, &data);
if (error)
goto bad;
dinfo->cfg.msi.msi_addr = addr;
dinfo->cfg.msi.msi_data = data;
}
if (dinfo->cfg.msi.msi_handlers == 0)
pci_enable_msi(child, dinfo->cfg.msi.msi_addr,
dinfo->cfg.msi.msi_data);
dinfo->cfg.msi.msi_handlers++;
} else {
KASSERT(dinfo->cfg.msix.msix_alloc > 0,
("No MSI or MSI-X interrupts allocated"));
KASSERT(rid <= dinfo->cfg.msix.msix_table_len,
("MSI-X index too high"));
mte = &dinfo->cfg.msix.msix_table[rid - 1];
KASSERT(mte->mte_vector != 0, ("no message vector"));
mv = &dinfo->cfg.msix.msix_vectors[mte->mte_vector - 1];
KASSERT(mv->mv_irq == rman_get_start(irq),
("IRQ mismatch"));
if (mv->mv_address == 0) {
KASSERT(mte->mte_handlers == 0,
("MSI-X table entry has handlers, but vector not mapped"));
error = PCIB_MAP_MSI(device_get_parent(dev),
child, rman_get_start(irq), &addr, &data);
if (error)
goto bad;
mv->mv_address = addr;
mv->mv_data = data;
}
/*
* The MSIX table entry must be made valid by
* incrementing the mte_handlers before
* calling pci_enable_msix() and
* pci_resume_msix(). Else the MSIX rewrite
* table quirk will not work as expected.
*/
mte->mte_handlers++;
if (mte->mte_handlers == 1) {
pci_enable_msix(child, rid - 1, mv->mv_address,
mv->mv_data);
pci_unmask_msix(child, rid - 1);
}
}
/*
* Make sure that INTx is disabled if we are using MSI/MSI-X,
* unless the device is affected by PCI_QUIRK_MSI_INTX_BUG,
* in which case we "enable" INTx so MSI/MSI-X actually works.
*/
if (!pci_has_quirk(pci_get_devid(child),
PCI_QUIRK_MSI_INTX_BUG))
pci_set_command_bit(dev, child, PCIM_CMD_INTxDIS);
else
pci_clear_command_bit(dev, child, PCIM_CMD_INTxDIS);
bad:
if (error) {
(void)bus_generic_teardown_intr(dev, child, irq,
cookie);
return (error);
}
}
*cookiep = cookie;
return (0);
}
int
pci_teardown_intr(device_t dev, device_t child, struct resource *irq,
void *cookie)
{
struct msix_table_entry *mte;
struct resource_list_entry *rle;
struct pci_devinfo *dinfo;
int error, rid;
if (irq == NULL || !(rman_get_flags(irq) & RF_ACTIVE))
return (EINVAL);
/* If this isn't a direct child, just bail out */
if (device_get_parent(child) != dev)
return(bus_generic_teardown_intr(dev, child, irq, cookie));
rid = rman_get_rid(irq);
if (rid == 0) {
/* Mask INTx */
pci_set_command_bit(dev, child, PCIM_CMD_INTxDIS);
} else {
/*
* Check to see if the interrupt is MSI or MSI-X. If so,
* decrement the appropriate handlers count and mask the
* MSI-X message, or disable MSI messages if the count
* drops to 0.
*/
dinfo = device_get_ivars(child);
rle = resource_list_find(&dinfo->resources, SYS_RES_IRQ, rid);
if (rle->res != irq)
return (EINVAL);
if (dinfo->cfg.msi.msi_alloc > 0) {
KASSERT(rid <= dinfo->cfg.msi.msi_alloc,
("MSI-X index too high"));
if (dinfo->cfg.msi.msi_handlers == 0)
return (EINVAL);
dinfo->cfg.msi.msi_handlers--;
if (dinfo->cfg.msi.msi_handlers == 0)
pci_disable_msi(child);
} else {
KASSERT(dinfo->cfg.msix.msix_alloc > 0,
("No MSI or MSI-X interrupts allocated"));
KASSERT(rid <= dinfo->cfg.msix.msix_table_len,
("MSI-X index too high"));
mte = &dinfo->cfg.msix.msix_table[rid - 1];
if (mte->mte_handlers == 0)
return (EINVAL);
mte->mte_handlers--;
if (mte->mte_handlers == 0)
pci_mask_msix(child, rid - 1);
}
}
error = bus_generic_teardown_intr(dev, child, irq, cookie);
if (rid > 0)
KASSERT(error == 0,
("%s: generic teardown failed for MSI/MSI-X", __func__));
return (error);
}
int
pci_print_child(device_t dev, device_t child)
{
struct pci_devinfo *dinfo;
struct resource_list *rl;
int retval = 0;
dinfo = device_get_ivars(child);
rl = &dinfo->resources;
retval += bus_print_child_header(dev, child);
retval += resource_list_print_type(rl, "port", SYS_RES_IOPORT, "%#jx");
retval += resource_list_print_type(rl, "mem", SYS_RES_MEMORY, "%#jx");
retval += resource_list_print_type(rl, "irq", SYS_RES_IRQ, "%jd");
if (device_get_flags(dev))
retval += printf(" flags %#x", device_get_flags(dev));
retval += printf(" at device %d.%d", pci_get_slot(child),
pci_get_function(child));
retval += bus_print_child_domain(dev, child);
retval += bus_print_child_footer(dev, child);
return (retval);
}
static const struct
{
int class;
int subclass;
int report; /* 0 = bootverbose, 1 = always */
const char *desc;
} pci_nomatch_tab[] = {
{PCIC_OLD, -1, 1, "old"},
{PCIC_OLD, PCIS_OLD_NONVGA, 1, "non-VGA display device"},
{PCIC_OLD, PCIS_OLD_VGA, 1, "VGA-compatible display device"},
{PCIC_STORAGE, -1, 1, "mass storage"},
{PCIC_STORAGE, PCIS_STORAGE_SCSI, 1, "SCSI"},
{PCIC_STORAGE, PCIS_STORAGE_IDE, 1, "ATA"},
{PCIC_STORAGE, PCIS_STORAGE_FLOPPY, 1, "floppy disk"},
{PCIC_STORAGE, PCIS_STORAGE_IPI, 1, "IPI"},
{PCIC_STORAGE, PCIS_STORAGE_RAID, 1, "RAID"},
{PCIC_STORAGE, PCIS_STORAGE_ATA_ADMA, 1, "ATA (ADMA)"},
{PCIC_STORAGE, PCIS_STORAGE_SATA, 1, "SATA"},
{PCIC_STORAGE, PCIS_STORAGE_SAS, 1, "SAS"},
{PCIC_STORAGE, PCIS_STORAGE_NVM, 1, "NVM"},
{PCIC_NETWORK, -1, 1, "network"},
{PCIC_NETWORK, PCIS_NETWORK_ETHERNET, 1, "ethernet"},
{PCIC_NETWORK, PCIS_NETWORK_TOKENRING, 1, "token ring"},
{PCIC_NETWORK, PCIS_NETWORK_FDDI, 1, "fddi"},
{PCIC_NETWORK, PCIS_NETWORK_ATM, 1, "ATM"},
{PCIC_NETWORK, PCIS_NETWORK_ISDN, 1, "ISDN"},
{PCIC_DISPLAY, -1, 1, "display"},
{PCIC_DISPLAY, PCIS_DISPLAY_VGA, 1, "VGA"},
{PCIC_DISPLAY, PCIS_DISPLAY_XGA, 1, "XGA"},
{PCIC_DISPLAY, PCIS_DISPLAY_3D, 1, "3D"},
{PCIC_MULTIMEDIA, -1, 1, "multimedia"},
{PCIC_MULTIMEDIA, PCIS_MULTIMEDIA_VIDEO, 1, "video"},
{PCIC_MULTIMEDIA, PCIS_MULTIMEDIA_AUDIO, 1, "audio"},
{PCIC_MULTIMEDIA, PCIS_MULTIMEDIA_TELE, 1, "telephony"},
{PCIC_MULTIMEDIA, PCIS_MULTIMEDIA_HDA, 1, "HDA"},
{PCIC_MEMORY, -1, 1, "memory"},
{PCIC_MEMORY, PCIS_MEMORY_RAM, 1, "RAM"},
{PCIC_MEMORY, PCIS_MEMORY_FLASH, 1, "flash"},
{PCIC_BRIDGE, -1, 1, "bridge"},
{PCIC_BRIDGE, PCIS_BRIDGE_HOST, 1, "HOST-PCI"},
{PCIC_BRIDGE, PCIS_BRIDGE_ISA, 1, "PCI-ISA"},
{PCIC_BRIDGE, PCIS_BRIDGE_EISA, 1, "PCI-EISA"},
{PCIC_BRIDGE, PCIS_BRIDGE_MCA, 1, "PCI-MCA"},
{PCIC_BRIDGE, PCIS_BRIDGE_PCI, 1, "PCI-PCI"},
{PCIC_BRIDGE, PCIS_BRIDGE_PCMCIA, 1, "PCI-PCMCIA"},
{PCIC_BRIDGE, PCIS_BRIDGE_NUBUS, 1, "PCI-NuBus"},
{PCIC_BRIDGE, PCIS_BRIDGE_CARDBUS, 1, "PCI-CardBus"},
{PCIC_BRIDGE, PCIS_BRIDGE_RACEWAY, 1, "PCI-RACEway"},
{PCIC_SIMPLECOMM, -1, 1, "simple comms"},
{PCIC_SIMPLECOMM, PCIS_SIMPLECOMM_UART, 1, "UART"}, /* could detect 16550 */
{PCIC_SIMPLECOMM, PCIS_SIMPLECOMM_PAR, 1, "parallel port"},
{PCIC_SIMPLECOMM, PCIS_SIMPLECOMM_MULSER, 1, "multiport serial"},
{PCIC_SIMPLECOMM, PCIS_SIMPLECOMM_MODEM, 1, "generic modem"},
{PCIC_BASEPERIPH, -1, 0, "base peripheral"},
{PCIC_BASEPERIPH, PCIS_BASEPERIPH_PIC, 1, "interrupt controller"},
{PCIC_BASEPERIPH, PCIS_BASEPERIPH_DMA, 1, "DMA controller"},
{PCIC_BASEPERIPH, PCIS_BASEPERIPH_TIMER, 1, "timer"},
{PCIC_BASEPERIPH, PCIS_BASEPERIPH_RTC, 1, "realtime clock"},
{PCIC_BASEPERIPH, PCIS_BASEPERIPH_PCIHOT, 1, "PCI hot-plug controller"},
{PCIC_BASEPERIPH, PCIS_BASEPERIPH_SDHC, 1, "SD host controller"},
{PCIC_BASEPERIPH, PCIS_BASEPERIPH_IOMMU, 1, "IOMMU"},
{PCIC_INPUTDEV, -1, 1, "input device"},
{PCIC_INPUTDEV, PCIS_INPUTDEV_KEYBOARD, 1, "keyboard"},
{PCIC_INPUTDEV, PCIS_INPUTDEV_DIGITIZER,1, "digitizer"},
{PCIC_INPUTDEV, PCIS_INPUTDEV_MOUSE, 1, "mouse"},
{PCIC_INPUTDEV, PCIS_INPUTDEV_SCANNER, 1, "scanner"},
{PCIC_INPUTDEV, PCIS_INPUTDEV_GAMEPORT, 1, "gameport"},
{PCIC_DOCKING, -1, 1, "docking station"},
{PCIC_PROCESSOR, -1, 1, "processor"},
{PCIC_SERIALBUS, -1, 1, "serial bus"},
{PCIC_SERIALBUS, PCIS_SERIALBUS_FW, 1, "FireWire"},
{PCIC_SERIALBUS, PCIS_SERIALBUS_ACCESS, 1, "AccessBus"},
{PCIC_SERIALBUS, PCIS_SERIALBUS_SSA, 1, "SSA"},
{PCIC_SERIALBUS, PCIS_SERIALBUS_USB, 1, "USB"},
{PCIC_SERIALBUS, PCIS_SERIALBUS_FC, 1, "Fibre Channel"},
{PCIC_SERIALBUS, PCIS_SERIALBUS_SMBUS, 0, "SMBus"},
{PCIC_WIRELESS, -1, 1, "wireless controller"},
{PCIC_WIRELESS, PCIS_WIRELESS_IRDA, 1, "iRDA"},
{PCIC_WIRELESS, PCIS_WIRELESS_IR, 1, "IR"},
{PCIC_WIRELESS, PCIS_WIRELESS_RF, 1, "RF"},
{PCIC_INTELLIIO, -1, 1, "intelligent I/O controller"},
{PCIC_INTELLIIO, PCIS_INTELLIIO_I2O, 1, "I2O"},
{PCIC_SATCOM, -1, 1, "satellite communication"},
{PCIC_SATCOM, PCIS_SATCOM_TV, 1, "sat TV"},
{PCIC_SATCOM, PCIS_SATCOM_AUDIO, 1, "sat audio"},
{PCIC_SATCOM, PCIS_SATCOM_VOICE, 1, "sat voice"},
{PCIC_SATCOM, PCIS_SATCOM_DATA, 1, "sat data"},
{PCIC_CRYPTO, -1, 1, "encrypt/decrypt"},
{PCIC_CRYPTO, PCIS_CRYPTO_NETCOMP, 1, "network/computer crypto"},
{PCIC_CRYPTO, PCIS_CRYPTO_ENTERTAIN, 1, "entertainment crypto"},
{PCIC_DASP, -1, 0, "dasp"},
{PCIC_DASP, PCIS_DASP_DPIO, 1, "DPIO module"},
{PCIC_DASP, PCIS_DASP_PERFCNTRS, 1, "performance counters"},
{PCIC_DASP, PCIS_DASP_COMM_SYNC, 1, "communication synchronizer"},
{PCIC_DASP, PCIS_DASP_MGMT_CARD, 1, "signal processing management"},
{0, 0, 0, NULL}
};
void
pci_probe_nomatch(device_t dev, device_t child)
{
int i, report;
const char *cp, *scp;
char *device;
/*
* Look for a listing for this device in a loaded device database.
*/
report = 1;
if ((device = pci_describe_device(child)) != NULL) {
device_printf(dev, "<%s>", device);
free(device, M_DEVBUF);
} else {
/*
* Scan the class/subclass descriptions for a general
* description.
*/
cp = "unknown";
scp = NULL;
for (i = 0; pci_nomatch_tab[i].desc != NULL; i++) {
if (pci_nomatch_tab[i].class == pci_get_class(child)) {
if (pci_nomatch_tab[i].subclass == -1) {
cp = pci_nomatch_tab[i].desc;
report = pci_nomatch_tab[i].report;
} else if (pci_nomatch_tab[i].subclass ==
pci_get_subclass(child)) {
scp = pci_nomatch_tab[i].desc;
report = pci_nomatch_tab[i].report;
}
}
}
if (report || bootverbose) {
device_printf(dev, "<%s%s%s>",
cp ? cp : "",
((cp != NULL) && (scp != NULL)) ? ", " : "",
scp ? scp : "");
}
}
if (report || bootverbose) {
printf(" at device %d.%d (no driver attached)\n",
pci_get_slot(child), pci_get_function(child));
}
pci_cfg_save(child, device_get_ivars(child), 1);
}
void
pci_child_detached(device_t dev, device_t child)
{
struct pci_devinfo *dinfo;
struct resource_list *rl;
dinfo = device_get_ivars(child);
rl = &dinfo->resources;
/*
* Have to deallocate IRQs before releasing any MSI messages and
* have to release MSI messages before deallocating any memory
* BARs.
*/
if (resource_list_release_active(rl, dev, child, SYS_RES_IRQ) != 0)
pci_printf(&dinfo->cfg, "Device leaked IRQ resources\n");
if (dinfo->cfg.msi.msi_alloc != 0 || dinfo->cfg.msix.msix_alloc != 0) {
pci_printf(&dinfo->cfg, "Device leaked MSI vectors\n");
(void)pci_release_msi(child);
}
if (resource_list_release_active(rl, dev, child, SYS_RES_MEMORY) != 0)
pci_printf(&dinfo->cfg, "Device leaked memory resources\n");
if (resource_list_release_active(rl, dev, child, SYS_RES_IOPORT) != 0)
pci_printf(&dinfo->cfg, "Device leaked I/O resources\n");
#ifdef PCI_RES_BUS
if (resource_list_release_active(rl, dev, child, PCI_RES_BUS) != 0)
pci_printf(&dinfo->cfg, "Device leaked PCI bus numbers\n");
#endif
pci_cfg_save(child, dinfo, 1);
}
/*
* Parse the PCI device database, if loaded, and return a pointer to a
* description of the device.
*
* The database is flat text formatted as follows:
*
* Any line not in a valid format is ignored.
* Lines are terminated with newline '\n' characters.
*
* A VENDOR line consists of the 4 digit (hex) vendor code, a TAB, then
* the vendor name.
*
* A DEVICE line is entered immediately below the corresponding VENDOR ID.
* - devices cannot be listed without a corresponding VENDOR line.
* A DEVICE line consists of a TAB, the 4 digit (hex) device code,
* another TAB, then the device name.
*/
/*
* Assuming (ptr) points to the beginning of a line in the database,
* return the vendor or device and description of the next entry.
* The value of (vendor) or (device) inappropriate for the entry type
* is set to -1. Returns nonzero at the end of the database.
*
* Note that this is slightly unrobust in the face of corrupt data;
* we attempt to safeguard against this by spamming the end of the
* database with a newline when we initialise.
*/
static int
pci_describe_parse_line(char **ptr, int *vendor, int *device, char **desc)
{
char *cp = *ptr;
int left;
*device = -1;
*vendor = -1;
**desc = '\0';
for (;;) {
left = pci_vendordata_size - (cp - pci_vendordata);
if (left <= 0) {
*ptr = cp;
return(1);
}
/* vendor entry? */
if (*cp != '\t' &&
sscanf(cp, "%x\t%80[^\n]", vendor, *desc) == 2)
break;
/* device entry? */
if (*cp == '\t' &&
sscanf(cp, "%x\t%80[^\n]", device, *desc) == 2)
break;
/* skip to next line */
while (*cp != '\n' && left > 0) {
cp++;
left--;
}
if (*cp == '\n') {
cp++;
left--;
}
}
/* skip to next line */
while (*cp != '\n' && left > 0) {
cp++;
left--;
}
if (*cp == '\n' && left > 0)
cp++;
*ptr = cp;
return(0);
}
static char *
pci_describe_device(device_t dev)
{
int vendor, device;
char *desc, *vp, *dp, *line;
desc = vp = dp = NULL;
/*
* If we have no vendor data, we can't do anything.
*/
if (pci_vendordata == NULL)
goto out;
/*
* Scan the vendor data looking for this device
*/
line = pci_vendordata;
if ((vp = malloc(80, M_DEVBUF, M_NOWAIT)) == NULL)
goto out;
for (;;) {
if (pci_describe_parse_line(&line, &vendor, &device, &vp))
goto out;
if (vendor == pci_get_vendor(dev))
break;
}
if ((dp = malloc(80, M_DEVBUF, M_NOWAIT)) == NULL)
goto out;
for (;;) {
if (pci_describe_parse_line(&line, &vendor, &device, &dp)) {
*dp = 0;
break;
}
if (vendor != -1) {
*dp = 0;
break;
}
if (device == pci_get_device(dev))
break;
}
if (dp[0] == '\0')
snprintf(dp, 80, "0x%x", pci_get_device(dev));
if ((desc = malloc(strlen(vp) + strlen(dp) + 3, M_DEVBUF, M_NOWAIT)) !=
NULL)
sprintf(desc, "%s, %s", vp, dp);
out:
if (vp != NULL)
free(vp, M_DEVBUF);
if (dp != NULL)
free(dp, M_DEVBUF);
return(desc);
}
int
pci_read_ivar(device_t dev, device_t child, int which, uintptr_t *result)
{
struct pci_devinfo *dinfo;
pcicfgregs *cfg;
dinfo = device_get_ivars(child);
cfg = &dinfo->cfg;
switch (which) {
case PCI_IVAR_ETHADDR:
/*
* The generic accessor doesn't deal with failure, so
* we set the return value, then return an error.
*/
*((uint8_t **) result) = NULL;
return (EINVAL);
case PCI_IVAR_SUBVENDOR:
*result = cfg->subvendor;
break;
case PCI_IVAR_SUBDEVICE:
*result = cfg->subdevice;
break;
case PCI_IVAR_VENDOR:
*result = cfg->vendor;
break;
case PCI_IVAR_DEVICE:
*result = cfg->device;
break;
case PCI_IVAR_DEVID:
*result = (cfg->device << 16) | cfg->vendor;
break;
case PCI_IVAR_CLASS:
*result = cfg->baseclass;
break;
case PCI_IVAR_SUBCLASS:
*result = cfg->subclass;
break;
case PCI_IVAR_PROGIF:
*result = cfg->progif;
break;
case PCI_IVAR_REVID:
*result = cfg->revid;
break;
case PCI_IVAR_INTPIN:
*result = cfg->intpin;
break;
case PCI_IVAR_IRQ:
*result = cfg->intline;
break;
case PCI_IVAR_DOMAIN:
*result = cfg->domain;
break;
case PCI_IVAR_BUS:
*result = cfg->bus;
break;
case PCI_IVAR_SLOT:
*result = cfg->slot;
break;
case PCI_IVAR_FUNCTION:
*result = cfg->func;
break;
case PCI_IVAR_CMDREG:
*result = cfg->cmdreg;
break;
case PCI_IVAR_CACHELNSZ:
*result = cfg->cachelnsz;
break;
case PCI_IVAR_MINGNT:
if (cfg->hdrtype != PCIM_HDRTYPE_NORMAL) {
*result = -1;
return (EINVAL);
}
*result = cfg->mingnt;
break;
case PCI_IVAR_MAXLAT:
if (cfg->hdrtype != PCIM_HDRTYPE_NORMAL) {
*result = -1;
return (EINVAL);
}
*result = cfg->maxlat;
break;
case PCI_IVAR_LATTIMER:
*result = cfg->lattimer;
break;
default:
return (ENOENT);
}
return (0);
}
int
pci_write_ivar(device_t dev, device_t child, int which, uintptr_t value)
{
struct pci_devinfo *dinfo;
dinfo = device_get_ivars(child);
switch (which) {
case PCI_IVAR_INTPIN:
dinfo->cfg.intpin = value;
return (0);
case PCI_IVAR_ETHADDR:
case PCI_IVAR_SUBVENDOR:
case PCI_IVAR_SUBDEVICE:
case PCI_IVAR_VENDOR:
case PCI_IVAR_DEVICE:
case PCI_IVAR_DEVID:
case PCI_IVAR_CLASS:
case PCI_IVAR_SUBCLASS:
case PCI_IVAR_PROGIF:
case PCI_IVAR_REVID:
case PCI_IVAR_IRQ:
case PCI_IVAR_DOMAIN:
case PCI_IVAR_BUS:
case PCI_IVAR_SLOT:
case PCI_IVAR_FUNCTION:
return (EINVAL); /* disallow for now */
default:
return (ENOENT);
}
}
#include "opt_ddb.h"
#ifdef DDB
#include <ddb/ddb.h>
#include <sys/cons.h>
/*
* List resources based on pci map registers, used for within ddb
*/
DB_SHOW_COMMAND(pciregs, db_pci_dump)
{
struct pci_devinfo *dinfo;
struct devlist *devlist_head;
struct pci_conf *p;
const char *name;
int i, error, none_count;
none_count = 0;
/* get the head of the device queue */
devlist_head = &pci_devq;
/*
* Go through the list of devices and print out devices
*/
for (error = 0, i = 0,
dinfo = STAILQ_FIRST(devlist_head);
(dinfo != NULL) && (error == 0) && (i < pci_numdevs) && !db_pager_quit;
dinfo = STAILQ_NEXT(dinfo, pci_links), i++) {
/* Populate pd_name and pd_unit */
name = NULL;
if (dinfo->cfg.dev)
name = device_get_name(dinfo->cfg.dev);
p = &dinfo->conf;
db_printf("%s%d@pci%d:%d:%d:%d:\tclass=0x%06x card=0x%08x "
"chip=0x%08x rev=0x%02x hdr=0x%02x\n",
(name && *name) ? name : "none",
(name && *name) ? (int)device_get_unit(dinfo->cfg.dev) :
none_count++,
p->pc_sel.pc_domain, p->pc_sel.pc_bus, p->pc_sel.pc_dev,
p->pc_sel.pc_func, (p->pc_class << 16) |
(p->pc_subclass << 8) | p->pc_progif,
(p->pc_subdevice << 16) | p->pc_subvendor,
(p->pc_device << 16) | p->pc_vendor,
p->pc_revid, p->pc_hdr);
}
}
#endif /* DDB */
static struct resource *
pci_reserve_map(device_t dev, device_t child, int type, int *rid,
rman_res_t start, rman_res_t end, rman_res_t count, u_int num,
u_int flags)
{
struct pci_devinfo *dinfo = device_get_ivars(child);
struct resource_list *rl = &dinfo->resources;
struct resource *res;
struct pci_map *pm;
uint16_t cmd;
pci_addr_t map, testval;
int mapsize;
res = NULL;
/* If rid is managed by EA, ignore it */
if (pci_ea_is_enabled(child, *rid))
goto out;
pm = pci_find_bar(child, *rid);
if (pm != NULL) {
/* This is a BAR that we failed to allocate earlier. */
mapsize = pm->pm_size;
map = pm->pm_value;
} else {
/*
* Weed out the bogons, and figure out how large the
* BAR/map is. BARs that read back 0 here are bogus
* and unimplemented. Note: atapci in legacy mode are
* special and handled elsewhere in the code. If you
* have a atapci device in legacy mode and it fails
* here, that other code is broken.
*/
pci_read_bar(child, *rid, &map, &testval, NULL);
/*
* Determine the size of the BAR and ignore BARs with a size
* of 0. Device ROM BARs use a different mask value.
*/
if (PCIR_IS_BIOS(&dinfo->cfg, *rid))
mapsize = pci_romsize(testval);
else
mapsize = pci_mapsize(testval);
if (mapsize == 0)
goto out;
pm = pci_add_bar(child, *rid, map, mapsize);
}
if (PCI_BAR_MEM(map) || PCIR_IS_BIOS(&dinfo->cfg, *rid)) {
if (type != SYS_RES_MEMORY) {
if (bootverbose)
device_printf(dev,
"child %s requested type %d for rid %#x,"
" but the BAR says it is an memio\n",
device_get_nameunit(child), type, *rid);
goto out;
}
} else {
if (type != SYS_RES_IOPORT) {
if (bootverbose)
device_printf(dev,
"child %s requested type %d for rid %#x,"
" but the BAR says it is an ioport\n",
device_get_nameunit(child), type, *rid);
goto out;
}
}
/*
* For real BARs, we need to override the size that
* the driver requests, because that's what the BAR
* actually uses and we would otherwise have a
* situation where we might allocate the excess to
* another driver, which won't work.
*/
count = ((pci_addr_t)1 << mapsize) * num;
if (RF_ALIGNMENT(flags) < mapsize)
flags = (flags & ~RF_ALIGNMENT_MASK) | RF_ALIGNMENT_LOG2(mapsize);
if (PCI_BAR_MEM(map) && (map & PCIM_BAR_MEM_PREFETCH))
flags |= RF_PREFETCHABLE;
/*
* Allocate enough resource, and then write back the
* appropriate BAR for that resource.
*/
resource_list_add(rl, type, *rid, start, end, count);
res = resource_list_reserve(rl, dev, child, type, rid, start, end,
count, flags & ~RF_ACTIVE);
if (res == NULL) {
resource_list_delete(rl, type, *rid);
device_printf(child,
"%#jx bytes of rid %#x res %d failed (%#jx, %#jx).\n",
count, *rid, type, start, end);
goto out;
}
if (bootverbose)
device_printf(child,
"Lazy allocation of %#jx bytes rid %#x type %d at %#jx\n",
count, *rid, type, rman_get_start(res));
/* Disable decoding via the CMD register before updating the BAR */
cmd = pci_read_config(child, PCIR_COMMAND, 2);
pci_write_config(child, PCIR_COMMAND,
cmd & ~(PCI_BAR_MEM(map) ? PCIM_CMD_MEMEN : PCIM_CMD_PORTEN), 2);
map = rman_get_start(res);
pci_write_bar(child, pm, map);
/* Restore the original value of the CMD register */
pci_write_config(child, PCIR_COMMAND, cmd, 2);
out:
return (res);
}
struct resource *
pci_alloc_multi_resource(device_t dev, device_t child, int type, int *rid,
rman_res_t start, rman_res_t end, rman_res_t count, u_long num,
u_int flags)
{
struct pci_devinfo *dinfo;
struct resource_list *rl;
struct resource_list_entry *rle;
struct resource *res;
pcicfgregs *cfg;
/*
* Perform lazy resource allocation
*/
dinfo = device_get_ivars(child);
rl = &dinfo->resources;
cfg = &dinfo->cfg;
switch (type) {
#if defined(NEW_PCIB) && defined(PCI_RES_BUS)
case PCI_RES_BUS:
return (pci_alloc_secbus(dev, child, rid, start, end, count,
flags));
#endif
case SYS_RES_IRQ:
/*
* Can't alloc legacy interrupt once MSI messages have
* been allocated.
*/
if (*rid == 0 && (cfg->msi.msi_alloc > 0 ||
cfg->msix.msix_alloc > 0))
return (NULL);
/*
* If the child device doesn't have an interrupt
* routed and is deserving of an interrupt, try to
* assign it one.
*/
if (*rid == 0 && !PCI_INTERRUPT_VALID(cfg->intline) &&
(cfg->intpin != 0))
pci_assign_interrupt(dev, child, 0);
break;
case SYS_RES_IOPORT:
case SYS_RES_MEMORY:
#ifdef NEW_PCIB
/*
* PCI-PCI bridge I/O window resources are not BARs.
* For those allocations just pass the request up the
* tree.
*/
if (cfg->hdrtype == PCIM_HDRTYPE_BRIDGE) {
switch (*rid) {
case PCIR_IOBASEL_1:
case PCIR_MEMBASE_1:
case PCIR_PMBASEL_1:
/*
* XXX: Should we bother creating a resource
* list entry?
*/
return (bus_generic_alloc_resource(dev, child,
type, rid, start, end, count, flags));
}
}
#endif
/* Reserve resources for this BAR if needed. */
rle = resource_list_find(rl, type, *rid);
if (rle == NULL) {
res = pci_reserve_map(dev, child, type, rid, start, end,
count, num, flags);
if (res == NULL)
return (NULL);
}
}
return (resource_list_alloc(rl, dev, child, type, rid,
start, end, count, flags));
}
struct resource *
pci_alloc_resource(device_t dev, device_t child, int type, int *rid,
rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
{
#ifdef PCI_IOV
struct pci_devinfo *dinfo;
#endif
if (device_get_parent(child) != dev)
return (BUS_ALLOC_RESOURCE(device_get_parent(dev), child,
type, rid, start, end, count, flags));
#ifdef PCI_IOV
dinfo = device_get_ivars(child);
if (dinfo->cfg.flags & PCICFG_VF) {
switch (type) {
/* VFs can't have I/O BARs. */
case SYS_RES_IOPORT:
return (NULL);
case SYS_RES_MEMORY:
return (pci_vf_alloc_mem_resource(dev, child, rid,
start, end, count, flags));
}
/* Fall through for other types of resource allocations. */
}
#endif
return (pci_alloc_multi_resource(dev, child, type, rid, start, end,
count, 1, flags));
}
int
pci_release_resource(device_t dev, device_t child, int type, int rid,
struct resource *r)
{
struct pci_devinfo *dinfo;
struct resource_list *rl;
pcicfgregs *cfg;
if (device_get_parent(child) != dev)
return (BUS_RELEASE_RESOURCE(device_get_parent(dev), child,
type, rid, r));
dinfo = device_get_ivars(child);
cfg = &dinfo->cfg;
#ifdef PCI_IOV
if (dinfo->cfg.flags & PCICFG_VF) {
switch (type) {
/* VFs can't have I/O BARs. */
case SYS_RES_IOPORT:
return (EDOOFUS);
case SYS_RES_MEMORY:
return (pci_vf_release_mem_resource(dev, child, rid,
r));
}
/* Fall through for other types of resource allocations. */
}
#endif
#ifdef NEW_PCIB
/*
* PCI-PCI bridge I/O window resources are not BARs. For
* those allocations just pass the request up the tree.
*/
if (cfg->hdrtype == PCIM_HDRTYPE_BRIDGE &&
(type == SYS_RES_IOPORT || type == SYS_RES_MEMORY)) {
switch (rid) {
case PCIR_IOBASEL_1:
case PCIR_MEMBASE_1:
case PCIR_PMBASEL_1:
return (bus_generic_release_resource(dev, child, type,
rid, r));
}
}
#endif
rl = &dinfo->resources;
return (resource_list_release(rl, dev, child, type, rid, r));
}
int
pci_activate_resource(device_t dev, device_t child, int type, int rid,
struct resource *r)
{
struct pci_devinfo *dinfo;
int error;
error = bus_generic_activate_resource(dev, child, type, rid, r);
if (error)
return (error);
/* Enable decoding in the command register when activating BARs. */
if (device_get_parent(child) == dev) {
/* Device ROMs need their decoding explicitly enabled. */
dinfo = device_get_ivars(child);
if (type == SYS_RES_MEMORY && PCIR_IS_BIOS(&dinfo->cfg, rid))
pci_write_bar(child, pci_find_bar(child, rid),
rman_get_start(r) | PCIM_BIOS_ENABLE);
switch (type) {
case SYS_RES_IOPORT:
case SYS_RES_MEMORY:
error = PCI_ENABLE_IO(dev, child, type);
break;
}
}
return (error);
}
int
pci_deactivate_resource(device_t dev, device_t child, int type,
int rid, struct resource *r)
{
struct pci_devinfo *dinfo;
int error;
error = bus_generic_deactivate_resource(dev, child, type, rid, r);
if (error)
return (error);
/* Disable decoding for device ROMs. */
if (device_get_parent(child) == dev) {
dinfo = device_get_ivars(child);
if (type == SYS_RES_MEMORY && PCIR_IS_BIOS(&dinfo->cfg, rid))
pci_write_bar(child, pci_find_bar(child, rid),
rman_get_start(r));
}
return (0);
}
void
pci_child_deleted(device_t dev, device_t child)
{
struct resource_list_entry *rle;
struct resource_list *rl;
struct pci_devinfo *dinfo;
dinfo = device_get_ivars(child);
rl = &dinfo->resources;
EVENTHANDLER_INVOKE(pci_delete_device, child);
/* Turn off access to resources we're about to free */
if (bus_child_present(child) != 0) {
pci_write_config(child, PCIR_COMMAND, pci_read_config(child,
PCIR_COMMAND, 2) & ~(PCIM_CMD_MEMEN | PCIM_CMD_PORTEN), 2);
pci_disable_busmaster(child);
}
/* Free all allocated resources */
STAILQ_FOREACH(rle, rl, link) {
if (rle->res) {
if (rman_get_flags(rle->res) & RF_ACTIVE ||
resource_list_busy(rl, rle->type, rle->rid)) {
pci_printf(&dinfo->cfg,
"Resource still owned, oops. "
"(type=%d, rid=%d, addr=%lx)\n",
rle->type, rle->rid,
rman_get_start(rle->res));
bus_release_resource(child, rle->type, rle->rid,
rle->res);
}
resource_list_unreserve(rl, dev, child, rle->type,
rle->rid);
}
}
resource_list_free(rl);
pci_freecfg(dinfo);
}
void
pci_delete_resource(device_t dev, device_t child, int type, int rid)
{
struct pci_devinfo *dinfo;
struct resource_list *rl;
struct resource_list_entry *rle;
if (device_get_parent(child) != dev)
return;
dinfo = device_get_ivars(child);
rl = &dinfo->resources;
rle = resource_list_find(rl, type, rid);
if (rle == NULL)
return;
if (rle->res) {
if (rman_get_flags(rle->res) & RF_ACTIVE ||
resource_list_busy(rl, type, rid)) {
device_printf(dev, "delete_resource: "
"Resource still owned by child, oops. "
"(type=%d, rid=%d, addr=%jx)\n",
type, rid, rman_get_start(rle->res));
return;
}
resource_list_unreserve(rl, dev, child, type, rid);
}
resource_list_delete(rl, type, rid);
}
struct resource_list *
pci_get_resource_list (device_t dev, device_t child)
{
struct pci_devinfo *dinfo = device_get_ivars(child);
return (&dinfo->resources);
}
bus_dma_tag_t
pci_get_dma_tag(device_t bus, device_t dev)
{
struct pci_softc *sc = device_get_softc(bus);
return (sc->sc_dma_tag);
}
uint32_t
pci_read_config_method(device_t dev, device_t child, int reg, int width)
{
struct pci_devinfo *dinfo = device_get_ivars(child);
pcicfgregs *cfg = &dinfo->cfg;
#ifdef PCI_IOV
/*
* SR-IOV VFs don't implement the VID or DID registers, so we have to
* emulate them here.
*/
if (cfg->flags & PCICFG_VF) {
if (reg == PCIR_VENDOR) {
switch (width) {
case 4:
return (cfg->device << 16 | cfg->vendor);
case 2:
return (cfg->vendor);
case 1:
return (cfg->vendor & 0xff);
default:
return (0xffffffff);
}
} else if (reg == PCIR_DEVICE) {
switch (width) {
/* Note that an unaligned 4-byte read is an error. */
case 2:
return (cfg->device);
case 1:
return (cfg->device & 0xff);
default:
return (0xffffffff);
}
}
}
#endif
return (PCIB_READ_CONFIG(device_get_parent(dev),
cfg->bus, cfg->slot, cfg->func, reg, width));
}
void
pci_write_config_method(device_t dev, device_t child, int reg,
uint32_t val, int width)
{
struct pci_devinfo *dinfo = device_get_ivars(child);
pcicfgregs *cfg = &dinfo->cfg;
PCIB_WRITE_CONFIG(device_get_parent(dev),
cfg->bus, cfg->slot, cfg->func, reg, val, width);
}
int
pci_child_location_str_method(device_t dev, device_t child, char *buf,
size_t buflen)
{
snprintf(buf, buflen, "slot=%d function=%d dbsf=pci%d:%d:%d:%d",
pci_get_slot(child), pci_get_function(child), pci_get_domain(child),
pci_get_bus(child), pci_get_slot(child), pci_get_function(child));
return (0);
}
int
pci_child_pnpinfo_str_method(device_t dev, device_t child, char *buf,
size_t buflen)
{
struct pci_devinfo *dinfo;
pcicfgregs *cfg;
dinfo = device_get_ivars(child);
cfg = &dinfo->cfg;
snprintf(buf, buflen, "vendor=0x%04x device=0x%04x subvendor=0x%04x "
"subdevice=0x%04x class=0x%02x%02x%02x", cfg->vendor, cfg->device,
cfg->subvendor, cfg->subdevice, cfg->baseclass, cfg->subclass,
cfg->progif);
return (0);
}
int
pci_assign_interrupt_method(device_t dev, device_t child)
{
struct pci_devinfo *dinfo = device_get_ivars(child);
pcicfgregs *cfg = &dinfo->cfg;
return (PCIB_ROUTE_INTERRUPT(device_get_parent(dev), child,
cfg->intpin));
}
static void
pci_lookup(void *arg, const char *name, device_t *dev)
{
long val;
char *end;
int domain, bus, slot, func;
if (*dev != NULL)
return;
/*
* Accept pciconf-style selectors of either pciD:B:S:F or
* pciB:S:F. In the latter case, the domain is assumed to
* be zero.
*/
if (strncmp(name, "pci", 3) != 0)
return;
val = strtol(name + 3, &end, 10);
if (val < 0 || val > INT_MAX || *end != ':')
return;
domain = val;
val = strtol(end + 1, &end, 10);
if (val < 0 || val > INT_MAX || *end != ':')
return;
bus = val;
val = strtol(end + 1, &end, 10);
if (val < 0 || val > INT_MAX)
return;
slot = val;
if (*end == ':') {
val = strtol(end + 1, &end, 10);
if (val < 0 || val > INT_MAX || *end != '\0')
return;
func = val;
} else if (*end == '\0') {
func = slot;
slot = bus;
bus = domain;
domain = 0;
} else
return;
if (domain > PCI_DOMAINMAX || bus > PCI_BUSMAX || slot > PCI_SLOTMAX ||
func > PCIE_ARI_FUNCMAX || (slot != 0 && func > PCI_FUNCMAX))
return;
*dev = pci_find_dbsf(domain, bus, slot, func);
}
static int
pci_modevent(module_t mod, int what, void *arg)
{
static struct cdev *pci_cdev;
static eventhandler_tag tag;
switch (what) {
case MOD_LOAD:
STAILQ_INIT(&pci_devq);
pci_generation = 0;
pci_cdev = make_dev(&pcicdev, 0, UID_ROOT, GID_WHEEL, 0644,
"pci");
pci_load_vendor_data();
tag = EVENTHANDLER_REGISTER(dev_lookup, pci_lookup, NULL,
1000);
break;
case MOD_UNLOAD:
if (tag != NULL)
EVENTHANDLER_DEREGISTER(dev_lookup, tag);
destroy_dev(pci_cdev);
break;
}
return (0);
}
static void
pci_cfg_restore_pcie(device_t dev, struct pci_devinfo *dinfo)
{
#define WREG(n, v) pci_write_config(dev, pos + (n), (v), 2)
struct pcicfg_pcie *cfg;
int version, pos;
cfg = &dinfo->cfg.pcie;
pos = cfg->pcie_location;
version = cfg->pcie_flags & PCIEM_FLAGS_VERSION;
WREG(PCIER_DEVICE_CTL, cfg->pcie_device_ctl);
if (version > 1 || cfg->pcie_type == PCIEM_TYPE_ROOT_PORT ||
cfg->pcie_type == PCIEM_TYPE_ENDPOINT ||
cfg->pcie_type == PCIEM_TYPE_LEGACY_ENDPOINT)
WREG(PCIER_LINK_CTL, cfg->pcie_link_ctl);
if (version > 1 || (cfg->pcie_type == PCIEM_TYPE_ROOT_PORT ||
(cfg->pcie_type == PCIEM_TYPE_DOWNSTREAM_PORT &&
(cfg->pcie_flags & PCIEM_FLAGS_SLOT))))
WREG(PCIER_SLOT_CTL, cfg->pcie_slot_ctl);
if (version > 1 || cfg->pcie_type == PCIEM_TYPE_ROOT_PORT ||
cfg->pcie_type == PCIEM_TYPE_ROOT_EC)
WREG(PCIER_ROOT_CTL, cfg->pcie_root_ctl);
if (version > 1) {
WREG(PCIER_DEVICE_CTL2, cfg->pcie_device_ctl2);
WREG(PCIER_LINK_CTL2, cfg->pcie_link_ctl2);
WREG(PCIER_SLOT_CTL2, cfg->pcie_slot_ctl2);
}
#undef WREG
}
static void
pci_cfg_restore_pcix(device_t dev, struct pci_devinfo *dinfo)
{
pci_write_config(dev, dinfo->cfg.pcix.pcix_location + PCIXR_COMMAND,
dinfo->cfg.pcix.pcix_command, 2);
}
void
pci_cfg_restore(device_t dev, struct pci_devinfo *dinfo)
{
/*
* Restore the device to full power mode. We must do this
* before we restore the registers because moving from D3 to
* D0 will cause the chip's BARs and some other registers to
* be reset to some unknown power on reset values. Cut down
* the noise on boot by doing nothing if we are already in
* state D0.
*/
if (pci_get_powerstate(dev) != PCI_POWERSTATE_D0)
pci_set_powerstate(dev, PCI_POWERSTATE_D0);
pci_write_config(dev, PCIR_COMMAND, dinfo->cfg.cmdreg, 2);
pci_write_config(dev, PCIR_INTLINE, dinfo->cfg.intline, 1);
pci_write_config(dev, PCIR_INTPIN, dinfo->cfg.intpin, 1);
pci_write_config(dev, PCIR_CACHELNSZ, dinfo->cfg.cachelnsz, 1);
pci_write_config(dev, PCIR_LATTIMER, dinfo->cfg.lattimer, 1);
pci_write_config(dev, PCIR_PROGIF, dinfo->cfg.progif, 1);
pci_write_config(dev, PCIR_REVID, dinfo->cfg.revid, 1);
switch (dinfo->cfg.hdrtype & PCIM_HDRTYPE) {
case PCIM_HDRTYPE_NORMAL:
pci_write_config(dev, PCIR_MINGNT, dinfo->cfg.mingnt, 1);
pci_write_config(dev, PCIR_MAXLAT, dinfo->cfg.maxlat, 1);
break;
case PCIM_HDRTYPE_BRIDGE:
pci_write_config(dev, PCIR_SECLAT_1,
dinfo->cfg.bridge.br_seclat, 1);
pci_write_config(dev, PCIR_SUBBUS_1,
dinfo->cfg.bridge.br_subbus, 1);
pci_write_config(dev, PCIR_SECBUS_1,
dinfo->cfg.bridge.br_secbus, 1);
pci_write_config(dev, PCIR_PRIBUS_1,
dinfo->cfg.bridge.br_pribus, 1);
pci_write_config(dev, PCIR_BRIDGECTL_1,
dinfo->cfg.bridge.br_control, 2);
break;
case PCIM_HDRTYPE_CARDBUS:
pci_write_config(dev, PCIR_SECLAT_2,
dinfo->cfg.bridge.br_seclat, 1);
pci_write_config(dev, PCIR_SUBBUS_2,
dinfo->cfg.bridge.br_subbus, 1);
pci_write_config(dev, PCIR_SECBUS_2,
dinfo->cfg.bridge.br_secbus, 1);
pci_write_config(dev, PCIR_PRIBUS_2,
dinfo->cfg.bridge.br_pribus, 1);
pci_write_config(dev, PCIR_BRIDGECTL_2,
dinfo->cfg.bridge.br_control, 2);
break;
}
pci_restore_bars(dev);
/*
* Restore extended capabilities for PCI-Express and PCI-X
*/
if (dinfo->cfg.pcie.pcie_location != 0)
pci_cfg_restore_pcie(dev, dinfo);
if (dinfo->cfg.pcix.pcix_location != 0)
pci_cfg_restore_pcix(dev, dinfo);
/* Restore MSI and MSI-X configurations if they are present. */
if (dinfo->cfg.msi.msi_location != 0)
pci_resume_msi(dev);
if (dinfo->cfg.msix.msix_location != 0)
pci_resume_msix(dev);
#ifdef PCI_IOV
if (dinfo->cfg.iov != NULL)
pci_iov_cfg_restore(dev, dinfo);
#endif
}
static void
pci_cfg_save_pcie(device_t dev, struct pci_devinfo *dinfo)
{
#define RREG(n) pci_read_config(dev, pos + (n), 2)
struct pcicfg_pcie *cfg;
int version, pos;
cfg = &dinfo->cfg.pcie;
pos = cfg->pcie_location;
cfg->pcie_flags = RREG(PCIER_FLAGS);
version = cfg->pcie_flags & PCIEM_FLAGS_VERSION;
cfg->pcie_device_ctl = RREG(PCIER_DEVICE_CTL);
if (version > 1 || cfg->pcie_type == PCIEM_TYPE_ROOT_PORT ||
cfg->pcie_type == PCIEM_TYPE_ENDPOINT ||
cfg->pcie_type == PCIEM_TYPE_LEGACY_ENDPOINT)
cfg->pcie_link_ctl = RREG(PCIER_LINK_CTL);
if (version > 1 || (cfg->pcie_type == PCIEM_TYPE_ROOT_PORT ||
(cfg->pcie_type == PCIEM_TYPE_DOWNSTREAM_PORT &&
(cfg->pcie_flags & PCIEM_FLAGS_SLOT))))
cfg->pcie_slot_ctl = RREG(PCIER_SLOT_CTL);
if (version > 1 || cfg->pcie_type == PCIEM_TYPE_ROOT_PORT ||
cfg->pcie_type == PCIEM_TYPE_ROOT_EC)
cfg->pcie_root_ctl = RREG(PCIER_ROOT_CTL);
if (version > 1) {
cfg->pcie_device_ctl2 = RREG(PCIER_DEVICE_CTL2);
cfg->pcie_link_ctl2 = RREG(PCIER_LINK_CTL2);
cfg->pcie_slot_ctl2 = RREG(PCIER_SLOT_CTL2);
}
#undef RREG
}
static void
pci_cfg_save_pcix(device_t dev, struct pci_devinfo *dinfo)
{
dinfo->cfg.pcix.pcix_command = pci_read_config(dev,
dinfo->cfg.pcix.pcix_location + PCIXR_COMMAND, 2);
}
void
pci_cfg_save(device_t dev, struct pci_devinfo *dinfo, int setstate)
{
uint32_t cls;
int ps;
/*
* Some drivers apparently write to these registers w/o updating our
* cached copy. No harm happens if we update the copy, so do so here
* so we can restore them. The COMMAND register is modified by the
* bus w/o updating the cache. This should represent the normally
* writable portion of the 'defined' part of type 0/1/2 headers.
*/
dinfo->cfg.vendor = pci_read_config(dev, PCIR_VENDOR, 2);
dinfo->cfg.device = pci_read_config(dev, PCIR_DEVICE, 2);
dinfo->cfg.cmdreg = pci_read_config(dev, PCIR_COMMAND, 2);
dinfo->cfg.intline = pci_read_config(dev, PCIR_INTLINE, 1);
dinfo->cfg.intpin = pci_read_config(dev, PCIR_INTPIN, 1);
dinfo->cfg.cachelnsz = pci_read_config(dev, PCIR_CACHELNSZ, 1);
dinfo->cfg.lattimer = pci_read_config(dev, PCIR_LATTIMER, 1);
dinfo->cfg.baseclass = pci_read_config(dev, PCIR_CLASS, 1);
dinfo->cfg.subclass = pci_read_config(dev, PCIR_SUBCLASS, 1);
dinfo->cfg.progif = pci_read_config(dev, PCIR_PROGIF, 1);
dinfo->cfg.revid = pci_read_config(dev, PCIR_REVID, 1);
switch (dinfo->cfg.hdrtype & PCIM_HDRTYPE) {
case PCIM_HDRTYPE_NORMAL:
dinfo->cfg.subvendor = pci_read_config(dev, PCIR_SUBVEND_0, 2);
dinfo->cfg.subdevice = pci_read_config(dev, PCIR_SUBDEV_0, 2);
dinfo->cfg.mingnt = pci_read_config(dev, PCIR_MINGNT, 1);
dinfo->cfg.maxlat = pci_read_config(dev, PCIR_MAXLAT, 1);
break;
case PCIM_HDRTYPE_BRIDGE:
dinfo->cfg.bridge.br_seclat = pci_read_config(dev,
PCIR_SECLAT_1, 1);
dinfo->cfg.bridge.br_subbus = pci_read_config(dev,
PCIR_SUBBUS_1, 1);
dinfo->cfg.bridge.br_secbus = pci_read_config(dev,
PCIR_SECBUS_1, 1);
dinfo->cfg.bridge.br_pribus = pci_read_config(dev,
PCIR_PRIBUS_1, 1);
dinfo->cfg.bridge.br_control = pci_read_config(dev,
PCIR_BRIDGECTL_1, 2);
break;
case PCIM_HDRTYPE_CARDBUS:
dinfo->cfg.bridge.br_seclat = pci_read_config(dev,
PCIR_SECLAT_2, 1);
dinfo->cfg.bridge.br_subbus = pci_read_config(dev,
PCIR_SUBBUS_2, 1);
dinfo->cfg.bridge.br_secbus = pci_read_config(dev,
PCIR_SECBUS_2, 1);
dinfo->cfg.bridge.br_pribus = pci_read_config(dev,
PCIR_PRIBUS_2, 1);
dinfo->cfg.bridge.br_control = pci_read_config(dev,
PCIR_BRIDGECTL_2, 2);
dinfo->cfg.subvendor = pci_read_config(dev, PCIR_SUBVEND_2, 2);
dinfo->cfg.subdevice = pci_read_config(dev, PCIR_SUBDEV_2, 2);
break;
}
if (dinfo->cfg.pcie.pcie_location != 0)
pci_cfg_save_pcie(dev, dinfo);
if (dinfo->cfg.pcix.pcix_location != 0)
pci_cfg_save_pcix(dev, dinfo);
#ifdef PCI_IOV
if (dinfo->cfg.iov != NULL)
pci_iov_cfg_save(dev, dinfo);
#endif
/*
* don't set the state for display devices, base peripherals and
* memory devices since bad things happen when they are powered down.
* We should (a) have drivers that can easily detach and (b) use
* generic drivers for these devices so that some device actually
* attaches. We need to make sure that when we implement (a) we don't
* power the device down on a reattach.
*/
cls = pci_get_class(dev);
if (!setstate)
return;
switch (pci_do_power_nodriver)
{
case 0: /* NO powerdown at all */
return;
case 1: /* Conservative about what to power down */
if (cls == PCIC_STORAGE)
return;
/*FALLTHROUGH*/
case 2: /* Aggressive about what to power down */
if (cls == PCIC_DISPLAY || cls == PCIC_MEMORY ||
cls == PCIC_BASEPERIPH)
return;
/*FALLTHROUGH*/
case 3: /* Power down everything */
break;
}
/*
* PCI spec says we can only go into D3 state from D0 state.
* Transition from D[12] into D0 before going to D3 state.
*/
ps = pci_get_powerstate(dev);
if (ps != PCI_POWERSTATE_D0 && ps != PCI_POWERSTATE_D3)
pci_set_powerstate(dev, PCI_POWERSTATE_D0);
if (pci_get_powerstate(dev) != PCI_POWERSTATE_D3)
pci_set_powerstate(dev, PCI_POWERSTATE_D3);
}
/* Wrapper APIs suitable for device driver use. */
void
pci_save_state(device_t dev)
{
struct pci_devinfo *dinfo;
dinfo = device_get_ivars(dev);
pci_cfg_save(dev, dinfo, 0);
}
void
pci_restore_state(device_t dev)
{
struct pci_devinfo *dinfo;
dinfo = device_get_ivars(dev);
pci_cfg_restore(dev, dinfo);
}
static int
pci_get_id_method(device_t dev, device_t child, enum pci_id_type type,
uintptr_t *id)
{
return (PCIB_GET_ID(device_get_parent(dev), child, type, id));
}
/* Find the upstream port of a given PCI device in a root complex. */
device_t
pci_find_pcie_root_port(device_t dev)
{
struct pci_devinfo *dinfo;
devclass_t pci_class;
device_t pcib, bus;
pci_class = devclass_find("pci");
KASSERT(device_get_devclass(device_get_parent(dev)) == pci_class,
("%s: non-pci device %s", __func__, device_get_nameunit(dev)));
/*
* Walk the bridge hierarchy until we find a PCI-e root
* port or a non-PCI device.
*/
for (;;) {
bus = device_get_parent(dev);
KASSERT(bus != NULL, ("%s: null parent of %s", __func__,
device_get_nameunit(dev)));
pcib = device_get_parent(bus);
KASSERT(pcib != NULL, ("%s: null bridge of %s", __func__,
device_get_nameunit(bus)));
/*
* pcib's parent must be a PCI bus for this to be a
* PCI-PCI bridge.
*/
if (device_get_devclass(device_get_parent(pcib)) != pci_class)
return (NULL);
dinfo = device_get_ivars(pcib);
if (dinfo->cfg.pcie.pcie_location != 0 &&
dinfo->cfg.pcie.pcie_type == PCIEM_TYPE_ROOT_PORT)
return (pcib);
dev = pcib;
}
}
/*
* Wait for pending transactions to complete on a PCI-express function.
*
* The maximum delay is specified in milliseconds in max_delay. Note
* that this function may sleep.
*
* Returns true if the function is idle and false if the timeout is
* exceeded. If dev is not a PCI-express function, this returns true.
*/
bool
pcie_wait_for_pending_transactions(device_t dev, u_int max_delay)
{
struct pci_devinfo *dinfo = device_get_ivars(dev);
uint16_t sta;
int cap;
cap = dinfo->cfg.pcie.pcie_location;
if (cap == 0)
return (true);
sta = pci_read_config(dev, cap + PCIER_DEVICE_STA, 2);
while (sta & PCIEM_STA_TRANSACTION_PND) {
if (max_delay == 0)
return (false);
/* Poll once every 100 milliseconds up to the timeout. */
if (max_delay > 100) {
pause_sbt("pcietp", 100 * SBT_1MS, 0, C_HARDCLOCK);
max_delay -= 100;
} else {
pause_sbt("pcietp", max_delay * SBT_1MS, 0,
C_HARDCLOCK);
max_delay = 0;
}
sta = pci_read_config(dev, cap + PCIER_DEVICE_STA, 2);
}
return (true);
}
/*
* Determine the maximum Completion Timeout in microseconds.
*
* For non-PCI-express functions this returns 0.
*/
int
pcie_get_max_completion_timeout(device_t dev)
{
struct pci_devinfo *dinfo = device_get_ivars(dev);
int cap;
cap = dinfo->cfg.pcie.pcie_location;
if (cap == 0)
return (0);
/*
* Functions using the 1.x spec use the default timeout range of
* 50 microseconds to 50 milliseconds. Functions that do not
* support programmable timeouts also use this range.
*/
if ((dinfo->cfg.pcie.pcie_flags & PCIEM_FLAGS_VERSION) < 2 ||
(pci_read_config(dev, cap + PCIER_DEVICE_CAP2, 4) &
PCIEM_CAP2_COMP_TIMO_RANGES) == 0)
return (50 * 1000);
switch (pci_read_config(dev, cap + PCIER_DEVICE_CTL2, 2) &
PCIEM_CTL2_COMP_TIMO_VAL) {
case PCIEM_CTL2_COMP_TIMO_100US:
return (100);
case PCIEM_CTL2_COMP_TIMO_10MS:
return (10 * 1000);
case PCIEM_CTL2_COMP_TIMO_55MS:
return (55 * 1000);
case PCIEM_CTL2_COMP_TIMO_210MS:
return (210 * 1000);
case PCIEM_CTL2_COMP_TIMO_900MS:
return (900 * 1000);
case PCIEM_CTL2_COMP_TIMO_3500MS:
return (3500 * 1000);
case PCIEM_CTL2_COMP_TIMO_13S:
return (13 * 1000 * 1000);
case PCIEM_CTL2_COMP_TIMO_64S:
return (64 * 1000 * 1000);
default:
return (50 * 1000);
}
}
/*
* Perform a Function Level Reset (FLR) on a device.
*
* This function first waits for any pending transactions to complete
* within the timeout specified by max_delay. If transactions are
* still pending, the function will return false without attempting a
* reset.
*
* If dev is not a PCI-express function or does not support FLR, this
* function returns false.
*
* Note that no registers are saved or restored. The caller is
* responsible for saving and restoring any registers including
* PCI-standard registers via pci_save_state() and
* pci_restore_state().
*/
bool
pcie_flr(device_t dev, u_int max_delay, bool force)
{
struct pci_devinfo *dinfo = device_get_ivars(dev);
uint16_t cmd, ctl;
int compl_delay;
int cap;
cap = dinfo->cfg.pcie.pcie_location;
if (cap == 0)
return (false);
if (!(pci_read_config(dev, cap + PCIER_DEVICE_CAP, 4) & PCIEM_CAP_FLR))
return (false);
/*
* Disable busmastering to prevent generation of new
* transactions while waiting for the device to go idle. If
* the idle timeout fails, the command register is restored
* which will re-enable busmastering.
*/
cmd = pci_read_config(dev, PCIR_COMMAND, 2);
pci_write_config(dev, PCIR_COMMAND, cmd & ~(PCIM_CMD_BUSMASTEREN), 2);
if (!pcie_wait_for_pending_transactions(dev, max_delay)) {
if (!force) {
pci_write_config(dev, PCIR_COMMAND, cmd, 2);
return (false);
}
pci_printf(&dinfo->cfg,
"Resetting with transactions pending after %d ms\n",
max_delay);
/*
* Extend the post-FLR delay to cover the maximum
* Completion Timeout delay of anything in flight
* during the FLR delay. Enforce a minimum delay of
* at least 10ms.
*/
compl_delay = pcie_get_max_completion_timeout(dev) / 1000;
if (compl_delay < 10)
compl_delay = 10;
} else
compl_delay = 0;
/* Initiate the reset. */
ctl = pci_read_config(dev, cap + PCIER_DEVICE_CTL, 2);
pci_write_config(dev, cap + PCIER_DEVICE_CTL, ctl |
PCIEM_CTL_INITIATE_FLR, 2);
/* Wait for 100ms. */
pause_sbt("pcieflr", (100 + compl_delay) * SBT_1MS, 0, C_HARDCLOCK);
if (pci_read_config(dev, cap + PCIER_DEVICE_STA, 2) &
PCIEM_STA_TRANSACTION_PND)
pci_printf(&dinfo->cfg, "Transactions pending after FLR!\n");
return (true);
}
Index: head/sys/dev/pci/pci_pci.c
===================================================================
--- head/sys/dev/pci/pci_pci.c (revision 327172)
+++ head/sys/dev/pci/pci_pci.c (revision 327173)
@@ -1,2898 +1,2894 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (c) 1994,1995 Stefan Esser, Wolfgang StanglMeier
* Copyright (c) 2000 Michael Smith <msmith@freebsd.org>
* Copyright (c) 2000 BSDi
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
/*
* PCI:PCI bridge support.
*/
#include "opt_pci.h"
#include <sys/param.h>
#include <sys/bus.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/module.h>
#include <sys/rman.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/taskqueue.h>
#include <dev/pci/pcivar.h>
#include <dev/pci/pcireg.h>
#include <dev/pci/pci_private.h>
#include <dev/pci/pcib_private.h>
#include "pcib_if.h"
static int pcib_probe(device_t dev);
static int pcib_suspend(device_t dev);
static int pcib_resume(device_t dev);
static int pcib_power_for_sleep(device_t pcib, device_t dev,
int *pstate);
static int pcib_ari_get_id(device_t pcib, device_t dev,
enum pci_id_type type, uintptr_t *id);
static uint32_t pcib_read_config(device_t dev, u_int b, u_int s,
u_int f, u_int reg, int width);
static void pcib_write_config(device_t dev, u_int b, u_int s,
u_int f, u_int reg, uint32_t val, int width);
static int pcib_ari_maxslots(device_t dev);
static int pcib_ari_maxfuncs(device_t dev);
static int pcib_try_enable_ari(device_t pcib, device_t dev);
static int pcib_ari_enabled(device_t pcib);
static void pcib_ari_decode_rid(device_t pcib, uint16_t rid,
int *bus, int *slot, int *func);
#ifdef PCI_HP
static void pcib_pcie_ab_timeout(void *arg);
static void pcib_pcie_cc_timeout(void *arg);
static void pcib_pcie_dll_timeout(void *arg);
#endif
static int pcib_request_feature_default(device_t pcib, device_t dev,
enum pci_feature feature);
static device_method_t pcib_methods[] = {
/* Device interface */
DEVMETHOD(device_probe, pcib_probe),
DEVMETHOD(device_attach, pcib_attach),
DEVMETHOD(device_detach, pcib_detach),
DEVMETHOD(device_shutdown, bus_generic_shutdown),
DEVMETHOD(device_suspend, pcib_suspend),
DEVMETHOD(device_resume, pcib_resume),
/* Bus interface */
DEVMETHOD(bus_child_present, pcib_child_present),
DEVMETHOD(bus_read_ivar, pcib_read_ivar),
DEVMETHOD(bus_write_ivar, pcib_write_ivar),
DEVMETHOD(bus_alloc_resource, pcib_alloc_resource),
#ifdef NEW_PCIB
DEVMETHOD(bus_adjust_resource, pcib_adjust_resource),
DEVMETHOD(bus_release_resource, pcib_release_resource),
#else
DEVMETHOD(bus_adjust_resource, bus_generic_adjust_resource),
DEVMETHOD(bus_release_resource, bus_generic_release_resource),
#endif
DEVMETHOD(bus_activate_resource, bus_generic_activate_resource),
DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource),
DEVMETHOD(bus_setup_intr, bus_generic_setup_intr),
DEVMETHOD(bus_teardown_intr, bus_generic_teardown_intr),
/* pcib interface */
DEVMETHOD(pcib_maxslots, pcib_ari_maxslots),
DEVMETHOD(pcib_maxfuncs, pcib_ari_maxfuncs),
DEVMETHOD(pcib_read_config, pcib_read_config),
DEVMETHOD(pcib_write_config, pcib_write_config),
DEVMETHOD(pcib_route_interrupt, pcib_route_interrupt),
DEVMETHOD(pcib_alloc_msi, pcib_alloc_msi),
DEVMETHOD(pcib_release_msi, pcib_release_msi),
DEVMETHOD(pcib_alloc_msix, pcib_alloc_msix),
DEVMETHOD(pcib_release_msix, pcib_release_msix),
DEVMETHOD(pcib_map_msi, pcib_map_msi),
DEVMETHOD(pcib_power_for_sleep, pcib_power_for_sleep),
DEVMETHOD(pcib_get_id, pcib_ari_get_id),
DEVMETHOD(pcib_try_enable_ari, pcib_try_enable_ari),
DEVMETHOD(pcib_ari_enabled, pcib_ari_enabled),
DEVMETHOD(pcib_decode_rid, pcib_ari_decode_rid),
DEVMETHOD(pcib_request_feature, pcib_request_feature_default),
DEVMETHOD_END
};
static devclass_t pcib_devclass;
DEFINE_CLASS_0(pcib, pcib_driver, pcib_methods, sizeof(struct pcib_softc));
DRIVER_MODULE(pcib, pci, pcib_driver, pcib_devclass, NULL, NULL);
#if defined(NEW_PCIB) || defined(PCI_HP)
SYSCTL_DECL(_hw_pci);
#endif
#ifdef NEW_PCIB
static int pci_clear_pcib;
SYSCTL_INT(_hw_pci, OID_AUTO, clear_pcib, CTLFLAG_RDTUN, &pci_clear_pcib, 0,
"Clear firmware-assigned resources for PCI-PCI bridge I/O windows.");
/*
* Is a resource from a child device sub-allocated from one of our
* resource managers?
*/
static int
pcib_is_resource_managed(struct pcib_softc *sc, int type, struct resource *r)
{
switch (type) {
#ifdef PCI_RES_BUS
case PCI_RES_BUS:
return (rman_is_region_manager(r, &sc->bus.rman));
#endif
case SYS_RES_IOPORT:
return (rman_is_region_manager(r, &sc->io.rman));
case SYS_RES_MEMORY:
/* Prefetchable resources may live in either memory rman. */
if (rman_get_flags(r) & RF_PREFETCHABLE &&
rman_is_region_manager(r, &sc->pmem.rman))
return (1);
return (rman_is_region_manager(r, &sc->mem.rman));
}
return (0);
}
static int
pcib_is_window_open(struct pcib_window *pw)
{
return (pw->valid && pw->base < pw->limit);
}
/*
* XXX: If RF_ACTIVE did not also imply allocating a bus space tag and
* handle for the resource, we could pass RF_ACTIVE up to the PCI bus
* when allocating the resource windows and rely on the PCI bus driver
* to do this for us.
*/
static void
pcib_activate_window(struct pcib_softc *sc, int type)
{
PCI_ENABLE_IO(device_get_parent(sc->dev), sc->dev, type);
}
static void
pcib_write_windows(struct pcib_softc *sc, int mask)
{
device_t dev;
uint32_t val;
dev = sc->dev;
if (sc->io.valid && mask & WIN_IO) {
val = pci_read_config(dev, PCIR_IOBASEL_1, 1);
if ((val & PCIM_BRIO_MASK) == PCIM_BRIO_32) {
pci_write_config(dev, PCIR_IOBASEH_1,
sc->io.base >> 16, 2);
pci_write_config(dev, PCIR_IOLIMITH_1,
sc->io.limit >> 16, 2);
}
pci_write_config(dev, PCIR_IOBASEL_1, sc->io.base >> 8, 1);
pci_write_config(dev, PCIR_IOLIMITL_1, sc->io.limit >> 8, 1);
}
if (mask & WIN_MEM) {
pci_write_config(dev, PCIR_MEMBASE_1, sc->mem.base >> 16, 2);
pci_write_config(dev, PCIR_MEMLIMIT_1, sc->mem.limit >> 16, 2);
}
if (sc->pmem.valid && mask & WIN_PMEM) {
val = pci_read_config(dev, PCIR_PMBASEL_1, 2);
if ((val & PCIM_BRPM_MASK) == PCIM_BRPM_64) {
pci_write_config(dev, PCIR_PMBASEH_1,
sc->pmem.base >> 32, 4);
pci_write_config(dev, PCIR_PMLIMITH_1,
sc->pmem.limit >> 32, 4);
}
pci_write_config(dev, PCIR_PMBASEL_1, sc->pmem.base >> 16, 2);
pci_write_config(dev, PCIR_PMLIMITL_1, sc->pmem.limit >> 16, 2);
}
}
/*
* This is used to reject I/O port allocations that conflict with an
* ISA alias range.
*/
static int
pcib_is_isa_range(struct pcib_softc *sc, rman_res_t start, rman_res_t end,
rman_res_t count)
{
rman_res_t next_alias;
if (!(sc->bridgectl & PCIB_BCR_ISA_ENABLE))
return (0);
/* Only check fixed ranges for overlap. */
if (start + count - 1 != end)
return (0);
/* ISA aliases are only in the lower 64KB of I/O space. */
if (start >= 65536)
return (0);
/* Check for overlap with 0x000 - 0x0ff as a special case. */
if (start < 0x100)
goto alias;
/*
* If the start address is an alias, the range is an alias.
* Otherwise, compute the start of the next alias range and
* check if it is before the end of the candidate range.
*/
if ((start & 0x300) != 0)
goto alias;
next_alias = (start & ~0x3fful) | 0x100;
if (next_alias <= end)
goto alias;
return (0);
alias:
if (bootverbose)
device_printf(sc->dev,
"I/O range %#jx-%#jx overlaps with an ISA alias\n", start,
end);
return (1);
}
static void
pcib_add_window_resources(struct pcib_window *w, struct resource **res,
int count)
{
struct resource **newarray;
int error, i;
newarray = malloc(sizeof(struct resource *) * (w->count + count),
M_DEVBUF, M_WAITOK);
if (w->res != NULL)
bcopy(w->res, newarray, sizeof(struct resource *) * w->count);
bcopy(res, newarray + w->count, sizeof(struct resource *) * count);
free(w->res, M_DEVBUF);
w->res = newarray;
w->count += count;
for (i = 0; i < count; i++) {
error = rman_manage_region(&w->rman, rman_get_start(res[i]),
rman_get_end(res[i]));
if (error)
panic("Failed to add resource to rman");
}
}
typedef void (nonisa_callback)(rman_res_t start, rman_res_t end, void *arg);
static void
pcib_walk_nonisa_ranges(rman_res_t start, rman_res_t end, nonisa_callback *cb,
void *arg)
{
rman_res_t next_end;
/*
* If start is within an ISA alias range, move up to the start
* of the next non-alias range. As a special case, addresses
* in the range 0x000 - 0x0ff should also be skipped since
* those are used for various system I/O devices in ISA
* systems.
*/
if (start <= 65535) {
if (start < 0x100 || (start & 0x300) != 0) {
start &= ~0x3ff;
start += 0x400;
}
}
/* ISA aliases are only in the lower 64KB of I/O space. */
while (start <= MIN(end, 65535)) {
next_end = MIN(start | 0xff, end);
cb(start, next_end, arg);
start += 0x400;
}
if (start <= end)
cb(start, end, arg);
}
static void
count_ranges(rman_res_t start, rman_res_t end, void *arg)
{
int *countp;
countp = arg;
(*countp)++;
}
struct alloc_state {
struct resource **res;
struct pcib_softc *sc;
int count, error;
};
static void
alloc_ranges(rman_res_t start, rman_res_t end, void *arg)
{
struct alloc_state *as;
struct pcib_window *w;
int rid;
as = arg;
if (as->error != 0)
return;
w = &as->sc->io;
rid = w->reg;
if (bootverbose)
device_printf(as->sc->dev,
"allocating non-ISA range %#jx-%#jx\n", start, end);
as->res[as->count] = bus_alloc_resource(as->sc->dev, SYS_RES_IOPORT,
&rid, start, end, end - start + 1, 0);
if (as->res[as->count] == NULL)
as->error = ENXIO;
else
as->count++;
}
static int
pcib_alloc_nonisa_ranges(struct pcib_softc *sc, rman_res_t start, rman_res_t end)
{
struct alloc_state as;
int i, new_count;
/* First, see how many ranges we need. */
new_count = 0;
pcib_walk_nonisa_ranges(start, end, count_ranges, &new_count);
/* Second, allocate the ranges. */
as.res = malloc(sizeof(struct resource *) * new_count, M_DEVBUF,
M_WAITOK);
as.sc = sc;
as.count = 0;
as.error = 0;
pcib_walk_nonisa_ranges(start, end, alloc_ranges, &as);
if (as.error != 0) {
for (i = 0; i < as.count; i++)
bus_release_resource(sc->dev, SYS_RES_IOPORT,
sc->io.reg, as.res[i]);
free(as.res, M_DEVBUF);
return (as.error);
}
KASSERT(as.count == new_count, ("%s: count mismatch", __func__));
/* Third, add the ranges to the window. */
pcib_add_window_resources(&sc->io, as.res, as.count);
free(as.res, M_DEVBUF);
return (0);
}
static void
pcib_alloc_window(struct pcib_softc *sc, struct pcib_window *w, int type,
int flags, pci_addr_t max_address)
{
struct resource *res;
char buf[64];
int error, rid;
if (max_address != (rman_res_t)max_address)
max_address = ~0;
w->rman.rm_start = 0;
w->rman.rm_end = max_address;
w->rman.rm_type = RMAN_ARRAY;
snprintf(buf, sizeof(buf), "%s %s window",
device_get_nameunit(sc->dev), w->name);
w->rman.rm_descr = strdup(buf, M_DEVBUF);
error = rman_init(&w->rman);
if (error)
panic("Failed to initialize %s %s rman",
device_get_nameunit(sc->dev), w->name);
if (!pcib_is_window_open(w))
return;
if (w->base > max_address || w->limit > max_address) {
device_printf(sc->dev,
"initial %s window has too many bits, ignoring\n", w->name);
return;
}
if (type == SYS_RES_IOPORT && sc->bridgectl & PCIB_BCR_ISA_ENABLE)
(void)pcib_alloc_nonisa_ranges(sc, w->base, w->limit);
else {
rid = w->reg;
res = bus_alloc_resource(sc->dev, type, &rid, w->base, w->limit,
w->limit - w->base + 1, flags);
if (res != NULL)
pcib_add_window_resources(w, &res, 1);
}
if (w->res == NULL) {
device_printf(sc->dev,
"failed to allocate initial %s window: %#jx-%#jx\n",
w->name, (uintmax_t)w->base, (uintmax_t)w->limit);
w->base = max_address;
w->limit = 0;
pcib_write_windows(sc, w->mask);
return;
}
pcib_activate_window(sc, type);
}
/*
* Initialize I/O windows.
*/
static void
pcib_probe_windows(struct pcib_softc *sc)
{
pci_addr_t max;
device_t dev;
uint32_t val;
dev = sc->dev;
if (pci_clear_pcib) {
pcib_bridge_init(dev);
}
/* Determine if the I/O port window is implemented. */
val = pci_read_config(dev, PCIR_IOBASEL_1, 1);
if (val == 0) {
/*
* If 'val' is zero, then only 16-bits of I/O space
* are supported.
*/
pci_write_config(dev, PCIR_IOBASEL_1, 0xff, 1);
if (pci_read_config(dev, PCIR_IOBASEL_1, 1) != 0) {
sc->io.valid = 1;
pci_write_config(dev, PCIR_IOBASEL_1, 0, 1);
}
} else
sc->io.valid = 1;
/* Read the existing I/O port window. */
if (sc->io.valid) {
sc->io.reg = PCIR_IOBASEL_1;
sc->io.step = 12;
sc->io.mask = WIN_IO;
sc->io.name = "I/O port";
if ((val & PCIM_BRIO_MASK) == PCIM_BRIO_32) {
sc->io.base = PCI_PPBIOBASE(
pci_read_config(dev, PCIR_IOBASEH_1, 2), val);
sc->io.limit = PCI_PPBIOLIMIT(
pci_read_config(dev, PCIR_IOLIMITH_1, 2),
pci_read_config(dev, PCIR_IOLIMITL_1, 1));
max = 0xffffffff;
} else {
sc->io.base = PCI_PPBIOBASE(0, val);
sc->io.limit = PCI_PPBIOLIMIT(0,
pci_read_config(dev, PCIR_IOLIMITL_1, 1));
max = 0xffff;
}
pcib_alloc_window(sc, &sc->io, SYS_RES_IOPORT, 0, max);
}
/* Read the existing memory window. */
sc->mem.valid = 1;
sc->mem.reg = PCIR_MEMBASE_1;
sc->mem.step = 20;
sc->mem.mask = WIN_MEM;
sc->mem.name = "memory";
sc->mem.base = PCI_PPBMEMBASE(0,
pci_read_config(dev, PCIR_MEMBASE_1, 2));
sc->mem.limit = PCI_PPBMEMLIMIT(0,
pci_read_config(dev, PCIR_MEMLIMIT_1, 2));
pcib_alloc_window(sc, &sc->mem, SYS_RES_MEMORY, 0, 0xffffffff);
/* Determine if the prefetchable memory window is implemented. */
val = pci_read_config(dev, PCIR_PMBASEL_1, 2);
if (val == 0) {
/*
* If 'val' is zero, then only 32-bits of memory space
* are supported.
*/
pci_write_config(dev, PCIR_PMBASEL_1, 0xffff, 2);
if (pci_read_config(dev, PCIR_PMBASEL_1, 2) != 0) {
sc->pmem.valid = 1;
pci_write_config(dev, PCIR_PMBASEL_1, 0, 2);
}
} else
sc->pmem.valid = 1;
/* Read the existing prefetchable memory window. */
if (sc->pmem.valid) {
sc->pmem.reg = PCIR_PMBASEL_1;
sc->pmem.step = 20;
sc->pmem.mask = WIN_PMEM;
sc->pmem.name = "prefetch";
if ((val & PCIM_BRPM_MASK) == PCIM_BRPM_64) {
sc->pmem.base = PCI_PPBMEMBASE(
pci_read_config(dev, PCIR_PMBASEH_1, 4), val);
sc->pmem.limit = PCI_PPBMEMLIMIT(
pci_read_config(dev, PCIR_PMLIMITH_1, 4),
pci_read_config(dev, PCIR_PMLIMITL_1, 2));
max = 0xffffffffffffffff;
} else {
sc->pmem.base = PCI_PPBMEMBASE(0, val);
sc->pmem.limit = PCI_PPBMEMLIMIT(0,
pci_read_config(dev, PCIR_PMLIMITL_1, 2));
max = 0xffffffff;
}
pcib_alloc_window(sc, &sc->pmem, SYS_RES_MEMORY,
RF_PREFETCHABLE, max);
}
}
static void
pcib_release_window(struct pcib_softc *sc, struct pcib_window *w, int type)
{
device_t dev;
int error, i;
if (!w->valid)
return;
dev = sc->dev;
error = rman_fini(&w->rman);
if (error) {
device_printf(dev, "failed to release %s rman\n", w->name);
return;
}
free(__DECONST(char *, w->rman.rm_descr), M_DEVBUF);
for (i = 0; i < w->count; i++) {
error = bus_free_resource(dev, type, w->res[i]);
if (error)
device_printf(dev,
"failed to release %s resource: %d\n", w->name,
error);
}
free(w->res, M_DEVBUF);
}
static void
pcib_free_windows(struct pcib_softc *sc)
{
pcib_release_window(sc, &sc->pmem, SYS_RES_MEMORY);
pcib_release_window(sc, &sc->mem, SYS_RES_MEMORY);
pcib_release_window(sc, &sc->io, SYS_RES_IOPORT);
}
#ifdef PCI_RES_BUS
/*
* Allocate a suitable secondary bus for this bridge if needed and
* initialize the resource manager for the secondary bus range. Note
* that the minimum count is a desired value and this may allocate a
* smaller range.
*/
void
pcib_setup_secbus(device_t dev, struct pcib_secbus *bus, int min_count)
{
char buf[64];
int error, rid, sec_reg;
switch (pci_read_config(dev, PCIR_HDRTYPE, 1) & PCIM_HDRTYPE) {
case PCIM_HDRTYPE_BRIDGE:
sec_reg = PCIR_SECBUS_1;
bus->sub_reg = PCIR_SUBBUS_1;
break;
case PCIM_HDRTYPE_CARDBUS:
sec_reg = PCIR_SECBUS_2;
bus->sub_reg = PCIR_SUBBUS_2;
break;
default:
panic("not a PCI bridge");
}
bus->sec = pci_read_config(dev, sec_reg, 1);
bus->sub = pci_read_config(dev, bus->sub_reg, 1);
bus->dev = dev;
bus->rman.rm_start = 0;
bus->rman.rm_end = PCI_BUSMAX;
bus->rman.rm_type = RMAN_ARRAY;
snprintf(buf, sizeof(buf), "%s bus numbers", device_get_nameunit(dev));
bus->rman.rm_descr = strdup(buf, M_DEVBUF);
error = rman_init(&bus->rman);
if (error)
panic("Failed to initialize %s bus number rman",
device_get_nameunit(dev));
/*
* Allocate a bus range. This will return an existing bus range
* if one exists, or a new bus range if one does not.
*/
rid = 0;
bus->res = bus_alloc_resource_anywhere(dev, PCI_RES_BUS, &rid,
min_count, 0);
if (bus->res == NULL) {
/*
* Fall back to just allocating a range of a single bus
* number.
*/
bus->res = bus_alloc_resource_anywhere(dev, PCI_RES_BUS, &rid,
1, 0);
} else if (rman_get_size(bus->res) < min_count)
/*
* Attempt to grow the existing range to satisfy the
* minimum desired count.
*/
(void)bus_adjust_resource(dev, PCI_RES_BUS, bus->res,
rman_get_start(bus->res), rman_get_start(bus->res) +
min_count - 1);
/*
* Add the initial resource to the rman.
*/
if (bus->res != NULL) {
error = rman_manage_region(&bus->rman, rman_get_start(bus->res),
rman_get_end(bus->res));
if (error)
panic("Failed to add resource to rman");
bus->sec = rman_get_start(bus->res);
bus->sub = rman_get_end(bus->res);
}
}
void
pcib_free_secbus(device_t dev, struct pcib_secbus *bus)
{
int error;
error = rman_fini(&bus->rman);
if (error) {
device_printf(dev, "failed to release bus number rman\n");
return;
}
free(__DECONST(char *, bus->rman.rm_descr), M_DEVBUF);
error = bus_free_resource(dev, PCI_RES_BUS, bus->res);
if (error)
device_printf(dev,
"failed to release bus numbers resource: %d\n", error);
}
static struct resource *
pcib_suballoc_bus(struct pcib_secbus *bus, device_t child, int *rid,
rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
{
struct resource *res;
res = rman_reserve_resource(&bus->rman, start, end, count, flags,
child);
if (res == NULL)
return (NULL);
if (bootverbose)
device_printf(bus->dev,
"allocated bus range (%ju-%ju) for rid %d of %s\n",
rman_get_start(res), rman_get_end(res), *rid,
pcib_child_name(child));
rman_set_rid(res, *rid);
return (res);
}
/*
* Attempt to grow the secondary bus range. This is much simpler than
* for I/O windows as the range can only be grown by increasing
* subbus.
*/
static int
pcib_grow_subbus(struct pcib_secbus *bus, rman_res_t new_end)
{
rman_res_t old_end;
int error;
old_end = rman_get_end(bus->res);
KASSERT(new_end > old_end, ("attempt to shrink subbus"));
error = bus_adjust_resource(bus->dev, PCI_RES_BUS, bus->res,
rman_get_start(bus->res), new_end);
if (error)
return (error);
if (bootverbose)
device_printf(bus->dev, "grew bus range to %ju-%ju\n",
rman_get_start(bus->res), rman_get_end(bus->res));
error = rman_manage_region(&bus->rman, old_end + 1,
rman_get_end(bus->res));
if (error)
panic("Failed to add resource to rman");
bus->sub = rman_get_end(bus->res);
pci_write_config(bus->dev, bus->sub_reg, bus->sub, 1);
return (0);
}
struct resource *
pcib_alloc_subbus(struct pcib_secbus *bus, device_t child, int *rid,
rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
{
struct resource *res;
rman_res_t start_free, end_free, new_end;
/*
* First, see if the request can be satisified by the existing
* bus range.
*/
res = pcib_suballoc_bus(bus, child, rid, start, end, count, flags);
if (res != NULL)
return (res);
/*
* Figure out a range to grow the bus range. First, find the
* first bus number after the last allocated bus in the rman and
* enforce that as a minimum starting point for the range.
*/
if (rman_last_free_region(&bus->rman, &start_free, &end_free) != 0 ||
end_free != bus->sub)
start_free = bus->sub + 1;
if (start_free < start)
start_free = start;
new_end = start_free + count - 1;
/*
* See if this new range would satisfy the request if it
* succeeds.
*/
if (new_end > end)
return (NULL);
/* Finally, attempt to grow the existing resource. */
if (bootverbose) {
device_printf(bus->dev,
"attempting to grow bus range for %ju buses\n", count);
printf("\tback candidate range: %ju-%ju\n", start_free,
new_end);
}
if (pcib_grow_subbus(bus, new_end) == 0)
return (pcib_suballoc_bus(bus, child, rid, start, end, count,
flags));
return (NULL);
}
#endif
#else
/*
* Is the prefetch window open (eg, can we allocate memory in it?)
*/
static int
pcib_is_prefetch_open(struct pcib_softc *sc)
{
return (sc->pmembase > 0 && sc->pmembase < sc->pmemlimit);
}
/*
* Is the nonprefetch window open (eg, can we allocate memory in it?)
*/
static int
pcib_is_nonprefetch_open(struct pcib_softc *sc)
{
return (sc->membase > 0 && sc->membase < sc->memlimit);
}
/*
* Is the io window open (eg, can we allocate ports in it?)
*/
static int
pcib_is_io_open(struct pcib_softc *sc)
{
return (sc->iobase > 0 && sc->iobase < sc->iolimit);
}
/*
* Get current I/O decode.
*/
static void
pcib_get_io_decode(struct pcib_softc *sc)
{
device_t dev;
uint32_t iolow;
dev = sc->dev;
iolow = pci_read_config(dev, PCIR_IOBASEL_1, 1);
if ((iolow & PCIM_BRIO_MASK) == PCIM_BRIO_32)
sc->iobase = PCI_PPBIOBASE(
pci_read_config(dev, PCIR_IOBASEH_1, 2), iolow);
else
sc->iobase = PCI_PPBIOBASE(0, iolow);
iolow = pci_read_config(dev, PCIR_IOLIMITL_1, 1);
if ((iolow & PCIM_BRIO_MASK) == PCIM_BRIO_32)
sc->iolimit = PCI_PPBIOLIMIT(
pci_read_config(dev, PCIR_IOLIMITH_1, 2), iolow);
else
sc->iolimit = PCI_PPBIOLIMIT(0, iolow);
}
/*
* Get current memory decode.
*/
static void
pcib_get_mem_decode(struct pcib_softc *sc)
{
device_t dev;
pci_addr_t pmemlow;
dev = sc->dev;
sc->membase = PCI_PPBMEMBASE(0,
pci_read_config(dev, PCIR_MEMBASE_1, 2));
sc->memlimit = PCI_PPBMEMLIMIT(0,
pci_read_config(dev, PCIR_MEMLIMIT_1, 2));
pmemlow = pci_read_config(dev, PCIR_PMBASEL_1, 2);
if ((pmemlow & PCIM_BRPM_MASK) == PCIM_BRPM_64)
sc->pmembase = PCI_PPBMEMBASE(
pci_read_config(dev, PCIR_PMBASEH_1, 4), pmemlow);
else
sc->pmembase = PCI_PPBMEMBASE(0, pmemlow);
pmemlow = pci_read_config(dev, PCIR_PMLIMITL_1, 2);
if ((pmemlow & PCIM_BRPM_MASK) == PCIM_BRPM_64)
sc->pmemlimit = PCI_PPBMEMLIMIT(
pci_read_config(dev, PCIR_PMLIMITH_1, 4), pmemlow);
else
sc->pmemlimit = PCI_PPBMEMLIMIT(0, pmemlow);
}
/*
* Restore previous I/O decode.
*/
static void
pcib_set_io_decode(struct pcib_softc *sc)
{
device_t dev;
uint32_t iohi;
dev = sc->dev;
iohi = sc->iobase >> 16;
if (iohi > 0)
pci_write_config(dev, PCIR_IOBASEH_1, iohi, 2);
pci_write_config(dev, PCIR_IOBASEL_1, sc->iobase >> 8, 1);
iohi = sc->iolimit >> 16;
if (iohi > 0)
pci_write_config(dev, PCIR_IOLIMITH_1, iohi, 2);
pci_write_config(dev, PCIR_IOLIMITL_1, sc->iolimit >> 8, 1);
}
/*
* Restore previous memory decode.
*/
static void
pcib_set_mem_decode(struct pcib_softc *sc)
{
device_t dev;
pci_addr_t pmemhi;
dev = sc->dev;
pci_write_config(dev, PCIR_MEMBASE_1, sc->membase >> 16, 2);
pci_write_config(dev, PCIR_MEMLIMIT_1, sc->memlimit >> 16, 2);
pmemhi = sc->pmembase >> 32;
if (pmemhi > 0)
pci_write_config(dev, PCIR_PMBASEH_1, pmemhi, 4);
pci_write_config(dev, PCIR_PMBASEL_1, sc->pmembase >> 16, 2);
pmemhi = sc->pmemlimit >> 32;
if (pmemhi > 0)
pci_write_config(dev, PCIR_PMLIMITH_1, pmemhi, 4);
pci_write_config(dev, PCIR_PMLIMITL_1, sc->pmemlimit >> 16, 2);
}
#endif
#ifdef PCI_HP
/*
* PCI-express HotPlug support.
*/
static int pci_enable_pcie_hp = 1;
SYSCTL_INT(_hw_pci, OID_AUTO, enable_pcie_hp, CTLFLAG_RDTUN,
&pci_enable_pcie_hp, 0,
"Enable support for native PCI-express HotPlug.");
static void
pcib_probe_hotplug(struct pcib_softc *sc)
{
device_t dev;
uint32_t link_cap;
uint16_t link_sta, slot_sta;
if (!pci_enable_pcie_hp)
return;
dev = sc->dev;
if (pci_find_cap(dev, PCIY_EXPRESS, NULL) != 0)
return;
if (!(pcie_read_config(dev, PCIER_FLAGS, 2) & PCIEM_FLAGS_SLOT))
return;
sc->pcie_slot_cap = pcie_read_config(dev, PCIER_SLOT_CAP, 4);
if ((sc->pcie_slot_cap & PCIEM_SLOT_CAP_HPC) == 0)
return;
link_cap = pcie_read_config(dev, PCIER_LINK_CAP, 4);
if ((link_cap & PCIEM_LINK_CAP_DL_ACTIVE) == 0)
return;
/*
* Some devices report that they have an MRL when they actually
* do not. Since they always report that the MRL is open, child
* devices would be ignored. Try to detect these devices and
* ignore their claim of HotPlug support.
*
* If there is an open MRL but the Data Link Layer is active,
* the MRL is not real.
*/
if ((sc->pcie_slot_cap & PCIEM_SLOT_CAP_MRLSP) != 0) {
link_sta = pcie_read_config(dev, PCIER_LINK_STA, 2);
slot_sta = pcie_read_config(dev, PCIER_SLOT_STA, 2);
if ((slot_sta & PCIEM_SLOT_STA_MRLSS) != 0 &&
(link_sta & PCIEM_LINK_STA_DL_ACTIVE) != 0) {
return;
}
}
/*
* Now that we're sure we want to do hot plug, ask the
* firmware, if any, if that's OK.
*/
if (pcib_request_feature(dev, PCI_FEATURE_HP) != 0) {
if (bootverbose)
device_printf(dev, "Unable to activate hot plug feature.\n");
return;
}
sc->flags |= PCIB_HOTPLUG;
}
/*
* Send a HotPlug command to the slot control register. If this slot
* uses command completion interrupts and a previous command is still
* in progress, then the command is dropped. Once the previous
* command completes or times out, pcib_pcie_hotplug_update() will be
* invoked to post a new command based on the slot's state at that
* time.
*/
static void
pcib_pcie_hotplug_command(struct pcib_softc *sc, uint16_t val, uint16_t mask)
{
device_t dev;
uint16_t ctl, new;
dev = sc->dev;
if (sc->flags & PCIB_HOTPLUG_CMD_PENDING)
return;
ctl = pcie_read_config(dev, PCIER_SLOT_CTL, 2);
new = (ctl & ~mask) | val;
if (new == ctl)
return;
if (bootverbose)
device_printf(dev, "HotPlug command: %04x -> %04x\n", ctl, new);
pcie_write_config(dev, PCIER_SLOT_CTL, new, 2);
if (!(sc->pcie_slot_cap & PCIEM_SLOT_CAP_NCCS) &&
(ctl & new) & PCIEM_SLOT_CTL_CCIE) {
sc->flags |= PCIB_HOTPLUG_CMD_PENDING;
if (!cold)
callout_reset(&sc->pcie_cc_timer, hz,
pcib_pcie_cc_timeout, sc);
}
}
static void
pcib_pcie_hotplug_command_completed(struct pcib_softc *sc)
{
device_t dev;
dev = sc->dev;
if (bootverbose)
device_printf(dev, "Command Completed\n");
if (!(sc->flags & PCIB_HOTPLUG_CMD_PENDING))
return;
callout_stop(&sc->pcie_cc_timer);
sc->flags &= ~PCIB_HOTPLUG_CMD_PENDING;
wakeup(sc);
}
/*
* Returns true if a card is fully inserted from the user's
* perspective. It may not yet be ready for access, but the driver
* can now start enabling access if necessary.
*/
static bool
pcib_hotplug_inserted(struct pcib_softc *sc)
{
/* Pretend the card isn't present if a detach is forced. */
if (sc->flags & PCIB_DETACHING)
return (false);
/* Card must be present in the slot. */
if ((sc->pcie_slot_sta & PCIEM_SLOT_STA_PDS) == 0)
return (false);
/* A power fault implicitly turns off power to the slot. */
if (sc->pcie_slot_sta & PCIEM_SLOT_STA_PFD)
return (false);
/* If the MRL is disengaged, the slot is powered off. */
if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_MRLSP &&
(sc->pcie_slot_sta & PCIEM_SLOT_STA_MRLSS) != 0)
return (false);
return (true);
}
/*
* Returns -1 if the card is fully inserted, powered, and ready for
* access. Otherwise, returns 0.
*/
static int
pcib_hotplug_present(struct pcib_softc *sc)
{
/* Card must be inserted. */
if (!pcib_hotplug_inserted(sc))
return (0);
/*
* Require the Electromechanical Interlock to be engaged if
* present.
*/
if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_EIP &&
(sc->pcie_slot_sta & PCIEM_SLOT_STA_EIS) == 0)
return (0);
/* Require the Data Link Layer to be active. */
if (!(sc->pcie_link_sta & PCIEM_LINK_STA_DL_ACTIVE))
return (0);
return (-1);
}
static void
pcib_pcie_hotplug_update(struct pcib_softc *sc, uint16_t val, uint16_t mask,
bool schedule_task)
{
bool card_inserted, ei_engaged;
/* Clear DETACHING if Presence Detect has cleared. */
if ((sc->pcie_slot_sta & (PCIEM_SLOT_STA_PDC | PCIEM_SLOT_STA_PDS)) ==
PCIEM_SLOT_STA_PDC)
sc->flags &= ~PCIB_DETACHING;
card_inserted = pcib_hotplug_inserted(sc);
/* Turn the power indicator on if a card is inserted. */
if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_PIP) {
mask |= PCIEM_SLOT_CTL_PIC;
if (card_inserted)
val |= PCIEM_SLOT_CTL_PI_ON;
else if (sc->flags & PCIB_DETACH_PENDING)
val |= PCIEM_SLOT_CTL_PI_BLINK;
else
val |= PCIEM_SLOT_CTL_PI_OFF;
}
/* Turn the power on via the Power Controller if a card is inserted. */
if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_PCP) {
mask |= PCIEM_SLOT_CTL_PCC;
if (card_inserted)
val |= PCIEM_SLOT_CTL_PC_ON;
else
val |= PCIEM_SLOT_CTL_PC_OFF;
}
/*
* If a card is inserted, enable the Electromechanical
* Interlock. If a card is not inserted (or we are in the
* process of detaching), disable the Electromechanical
* Interlock.
*/
if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_EIP) {
mask |= PCIEM_SLOT_CTL_EIC;
ei_engaged = (sc->pcie_slot_sta & PCIEM_SLOT_STA_EIS) != 0;
if (card_inserted != ei_engaged)
val |= PCIEM_SLOT_CTL_EIC;
}
/*
* Start a timer to see if the Data Link Layer times out.
* Note that we only start the timer if Presence Detect or MRL Sensor
* changed on this interrupt. Stop any scheduled timer if
* the Data Link Layer is active.
*/
if (card_inserted &&
!(sc->pcie_link_sta & PCIEM_LINK_STA_DL_ACTIVE) &&
sc->pcie_slot_sta &
(PCIEM_SLOT_STA_MRLSC | PCIEM_SLOT_STA_PDC)) {
if (cold)
device_printf(sc->dev,
"Data Link Layer inactive\n");
else
callout_reset(&sc->pcie_dll_timer, hz,
pcib_pcie_dll_timeout, sc);
} else if (sc->pcie_link_sta & PCIEM_LINK_STA_DL_ACTIVE)
callout_stop(&sc->pcie_dll_timer);
pcib_pcie_hotplug_command(sc, val, mask);
/*
* During attach the child "pci" device is added synchronously;
* otherwise, the task is scheduled to manage the child
* device.
*/
if (schedule_task &&
(pcib_hotplug_present(sc) != 0) != (sc->child != NULL))
taskqueue_enqueue(taskqueue_thread, &sc->pcie_hp_task);
}
static void
pcib_pcie_intr_hotplug(void *arg)
{
struct pcib_softc *sc;
device_t dev;
sc = arg;
dev = sc->dev;
sc->pcie_slot_sta = pcie_read_config(dev, PCIER_SLOT_STA, 2);
/* Clear the events just reported. */
pcie_write_config(dev, PCIER_SLOT_STA, sc->pcie_slot_sta, 2);
if (bootverbose)
device_printf(dev, "HotPlug interrupt: %#x\n",
sc->pcie_slot_sta);
if (sc->pcie_slot_sta & PCIEM_SLOT_STA_ABP) {
if (sc->flags & PCIB_DETACH_PENDING) {
device_printf(dev,
"Attention Button Pressed: Detach Cancelled\n");
sc->flags &= ~PCIB_DETACH_PENDING;
callout_stop(&sc->pcie_ab_timer);
} else {
device_printf(dev,
"Attention Button Pressed: Detaching in 5 seconds\n");
sc->flags |= PCIB_DETACH_PENDING;
callout_reset(&sc->pcie_ab_timer, 5 * hz,
pcib_pcie_ab_timeout, sc);
}
}
if (sc->pcie_slot_sta & PCIEM_SLOT_STA_PFD)
device_printf(dev, "Power Fault Detected\n");
if (sc->pcie_slot_sta & PCIEM_SLOT_STA_MRLSC)
device_printf(dev, "MRL Sensor Changed to %s\n",
sc->pcie_slot_sta & PCIEM_SLOT_STA_MRLSS ? "open" :
"closed");
if (bootverbose && sc->pcie_slot_sta & PCIEM_SLOT_STA_PDC)
device_printf(dev, "Presence Detect Changed to %s\n",
sc->pcie_slot_sta & PCIEM_SLOT_STA_PDS ? "card present" :
"empty");
if (sc->pcie_slot_sta & PCIEM_SLOT_STA_CC)
pcib_pcie_hotplug_command_completed(sc);
if (sc->pcie_slot_sta & PCIEM_SLOT_STA_DLLSC) {
sc->pcie_link_sta = pcie_read_config(dev, PCIER_LINK_STA, 2);
if (bootverbose)
device_printf(dev,
"Data Link Layer State Changed to %s\n",
sc->pcie_link_sta & PCIEM_LINK_STA_DL_ACTIVE ?
"active" : "inactive");
}
pcib_pcie_hotplug_update(sc, 0, 0, true);
}
static void
pcib_pcie_hotplug_task(void *context, int pending)
{
struct pcib_softc *sc;
device_t dev;
sc = context;
mtx_lock(&Giant);
dev = sc->dev;
if (pcib_hotplug_present(sc) != 0) {
if (sc->child == NULL) {
sc->child = device_add_child(dev, "pci", -1);
bus_generic_attach(dev);
}
} else {
if (sc->child != NULL) {
if (device_delete_child(dev, sc->child) == 0)
sc->child = NULL;
}
}
mtx_unlock(&Giant);
}
static void
pcib_pcie_ab_timeout(void *arg)
{
struct pcib_softc *sc;
- device_t dev;
sc = arg;
- dev = sc->dev;
mtx_assert(&Giant, MA_OWNED);
if (sc->flags & PCIB_DETACH_PENDING) {
sc->flags |= PCIB_DETACHING;
sc->flags &= ~PCIB_DETACH_PENDING;
pcib_pcie_hotplug_update(sc, 0, 0, true);
}
}
static void
pcib_pcie_cc_timeout(void *arg)
{
struct pcib_softc *sc;
device_t dev;
uint16_t sta;
sc = arg;
dev = sc->dev;
mtx_assert(&Giant, MA_OWNED);
sta = pcie_read_config(dev, PCIER_SLOT_STA, 2);
if (!(sta & PCIEM_SLOT_STA_CC)) {
device_printf(dev,
"HotPlug Command Timed Out - forcing detach\n");
sc->flags &= ~(PCIB_HOTPLUG_CMD_PENDING | PCIB_DETACH_PENDING);
sc->flags |= PCIB_DETACHING;
pcib_pcie_hotplug_update(sc, 0, 0, true);
} else {
device_printf(dev,
"Missed HotPlug interrupt waiting for Command Completion\n");
pcib_pcie_intr_hotplug(sc);
}
}
static void
pcib_pcie_dll_timeout(void *arg)
{
struct pcib_softc *sc;
device_t dev;
uint16_t sta;
sc = arg;
dev = sc->dev;
mtx_assert(&Giant, MA_OWNED);
sta = pcie_read_config(dev, PCIER_LINK_STA, 2);
if (!(sta & PCIEM_LINK_STA_DL_ACTIVE)) {
device_printf(dev,
"Timed out waiting for Data Link Layer Active\n");
sc->flags |= PCIB_DETACHING;
pcib_pcie_hotplug_update(sc, 0, 0, true);
} else if (sta != sc->pcie_link_sta) {
device_printf(dev,
"Missed HotPlug interrupt waiting for DLL Active\n");
pcib_pcie_intr_hotplug(sc);
}
}
static int
pcib_alloc_pcie_irq(struct pcib_softc *sc)
{
device_t dev;
int count, error, rid;
rid = -1;
dev = sc->dev;
/*
* For simplicity, only use MSI-X if there is a single message.
* To support a device with multiple messages we would have to
* use remap intr if the MSI number is not 0.
*/
count = pci_msix_count(dev);
if (count == 1) {
error = pci_alloc_msix(dev, &count);
if (error == 0)
rid = 1;
}
if (rid < 0 && pci_msi_count(dev) > 0) {
count = 1;
error = pci_alloc_msi(dev, &count);
if (error == 0)
rid = 1;
}
if (rid < 0)
rid = 0;
sc->pcie_irq = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid,
RF_ACTIVE);
if (sc->pcie_irq == NULL) {
device_printf(dev,
"Failed to allocate interrupt for PCI-e events\n");
if (rid > 0)
pci_release_msi(dev);
return (ENXIO);
}
error = bus_setup_intr(dev, sc->pcie_irq, INTR_TYPE_MISC,
NULL, pcib_pcie_intr_hotplug, sc, &sc->pcie_ihand);
if (error) {
device_printf(dev, "Failed to setup PCI-e interrupt handler\n");
bus_release_resource(dev, SYS_RES_IRQ, rid, sc->pcie_irq);
if (rid > 0)
pci_release_msi(dev);
return (error);
}
return (0);
}
static int
pcib_release_pcie_irq(struct pcib_softc *sc)
{
device_t dev;
int error;
dev = sc->dev;
error = bus_teardown_intr(dev, sc->pcie_irq, sc->pcie_ihand);
if (error)
return (error);
error = bus_free_resource(dev, SYS_RES_IRQ, sc->pcie_irq);
if (error)
return (error);
return (pci_release_msi(dev));
}
static void
pcib_setup_hotplug(struct pcib_softc *sc)
{
device_t dev;
uint16_t mask, val;
dev = sc->dev;
callout_init(&sc->pcie_ab_timer, 0);
callout_init(&sc->pcie_cc_timer, 0);
callout_init(&sc->pcie_dll_timer, 0);
TASK_INIT(&sc->pcie_hp_task, 0, pcib_pcie_hotplug_task, sc);
/* Allocate IRQ. */
if (pcib_alloc_pcie_irq(sc) != 0)
return;
sc->pcie_link_sta = pcie_read_config(dev, PCIER_LINK_STA, 2);
sc->pcie_slot_sta = pcie_read_config(dev, PCIER_SLOT_STA, 2);
/* Clear any events previously pending. */
pcie_write_config(dev, PCIER_SLOT_STA, sc->pcie_slot_sta, 2);
/* Enable HotPlug events. */
mask = PCIEM_SLOT_CTL_DLLSCE | PCIEM_SLOT_CTL_HPIE |
PCIEM_SLOT_CTL_CCIE | PCIEM_SLOT_CTL_PDCE | PCIEM_SLOT_CTL_MRLSCE |
PCIEM_SLOT_CTL_PFDE | PCIEM_SLOT_CTL_ABPE;
val = PCIEM_SLOT_CTL_DLLSCE | PCIEM_SLOT_CTL_HPIE | PCIEM_SLOT_CTL_PDCE;
if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_APB)
val |= PCIEM_SLOT_CTL_ABPE;
if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_PCP)
val |= PCIEM_SLOT_CTL_PFDE;
if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_MRLSP)
val |= PCIEM_SLOT_CTL_MRLSCE;
if (!(sc->pcie_slot_cap & PCIEM_SLOT_CAP_NCCS))
val |= PCIEM_SLOT_CTL_CCIE;
/* Turn the attention indicator off. */
if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_AIP) {
mask |= PCIEM_SLOT_CTL_AIC;
val |= PCIEM_SLOT_CTL_AI_OFF;
}
pcib_pcie_hotplug_update(sc, val, mask, false);
}
static int
pcib_detach_hotplug(struct pcib_softc *sc)
{
uint16_t mask, val;
int error;
/* Disable the card in the slot and force it to detach. */
if (sc->flags & PCIB_DETACH_PENDING) {
sc->flags &= ~PCIB_DETACH_PENDING;
callout_stop(&sc->pcie_ab_timer);
}
sc->flags |= PCIB_DETACHING;
if (sc->flags & PCIB_HOTPLUG_CMD_PENDING) {
callout_stop(&sc->pcie_cc_timer);
tsleep(sc, 0, "hpcmd", hz);
sc->flags &= ~PCIB_HOTPLUG_CMD_PENDING;
}
/* Disable HotPlug events. */
mask = PCIEM_SLOT_CTL_DLLSCE | PCIEM_SLOT_CTL_HPIE |
PCIEM_SLOT_CTL_CCIE | PCIEM_SLOT_CTL_PDCE | PCIEM_SLOT_CTL_MRLSCE |
PCIEM_SLOT_CTL_PFDE | PCIEM_SLOT_CTL_ABPE;
val = 0;
/* Turn the attention indicator off. */
if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_AIP) {
mask |= PCIEM_SLOT_CTL_AIC;
val |= PCIEM_SLOT_CTL_AI_OFF;
}
pcib_pcie_hotplug_update(sc, val, mask, false);
error = pcib_release_pcie_irq(sc);
if (error)
return (error);
taskqueue_drain(taskqueue_thread, &sc->pcie_hp_task);
callout_drain(&sc->pcie_ab_timer);
callout_drain(&sc->pcie_cc_timer);
callout_drain(&sc->pcie_dll_timer);
return (0);
}
#endif
/*
* Get current bridge configuration.
*/
static void
pcib_cfg_save(struct pcib_softc *sc)
{
#ifndef NEW_PCIB
device_t dev;
uint16_t command;
dev = sc->dev;
command = pci_read_config(dev, PCIR_COMMAND, 2);
if (command & PCIM_CMD_PORTEN)
pcib_get_io_decode(sc);
if (command & PCIM_CMD_MEMEN)
pcib_get_mem_decode(sc);
#endif
}
/*
* Restore previous bridge configuration.
*/
static void
pcib_cfg_restore(struct pcib_softc *sc)
{
- device_t dev;
#ifndef NEW_PCIB
uint16_t command;
#endif
- dev = sc->dev;
#ifdef NEW_PCIB
pcib_write_windows(sc, WIN_IO | WIN_MEM | WIN_PMEM);
#else
- command = pci_read_config(dev, PCIR_COMMAND, 2);
+ command = pci_read_config(sc->dev, PCIR_COMMAND, 2);
if (command & PCIM_CMD_PORTEN)
pcib_set_io_decode(sc);
if (command & PCIM_CMD_MEMEN)
pcib_set_mem_decode(sc);
#endif
}
/*
* Generic device interface
*/
static int
pcib_probe(device_t dev)
{
if ((pci_get_class(dev) == PCIC_BRIDGE) &&
(pci_get_subclass(dev) == PCIS_BRIDGE_PCI)) {
device_set_desc(dev, "PCI-PCI bridge");
return(-10000);
}
return(ENXIO);
}
void
pcib_attach_common(device_t dev)
{
struct pcib_softc *sc;
struct sysctl_ctx_list *sctx;
struct sysctl_oid *soid;
int comma;
sc = device_get_softc(dev);
sc->dev = dev;
/*
* Get current bridge configuration.
*/
sc->domain = pci_get_domain(dev);
#if !(defined(NEW_PCIB) && defined(PCI_RES_BUS))
sc->bus.sec = pci_read_config(dev, PCIR_SECBUS_1, 1);
sc->bus.sub = pci_read_config(dev, PCIR_SUBBUS_1, 1);
#endif
sc->bridgectl = pci_read_config(dev, PCIR_BRIDGECTL_1, 2);
pcib_cfg_save(sc);
/*
* The primary bus register should always be the bus of the
* parent.
*/
sc->pribus = pci_get_bus(dev);
pci_write_config(dev, PCIR_PRIBUS_1, sc->pribus, 1);
/*
* Setup sysctl reporting nodes
*/
sctx = device_get_sysctl_ctx(dev);
soid = device_get_sysctl_tree(dev);
SYSCTL_ADD_UINT(sctx, SYSCTL_CHILDREN(soid), OID_AUTO, "domain",
CTLFLAG_RD, &sc->domain, 0, "Domain number");
SYSCTL_ADD_UINT(sctx, SYSCTL_CHILDREN(soid), OID_AUTO, "pribus",
CTLFLAG_RD, &sc->pribus, 0, "Primary bus number");
SYSCTL_ADD_UINT(sctx, SYSCTL_CHILDREN(soid), OID_AUTO, "secbus",
CTLFLAG_RD, &sc->bus.sec, 0, "Secondary bus number");
SYSCTL_ADD_UINT(sctx, SYSCTL_CHILDREN(soid), OID_AUTO, "subbus",
CTLFLAG_RD, &sc->bus.sub, 0, "Subordinate bus number");
/*
* Quirk handling.
*/
switch (pci_get_devid(dev)) {
#if !(defined(NEW_PCIB) && defined(PCI_RES_BUS))
case 0x12258086: /* Intel 82454KX/GX (Orion) */
{
uint8_t supbus;
supbus = pci_read_config(dev, 0x41, 1);
if (supbus != 0xff) {
sc->bus.sec = supbus + 1;
sc->bus.sub = supbus + 1;
}
break;
}
#endif
/*
* The i82380FB mobile docking controller is a PCI-PCI bridge,
* and it is a subtractive bridge. However, the ProgIf is wrong
* so the normal setting of PCIB_SUBTRACTIVE bit doesn't
* happen. There are also Toshiba and Cavium ThunderX bridges
* that behave this way.
*/
case 0xa002177d: /* Cavium ThunderX */
case 0x124b8086: /* Intel 82380FB Mobile */
case 0x060513d7: /* Toshiba ???? */
sc->flags |= PCIB_SUBTRACTIVE;
break;
#if !(defined(NEW_PCIB) && defined(PCI_RES_BUS))
/* Compaq R3000 BIOS sets wrong subordinate bus number. */
case 0x00dd10de:
{
char *cp;
if ((cp = kern_getenv("smbios.planar.maker")) == NULL)
break;
if (strncmp(cp, "Compal", 6) != 0) {
freeenv(cp);
break;
}
freeenv(cp);
if ((cp = kern_getenv("smbios.planar.product")) == NULL)
break;
if (strncmp(cp, "08A0", 4) != 0) {
freeenv(cp);
break;
}
freeenv(cp);
if (sc->bus.sub < 0xa) {
pci_write_config(dev, PCIR_SUBBUS_1, 0xa, 1);
sc->bus.sub = pci_read_config(dev, PCIR_SUBBUS_1, 1);
}
break;
}
#endif
}
if (pci_msi_device_blacklisted(dev))
sc->flags |= PCIB_DISABLE_MSI;
if (pci_msix_device_blacklisted(dev))
sc->flags |= PCIB_DISABLE_MSIX;
/*
* Intel 815, 845 and other chipsets say they are PCI-PCI bridges,
* but have a ProgIF of 0x80. The 82801 family (AA, AB, BAM/CAM,
* BA/CA/DB and E) PCI bridges are HUB-PCI bridges, in Intelese.
* This means they act as if they were subtractively decoding
* bridges and pass all transactions. Mark them and real ProgIf 1
* parts as subtractive.
*/
if ((pci_get_devid(dev) & 0xff00ffff) == 0x24008086 ||
pci_read_config(dev, PCIR_PROGIF, 1) == PCIP_BRIDGE_PCI_SUBTRACTIVE)
sc->flags |= PCIB_SUBTRACTIVE;
#ifdef PCI_HP
pcib_probe_hotplug(sc);
#endif
#ifdef NEW_PCIB
#ifdef PCI_RES_BUS
pcib_setup_secbus(dev, &sc->bus, 1);
#endif
pcib_probe_windows(sc);
#endif
#ifdef PCI_HP
if (sc->flags & PCIB_HOTPLUG)
pcib_setup_hotplug(sc);
#endif
if (bootverbose) {
device_printf(dev, " domain %d\n", sc->domain);
device_printf(dev, " secondary bus %d\n", sc->bus.sec);
device_printf(dev, " subordinate bus %d\n", sc->bus.sub);
#ifdef NEW_PCIB
if (pcib_is_window_open(&sc->io))
device_printf(dev, " I/O decode 0x%jx-0x%jx\n",
(uintmax_t)sc->io.base, (uintmax_t)sc->io.limit);
if (pcib_is_window_open(&sc->mem))
device_printf(dev, " memory decode 0x%jx-0x%jx\n",
(uintmax_t)sc->mem.base, (uintmax_t)sc->mem.limit);
if (pcib_is_window_open(&sc->pmem))
device_printf(dev, " prefetched decode 0x%jx-0x%jx\n",
(uintmax_t)sc->pmem.base, (uintmax_t)sc->pmem.limit);
#else
if (pcib_is_io_open(sc))
device_printf(dev, " I/O decode 0x%x-0x%x\n",
sc->iobase, sc->iolimit);
if (pcib_is_nonprefetch_open(sc))
device_printf(dev, " memory decode 0x%jx-0x%jx\n",
(uintmax_t)sc->membase, (uintmax_t)sc->memlimit);
if (pcib_is_prefetch_open(sc))
device_printf(dev, " prefetched decode 0x%jx-0x%jx\n",
(uintmax_t)sc->pmembase, (uintmax_t)sc->pmemlimit);
#endif
if (sc->bridgectl & (PCIB_BCR_ISA_ENABLE | PCIB_BCR_VGA_ENABLE) ||
sc->flags & PCIB_SUBTRACTIVE) {
device_printf(dev, " special decode ");
comma = 0;
if (sc->bridgectl & PCIB_BCR_ISA_ENABLE) {
printf("ISA");
comma = 1;
}
if (sc->bridgectl & PCIB_BCR_VGA_ENABLE) {
printf("%sVGA", comma ? ", " : "");
comma = 1;
}
if (sc->flags & PCIB_SUBTRACTIVE)
printf("%ssubtractive", comma ? ", " : "");
printf("\n");
}
}
/*
* Always enable busmastering on bridges so that transactions
* initiated on the secondary bus are passed through to the
* primary bus.
*/
pci_enable_busmaster(dev);
}
#ifdef PCI_HP
static int
pcib_present(struct pcib_softc *sc)
{
if (sc->flags & PCIB_HOTPLUG)
return (pcib_hotplug_present(sc) != 0);
return (1);
}
#endif
int
pcib_attach_child(device_t dev)
{
struct pcib_softc *sc;
sc = device_get_softc(dev);
if (sc->bus.sec == 0) {
/* no secondary bus; we should have fixed this */
return(0);
}
#ifdef PCI_HP
if (!pcib_present(sc)) {
/* An empty HotPlug slot, so don't add a PCI bus yet. */
return (0);
}
#endif
sc->child = device_add_child(dev, "pci", -1);
return (bus_generic_attach(dev));
}
int
pcib_attach(device_t dev)
{
pcib_attach_common(dev);
return (pcib_attach_child(dev));
}
int
pcib_detach(device_t dev)
{
#if defined(PCI_HP) || defined(NEW_PCIB)
struct pcib_softc *sc;
#endif
int error;
#if defined(PCI_HP) || defined(NEW_PCIB)
sc = device_get_softc(dev);
#endif
error = bus_generic_detach(dev);
if (error)
return (error);
#ifdef PCI_HP
if (sc->flags & PCIB_HOTPLUG) {
error = pcib_detach_hotplug(sc);
if (error)
return (error);
}
#endif
error = device_delete_children(dev);
if (error)
return (error);
#ifdef NEW_PCIB
pcib_free_windows(sc);
#ifdef PCI_RES_BUS
pcib_free_secbus(dev, &sc->bus);
#endif
#endif
return (0);
}
int
pcib_suspend(device_t dev)
{
pcib_cfg_save(device_get_softc(dev));
return (bus_generic_suspend(dev));
}
int
pcib_resume(device_t dev)
{
pcib_cfg_restore(device_get_softc(dev));
return (bus_generic_resume(dev));
}
void
pcib_bridge_init(device_t dev)
{
pci_write_config(dev, PCIR_IOBASEL_1, 0xff, 1);
pci_write_config(dev, PCIR_IOBASEH_1, 0xffff, 2);
pci_write_config(dev, PCIR_IOLIMITL_1, 0, 1);
pci_write_config(dev, PCIR_IOLIMITH_1, 0, 2);
pci_write_config(dev, PCIR_MEMBASE_1, 0xffff, 2);
pci_write_config(dev, PCIR_MEMLIMIT_1, 0, 2);
pci_write_config(dev, PCIR_PMBASEL_1, 0xffff, 2);
pci_write_config(dev, PCIR_PMBASEH_1, 0xffffffff, 4);
pci_write_config(dev, PCIR_PMLIMITL_1, 0, 2);
pci_write_config(dev, PCIR_PMLIMITH_1, 0, 4);
}
int
pcib_child_present(device_t dev, device_t child)
{
#ifdef PCI_HP
struct pcib_softc *sc = device_get_softc(dev);
int retval;
retval = bus_child_present(dev);
if (retval != 0 && sc->flags & PCIB_HOTPLUG)
retval = pcib_hotplug_present(sc);
return (retval);
#else
return (bus_child_present(dev));
#endif
}
int
pcib_read_ivar(device_t dev, device_t child, int which, uintptr_t *result)
{
struct pcib_softc *sc = device_get_softc(dev);
switch (which) {
case PCIB_IVAR_DOMAIN:
*result = sc->domain;
return(0);
case PCIB_IVAR_BUS:
*result = sc->bus.sec;
return(0);
}
return(ENOENT);
}
int
pcib_write_ivar(device_t dev, device_t child, int which, uintptr_t value)
{
switch (which) {
case PCIB_IVAR_DOMAIN:
return(EINVAL);
case PCIB_IVAR_BUS:
return(EINVAL);
}
return(ENOENT);
}
#ifdef NEW_PCIB
/*
* Attempt to allocate a resource from the existing resources assigned
* to a window.
*/
static struct resource *
pcib_suballoc_resource(struct pcib_softc *sc, struct pcib_window *w,
device_t child, int type, int *rid, rman_res_t start, rman_res_t end,
rman_res_t count, u_int flags)
{
struct resource *res;
if (!pcib_is_window_open(w))
return (NULL);
res = rman_reserve_resource(&w->rman, start, end, count,
flags & ~RF_ACTIVE, child);
if (res == NULL)
return (NULL);
if (bootverbose)
device_printf(sc->dev,
"allocated %s range (%#jx-%#jx) for rid %x of %s\n",
w->name, rman_get_start(res), rman_get_end(res), *rid,
pcib_child_name(child));
rman_set_rid(res, *rid);
/*
* If the resource should be active, pass that request up the
* tree. This assumes the parent drivers can handle
* activating sub-allocated resources.
*/
if (flags & RF_ACTIVE) {
if (bus_activate_resource(child, type, *rid, res) != 0) {
rman_release_resource(res);
return (NULL);
}
}
return (res);
}
/* Allocate a fresh resource range for an unconfigured window. */
static int
pcib_alloc_new_window(struct pcib_softc *sc, struct pcib_window *w, int type,
rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
{
struct resource *res;
rman_res_t base, limit, wmask;
int rid;
/*
* If this is an I/O window on a bridge with ISA enable set
* and the start address is below 64k, then try to allocate an
* initial window of 0x1000 bytes long starting at address
* 0xf000 and walking down. Note that if the original request
* was larger than the non-aliased range size of 0x100 our
* caller would have raised the start address up to 64k
* already.
*/
if (type == SYS_RES_IOPORT && sc->bridgectl & PCIB_BCR_ISA_ENABLE &&
start < 65536) {
for (base = 0xf000; (long)base >= 0; base -= 0x1000) {
limit = base + 0xfff;
/*
* Skip ranges that wouldn't work for the
* original request. Note that the actual
* window that overlaps are the non-alias
* ranges within [base, limit], so this isn't
* quite a simple comparison.
*/
if (start + count > limit - 0x400)
continue;
if (base == 0) {
/*
* The first open region for the window at
* 0 is 0x400-0x4ff.
*/
if (end - count + 1 < 0x400)
continue;
} else {
if (end - count + 1 < base)
continue;
}
if (pcib_alloc_nonisa_ranges(sc, base, limit) == 0) {
w->base = base;
w->limit = limit;
return (0);
}
}
return (ENOSPC);
}
wmask = ((rman_res_t)1 << w->step) - 1;
if (RF_ALIGNMENT(flags) < w->step) {
flags &= ~RF_ALIGNMENT_MASK;
flags |= RF_ALIGNMENT_LOG2(w->step);
}
start &= ~wmask;
end |= wmask;
count = roundup2(count, (rman_res_t)1 << w->step);
rid = w->reg;
res = bus_alloc_resource(sc->dev, type, &rid, start, end, count,
flags & ~RF_ACTIVE);
if (res == NULL)
return (ENOSPC);
pcib_add_window_resources(w, &res, 1);
pcib_activate_window(sc, type);
w->base = rman_get_start(res);
w->limit = rman_get_end(res);
return (0);
}
/* Try to expand an existing window to the requested base and limit. */
static int
pcib_expand_window(struct pcib_softc *sc, struct pcib_window *w, int type,
rman_res_t base, rman_res_t limit)
{
struct resource *res;
int error, i, force_64k_base;
KASSERT(base <= w->base && limit >= w->limit,
("attempting to shrink window"));
/*
* XXX: pcib_grow_window() doesn't try to do this anyway and
* the error handling for all the edge cases would be tedious.
*/
KASSERT(limit == w->limit || base == w->base,
("attempting to grow both ends of a window"));
/*
* Yet more special handling for requests to expand an I/O
* window behind an ISA-enabled bridge. Since I/O windows
* have to grow in 0x1000 increments and the end of the 0xffff
* range is an alias, growing a window below 64k will always
* result in allocating new resources and never adjusting an
* existing resource.
*/
if (type == SYS_RES_IOPORT && sc->bridgectl & PCIB_BCR_ISA_ENABLE &&
(limit <= 65535 || (base <= 65535 && base != w->base))) {
KASSERT(limit == w->limit || limit <= 65535,
("attempting to grow both ends across 64k ISA alias"));
if (base != w->base)
error = pcib_alloc_nonisa_ranges(sc, base, w->base - 1);
else
error = pcib_alloc_nonisa_ranges(sc, w->limit + 1,
limit);
if (error == 0) {
w->base = base;
w->limit = limit;
}
return (error);
}
/*
* Find the existing resource to adjust. Usually there is only one,
* but for an ISA-enabled bridge we might be growing the I/O window
* above 64k and need to find the existing resource that maps all
* of the area above 64k.
*/
for (i = 0; i < w->count; i++) {
if (rman_get_end(w->res[i]) == w->limit)
break;
}
KASSERT(i != w->count, ("did not find existing resource"));
res = w->res[i];
/*
* Usually the resource we found should match the window's
* existing range. The one exception is the ISA-enabled case
* mentioned above in which case the resource should start at
* 64k.
*/
if (type == SYS_RES_IOPORT && sc->bridgectl & PCIB_BCR_ISA_ENABLE &&
w->base <= 65535) {
KASSERT(rman_get_start(res) == 65536,
("existing resource mismatch"));
force_64k_base = 1;
} else {
KASSERT(w->base == rman_get_start(res),
("existing resource mismatch"));
force_64k_base = 0;
}
error = bus_adjust_resource(sc->dev, type, res, force_64k_base ?
rman_get_start(res) : base, limit);
if (error)
return (error);
/* Add the newly allocated region to the resource manager. */
if (w->base != base) {
error = rman_manage_region(&w->rman, base, w->base - 1);
w->base = base;
} else {
error = rman_manage_region(&w->rman, w->limit + 1, limit);
w->limit = limit;
}
if (error) {
if (bootverbose)
device_printf(sc->dev,
"failed to expand %s resource manager\n", w->name);
(void)bus_adjust_resource(sc->dev, type, res, force_64k_base ?
rman_get_start(res) : w->base, w->limit);
}
return (error);
}
/*
* Attempt to grow a window to make room for a given resource request.
*/
static int
pcib_grow_window(struct pcib_softc *sc, struct pcib_window *w, int type,
rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
{
rman_res_t align, start_free, end_free, front, back, wmask;
int error;
/*
* Clamp the desired resource range to the maximum address
* this window supports. Reject impossible requests.
*
* For I/O port requests behind a bridge with the ISA enable
* bit set, force large allocations to start above 64k.
*/
if (!w->valid)
return (EINVAL);
if (sc->bridgectl & PCIB_BCR_ISA_ENABLE && count > 0x100 &&
start < 65536)
start = 65536;
if (end > w->rman.rm_end)
end = w->rman.rm_end;
if (start + count - 1 > end || start + count < start)
return (EINVAL);
wmask = ((rman_res_t)1 << w->step) - 1;
/*
* If there is no resource at all, just try to allocate enough
* aligned space for this resource.
*/
if (w->res == NULL) {
error = pcib_alloc_new_window(sc, w, type, start, end, count,
flags);
if (error) {
if (bootverbose)
device_printf(sc->dev,
"failed to allocate initial %s window (%#jx-%#jx,%#jx)\n",
w->name, start, end, count);
return (error);
}
if (bootverbose)
device_printf(sc->dev,
"allocated initial %s window of %#jx-%#jx\n",
w->name, (uintmax_t)w->base, (uintmax_t)w->limit);
goto updatewin;
}
/*
* See if growing the window would help. Compute the minimum
* amount of address space needed on both the front and back
* ends of the existing window to satisfy the allocation.
*
* For each end, build a candidate region adjusting for the
* required alignment, etc. If there is a free region at the
* edge of the window, grow from the inner edge of the free
* region. Otherwise grow from the window boundary.
*
* Growing an I/O window below 64k for a bridge with the ISA
* enable bit doesn't require any special magic as the step
* size of an I/O window (1k) always includes multiple
* non-alias ranges when it is grown in either direction.
*
* XXX: Special case: if w->res is completely empty and the
* request size is larger than w->res, we should find the
* optimal aligned buffer containing w->res and allocate that.
*/
if (bootverbose)
device_printf(sc->dev,
"attempting to grow %s window for (%#jx-%#jx,%#jx)\n",
w->name, start, end, count);
align = (rman_res_t)1 << RF_ALIGNMENT(flags);
if (start < w->base) {
if (rman_first_free_region(&w->rman, &start_free, &end_free) !=
0 || start_free != w->base)
end_free = w->base;
if (end_free > end)
end_free = end + 1;
/* Move end_free down until it is properly aligned. */
end_free &= ~(align - 1);
end_free--;
front = end_free - (count - 1);
/*
* The resource would now be allocated at (front,
* end_free). Ensure that fits in the (start, end)
* bounds. end_free is checked above. If 'front' is
* ok, ensure it is properly aligned for this window.
* Also check for underflow.
*/
if (front >= start && front <= end_free) {
if (bootverbose)
printf("\tfront candidate range: %#jx-%#jx\n",
front, end_free);
front &= ~wmask;
front = w->base - front;
} else
front = 0;
} else
front = 0;
if (end > w->limit) {
if (rman_last_free_region(&w->rman, &start_free, &end_free) !=
0 || end_free != w->limit)
start_free = w->limit + 1;
if (start_free < start)
start_free = start;
/* Move start_free up until it is properly aligned. */
start_free = roundup2(start_free, align);
back = start_free + count - 1;
/*
* The resource would now be allocated at (start_free,
* back). Ensure that fits in the (start, end)
* bounds. start_free is checked above. If 'back' is
* ok, ensure it is properly aligned for this window.
* Also check for overflow.
*/
if (back <= end && start_free <= back) {
if (bootverbose)
printf("\tback candidate range: %#jx-%#jx\n",
start_free, back);
back |= wmask;
back -= w->limit;
} else
back = 0;
} else
back = 0;
/*
* Try to allocate the smallest needed region first.
* If that fails, fall back to the other region.
*/
error = ENOSPC;
while (front != 0 || back != 0) {
if (front != 0 && (front <= back || back == 0)) {
error = pcib_expand_window(sc, w, type, w->base - front,
w->limit);
if (error == 0)
break;
front = 0;
} else {
error = pcib_expand_window(sc, w, type, w->base,
w->limit + back);
if (error == 0)
break;
back = 0;
}
}
if (error)
return (error);
if (bootverbose)
device_printf(sc->dev, "grew %s window to %#jx-%#jx\n",
w->name, (uintmax_t)w->base, (uintmax_t)w->limit);
updatewin:
/* Write the new window. */
KASSERT((w->base & wmask) == 0, ("start address is not aligned"));
KASSERT((w->limit & wmask) == wmask, ("end address is not aligned"));
pcib_write_windows(sc, w->mask);
return (0);
}
/*
* We have to trap resource allocation requests and ensure that the bridge
* is set up to, or capable of handling them.
*/
struct resource *
pcib_alloc_resource(device_t dev, device_t child, int type, int *rid,
rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
{
struct pcib_softc *sc;
struct resource *r;
sc = device_get_softc(dev);
/*
* VGA resources are decoded iff the VGA enable bit is set in
* the bridge control register. VGA resources do not fall into
* the resource windows and are passed up to the parent.
*/
if ((type == SYS_RES_IOPORT && pci_is_vga_ioport_range(start, end)) ||
(type == SYS_RES_MEMORY && pci_is_vga_memory_range(start, end))) {
if (sc->bridgectl & PCIB_BCR_VGA_ENABLE)
return (bus_generic_alloc_resource(dev, child, type,
rid, start, end, count, flags));
else
return (NULL);
}
switch (type) {
#ifdef PCI_RES_BUS
case PCI_RES_BUS:
return (pcib_alloc_subbus(&sc->bus, child, rid, start, end,
count, flags));
#endif
case SYS_RES_IOPORT:
if (pcib_is_isa_range(sc, start, end, count))
return (NULL);
r = pcib_suballoc_resource(sc, &sc->io, child, type, rid, start,
end, count, flags);
if (r != NULL || (sc->flags & PCIB_SUBTRACTIVE) != 0)
break;
if (pcib_grow_window(sc, &sc->io, type, start, end, count,
flags) == 0)
r = pcib_suballoc_resource(sc, &sc->io, child, type,
rid, start, end, count, flags);
break;
case SYS_RES_MEMORY:
/*
* For prefetchable resources, prefer the prefetchable
* memory window, but fall back to the regular memory
* window if that fails. Try both windows before
* attempting to grow a window in case the firmware
* has used a range in the regular memory window to
* map a prefetchable BAR.
*/
if (flags & RF_PREFETCHABLE) {
r = pcib_suballoc_resource(sc, &sc->pmem, child, type,
rid, start, end, count, flags);
if (r != NULL)
break;
}
r = pcib_suballoc_resource(sc, &sc->mem, child, type, rid,
start, end, count, flags);
if (r != NULL || (sc->flags & PCIB_SUBTRACTIVE) != 0)
break;
if (flags & RF_PREFETCHABLE) {
if (pcib_grow_window(sc, &sc->pmem, type, start, end,
count, flags) == 0) {
r = pcib_suballoc_resource(sc, &sc->pmem, child,
type, rid, start, end, count, flags);
if (r != NULL)
break;
}
}
if (pcib_grow_window(sc, &sc->mem, type, start, end, count,
flags & ~RF_PREFETCHABLE) == 0)
r = pcib_suballoc_resource(sc, &sc->mem, child, type,
rid, start, end, count, flags);
break;
default:
return (bus_generic_alloc_resource(dev, child, type, rid,
start, end, count, flags));
}
/*
* If attempts to suballocate from the window fail but this is a
* subtractive bridge, pass the request up the tree.
*/
if (sc->flags & PCIB_SUBTRACTIVE && r == NULL)
return (bus_generic_alloc_resource(dev, child, type, rid,
start, end, count, flags));
return (r);
}
int
pcib_adjust_resource(device_t bus, device_t child, int type, struct resource *r,
rman_res_t start, rman_res_t end)
{
struct pcib_softc *sc;
sc = device_get_softc(bus);
if (pcib_is_resource_managed(sc, type, r))
return (rman_adjust_resource(r, start, end));
return (bus_generic_adjust_resource(bus, child, type, r, start, end));
}
int
pcib_release_resource(device_t dev, device_t child, int type, int rid,
struct resource *r)
{
struct pcib_softc *sc;
int error;
sc = device_get_softc(dev);
if (pcib_is_resource_managed(sc, type, r)) {
if (rman_get_flags(r) & RF_ACTIVE) {
error = bus_deactivate_resource(child, type, rid, r);
if (error)
return (error);
}
return (rman_release_resource(r));
}
return (bus_generic_release_resource(dev, child, type, rid, r));
}
#else
/*
* We have to trap resource allocation requests and ensure that the bridge
* is set up to, or capable of handling them.
*/
struct resource *
pcib_alloc_resource(device_t dev, device_t child, int type, int *rid,
rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
{
struct pcib_softc *sc = device_get_softc(dev);
const char *name, *suffix;
int ok;
/*
* Fail the allocation for this range if it's not supported.
*/
name = device_get_nameunit(child);
if (name == NULL) {
name = "";
suffix = "";
} else
suffix = " ";
switch (type) {
case SYS_RES_IOPORT:
ok = 0;
if (!pcib_is_io_open(sc))
break;
ok = (start >= sc->iobase && end <= sc->iolimit);
/*
* Make sure we allow access to VGA I/O addresses when the
* bridge has the "VGA Enable" bit set.
*/
if (!ok && pci_is_vga_ioport_range(start, end))
ok = (sc->bridgectl & PCIB_BCR_VGA_ENABLE) ? 1 : 0;
if ((sc->flags & PCIB_SUBTRACTIVE) == 0) {
if (!ok) {
if (start < sc->iobase)
start = sc->iobase;
if (end > sc->iolimit)
end = sc->iolimit;
if (start < end)
ok = 1;
}
} else {
ok = 1;
#if 0
/*
* If we overlap with the subtractive range, then
* pick the upper range to use.
*/
if (start < sc->iolimit && end > sc->iobase)
start = sc->iolimit + 1;
#endif
}
if (end < start) {
device_printf(dev, "ioport: end (%jx) < start (%jx)\n",
end, start);
start = 0;
end = 0;
ok = 0;
}
if (!ok) {
device_printf(dev, "%s%srequested unsupported I/O "
"range 0x%jx-0x%jx (decoding 0x%x-0x%x)\n",
name, suffix, start, end, sc->iobase, sc->iolimit);
return (NULL);
}
if (bootverbose)
device_printf(dev,
"%s%srequested I/O range 0x%jx-0x%jx: in range\n",
name, suffix, start, end);
break;
case SYS_RES_MEMORY:
ok = 0;
if (pcib_is_nonprefetch_open(sc))
ok = ok || (start >= sc->membase && end <= sc->memlimit);
if (pcib_is_prefetch_open(sc))
ok = ok || (start >= sc->pmembase && end <= sc->pmemlimit);
/*
* Make sure we allow access to VGA memory addresses when the
* bridge has the "VGA Enable" bit set.
*/
if (!ok && pci_is_vga_memory_range(start, end))
ok = (sc->bridgectl & PCIB_BCR_VGA_ENABLE) ? 1 : 0;
if ((sc->flags & PCIB_SUBTRACTIVE) == 0) {
if (!ok) {
ok = 1;
if (flags & RF_PREFETCHABLE) {
if (pcib_is_prefetch_open(sc)) {
if (start < sc->pmembase)
start = sc->pmembase;
if (end > sc->pmemlimit)
end = sc->pmemlimit;
} else {
ok = 0;
}
} else { /* non-prefetchable */
if (pcib_is_nonprefetch_open(sc)) {
if (start < sc->membase)
start = sc->membase;
if (end > sc->memlimit)
end = sc->memlimit;
} else {
ok = 0;
}
}
}
} else if (!ok) {
ok = 1; /* subtractive bridge: always ok */
#if 0
if (pcib_is_nonprefetch_open(sc)) {
if (start < sc->memlimit && end > sc->membase)
start = sc->memlimit + 1;
}
if (pcib_is_prefetch_open(sc)) {
if (start < sc->pmemlimit && end > sc->pmembase)
start = sc->pmemlimit + 1;
}
#endif
}
if (end < start) {
device_printf(dev, "memory: end (%jx) < start (%jx)\n",
end, start);
start = 0;
end = 0;
ok = 0;
}
if (!ok && bootverbose)
device_printf(dev,
"%s%srequested unsupported memory range %#jx-%#jx "
"(decoding %#jx-%#jx, %#jx-%#jx)\n",
name, suffix, start, end,
(uintmax_t)sc->membase, (uintmax_t)sc->memlimit,
(uintmax_t)sc->pmembase, (uintmax_t)sc->pmemlimit);
if (!ok)
return (NULL);
if (bootverbose)
device_printf(dev,"%s%srequested memory range "
"0x%jx-0x%jx: good\n",
name, suffix, start, end);
break;
default:
break;
}
/*
* Bridge is OK decoding this resource, so pass it up.
*/
return (bus_generic_alloc_resource(dev, child, type, rid, start, end,
count, flags));
}
#endif
/*
* If ARI is enabled on this downstream port, translate the function number
* to the non-ARI slot/function. The downstream port will convert it back in
* hardware. If ARI is not enabled slot and func are not modified.
*/
static __inline void
pcib_xlate_ari(device_t pcib, int bus, int *slot, int *func)
{
struct pcib_softc *sc;
int ari_func;
sc = device_get_softc(pcib);
ari_func = *func;
if (sc->flags & PCIB_ENABLE_ARI) {
KASSERT(*slot == 0,
("Non-zero slot number with ARI enabled!"));
*slot = PCIE_ARI_SLOT(ari_func);
*func = PCIE_ARI_FUNC(ari_func);
}
}
static void
pcib_enable_ari(struct pcib_softc *sc, uint32_t pcie_pos)
{
uint32_t ctl2;
ctl2 = pci_read_config(sc->dev, pcie_pos + PCIER_DEVICE_CTL2, 4);
ctl2 |= PCIEM_CTL2_ARI;
pci_write_config(sc->dev, pcie_pos + PCIER_DEVICE_CTL2, ctl2, 4);
sc->flags |= PCIB_ENABLE_ARI;
}
/*
* PCIB interface.
*/
int
pcib_maxslots(device_t dev)
{
return (PCI_SLOTMAX);
}
static int
pcib_ari_maxslots(device_t dev)
{
struct pcib_softc *sc;
sc = device_get_softc(dev);
if (sc->flags & PCIB_ENABLE_ARI)
return (PCIE_ARI_SLOTMAX);
else
return (PCI_SLOTMAX);
}
static int
pcib_ari_maxfuncs(device_t dev)
{
struct pcib_softc *sc;
sc = device_get_softc(dev);
if (sc->flags & PCIB_ENABLE_ARI)
return (PCIE_ARI_FUNCMAX);
else
return (PCI_FUNCMAX);
}
static void
pcib_ari_decode_rid(device_t pcib, uint16_t rid, int *bus, int *slot,
int *func)
{
struct pcib_softc *sc;
sc = device_get_softc(pcib);
*bus = PCI_RID2BUS(rid);
if (sc->flags & PCIB_ENABLE_ARI) {
*slot = PCIE_ARI_RID2SLOT(rid);
*func = PCIE_ARI_RID2FUNC(rid);
} else {
*slot = PCI_RID2SLOT(rid);
*func = PCI_RID2FUNC(rid);
}
}
/*
* Since we are a child of a PCI bus, its parent must support the pcib interface.
*/
static uint32_t
pcib_read_config(device_t dev, u_int b, u_int s, u_int f, u_int reg, int width)
{
#ifdef PCI_HP
struct pcib_softc *sc;
sc = device_get_softc(dev);
if (!pcib_present(sc)) {
switch (width) {
case 2:
return (0xffff);
case 1:
return (0xff);
default:
return (0xffffffff);
}
}
#endif
pcib_xlate_ari(dev, b, &s, &f);
return(PCIB_READ_CONFIG(device_get_parent(device_get_parent(dev)), b, s,
f, reg, width));
}
static void
pcib_write_config(device_t dev, u_int b, u_int s, u_int f, u_int reg, uint32_t val, int width)
{
#ifdef PCI_HP
struct pcib_softc *sc;
sc = device_get_softc(dev);
if (!pcib_present(sc))
return;
#endif
pcib_xlate_ari(dev, b, &s, &f);
PCIB_WRITE_CONFIG(device_get_parent(device_get_parent(dev)), b, s, f,
reg, val, width);
}
/*
* Route an interrupt across a PCI bridge.
*/
int
pcib_route_interrupt(device_t pcib, device_t dev, int pin)
{
device_t bus;
int parent_intpin;
int intnum;
/*
*
* The PCI standard defines a swizzle of the child-side device/intpin to
* the parent-side intpin as follows.
*
* device = device on child bus
* child_intpin = intpin on child bus slot (0-3)
* parent_intpin = intpin on parent bus slot (0-3)
*
* parent_intpin = (device + child_intpin) % 4
*/
parent_intpin = (pci_get_slot(dev) + (pin - 1)) % 4;
/*
* Our parent is a PCI bus. Its parent must export the pcib interface
* which includes the ability to route interrupts.
*/
bus = device_get_parent(pcib);
intnum = PCIB_ROUTE_INTERRUPT(device_get_parent(bus), pcib, parent_intpin + 1);
if (PCI_INTERRUPT_VALID(intnum) && bootverbose) {
device_printf(pcib, "slot %d INT%c is routed to irq %d\n",
pci_get_slot(dev), 'A' + pin - 1, intnum);
}
return(intnum);
}
/* Pass request to alloc MSI/MSI-X messages up to the parent bridge. */
int
pcib_alloc_msi(device_t pcib, device_t dev, int count, int maxcount, int *irqs)
{
struct pcib_softc *sc = device_get_softc(pcib);
device_t bus;
if (sc->flags & PCIB_DISABLE_MSI)
return (ENXIO);
bus = device_get_parent(pcib);
return (PCIB_ALLOC_MSI(device_get_parent(bus), dev, count, maxcount,
irqs));
}
/* Pass request to release MSI/MSI-X messages up to the parent bridge. */
int
pcib_release_msi(device_t pcib, device_t dev, int count, int *irqs)
{
device_t bus;
bus = device_get_parent(pcib);
return (PCIB_RELEASE_MSI(device_get_parent(bus), dev, count, irqs));
}
/* Pass request to alloc an MSI-X message up to the parent bridge. */
int
pcib_alloc_msix(device_t pcib, device_t dev, int *irq)
{
struct pcib_softc *sc = device_get_softc(pcib);
device_t bus;
if (sc->flags & PCIB_DISABLE_MSIX)
return (ENXIO);
bus = device_get_parent(pcib);
return (PCIB_ALLOC_MSIX(device_get_parent(bus), dev, irq));
}
/* Pass request to release an MSI-X message up to the parent bridge. */
int
pcib_release_msix(device_t pcib, device_t dev, int irq)
{
device_t bus;
bus = device_get_parent(pcib);
return (PCIB_RELEASE_MSIX(device_get_parent(bus), dev, irq));
}
/* Pass request to map MSI/MSI-X message up to parent bridge. */
int
pcib_map_msi(device_t pcib, device_t dev, int irq, uint64_t *addr,
uint32_t *data)
{
device_t bus;
int error;
bus = device_get_parent(pcib);
error = PCIB_MAP_MSI(device_get_parent(bus), dev, irq, addr, data);
if (error)
return (error);
pci_ht_map_msi(pcib, *addr);
return (0);
}
/* Pass request for device power state up to parent bridge. */
int
pcib_power_for_sleep(device_t pcib, device_t dev, int *pstate)
{
device_t bus;
bus = device_get_parent(pcib);
return (PCIB_POWER_FOR_SLEEP(bus, dev, pstate));
}
static int
pcib_ari_enabled(device_t pcib)
{
struct pcib_softc *sc;
sc = device_get_softc(pcib);
return ((sc->flags & PCIB_ENABLE_ARI) != 0);
}
static int
pcib_ari_get_id(device_t pcib, device_t dev, enum pci_id_type type,
uintptr_t *id)
{
struct pcib_softc *sc;
device_t bus_dev;
uint8_t bus, slot, func;
if (type != PCI_ID_RID) {
bus_dev = device_get_parent(pcib);
return (PCIB_GET_ID(device_get_parent(bus_dev), dev, type, id));
}
sc = device_get_softc(pcib);
if (sc->flags & PCIB_ENABLE_ARI) {
bus = pci_get_bus(dev);
func = pci_get_function(dev);
*id = (PCI_ARI_RID(bus, func));
} else {
bus = pci_get_bus(dev);
slot = pci_get_slot(dev);
func = pci_get_function(dev);
*id = (PCI_RID(bus, slot, func));
}
return (0);
}
/*
* Check that the downstream port (pcib) and the endpoint device (dev) both
* support ARI. If so, enable it and return 0, otherwise return an error.
*/
static int
pcib_try_enable_ari(device_t pcib, device_t dev)
{
struct pcib_softc *sc;
int error;
uint32_t cap2;
int ari_cap_off;
uint32_t ari_ver;
uint32_t pcie_pos;
sc = device_get_softc(pcib);
/*
* ARI is controlled in a register in the PCIe capability structure.
* If the downstream port does not have the PCIe capability structure
* then it does not support ARI.
*/
error = pci_find_cap(pcib, PCIY_EXPRESS, &pcie_pos);
if (error != 0)
return (ENODEV);
/* Check that the PCIe port advertises ARI support. */
cap2 = pci_read_config(pcib, pcie_pos + PCIER_DEVICE_CAP2, 4);
if (!(cap2 & PCIEM_CAP2_ARI))
return (ENODEV);
/*
* Check that the endpoint device advertises ARI support via the ARI
* extended capability structure.
*/
error = pci_find_extcap(dev, PCIZ_ARI, &ari_cap_off);
if (error != 0)
return (ENODEV);
/*
* Finally, check that the endpoint device supports the same version
* of ARI that we do.
*/
ari_ver = pci_read_config(dev, ari_cap_off, 4);
if (PCI_EXTCAP_VER(ari_ver) != PCIB_SUPPORTED_ARI_VER) {
if (bootverbose)
device_printf(pcib,
"Unsupported version of ARI (%d) detected\n",
PCI_EXTCAP_VER(ari_ver));
return (ENXIO);
}
pcib_enable_ari(sc, pcie_pos);
return (0);
}
int
pcib_request_feature_allow(device_t pcib, device_t dev,
enum pci_feature feature)
{
/*
* No host firmware we have to negotiate with, so we allow
* every valid feature requested.
*/
switch (feature) {
case PCI_FEATURE_AER:
case PCI_FEATURE_HP:
break;
default:
return (EINVAL);
}
return (0);
}
int
pcib_request_feature(device_t dev, enum pci_feature feature)
{
/*
* Invoke PCIB_REQUEST_FEATURE of this bridge first in case
* the firmware overrides the method of PCI-PCI bridges.
*/
return (PCIB_REQUEST_FEATURE(dev, dev, feature));
}
/*
* Pass the request to use this PCI feature up the tree. Either there's a
* firmware like ACPI that's using this feature that will approve (or deny) the
* request to take it over, or the platform has no such firmware, in which case
* the request will be approved. If the request is approved, the OS is expected
* to make use of the feature or render it harmless.
*/
static int
pcib_request_feature_default(device_t pcib, device_t dev,
enum pci_feature feature)
{
device_t bus;
/*
* Our parent is necessarily a pci bus. Its parent will either be
* another pci bridge (which passes it up) or a host bridge that can
* approve or reject the request.
*/
bus = device_get_parent(pcib);
return (PCIB_REQUEST_FEATURE(device_get_parent(bus), dev, feature));
}
Index: head/sys/dev/smc/if_smc_fdt.c
===================================================================
--- head/sys/dev/smc/if_smc_fdt.c (revision 327172)
+++ head/sys/dev/smc/if_smc_fdt.c (revision 327173)
@@ -1,135 +1,126 @@
/*-
* Copyright (c) 2008 Benno Rice
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/bus.h>
#include <sys/kernel.h>
#include <sys/module.h>
#include <sys/socket.h>
#include <sys/systm.h>
#include <sys/taskqueue.h>
#include <machine/bus.h>
#include <machine/resource.h>
#include <net/ethernet.h>
#include <net/if.h>
#include <net/if_arp.h>
#include <net/if_media.h>
#include <dev/smc/if_smcvar.h>
#include <dev/mii/mii.h>
#include <dev/mii/miivar.h>
#include <dev/fdt/fdt_common.h>
#include <dev/ofw/openfirm.h>
#include <dev/ofw/ofw_bus.h>
#include <dev/ofw/ofw_bus_subr.h>
#include "miibus_if.h"
static int smc_fdt_probe(device_t);
static int smc_fdt_attach(device_t);
static int smc_fdt_detach(device_t);
static int
smc_fdt_probe(device_t dev)
{
struct smc_softc *sc;
if (!ofw_bus_status_okay(dev))
return (ENXIO);
if (ofw_bus_is_compatible(dev, "smsc,lan91c111")) {
sc = device_get_softc(dev);
sc->smc_usemem = 1;
if (smc_probe(dev) != 0) {
return (ENXIO);
}
return (0);
}
return (ENXIO);
}
static int
smc_fdt_attach(device_t dev)
{
- int err;
- struct smc_softc *sc;
- sc = device_get_softc(dev);
-
- err = smc_attach(dev);
- if (err) {
- return (err);
- }
-
- return (0);
+ return smc_attach(dev);
}
static int
smc_fdt_detach(device_t dev)
{
smc_detach(dev);
return (0);
}
static device_method_t smc_fdt_methods[] = {
/* Device interface */
DEVMETHOD(device_probe, smc_fdt_probe),
DEVMETHOD(device_attach, smc_fdt_attach),
DEVMETHOD(device_detach, smc_fdt_detach),
/* MII interface */
DEVMETHOD(miibus_readreg, smc_miibus_readreg),
DEVMETHOD(miibus_writereg, smc_miibus_writereg),
DEVMETHOD(miibus_statchg, smc_miibus_statchg),
{ 0, 0 }
};
static driver_t smc_fdt_driver = {
"smc",
smc_fdt_methods,
sizeof(struct smc_softc),
};
extern devclass_t smc_devclass;
DRIVER_MODULE(smc, simplebus, smc_fdt_driver, smc_devclass, 0, 0);
DRIVER_MODULE(miibus, smc, miibus_driver, miibus_devclass, 0, 0);
MODULE_DEPEND(smc, fdt, 1, 1, 1);
MODULE_DEPEND(smc, ether, 1, 1, 1);
MODULE_DEPEND(smc, miibus, 1, 1, 1);
Index: head/sys/dev/uart/uart_bus_acpi.c
===================================================================
--- head/sys/dev/uart/uart_bus_acpi.c (revision 327172)
+++ head/sys/dev/uart/uart_bus_acpi.c (revision 327173)
@@ -1,127 +1,125 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2001 M. Warner Losh. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/bus.h>
#include <sys/conf.h>
#include <sys/kernel.h>
#include <sys/module.h>
#include <machine/bus.h>
#include <sys/rman.h>
#include <machine/resource.h>
#include <isa/isavar.h>
#include <dev/uart/uart.h>
#include <dev/uart/uart_bus.h>
#include <dev/uart/uart_cpu_acpi.h>
#ifdef __aarch64__
#include <contrib/dev/acpica/include/acpi.h>
#include <contrib/dev/acpica/include/accommon.h>
#include <dev/acpica/acpivar.h>
#endif
static int uart_acpi_probe(device_t dev);
static device_method_t uart_acpi_methods[] = {
/* Device interface */
DEVMETHOD(device_probe, uart_acpi_probe),
DEVMETHOD(device_attach, uart_bus_attach),
DEVMETHOD(device_detach, uart_bus_detach),
DEVMETHOD(device_resume, uart_bus_resume),
{ 0, 0 }
};
static driver_t uart_acpi_driver = {
uart_driver_name,
uart_acpi_methods,
sizeof(struct uart_softc),
};
#if defined(__i386__) || defined(__amd64__)
static struct isa_pnp_id acpi_ns8250_ids[] = {
{0x0005d041, "Standard PC COM port"}, /* PNP0500 */
{0x0105d041, "16550A-compatible COM port"}, /* PNP0501 */
{0x0205d041, "Multiport serial device (non-intelligent 16550)"}, /* PNP0502 */
{0x1005d041, "Generic IRDA-compatible device"}, /* PNP0510 */
{0x1105d041, "Generic IRDA-compatible device"}, /* PNP0511 */
{0x04f0235c, "Wacom Tablet PC Screen"}, /* WACF004 */
{0x0ef0235c, "Wacom Tablet PC Screen 00e"}, /* WACF00e */
{0xe502aa1a, "Wacom Tablet at FuS Lifebook T"}, /* FUJ02E5 */
{0}
};
#endif
#ifdef __aarch64__
static struct uart_class *
uart_acpi_find_device(device_t dev)
{
struct acpi_uart_compat_data **cd;
ACPI_HANDLE h;
if ((h = acpi_get_handle(dev)) == NULL)
return (NULL);
SET_FOREACH(cd, uart_acpi_class_and_device_set) {
if (acpi_MatchHid(h, (*cd)->hid)) {
return ((*cd)->clas);
}
}
return (NULL);
}
#endif
static int
uart_acpi_probe(device_t dev)
{
struct uart_softc *sc;
- device_t parent;
- parent = device_get_parent(dev);
sc = device_get_softc(dev);
#if defined(__i386__) || defined(__amd64__)
- if (!ISA_PNP_PROBE(parent, dev, acpi_ns8250_ids)) {
+ if (!ISA_PNP_PROBE(device_get_parent(dev), dev, acpi_ns8250_ids)) {
sc->sc_class = &uart_ns8250_class;
return (uart_bus_probe(dev, 0, 0, 0, 0, 0));
}
/* Add checks for non-ns8250 IDs here. */
#elif defined(__aarch64__)
if ((sc->sc_class = uart_acpi_find_device(dev)) != NULL)
return (uart_bus_probe(dev, 2, 0, 0, 0, 0));
#endif
return (ENXIO);
}
DRIVER_MODULE(uart, acpi, uart_acpi_driver, uart_devclass, 0, 0);
Index: head/sys/dev/uart/uart_dev_pl011.c
===================================================================
--- head/sys/dev/uart/uart_dev_pl011.c (revision 327172)
+++ head/sys/dev/uart/uart_dev_pl011.c (revision 327173)
@@ -1,592 +1,590 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2012 Semihalf.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "opt_acpi.h"
#include "opt_platform.h"
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/bus.h>
#include <machine/bus.h>
#include <dev/uart/uart.h>
#include <dev/uart/uart_cpu.h>
#ifdef FDT
#include <dev/uart/uart_cpu_fdt.h>
#include <dev/ofw/ofw_bus.h>
#endif
#include <dev/uart/uart_bus.h>
#include "uart_if.h"
#ifdef DEV_ACPI
#include <dev/uart/uart_cpu_acpi.h>
#include <contrib/dev/acpica/include/acpi.h>
#include <contrib/dev/acpica/include/accommon.h>
#include <contrib/dev/acpica/include/actables.h>
#endif
#include <sys/kdb.h>
/* PL011 UART registers and masks*/
#define UART_DR 0x00 /* Data register */
#define DR_FE (1 << 8) /* Framing error */
#define DR_PE (1 << 9) /* Parity error */
#define DR_BE (1 << 10) /* Break error */
#define DR_OE (1 << 11) /* Overrun error */
#define UART_FR 0x06 /* Flag register */
#define FR_RXFE (1 << 4) /* Receive FIFO/reg empty */
#define FR_TXFF (1 << 5) /* Transmit FIFO/reg full */
#define FR_RXFF (1 << 6) /* Receive FIFO/reg full */
#define FR_TXFE (1 << 7) /* Transmit FIFO/reg empty */
#define UART_IBRD 0x09 /* Integer baud rate register */
#define IBRD_BDIVINT 0xffff /* Significant part of int. divisor value */
#define UART_FBRD 0x0a /* Fractional baud rate register */
#define FBRD_BDIVFRAC 0x3f /* Significant part of frac. divisor value */
#define UART_LCR_H 0x0b /* Line control register */
#define LCR_H_WLEN8 (0x3 << 5)
#define LCR_H_WLEN7 (0x2 << 5)
#define LCR_H_WLEN6 (0x1 << 5)
#define LCR_H_FEN (1 << 4) /* FIFO mode enable */
#define LCR_H_STP2 (1 << 3) /* 2 stop frames at the end */
#define LCR_H_EPS (1 << 2) /* Even parity select */
#define LCR_H_PEN (1 << 1) /* Parity enable */
#define UART_CR 0x0c /* Control register */
#define CR_RXE (1 << 9) /* Receive enable */
#define CR_TXE (1 << 8) /* Transmit enable */
#define CR_UARTEN (1 << 0) /* UART enable */
#define UART_IFLS 0x0d /* FIFO level select register */
#define IFLS_RX_SHIFT 3 /* RX level in bits [5:3] */
#define IFLS_TX_SHIFT 0 /* TX level in bits [2:0] */
#define IFLS_MASK 0x07 /* RX/TX level is 3 bits */
#define IFLS_LVL_1_8th 0 /* Interrupt at 1/8 full */
#define IFLS_LVL_2_8th 1 /* Interrupt at 1/4 full */
#define IFLS_LVL_4_8th 2 /* Interrupt at 1/2 full */
#define IFLS_LVL_6_8th 3 /* Interrupt at 3/4 full */
#define IFLS_LVL_7_8th 4 /* Interrupt at 7/8 full */
#define UART_IMSC 0x0e /* Interrupt mask set/clear register */
#define IMSC_MASK_ALL 0x7ff /* Mask all interrupts */
#define UART_RIS 0x0f /* Raw interrupt status register */
#define UART_RXREADY (1 << 4) /* RX buffer full */
#define UART_TXEMPTY (1 << 5) /* TX buffer empty */
#define RIS_RTIM (1 << 6) /* Receive timeout */
#define RIS_FE (1 << 7) /* Framing error interrupt status */
#define RIS_PE (1 << 8) /* Parity error interrupt status */
#define RIS_BE (1 << 9) /* Break error interrupt status */
#define RIS_OE (1 << 10) /* Overrun interrupt status */
#define UART_MIS 0x10 /* Masked interrupt status register */
#define UART_ICR 0x11 /* Interrupt clear register */
#define UART_PIDREG_0 0x3f8 /* Peripheral ID register 0 */
#define UART_PIDREG_1 0x3f9 /* Peripheral ID register 1 */
#define UART_PIDREG_2 0x3fa /* Peripheral ID register 2 */
#define UART_PIDREG_3 0x3fb /* Peripheral ID register 3 */
/*
* The hardware FIFOs are 16 bytes each on rev 2 and earlier hardware, 32 bytes
* on rev 3 and later. We configure them to interrupt when 3/4 full/empty. For
* RX we set the size to the full hardware capacity so that the uart core
* allocates enough buffer space to hold a complete fifo full of incoming data.
* For TX, we need to limit the size to the capacity we know will be available
* when the interrupt occurs; uart_core will feed exactly that many bytes to
* uart_pl011_bus_transmit() which must consume them all.
*/
#define FIFO_RX_SIZE_R2 16
#define FIFO_TX_SIZE_R2 12
#define FIFO_RX_SIZE_R3 32
#define FIFO_TX_SIZE_R3 24
#define FIFO_IFLS_BITS ((IFLS_LVL_6_8th << IFLS_RX_SHIFT) | (IFLS_LVL_2_8th))
/*
* FIXME: actual register size is SoC-dependent, we need to handle it
*/
#define __uart_getreg(bas, reg) \
bus_space_read_4((bas)->bst, (bas)->bsh, uart_regofs(bas, reg))
#define __uart_setreg(bas, reg, value) \
bus_space_write_4((bas)->bst, (bas)->bsh, uart_regofs(bas, reg), value)
/*
* Low-level UART interface.
*/
static int uart_pl011_probe(struct uart_bas *bas);
static void uart_pl011_init(struct uart_bas *bas, int, int, int, int);
static void uart_pl011_term(struct uart_bas *bas);
static void uart_pl011_putc(struct uart_bas *bas, int);
static int uart_pl011_rxready(struct uart_bas *bas);
static int uart_pl011_getc(struct uart_bas *bas, struct mtx *);
static struct uart_ops uart_pl011_ops = {
.probe = uart_pl011_probe,
.init = uart_pl011_init,
.term = uart_pl011_term,
.putc = uart_pl011_putc,
.rxready = uart_pl011_rxready,
.getc = uart_pl011_getc,
};
static int
uart_pl011_probe(struct uart_bas *bas)
{
return (0);
}
static void
uart_pl011_param(struct uart_bas *bas, int baudrate, int databits, int stopbits,
int parity)
{
uint32_t ctrl, line;
uint32_t baud;
/*
* Zero all settings to make sure
* UART is disabled and not configured
*/
ctrl = line = 0x0;
__uart_setreg(bas, UART_CR, ctrl);
/* As we know UART is disabled we may setup the line */
switch (databits) {
case 7:
line |= LCR_H_WLEN7;
break;
case 6:
line |= LCR_H_WLEN6;
break;
case 8:
default:
line |= LCR_H_WLEN8;
break;
}
if (stopbits == 2)
line |= LCR_H_STP2;
else
line &= ~LCR_H_STP2;
if (parity)
line |= LCR_H_PEN;
else
line &= ~LCR_H_PEN;
line |= LCR_H_FEN;
/* Configure the rest */
ctrl |= (CR_RXE | CR_TXE | CR_UARTEN);
if (bas->rclk != 0 && baudrate != 0) {
baud = bas->rclk * 4 / baudrate;
__uart_setreg(bas, UART_IBRD, ((uint32_t)(baud >> 6)) & IBRD_BDIVINT);
__uart_setreg(bas, UART_FBRD, (uint32_t)(baud & 0x3F) & FBRD_BDIVFRAC);
}
/* Add config. to line before reenabling UART */
__uart_setreg(bas, UART_LCR_H, (__uart_getreg(bas, UART_LCR_H) &
~0xff) | line);
/* Set rx and tx fifo levels. */
__uart_setreg(bas, UART_IFLS, FIFO_IFLS_BITS);
__uart_setreg(bas, UART_CR, ctrl);
}
static void
uart_pl011_init(struct uart_bas *bas, int baudrate, int databits, int stopbits,
int parity)
{
/* Mask all interrupts */
__uart_setreg(bas, UART_IMSC, __uart_getreg(bas, UART_IMSC) &
~IMSC_MASK_ALL);
uart_pl011_param(bas, baudrate, databits, stopbits, parity);
}
static void
uart_pl011_term(struct uart_bas *bas)
{
}
static void
uart_pl011_putc(struct uart_bas *bas, int c)
{
/* Wait when TX FIFO full. Push character otherwise. */
while (__uart_getreg(bas, UART_FR) & FR_TXFF)
;
__uart_setreg(bas, UART_DR, c & 0xff);
}
static int
uart_pl011_rxready(struct uart_bas *bas)
{
return !(__uart_getreg(bas, UART_FR) & FR_RXFE);
}
static int
uart_pl011_getc(struct uart_bas *bas, struct mtx *hwmtx)
{
int c;
while (!uart_pl011_rxready(bas))
;
c = __uart_getreg(bas, UART_DR) & 0xff;
return (c);
}
/*
* High-level UART interface.
*/
struct uart_pl011_softc {
struct uart_softc base;
uint16_t imsc; /* Interrupt mask */
};
static int uart_pl011_bus_attach(struct uart_softc *);
static int uart_pl011_bus_detach(struct uart_softc *);
static int uart_pl011_bus_flush(struct uart_softc *, int);
static int uart_pl011_bus_getsig(struct uart_softc *);
static int uart_pl011_bus_ioctl(struct uart_softc *, int, intptr_t);
static int uart_pl011_bus_ipend(struct uart_softc *);
static int uart_pl011_bus_param(struct uart_softc *, int, int, int, int);
static int uart_pl011_bus_probe(struct uart_softc *);
static int uart_pl011_bus_receive(struct uart_softc *);
static int uart_pl011_bus_setsig(struct uart_softc *, int);
static int uart_pl011_bus_transmit(struct uart_softc *);
static void uart_pl011_bus_grab(struct uart_softc *);
static void uart_pl011_bus_ungrab(struct uart_softc *);
static kobj_method_t uart_pl011_methods[] = {
KOBJMETHOD(uart_attach, uart_pl011_bus_attach),
KOBJMETHOD(uart_detach, uart_pl011_bus_detach),
KOBJMETHOD(uart_flush, uart_pl011_bus_flush),
KOBJMETHOD(uart_getsig, uart_pl011_bus_getsig),
KOBJMETHOD(uart_ioctl, uart_pl011_bus_ioctl),
KOBJMETHOD(uart_ipend, uart_pl011_bus_ipend),
KOBJMETHOD(uart_param, uart_pl011_bus_param),
KOBJMETHOD(uart_probe, uart_pl011_bus_probe),
KOBJMETHOD(uart_receive, uart_pl011_bus_receive),
KOBJMETHOD(uart_setsig, uart_pl011_bus_setsig),
KOBJMETHOD(uart_transmit, uart_pl011_bus_transmit),
KOBJMETHOD(uart_grab, uart_pl011_bus_grab),
KOBJMETHOD(uart_ungrab, uart_pl011_bus_ungrab),
{ 0, 0 }
};
static struct uart_class uart_pl011_class = {
"uart_pl011",
uart_pl011_methods,
sizeof(struct uart_pl011_softc),
.uc_ops = &uart_pl011_ops,
.uc_range = 0x48,
.uc_rclk = 0,
.uc_rshift = 2
};
#ifdef FDT
static struct ofw_compat_data compat_data[] = {
{"arm,pl011", (uintptr_t)&uart_pl011_class},
{NULL, (uintptr_t)NULL},
};
UART_FDT_CLASS_AND_DEVICE(compat_data);
#endif
#ifdef DEV_ACPI
static struct acpi_uart_compat_data acpi_compat_data[] = {
{"ARMH0011", &uart_pl011_class, ACPI_DBG2_ARM_PL011},
{NULL, NULL, 0},
};
UART_ACPI_CLASS_AND_DEVICE(acpi_compat_data);
#endif
static int
uart_pl011_bus_attach(struct uart_softc *sc)
{
struct uart_pl011_softc *psc;
struct uart_bas *bas;
psc = (struct uart_pl011_softc *)sc;
bas = &sc->sc_bas;
/* Enable interrupts */
psc->imsc = (UART_RXREADY | RIS_RTIM | UART_TXEMPTY);
__uart_setreg(bas, UART_IMSC, psc->imsc);
/* Clear interrupts */
__uart_setreg(bas, UART_ICR, IMSC_MASK_ALL);
return (0);
}
static int
uart_pl011_bus_detach(struct uart_softc *sc)
{
return (0);
}
static int
uart_pl011_bus_flush(struct uart_softc *sc, int what)
{
return (0);
}
static int
uart_pl011_bus_getsig(struct uart_softc *sc)
{
return (0);
}
static int
uart_pl011_bus_ioctl(struct uart_softc *sc, int request, intptr_t data)
{
- struct uart_bas *bas;
int error;
- bas = &sc->sc_bas;
error = 0;
uart_lock(sc->sc_hwmtx);
switch (request) {
case UART_IOCTL_BREAK:
break;
case UART_IOCTL_BAUD:
*(int*)data = 115200;
break;
default:
error = EINVAL;
break;
}
uart_unlock(sc->sc_hwmtx);
return (error);
}
static int
uart_pl011_bus_ipend(struct uart_softc *sc)
{
struct uart_pl011_softc *psc;
struct uart_bas *bas;
uint32_t ints;
int ipend;
psc = (struct uart_pl011_softc *)sc;
bas = &sc->sc_bas;
uart_lock(sc->sc_hwmtx);
ints = __uart_getreg(bas, UART_MIS);
ipend = 0;
if (ints & (UART_RXREADY | RIS_RTIM))
ipend |= SER_INT_RXREADY;
if (ints & RIS_BE)
ipend |= SER_INT_BREAK;
if (ints & RIS_OE)
ipend |= SER_INT_OVERRUN;
if (ints & UART_TXEMPTY) {
if (sc->sc_txbusy)
ipend |= SER_INT_TXIDLE;
/* Disable TX interrupt */
__uart_setreg(bas, UART_IMSC, psc->imsc & ~UART_TXEMPTY);
}
uart_unlock(sc->sc_hwmtx);
return (ipend);
}
static int
uart_pl011_bus_param(struct uart_softc *sc, int baudrate, int databits,
int stopbits, int parity)
{
uart_lock(sc->sc_hwmtx);
uart_pl011_param(&sc->sc_bas, baudrate, databits, stopbits, parity);
uart_unlock(sc->sc_hwmtx);
return (0);
}
static int
uart_pl011_bus_probe(struct uart_softc *sc)
{
uint8_t hwrev;
#ifdef FDT
pcell_t node;
uint32_t periphid;
/*
* The FIFO sizes vary depending on hardware; rev 2 and below have 16
* byte FIFOs, rev 3 and up are 32 byte. The hardware rev is in the
* primecell periphid register, but we get a bit of drama, as always,
* with the bcm2835 (rpi), which claims to be rev 3, but has 16 byte
* FIFOs. We check for both the old freebsd-historic and the proper
* bindings-defined compatible strings for bcm2835, and also check the
* workaround the linux drivers use for rpi3, which is to override the
* primecell periphid register value with a property.
*/
if (ofw_bus_is_compatible(sc->sc_dev, "brcm,bcm2835-pl011") ||
ofw_bus_is_compatible(sc->sc_dev, "broadcom,bcm2835-uart")) {
hwrev = 2;
} else {
node = ofw_bus_get_node(sc->sc_dev);
if (OF_getencprop(node, "arm,primecell-periphid", &periphid,
sizeof(periphid)) > 0) {
hwrev = (periphid >> 20) & 0x0f;
} else {
hwrev = __uart_getreg(&sc->sc_bas, UART_PIDREG_2) >> 4;
}
}
#else
hwrev = __uart_getreg(&sc->sc_bas, UART_PIDREG_2) >> 4;
#endif
if (hwrev <= 2) {
sc->sc_rxfifosz = FIFO_RX_SIZE_R2;
sc->sc_txfifosz = FIFO_TX_SIZE_R2;
} else {
sc->sc_rxfifosz = FIFO_RX_SIZE_R3;
sc->sc_txfifosz = FIFO_TX_SIZE_R3;
}
device_set_desc(sc->sc_dev, "PrimeCell UART (PL011)");
return (0);
}
static int
uart_pl011_bus_receive(struct uart_softc *sc)
{
struct uart_bas *bas;
uint32_t ints, xc;
int rx;
bas = &sc->sc_bas;
uart_lock(sc->sc_hwmtx);
for (;;) {
ints = __uart_getreg(bas, UART_FR);
if (ints & FR_RXFE)
break;
if (uart_rx_full(sc)) {
sc->sc_rxbuf[sc->sc_rxput] = UART_STAT_OVERRUN;
break;
}
xc = __uart_getreg(bas, UART_DR);
rx = xc & 0xff;
if (xc & DR_FE)
rx |= UART_STAT_FRAMERR;
if (xc & DR_PE)
rx |= UART_STAT_PARERR;
uart_rx_put(sc, rx);
}
uart_unlock(sc->sc_hwmtx);
return (0);
}
static int
uart_pl011_bus_setsig(struct uart_softc *sc, int sig)
{
return (0);
}
static int
uart_pl011_bus_transmit(struct uart_softc *sc)
{
struct uart_pl011_softc *psc;
struct uart_bas *bas;
int i;
psc = (struct uart_pl011_softc *)sc;
bas = &sc->sc_bas;
uart_lock(sc->sc_hwmtx);
for (i = 0; i < sc->sc_txdatasz; i++) {
__uart_setreg(bas, UART_DR, sc->sc_txbuf[i]);
uart_barrier(bas);
}
/* Mark busy and enable TX interrupt */
sc->sc_txbusy = 1;
__uart_setreg(bas, UART_IMSC, psc->imsc);
uart_unlock(sc->sc_hwmtx);
return (0);
}
static void
uart_pl011_bus_grab(struct uart_softc *sc)
{
struct uart_pl011_softc *psc;
struct uart_bas *bas;
psc = (struct uart_pl011_softc *)sc;
bas = &sc->sc_bas;
/* Disable interrupts on switch to polling */
uart_lock(sc->sc_hwmtx);
__uart_setreg(bas, UART_IMSC, psc->imsc & ~IMSC_MASK_ALL);
uart_unlock(sc->sc_hwmtx);
}
static void
uart_pl011_bus_ungrab(struct uart_softc *sc)
{
struct uart_pl011_softc *psc;
struct uart_bas *bas;
psc = (struct uart_pl011_softc *)sc;
bas = &sc->sc_bas;
/* Switch to using interrupts while not grabbed */
uart_lock(sc->sc_hwmtx);
__uart_setreg(bas, UART_IMSC, psc->imsc);
uart_unlock(sc->sc_hwmtx);
}
Index: head/sys/dev/uart/uart_dev_snps.c
===================================================================
--- head/sys/dev/uart/uart_dev_snps.c (revision 327172)
+++ head/sys/dev/uart/uart_dev_snps.c (revision 327173)
@@ -1,285 +1,283 @@
/*-
* Copyright (c) 2016 Jared McNeill <jmcneill@invisible.ca>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/bus.h>
#include <sys/kernel.h>
#include <sys/module.h>
#include <machine/bus.h>
#include <dev/uart/uart.h>
#include <dev/uart/uart_bus.h>
#include <dev/uart/uart_cpu_fdt.h>
#include <dev/uart/uart_dev_ns8250.h>
#include <dev/ofw/ofw_bus.h>
#include <dev/ofw/ofw_bus_subr.h>
#ifdef EXT_RESOURCES
#include <dev/extres/clk/clk.h>
#include <dev/extres/hwreset/hwreset.h>
#endif
#include "uart_if.h"
struct snps_softc {
struct ns8250_softc ns8250;
#ifdef EXT_RESOURCES
clk_t baudclk;
clk_t apb_pclk;
hwreset_t reset;
#endif
};
static int
snps_uart_attach(struct uart_softc *uart_sc)
{
struct snps_softc *sc;
sc = (struct snps_softc *)uart_sc;
/* UART requires to read USR reg when IIR_BUSY */
sc->ns8250.busy_detect = 1;
return (ns8250_bus_attach(uart_sc));
}
static kobj_method_t snps_methods[] = {
KOBJMETHOD(uart_probe, ns8250_bus_probe),
KOBJMETHOD(uart_attach, snps_uart_attach),
KOBJMETHOD(uart_detach, ns8250_bus_detach),
KOBJMETHOD(uart_flush, ns8250_bus_flush),
KOBJMETHOD(uart_getsig, ns8250_bus_getsig),
KOBJMETHOD(uart_ioctl, ns8250_bus_ioctl),
KOBJMETHOD(uart_ipend, ns8250_bus_ipend),
KOBJMETHOD(uart_param, ns8250_bus_param),
KOBJMETHOD(uart_receive, ns8250_bus_receive),
KOBJMETHOD(uart_setsig, ns8250_bus_setsig),
KOBJMETHOD(uart_transmit, ns8250_bus_transmit),
KOBJMETHOD(uart_grab, ns8250_bus_grab),
KOBJMETHOD(uart_ungrab, ns8250_bus_ungrab),
KOBJMETHOD_END
};
struct uart_class uart_snps_class = {
"snps",
snps_methods,
sizeof(struct snps_softc),
.uc_ops = &uart_ns8250_ops,
.uc_range = 8,
.uc_rclk = 0,
};
static struct ofw_compat_data compat_data[] = {
{ "snps,dw-apb-uart", (uintptr_t)&uart_snps_class },
{ NULL, (uintptr_t)NULL }
};
UART_FDT_CLASS(compat_data);
#ifdef EXT_RESOURCES
static int
snps_get_clocks(device_t dev, clk_t *baudclk, clk_t *apb_pclk)
{
- struct snps_softc *sc;
- sc = device_get_softc(dev);
*baudclk = NULL;
*apb_pclk = NULL;
/* Baud clock is either named "baudclk", or there is a single
* unnamed clock.
*/
if (clk_get_by_ofw_name(dev, 0, "baudclk", baudclk) != 0 &&
clk_get_by_ofw_index(dev, 0, 0, baudclk) != 0)
return (ENOENT);
/* APB peripheral clock is optional */
(void)clk_get_by_ofw_name(dev, 0, "apb_pclk", apb_pclk);
return (0);
}
#endif
static int
snps_probe(device_t dev)
{
struct snps_softc *sc;
struct uart_class *uart_class;
phandle_t node;
uint32_t shift, iowidth, clock;
uint64_t freq;
int error;
#ifdef EXT_RESOURCES
clk_t baudclk, apb_pclk;
hwreset_t reset;
#endif
if (!ofw_bus_status_okay(dev))
return (ENXIO);
uart_class = (struct uart_class *)ofw_bus_search_compatible(dev,
compat_data)->ocd_data;
if (uart_class == NULL)
return (ENXIO);
freq = 0;
sc = device_get_softc(dev);
sc->ns8250.base.sc_class = uart_class;
node = ofw_bus_get_node(dev);
if (OF_getencprop(node, "reg-shift", &shift, sizeof(shift)) <= 0)
shift = 0;
if (OF_getencprop(node, "reg-io-width", &iowidth, sizeof(iowidth)) <= 0)
iowidth = 1;
if (OF_getencprop(node, "clock-frequency", &clock, sizeof(clock)) <= 0)
clock = 0;
#ifdef EXT_RESOURCES
if (hwreset_get_by_ofw_idx(dev, 0, 0, &reset) == 0) {
error = hwreset_deassert(reset);
if (error != 0) {
device_printf(dev, "cannot de-assert reset\n");
return (error);
}
}
if (snps_get_clocks(dev, &baudclk, &apb_pclk) == 0) {
error = clk_enable(baudclk);
if (error != 0) {
device_printf(dev, "cannot enable baud clock\n");
return (error);
}
if (apb_pclk != NULL) {
error = clk_enable(apb_pclk);
if (error != 0) {
device_printf(dev,
"cannot enable peripheral clock\n");
return (error);
}
}
if (clock == 0) {
error = clk_get_freq(baudclk, &freq);
if (error != 0) {
device_printf(dev, "cannot get frequency\n");
return (error);
}
clock = (uint32_t)freq;
}
}
#endif
if (bootverbose && clock == 0)
device_printf(dev, "could not determine frequency\n");
error = uart_bus_probe(dev, (int)shift, (int)iowidth, (int)clock, 0, 0);
if (error != 0)
return (error);
#ifdef EXT_RESOURCES
/* XXX uart_bus_probe has changed the softc, so refresh it */
sc = device_get_softc(dev);
/* Store clock and reset handles for detach */
sc->baudclk = baudclk;
sc->apb_pclk = apb_pclk;
sc->reset = reset;
#endif
return (0);
}
static int
snps_detach(device_t dev)
{
#ifdef EXT_RESOURCES
struct snps_softc *sc;
clk_t baudclk, apb_pclk;
hwreset_t reset;
#endif
int error;
#ifdef EXT_RESOURCES
sc = device_get_softc(dev);
baudclk = sc->baudclk;
apb_pclk = sc->apb_pclk;
reset = sc->reset;
#endif
error = uart_bus_detach(dev);
if (error != 0)
return (error);
#ifdef EXT_RESOURCES
if (reset != NULL) {
error = hwreset_assert(reset);
if (error != 0) {
device_printf(dev, "cannot assert reset\n");
return (error);
}
hwreset_release(reset);
}
if (apb_pclk != NULL) {
error = clk_release(apb_pclk);
if (error != 0) {
device_printf(dev, "cannot release peripheral clock\n");
return (error);
}
}
if (baudclk != NULL) {
error = clk_release(baudclk);
if (error != 0) {
device_printf(dev, "cannot release baud clock\n");
return (error);
}
}
#endif
return (0);
}
static device_method_t snps_bus_methods[] = {
/* Device interface */
DEVMETHOD(device_probe, snps_probe),
DEVMETHOD(device_attach, uart_bus_attach),
DEVMETHOD(device_detach, snps_detach),
DEVMETHOD_END
};
static driver_t snps_uart_driver = {
uart_driver_name,
snps_bus_methods,
sizeof(struct snps_softc)
};
DRIVER_MODULE(uart_snps, simplebus, snps_uart_driver, uart_devclass, 0, 0);
Index: head/sys/dev/usb/controller/dwc_otg_fdt.c
===================================================================
--- head/sys/dev/usb/controller/dwc_otg_fdt.c (revision 327172)
+++ head/sys/dev/usb/controller/dwc_otg_fdt.c (revision 327173)
@@ -1,226 +1,225 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2012 Hans Petter Selasky. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/bus.h>
#include <sys/condvar.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/rman.h>
#include <dev/ofw/openfirm.h>
#include <dev/ofw/ofw_bus.h>
#include <dev/ofw/ofw_bus_subr.h>
#include <dev/usb/usb.h>
#include <dev/usb/usbdi.h>
#include <dev/usb/usb_core.h>
#include <dev/usb/usb_busdma.h>
#include <dev/usb/usb_process.h>
#include <dev/usb/usb_util.h>
#include <dev/usb/usb_controller.h>
#include <dev/usb/usb_bus.h>
#include <dev/usb/controller/dwc_otg.h>
#include <dev/usb/controller/dwc_otg_fdt.h>
static device_probe_t dwc_otg_probe;
static struct ofw_compat_data compat_data[] = {
{ "synopsys,designware-hs-otg2", 1 },
{ "snps,dwc2", 1 },
{ NULL, 0 }
};
static int
dwc_otg_probe(device_t dev)
{
if (!ofw_bus_status_okay(dev))
return (ENXIO);
if (!ofw_bus_search_compatible(dev, compat_data)->ocd_data)
return (ENXIO);
device_set_desc(dev, "DWC OTG 2.0 integrated USB controller");
return (BUS_PROBE_DEFAULT);
}
int
dwc_otg_attach(device_t dev)
{
struct dwc_otg_fdt_softc *sc = device_get_softc(dev);
char usb_mode[24];
int err;
int rid;
/* initialise some bus fields */
sc->sc_otg.sc_bus.parent = dev;
sc->sc_otg.sc_bus.devices = sc->sc_otg.sc_devices;
sc->sc_otg.sc_bus.devices_max = DWC_OTG_MAX_DEVICES;
sc->sc_otg.sc_bus.dma_bits = 32;
/* get USB mode, if any */
if (OF_getprop(ofw_bus_get_node(dev), "dr_mode",
&usb_mode, sizeof(usb_mode)) > 0) {
/* ensure proper zero termination */
usb_mode[sizeof(usb_mode) - 1] = 0;
if (strcasecmp(usb_mode, "host") == 0)
sc->sc_otg.sc_mode = DWC_MODE_HOST;
else if (strcasecmp(usb_mode, "peripheral") == 0)
sc->sc_otg.sc_mode = DWC_MODE_DEVICE;
else if (strcasecmp(usb_mode, "otg") != 0) {
device_printf(dev, "Invalid FDT dr_mode: %s\n",
usb_mode);
}
}
/* get all DMA memory */
if (usb_bus_mem_alloc_all(&sc->sc_otg.sc_bus,
USB_GET_DMA_TAG(dev), NULL)) {
return (ENOMEM);
}
rid = 0;
sc->sc_otg.sc_io_res =
bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE);
if (!(sc->sc_otg.sc_io_res)) {
err = ENOMEM;
goto error;
}
sc->sc_otg.sc_io_tag = rman_get_bustag(sc->sc_otg.sc_io_res);
sc->sc_otg.sc_io_hdl = rman_get_bushandle(sc->sc_otg.sc_io_res);
sc->sc_otg.sc_io_size = rman_get_size(sc->sc_otg.sc_io_res);
/*
* brcm,bcm2708-usb FDT provides two interrupts,
* we need only second one (VC_USB)
*/
rid = ofw_bus_is_compatible(dev, "brcm,bcm2708-usb") ? 1 : 0;
sc->sc_otg.sc_irq_res =
bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_ACTIVE);
if (sc->sc_otg.sc_irq_res == NULL)
goto error;
sc->sc_otg.sc_bus.bdev = device_add_child(dev, "usbus", -1);
if (sc->sc_otg.sc_bus.bdev == NULL)
goto error;
device_set_ivars(sc->sc_otg.sc_bus.bdev, &sc->sc_otg.sc_bus);
err = bus_setup_intr(dev, sc->sc_otg.sc_irq_res, INTR_TYPE_TTY | INTR_MPSAFE,
&dwc_otg_filter_interrupt, &dwc_otg_interrupt, sc, &sc->sc_otg.sc_intr_hdl);
if (err) {
sc->sc_otg.sc_intr_hdl = NULL;
goto error;
}
err = dwc_otg_init(&sc->sc_otg);
if (err == 0) {
err = device_probe_and_attach(sc->sc_otg.sc_bus.bdev);
}
if (err)
goto error;
return (0);
error:
dwc_otg_detach(dev);
return (ENXIO);
}
int
dwc_otg_detach(device_t dev)
{
struct dwc_otg_fdt_softc *sc = device_get_softc(dev);
- int err;
/* during module unload there are lots of children leftover */
device_delete_children(dev);
if (sc->sc_otg.sc_irq_res && sc->sc_otg.sc_intr_hdl) {
/*
* only call dwc_otg_uninit() after dwc_otg_init()
*/
dwc_otg_uninit(&sc->sc_otg);
- err = bus_teardown_intr(dev, sc->sc_otg.sc_irq_res,
+ bus_teardown_intr(dev, sc->sc_otg.sc_irq_res,
sc->sc_otg.sc_intr_hdl);
sc->sc_otg.sc_intr_hdl = NULL;
}
/* free IRQ channel, if any */
if (sc->sc_otg.sc_irq_res) {
bus_release_resource(dev, SYS_RES_IRQ, 0,
sc->sc_otg.sc_irq_res);
sc->sc_otg.sc_irq_res = NULL;
}
/* free memory resource, if any */
if (sc->sc_otg.sc_io_res) {
bus_release_resource(dev, SYS_RES_MEMORY, 0,
sc->sc_otg.sc_io_res);
sc->sc_otg.sc_io_res = NULL;
}
usb_bus_mem_free_all(&sc->sc_otg.sc_bus, NULL);
return (0);
}
static device_method_t dwc_otg_methods[] = {
/* Device interface */
DEVMETHOD(device_probe, dwc_otg_probe),
DEVMETHOD(device_attach, dwc_otg_attach),
DEVMETHOD(device_detach, dwc_otg_detach),
DEVMETHOD(device_suspend, bus_generic_suspend),
DEVMETHOD(device_resume, bus_generic_resume),
DEVMETHOD(device_shutdown, bus_generic_shutdown),
DEVMETHOD_END
};
driver_t dwc_otg_driver = {
.name = "dwcotg",
.methods = dwc_otg_methods,
.size = sizeof(struct dwc_otg_fdt_softc),
};
static devclass_t dwc_otg_devclass;
DRIVER_MODULE(dwcotg, simplebus, dwc_otg_driver, dwc_otg_devclass, 0, 0);
MODULE_DEPEND(dwcotg, usb, 1, 1, 1);
Index: head/sys/dev/usb/controller/ehci.c
===================================================================
--- head/sys/dev/usb/controller/ehci.c (revision 327172)
+++ head/sys/dev/usb/controller/ehci.c (revision 327173)
@@ -1,3975 +1,3971 @@
/* $FreeBSD$ */
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2008 Hans Petter Selasky. All rights reserved.
* Copyright (c) 2004 The NetBSD Foundation, Inc. All rights reserved.
* Copyright (c) 2004 Lennart Augustsson. All rights reserved.
* Copyright (c) 2004 Charles M. Hannum. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* USB Enhanced Host Controller Driver, a.k.a. USB 2.0 controller.
*
* The EHCI 0.96 spec can be found at
* http://developer.intel.com/technology/usb/download/ehci-r096.pdf
* The EHCI 1.0 spec can be found at
* http://developer.intel.com/technology/usb/download/ehci-r10.pdf
* and the USB 2.0 spec at
* http://www.usb.org/developers/docs/usb_20.zip
*
*/
/*
* TODO:
* 1) command failures are not recovered correctly
*/
#ifdef USB_GLOBAL_INCLUDE_FILE
#include USB_GLOBAL_INCLUDE_FILE
#else
#include <sys/stdint.h>
#include <sys/stddef.h>
#include <sys/param.h>
#include <sys/queue.h>
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/bus.h>
#include <sys/module.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/condvar.h>
#include <sys/sysctl.h>
#include <sys/sx.h>
#include <sys/unistd.h>
#include <sys/callout.h>
#include <sys/malloc.h>
#include <sys/priv.h>
#include <dev/usb/usb.h>
#include <dev/usb/usbdi.h>
#define USB_DEBUG_VAR ehcidebug
#include <dev/usb/usb_core.h>
#include <dev/usb/usb_debug.h>
#include <dev/usb/usb_busdma.h>
#include <dev/usb/usb_process.h>
#include <dev/usb/usb_transfer.h>
#include <dev/usb/usb_device.h>
#include <dev/usb/usb_hub.h>
#include <dev/usb/usb_util.h>
#include <dev/usb/usb_controller.h>
#include <dev/usb/usb_bus.h>
#endif /* USB_GLOBAL_INCLUDE_FILE */
#include <dev/usb/controller/ehci.h>
#include <dev/usb/controller/ehcireg.h>
#define EHCI_BUS2SC(bus) \
((ehci_softc_t *)(((uint8_t *)(bus)) - \
((uint8_t *)&(((ehci_softc_t *)0)->sc_bus))))
#ifdef USB_DEBUG
static int ehcidebug = 0;
static int ehcinohighspeed = 0;
static int ehciiaadbug = 0;
static int ehcilostintrbug = 0;
static SYSCTL_NODE(_hw_usb, OID_AUTO, ehci, CTLFLAG_RW, 0, "USB ehci");
SYSCTL_INT(_hw_usb_ehci, OID_AUTO, debug, CTLFLAG_RWTUN,
&ehcidebug, 0, "Debug level");
SYSCTL_INT(_hw_usb_ehci, OID_AUTO, no_hs, CTLFLAG_RWTUN,
&ehcinohighspeed, 0, "Disable High Speed USB");
SYSCTL_INT(_hw_usb_ehci, OID_AUTO, iaadbug, CTLFLAG_RWTUN,
&ehciiaadbug, 0, "Enable doorbell bug workaround");
SYSCTL_INT(_hw_usb_ehci, OID_AUTO, lostintrbug, CTLFLAG_RWTUN,
&ehcilostintrbug, 0, "Enable lost interrupt bug workaround");
static void ehci_dump_regs(ehci_softc_t *sc);
static void ehci_dump_sqh(ehci_softc_t *sc, ehci_qh_t *sqh);
#endif
#define EHCI_INTR_ENDPT 1
static const struct usb_bus_methods ehci_bus_methods;
static const struct usb_pipe_methods ehci_device_bulk_methods;
static const struct usb_pipe_methods ehci_device_ctrl_methods;
static const struct usb_pipe_methods ehci_device_intr_methods;
static const struct usb_pipe_methods ehci_device_isoc_fs_methods;
static const struct usb_pipe_methods ehci_device_isoc_hs_methods;
static void ehci_do_poll(struct usb_bus *);
static void ehci_device_done(struct usb_xfer *, usb_error_t);
static uint8_t ehci_check_transfer(struct usb_xfer *);
static void ehci_timeout(void *);
static void ehci_poll_timeout(void *);
static void ehci_root_intr(ehci_softc_t *sc);
struct ehci_std_temp {
ehci_softc_t *sc;
struct usb_page_cache *pc;
ehci_qtd_t *td;
ehci_qtd_t *td_next;
uint32_t average;
uint32_t qtd_status;
uint32_t len;
uint16_t max_frame_size;
uint8_t shortpkt;
uint8_t auto_data_toggle;
uint8_t setup_alt_next;
uint8_t last_frame;
};
void
ehci_iterate_hw_softc(struct usb_bus *bus, usb_bus_mem_sub_cb_t *cb)
{
ehci_softc_t *sc = EHCI_BUS2SC(bus);
uint32_t i;
cb(bus, &sc->sc_hw.pframes_pc, &sc->sc_hw.pframes_pg,
sizeof(uint32_t) * EHCI_FRAMELIST_COUNT, EHCI_FRAMELIST_ALIGN);
cb(bus, &sc->sc_hw.terminate_pc, &sc->sc_hw.terminate_pg,
sizeof(struct ehci_qh_sub), EHCI_QH_ALIGN);
cb(bus, &sc->sc_hw.async_start_pc, &sc->sc_hw.async_start_pg,
sizeof(ehci_qh_t), EHCI_QH_ALIGN);
for (i = 0; i != EHCI_VIRTUAL_FRAMELIST_COUNT; i++) {
cb(bus, sc->sc_hw.intr_start_pc + i,
sc->sc_hw.intr_start_pg + i,
sizeof(ehci_qh_t), EHCI_QH_ALIGN);
}
for (i = 0; i != EHCI_VIRTUAL_FRAMELIST_COUNT; i++) {
cb(bus, sc->sc_hw.isoc_hs_start_pc + i,
sc->sc_hw.isoc_hs_start_pg + i,
sizeof(ehci_itd_t), EHCI_ITD_ALIGN);
}
for (i = 0; i != EHCI_VIRTUAL_FRAMELIST_COUNT; i++) {
cb(bus, sc->sc_hw.isoc_fs_start_pc + i,
sc->sc_hw.isoc_fs_start_pg + i,
sizeof(ehci_sitd_t), EHCI_SITD_ALIGN);
}
}
usb_error_t
ehci_reset(ehci_softc_t *sc)
{
uint32_t hcr;
int i;
EOWRITE4(sc, EHCI_USBCMD, EHCI_CMD_HCRESET);
for (i = 0; i < 100; i++) {
usb_pause_mtx(NULL, hz / 128);
hcr = EOREAD4(sc, EHCI_USBCMD) & EHCI_CMD_HCRESET;
if (!hcr) {
if (sc->sc_vendor_post_reset != NULL)
sc->sc_vendor_post_reset(sc);
return (0);
}
}
device_printf(sc->sc_bus.bdev, "reset timeout\n");
return (USB_ERR_IOERROR);
}
static usb_error_t
ehci_hcreset(ehci_softc_t *sc)
{
uint32_t hcr;
int i;
EOWRITE4(sc, EHCI_USBCMD, 0); /* Halt controller */
for (i = 0; i < 100; i++) {
usb_pause_mtx(NULL, hz / 128);
hcr = EOREAD4(sc, EHCI_USBSTS) & EHCI_STS_HCH;
if (hcr)
break;
}
if (!hcr)
/*
* Fall through and try reset anyway even though
* Table 2-9 in the EHCI spec says this will result
* in undefined behavior.
*/
device_printf(sc->sc_bus.bdev, "stop timeout\n");
return (ehci_reset(sc));
}
static int
ehci_init_sub(struct ehci_softc *sc)
{
struct usb_page_search buf_res;
uint32_t cparams;
uint32_t hcr;
uint8_t i;
cparams = EREAD4(sc, EHCI_HCCPARAMS);
DPRINTF("cparams=0x%x\n", cparams);
if (EHCI_HCC_64BIT(cparams)) {
DPRINTF("HCC uses 64-bit structures\n");
/* MUST clear segment register if 64 bit capable */
EOWRITE4(sc, EHCI_CTRLDSSEGMENT, 0);
}
usbd_get_page(&sc->sc_hw.pframes_pc, 0, &buf_res);
EOWRITE4(sc, EHCI_PERIODICLISTBASE, buf_res.physaddr);
usbd_get_page(&sc->sc_hw.async_start_pc, 0, &buf_res);
EOWRITE4(sc, EHCI_ASYNCLISTADDR, buf_res.physaddr | EHCI_LINK_QH);
/* enable interrupts */
EOWRITE4(sc, EHCI_USBINTR, sc->sc_eintrs);
/* turn on controller */
EOWRITE4(sc, EHCI_USBCMD,
EHCI_CMD_ITC_1 | /* 1 microframes interrupt delay */
(EOREAD4(sc, EHCI_USBCMD) & EHCI_CMD_FLS_M) |
EHCI_CMD_ASE |
EHCI_CMD_PSE |
EHCI_CMD_RS);
/* Take over port ownership */
EOWRITE4(sc, EHCI_CONFIGFLAG, EHCI_CONF_CF);
for (i = 0; i < 100; i++) {
usb_pause_mtx(NULL, hz / 128);
hcr = EOREAD4(sc, EHCI_USBSTS) & EHCI_STS_HCH;
if (!hcr) {
break;
}
}
if (hcr) {
device_printf(sc->sc_bus.bdev, "run timeout\n");
return (USB_ERR_IOERROR);
}
return (USB_ERR_NORMAL_COMPLETION);
}
usb_error_t
ehci_init(ehci_softc_t *sc)
{
struct usb_page_search buf_res;
uint32_t version;
uint32_t sparams;
uint16_t i;
uint16_t x;
uint16_t y;
uint16_t bit;
usb_error_t err = 0;
DPRINTF("start\n");
usb_callout_init_mtx(&sc->sc_tmo_pcd, &sc->sc_bus.bus_mtx, 0);
usb_callout_init_mtx(&sc->sc_tmo_poll, &sc->sc_bus.bus_mtx, 0);
sc->sc_offs = EHCI_CAPLENGTH(EREAD4(sc, EHCI_CAPLEN_HCIVERSION));
#ifdef USB_DEBUG
if (ehciiaadbug)
sc->sc_flags |= EHCI_SCFLG_IAADBUG;
if (ehcilostintrbug)
sc->sc_flags |= EHCI_SCFLG_LOSTINTRBUG;
if (ehcidebug > 2) {
ehci_dump_regs(sc);
}
#endif
version = EHCI_HCIVERSION(EREAD4(sc, EHCI_CAPLEN_HCIVERSION));
device_printf(sc->sc_bus.bdev, "EHCI version %x.%x\n",
version >> 8, version & 0xff);
sparams = EREAD4(sc, EHCI_HCSPARAMS);
DPRINTF("sparams=0x%x\n", sparams);
sc->sc_noport = EHCI_HCS_N_PORTS(sparams);
sc->sc_bus.usbrev = USB_REV_2_0;
if (!(sc->sc_flags & EHCI_SCFLG_DONTRESET)) {
/* Reset the controller */
DPRINTF("%s: resetting\n",
device_get_nameunit(sc->sc_bus.bdev));
err = ehci_hcreset(sc);
if (err) {
device_printf(sc->sc_bus.bdev, "reset timeout\n");
return (err);
}
}
/*
* use current frame-list-size selection 0: 1024*4 bytes 1: 512*4
* bytes 2: 256*4 bytes 3: unknown
*/
if (EHCI_CMD_FLS(EOREAD4(sc, EHCI_USBCMD)) == 3) {
device_printf(sc->sc_bus.bdev, "invalid frame-list-size\n");
return (USB_ERR_IOERROR);
}
/* set up the bus struct */
sc->sc_bus.methods = &ehci_bus_methods;
sc->sc_eintrs = EHCI_NORMAL_INTRS;
if (1) {
struct ehci_qh_sub *qh;
usbd_get_page(&sc->sc_hw.terminate_pc, 0, &buf_res);
qh = buf_res.buffer;
sc->sc_terminate_self = htohc32(sc, buf_res.physaddr);
/* init terminate TD */
qh->qtd_next =
htohc32(sc, EHCI_LINK_TERMINATE);
qh->qtd_altnext =
htohc32(sc, EHCI_LINK_TERMINATE);
qh->qtd_status =
htohc32(sc, EHCI_QTD_HALTED);
}
for (i = 0; i < EHCI_VIRTUAL_FRAMELIST_COUNT; i++) {
ehci_qh_t *qh;
usbd_get_page(sc->sc_hw.intr_start_pc + i, 0, &buf_res);
qh = buf_res.buffer;
/* initialize page cache pointer */
qh->page_cache = sc->sc_hw.intr_start_pc + i;
/* store a pointer to queue head */
sc->sc_intr_p_last[i] = qh;
qh->qh_self =
htohc32(sc, buf_res.physaddr) |
htohc32(sc, EHCI_LINK_QH);
qh->qh_endp =
htohc32(sc, EHCI_QH_SET_EPS(EHCI_QH_SPEED_HIGH));
qh->qh_endphub =
htohc32(sc, EHCI_QH_SET_MULT(1));
qh->qh_curqtd = 0;
qh->qh_qtd.qtd_next =
htohc32(sc, EHCI_LINK_TERMINATE);
qh->qh_qtd.qtd_altnext =
htohc32(sc, EHCI_LINK_TERMINATE);
qh->qh_qtd.qtd_status =
htohc32(sc, EHCI_QTD_HALTED);
}
/*
* the QHs are arranged to give poll intervals that are
* powers of 2 times 1ms
*/
bit = EHCI_VIRTUAL_FRAMELIST_COUNT / 2;
while (bit) {
x = bit;
while (x & bit) {
ehci_qh_t *qh_x;
ehci_qh_t *qh_y;
y = (x ^ bit) | (bit / 2);
qh_x = sc->sc_intr_p_last[x];
qh_y = sc->sc_intr_p_last[y];
/*
* the next QH has half the poll interval
*/
qh_x->qh_link = qh_y->qh_self;
x++;
}
bit >>= 1;
}
if (1) {
ehci_qh_t *qh;
qh = sc->sc_intr_p_last[0];
/* the last (1ms) QH terminates */
qh->qh_link = htohc32(sc, EHCI_LINK_TERMINATE);
}
for (i = 0; i < EHCI_VIRTUAL_FRAMELIST_COUNT; i++) {
ehci_sitd_t *sitd;
ehci_itd_t *itd;
usbd_get_page(sc->sc_hw.isoc_fs_start_pc + i, 0, &buf_res);
sitd = buf_res.buffer;
/* initialize page cache pointer */
sitd->page_cache = sc->sc_hw.isoc_fs_start_pc + i;
/* store a pointer to the transfer descriptor */
sc->sc_isoc_fs_p_last[i] = sitd;
/* initialize full speed isochronous */
sitd->sitd_self =
htohc32(sc, buf_res.physaddr) |
htohc32(sc, EHCI_LINK_SITD);
sitd->sitd_back =
htohc32(sc, EHCI_LINK_TERMINATE);
sitd->sitd_next =
sc->sc_intr_p_last[i | (EHCI_VIRTUAL_FRAMELIST_COUNT / 2)]->qh_self;
usbd_get_page(sc->sc_hw.isoc_hs_start_pc + i, 0, &buf_res);
itd = buf_res.buffer;
/* initialize page cache pointer */
itd->page_cache = sc->sc_hw.isoc_hs_start_pc + i;
/* store a pointer to the transfer descriptor */
sc->sc_isoc_hs_p_last[i] = itd;
/* initialize high speed isochronous */
itd->itd_self =
htohc32(sc, buf_res.physaddr) |
htohc32(sc, EHCI_LINK_ITD);
itd->itd_next =
sitd->sitd_self;
}
usbd_get_page(&sc->sc_hw.pframes_pc, 0, &buf_res);
if (1) {
uint32_t *pframes;
pframes = buf_res.buffer;
/*
* execution order:
* pframes -> high speed isochronous ->
* full speed isochronous -> interrupt QH's
*/
for (i = 0; i < EHCI_FRAMELIST_COUNT; i++) {
pframes[i] = sc->sc_isoc_hs_p_last
[i & (EHCI_VIRTUAL_FRAMELIST_COUNT - 1)]->itd_self;
}
}
usbd_get_page(&sc->sc_hw.async_start_pc, 0, &buf_res);
if (1) {
ehci_qh_t *qh;
qh = buf_res.buffer;
/* initialize page cache pointer */
qh->page_cache = &sc->sc_hw.async_start_pc;
/* store a pointer to the queue head */
sc->sc_async_p_last = qh;
/* init dummy QH that starts the async list */
qh->qh_self =
htohc32(sc, buf_res.physaddr) |
htohc32(sc, EHCI_LINK_QH);
/* fill the QH */
qh->qh_endp =
htohc32(sc, EHCI_QH_SET_EPS(EHCI_QH_SPEED_HIGH) | EHCI_QH_HRECL);
qh->qh_endphub = htohc32(sc, EHCI_QH_SET_MULT(1));
qh->qh_link = qh->qh_self;
qh->qh_curqtd = 0;
/* fill the overlay qTD */
qh->qh_qtd.qtd_next = htohc32(sc, EHCI_LINK_TERMINATE);
qh->qh_qtd.qtd_altnext = htohc32(sc, EHCI_LINK_TERMINATE);
qh->qh_qtd.qtd_status = htohc32(sc, EHCI_QTD_HALTED);
}
/* flush all cache into memory */
usb_bus_mem_flush_all(&sc->sc_bus, &ehci_iterate_hw_softc);
#ifdef USB_DEBUG
if (ehcidebug) {
ehci_dump_sqh(sc, sc->sc_async_p_last);
}
#endif
/* finial setup */
err = ehci_init_sub(sc);
if (!err) {
/* catch any lost interrupts */
ehci_do_poll(&sc->sc_bus);
}
return (err);
}
/*
* shut down the controller when the system is going down
*/
void
ehci_detach(ehci_softc_t *sc)
{
USB_BUS_LOCK(&sc->sc_bus);
usb_callout_stop(&sc->sc_tmo_pcd);
usb_callout_stop(&sc->sc_tmo_poll);
EOWRITE4(sc, EHCI_USBINTR, 0);
USB_BUS_UNLOCK(&sc->sc_bus);
if (ehci_hcreset(sc)) {
DPRINTF("reset failed!\n");
}
/* XXX let stray task complete */
usb_pause_mtx(NULL, hz / 20);
usb_callout_drain(&sc->sc_tmo_pcd);
usb_callout_drain(&sc->sc_tmo_poll);
}
static void
ehci_suspend(ehci_softc_t *sc)
{
DPRINTF("stopping the HC\n");
/* reset HC */
ehci_hcreset(sc);
}
static void
ehci_resume(ehci_softc_t *sc)
{
/* reset HC */
ehci_hcreset(sc);
/* setup HC */
ehci_init_sub(sc);
/* catch any lost interrupts */
ehci_do_poll(&sc->sc_bus);
}
#ifdef USB_DEBUG
static void
ehci_dump_regs(ehci_softc_t *sc)
{
uint32_t i;
i = EOREAD4(sc, EHCI_USBCMD);
printf("cmd=0x%08x\n", i);
if (i & EHCI_CMD_ITC_1)
printf(" EHCI_CMD_ITC_1\n");
if (i & EHCI_CMD_ITC_2)
printf(" EHCI_CMD_ITC_2\n");
if (i & EHCI_CMD_ITC_4)
printf(" EHCI_CMD_ITC_4\n");
if (i & EHCI_CMD_ITC_8)
printf(" EHCI_CMD_ITC_8\n");
if (i & EHCI_CMD_ITC_16)
printf(" EHCI_CMD_ITC_16\n");
if (i & EHCI_CMD_ITC_32)
printf(" EHCI_CMD_ITC_32\n");
if (i & EHCI_CMD_ITC_64)
printf(" EHCI_CMD_ITC_64\n");
if (i & EHCI_CMD_ASPME)
printf(" EHCI_CMD_ASPME\n");
if (i & EHCI_CMD_ASPMC)
printf(" EHCI_CMD_ASPMC\n");
if (i & EHCI_CMD_LHCR)
printf(" EHCI_CMD_LHCR\n");
if (i & EHCI_CMD_IAAD)
printf(" EHCI_CMD_IAAD\n");
if (i & EHCI_CMD_ASE)
printf(" EHCI_CMD_ASE\n");
if (i & EHCI_CMD_PSE)
printf(" EHCI_CMD_PSE\n");
if (i & EHCI_CMD_FLS_M)
printf(" EHCI_CMD_FLS_M\n");
if (i & EHCI_CMD_HCRESET)
printf(" EHCI_CMD_HCRESET\n");
if (i & EHCI_CMD_RS)
printf(" EHCI_CMD_RS\n");
i = EOREAD4(sc, EHCI_USBSTS);
printf("sts=0x%08x\n", i);
if (i & EHCI_STS_ASS)
printf(" EHCI_STS_ASS\n");
if (i & EHCI_STS_PSS)
printf(" EHCI_STS_PSS\n");
if (i & EHCI_STS_REC)
printf(" EHCI_STS_REC\n");
if (i & EHCI_STS_HCH)
printf(" EHCI_STS_HCH\n");
if (i & EHCI_STS_IAA)
printf(" EHCI_STS_IAA\n");
if (i & EHCI_STS_HSE)
printf(" EHCI_STS_HSE\n");
if (i & EHCI_STS_FLR)
printf(" EHCI_STS_FLR\n");
if (i & EHCI_STS_PCD)
printf(" EHCI_STS_PCD\n");
if (i & EHCI_STS_ERRINT)
printf(" EHCI_STS_ERRINT\n");
if (i & EHCI_STS_INT)
printf(" EHCI_STS_INT\n");
printf("ien=0x%08x\n",
EOREAD4(sc, EHCI_USBINTR));
printf("frindex=0x%08x ctrdsegm=0x%08x periodic=0x%08x async=0x%08x\n",
EOREAD4(sc, EHCI_FRINDEX),
EOREAD4(sc, EHCI_CTRLDSSEGMENT),
EOREAD4(sc, EHCI_PERIODICLISTBASE),
EOREAD4(sc, EHCI_ASYNCLISTADDR));
for (i = 1; i <= sc->sc_noport; i++) {
printf("port %d status=0x%08x\n", i,
EOREAD4(sc, EHCI_PORTSC(i)));
}
}
static void
ehci_dump_link(ehci_softc_t *sc, uint32_t link, int type)
{
link = hc32toh(sc, link);
printf("0x%08x", link);
if (link & EHCI_LINK_TERMINATE)
printf("<T>");
else {
printf("<");
if (type) {
switch (EHCI_LINK_TYPE(link)) {
case EHCI_LINK_ITD:
printf("ITD");
break;
case EHCI_LINK_QH:
printf("QH");
break;
case EHCI_LINK_SITD:
printf("SITD");
break;
case EHCI_LINK_FSTN:
printf("FSTN");
break;
}
}
printf(">");
}
}
static void
ehci_dump_qtd(ehci_softc_t *sc, ehci_qtd_t *qtd)
{
uint32_t s;
printf(" next=");
ehci_dump_link(sc, qtd->qtd_next, 0);
printf(" altnext=");
ehci_dump_link(sc, qtd->qtd_altnext, 0);
printf("\n");
s = hc32toh(sc, qtd->qtd_status);
printf(" status=0x%08x: toggle=%d bytes=0x%x ioc=%d c_page=0x%x\n",
s, EHCI_QTD_GET_TOGGLE(s), EHCI_QTD_GET_BYTES(s),
EHCI_QTD_GET_IOC(s), EHCI_QTD_GET_C_PAGE(s));
printf(" cerr=%d pid=%d stat=%s%s%s%s%s%s%s%s\n",
EHCI_QTD_GET_CERR(s), EHCI_QTD_GET_PID(s),
(s & EHCI_QTD_ACTIVE) ? "ACTIVE" : "NOT_ACTIVE",
(s & EHCI_QTD_HALTED) ? "-HALTED" : "",
(s & EHCI_QTD_BUFERR) ? "-BUFERR" : "",
(s & EHCI_QTD_BABBLE) ? "-BABBLE" : "",
(s & EHCI_QTD_XACTERR) ? "-XACTERR" : "",
(s & EHCI_QTD_MISSEDMICRO) ? "-MISSED" : "",
(s & EHCI_QTD_SPLITXSTATE) ? "-SPLIT" : "",
(s & EHCI_QTD_PINGSTATE) ? "-PING" : "");
for (s = 0; s < 5; s++) {
printf(" buffer[%d]=0x%08x\n", s,
hc32toh(sc, qtd->qtd_buffer[s]));
}
for (s = 0; s < 5; s++) {
printf(" buffer_hi[%d]=0x%08x\n", s,
hc32toh(sc, qtd->qtd_buffer_hi[s]));
}
}
static uint8_t
ehci_dump_sqtd(ehci_softc_t *sc, ehci_qtd_t *sqtd)
{
uint8_t temp;
usb_pc_cpu_invalidate(sqtd->page_cache);
printf("QTD(%p) at 0x%08x:\n", sqtd, hc32toh(sc, sqtd->qtd_self));
ehci_dump_qtd(sc, sqtd);
temp = (sqtd->qtd_next & htohc32(sc, EHCI_LINK_TERMINATE)) ? 1 : 0;
return (temp);
}
static void
ehci_dump_sqtds(ehci_softc_t *sc, ehci_qtd_t *sqtd)
{
uint16_t i;
uint8_t stop;
stop = 0;
for (i = 0; sqtd && (i < 20) && !stop; sqtd = sqtd->obj_next, i++) {
stop = ehci_dump_sqtd(sc, sqtd);
}
if (sqtd) {
printf("dump aborted, too many TDs\n");
}
}
static void
ehci_dump_sqh(ehci_softc_t *sc, ehci_qh_t *qh)
{
uint32_t endp;
uint32_t endphub;
usb_pc_cpu_invalidate(qh->page_cache);
printf("QH(%p) at 0x%08x:\n", qh, hc32toh(sc, qh->qh_self) & ~0x1F);
printf(" link=");
ehci_dump_link(sc, qh->qh_link, 1);
printf("\n");
endp = hc32toh(sc, qh->qh_endp);
printf(" endp=0x%08x\n", endp);
printf(" addr=0x%02x inact=%d endpt=%d eps=%d dtc=%d hrecl=%d\n",
EHCI_QH_GET_ADDR(endp), EHCI_QH_GET_INACT(endp),
EHCI_QH_GET_ENDPT(endp), EHCI_QH_GET_EPS(endp),
EHCI_QH_GET_DTC(endp), EHCI_QH_GET_HRECL(endp));
printf(" mpl=0x%x ctl=%d nrl=%d\n",
EHCI_QH_GET_MPL(endp), EHCI_QH_GET_CTL(endp),
EHCI_QH_GET_NRL(endp));
endphub = hc32toh(sc, qh->qh_endphub);
printf(" endphub=0x%08x\n", endphub);
printf(" smask=0x%02x cmask=0x%02x huba=0x%02x port=%d mult=%d\n",
EHCI_QH_GET_SMASK(endphub), EHCI_QH_GET_CMASK(endphub),
EHCI_QH_GET_HUBA(endphub), EHCI_QH_GET_PORT(endphub),
EHCI_QH_GET_MULT(endphub));
printf(" curqtd=");
ehci_dump_link(sc, qh->qh_curqtd, 0);
printf("\n");
printf("Overlay qTD:\n");
ehci_dump_qtd(sc, (void *)&qh->qh_qtd);
}
static void
ehci_dump_sitd(ehci_softc_t *sc, ehci_sitd_t *sitd)
{
usb_pc_cpu_invalidate(sitd->page_cache);
printf("SITD(%p) at 0x%08x\n", sitd, hc32toh(sc, sitd->sitd_self) & ~0x1F);
printf(" next=0x%08x\n", hc32toh(sc, sitd->sitd_next));
printf(" portaddr=0x%08x dir=%s addr=%d endpt=0x%x port=0x%x huba=0x%x\n",
hc32toh(sc, sitd->sitd_portaddr),
(sitd->sitd_portaddr & htohc32(sc, EHCI_SITD_SET_DIR_IN))
? "in" : "out",
EHCI_SITD_GET_ADDR(hc32toh(sc, sitd->sitd_portaddr)),
EHCI_SITD_GET_ENDPT(hc32toh(sc, sitd->sitd_portaddr)),
EHCI_SITD_GET_PORT(hc32toh(sc, sitd->sitd_portaddr)),
EHCI_SITD_GET_HUBA(hc32toh(sc, sitd->sitd_portaddr)));
printf(" mask=0x%08x\n", hc32toh(sc, sitd->sitd_mask));
printf(" status=0x%08x <%s> len=0x%x\n", hc32toh(sc, sitd->sitd_status),
(sitd->sitd_status & htohc32(sc, EHCI_SITD_ACTIVE)) ? "ACTIVE" : "",
EHCI_SITD_GET_LEN(hc32toh(sc, sitd->sitd_status)));
printf(" back=0x%08x, bp=0x%08x,0x%08x,0x%08x,0x%08x\n",
hc32toh(sc, sitd->sitd_back),
hc32toh(sc, sitd->sitd_bp[0]),
hc32toh(sc, sitd->sitd_bp[1]),
hc32toh(sc, sitd->sitd_bp_hi[0]),
hc32toh(sc, sitd->sitd_bp_hi[1]));
}
static void
ehci_dump_itd(ehci_softc_t *sc, ehci_itd_t *itd)
{
usb_pc_cpu_invalidate(itd->page_cache);
printf("ITD(%p) at 0x%08x\n", itd, hc32toh(sc, itd->itd_self) & ~0x1F);
printf(" next=0x%08x\n", hc32toh(sc, itd->itd_next));
printf(" status[0]=0x%08x; <%s>\n", hc32toh(sc, itd->itd_status[0]),
(itd->itd_status[0] & htohc32(sc, EHCI_ITD_ACTIVE)) ? "ACTIVE" : "");
printf(" status[1]=0x%08x; <%s>\n", hc32toh(sc, itd->itd_status[1]),
(itd->itd_status[1] & htohc32(sc, EHCI_ITD_ACTIVE)) ? "ACTIVE" : "");
printf(" status[2]=0x%08x; <%s>\n", hc32toh(sc, itd->itd_status[2]),
(itd->itd_status[2] & htohc32(sc, EHCI_ITD_ACTIVE)) ? "ACTIVE" : "");
printf(" status[3]=0x%08x; <%s>\n", hc32toh(sc, itd->itd_status[3]),
(itd->itd_status[3] & htohc32(sc, EHCI_ITD_ACTIVE)) ? "ACTIVE" : "");
printf(" status[4]=0x%08x; <%s>\n", hc32toh(sc, itd->itd_status[4]),
(itd->itd_status[4] & htohc32(sc, EHCI_ITD_ACTIVE)) ? "ACTIVE" : "");
printf(" status[5]=0x%08x; <%s>\n", hc32toh(sc, itd->itd_status[5]),
(itd->itd_status[5] & htohc32(sc, EHCI_ITD_ACTIVE)) ? "ACTIVE" : "");
printf(" status[6]=0x%08x; <%s>\n", hc32toh(sc, itd->itd_status[6]),
(itd->itd_status[6] & htohc32(sc, EHCI_ITD_ACTIVE)) ? "ACTIVE" : "");
printf(" status[7]=0x%08x; <%s>\n", hc32toh(sc, itd->itd_status[7]),
(itd->itd_status[7] & htohc32(sc, EHCI_ITD_ACTIVE)) ? "ACTIVE" : "");
printf(" bp[0]=0x%08x\n", hc32toh(sc, itd->itd_bp[0]));
printf(" addr=0x%02x; endpt=0x%01x\n",
EHCI_ITD_GET_ADDR(hc32toh(sc, itd->itd_bp[0])),
EHCI_ITD_GET_ENDPT(hc32toh(sc, itd->itd_bp[0])));
printf(" bp[1]=0x%08x\n", hc32toh(sc, itd->itd_bp[1]));
printf(" dir=%s; mpl=0x%02x\n",
(hc32toh(sc, itd->itd_bp[1]) & EHCI_ITD_SET_DIR_IN) ? "in" : "out",
EHCI_ITD_GET_MPL(hc32toh(sc, itd->itd_bp[1])));
printf(" bp[2..6]=0x%08x,0x%08x,0x%08x,0x%08x,0x%08x\n",
hc32toh(sc, itd->itd_bp[2]),
hc32toh(sc, itd->itd_bp[3]),
hc32toh(sc, itd->itd_bp[4]),
hc32toh(sc, itd->itd_bp[5]),
hc32toh(sc, itd->itd_bp[6]));
printf(" bp_hi=0x%08x,0x%08x,0x%08x,0x%08x,\n"
" 0x%08x,0x%08x,0x%08x\n",
hc32toh(sc, itd->itd_bp_hi[0]),
hc32toh(sc, itd->itd_bp_hi[1]),
hc32toh(sc, itd->itd_bp_hi[2]),
hc32toh(sc, itd->itd_bp_hi[3]),
hc32toh(sc, itd->itd_bp_hi[4]),
hc32toh(sc, itd->itd_bp_hi[5]),
hc32toh(sc, itd->itd_bp_hi[6]));
}
static void
ehci_dump_isoc(ehci_softc_t *sc)
{
ehci_itd_t *itd;
ehci_sitd_t *sitd;
uint16_t max = 1000;
uint16_t pos;
pos = (EOREAD4(sc, EHCI_FRINDEX) / 8) &
(EHCI_VIRTUAL_FRAMELIST_COUNT - 1);
printf("%s: isochronous dump from frame 0x%03x:\n",
__FUNCTION__, pos);
itd = sc->sc_isoc_hs_p_last[pos];
sitd = sc->sc_isoc_fs_p_last[pos];
while (itd && max && max--) {
ehci_dump_itd(sc, itd);
itd = itd->prev;
}
while (sitd && max && max--) {
ehci_dump_sitd(sc, sitd);
sitd = sitd->prev;
}
}
#endif
static void
ehci_transfer_intr_enqueue(struct usb_xfer *xfer)
{
/* check for early completion */
if (ehci_check_transfer(xfer)) {
return;
}
/* put transfer on interrupt queue */
usbd_transfer_enqueue(&xfer->xroot->bus->intr_q, xfer);
/* start timeout, if any */
if (xfer->timeout != 0) {
usbd_transfer_timeout_ms(xfer, &ehci_timeout, xfer->timeout);
}
}
#define EHCI_APPEND_FS_TD(std,last) (last) = _ehci_append_fs_td(std,last)
static ehci_sitd_t *
_ehci_append_fs_td(ehci_sitd_t *std, ehci_sitd_t *last)
{
DPRINTFN(11, "%p to %p\n", std, last);
/* (sc->sc_bus.mtx) must be locked */
std->next = last->next;
std->sitd_next = last->sitd_next;
std->prev = last;
usb_pc_cpu_flush(std->page_cache);
/*
* the last->next->prev is never followed: std->next->prev = std;
*/
last->next = std;
last->sitd_next = std->sitd_self;
usb_pc_cpu_flush(last->page_cache);
return (std);
}
#define EHCI_APPEND_HS_TD(std,last) (last) = _ehci_append_hs_td(std,last)
static ehci_itd_t *
_ehci_append_hs_td(ehci_itd_t *std, ehci_itd_t *last)
{
DPRINTFN(11, "%p to %p\n", std, last);
/* (sc->sc_bus.mtx) must be locked */
std->next = last->next;
std->itd_next = last->itd_next;
std->prev = last;
usb_pc_cpu_flush(std->page_cache);
/*
* the last->next->prev is never followed: std->next->prev = std;
*/
last->next = std;
last->itd_next = std->itd_self;
usb_pc_cpu_flush(last->page_cache);
return (std);
}
#define EHCI_APPEND_QH(sqh,last) (last) = _ehci_append_qh(sqh,last)
static ehci_qh_t *
_ehci_append_qh(ehci_qh_t *sqh, ehci_qh_t *last)
{
DPRINTFN(11, "%p to %p\n", sqh, last);
if (sqh->prev != NULL) {
/* should not happen */
DPRINTFN(0, "QH already linked!\n");
return (last);
}
/* (sc->sc_bus.mtx) must be locked */
sqh->next = last->next;
sqh->qh_link = last->qh_link;
sqh->prev = last;
usb_pc_cpu_flush(sqh->page_cache);
/*
* the last->next->prev is never followed: sqh->next->prev = sqh;
*/
last->next = sqh;
last->qh_link = sqh->qh_self;
usb_pc_cpu_flush(last->page_cache);
return (sqh);
}
#define EHCI_REMOVE_FS_TD(std,last) (last) = _ehci_remove_fs_td(std,last)
static ehci_sitd_t *
_ehci_remove_fs_td(ehci_sitd_t *std, ehci_sitd_t *last)
{
DPRINTFN(11, "%p from %p\n", std, last);
/* (sc->sc_bus.mtx) must be locked */
std->prev->next = std->next;
std->prev->sitd_next = std->sitd_next;
usb_pc_cpu_flush(std->prev->page_cache);
if (std->next) {
std->next->prev = std->prev;
usb_pc_cpu_flush(std->next->page_cache);
}
return ((last == std) ? std->prev : last);
}
#define EHCI_REMOVE_HS_TD(std,last) (last) = _ehci_remove_hs_td(std,last)
static ehci_itd_t *
_ehci_remove_hs_td(ehci_itd_t *std, ehci_itd_t *last)
{
DPRINTFN(11, "%p from %p\n", std, last);
/* (sc->sc_bus.mtx) must be locked */
std->prev->next = std->next;
std->prev->itd_next = std->itd_next;
usb_pc_cpu_flush(std->prev->page_cache);
if (std->next) {
std->next->prev = std->prev;
usb_pc_cpu_flush(std->next->page_cache);
}
return ((last == std) ? std->prev : last);
}
#define EHCI_REMOVE_QH(sqh,last) (last) = _ehci_remove_qh(sqh,last)
static ehci_qh_t *
_ehci_remove_qh(ehci_qh_t *sqh, ehci_qh_t *last)
{
DPRINTFN(11, "%p from %p\n", sqh, last);
/* (sc->sc_bus.mtx) must be locked */
/* only remove if not removed from a queue */
if (sqh->prev) {
sqh->prev->next = sqh->next;
sqh->prev->qh_link = sqh->qh_link;
usb_pc_cpu_flush(sqh->prev->page_cache);
if (sqh->next) {
sqh->next->prev = sqh->prev;
usb_pc_cpu_flush(sqh->next->page_cache);
}
last = ((last == sqh) ? sqh->prev : last);
sqh->prev = 0;
usb_pc_cpu_flush(sqh->page_cache);
}
return (last);
}
static void
ehci_data_toggle_update(struct usb_xfer *xfer, uint16_t actlen, uint16_t xlen)
{
uint16_t rem;
uint8_t dt;
/* count number of full packets */
dt = (actlen / xfer->max_packet_size) & 1;
/* compute remainder */
rem = actlen % xfer->max_packet_size;
if (rem > 0)
dt ^= 1; /* short packet at the end */
else if (actlen != xlen)
dt ^= 1; /* zero length packet at the end */
else if (xlen == 0)
dt ^= 1; /* zero length transfer */
xfer->endpoint->toggle_next ^= dt;
}
static usb_error_t
ehci_non_isoc_done_sub(struct usb_xfer *xfer)
{
ehci_softc_t *sc = EHCI_BUS2SC(xfer->xroot->bus);
ehci_qtd_t *td;
ehci_qtd_t *td_alt_next;
uint32_t status;
uint16_t len;
td = xfer->td_transfer_cache;
td_alt_next = td->alt_next;
if (xfer->aframes != xfer->nframes) {
usbd_xfer_set_frame_len(xfer, xfer->aframes, 0);
}
while (1) {
usb_pc_cpu_invalidate(td->page_cache);
status = hc32toh(sc, td->qtd_status);
len = EHCI_QTD_GET_BYTES(status);
/*
* Verify the status length and
* add the length to "frlengths[]":
*/
if (len > td->len) {
/* should not happen */
DPRINTF("Invalid status length, "
"0x%04x/0x%04x bytes\n", len, td->len);
status |= EHCI_QTD_HALTED;
} else if (xfer->aframes != xfer->nframes) {
xfer->frlengths[xfer->aframes] += td->len - len;
/* manually update data toggle */
ehci_data_toggle_update(xfer, td->len - len, td->len);
}
/* Check for last transfer */
if (((void *)td) == xfer->td_transfer_last) {
td = NULL;
break;
}
/* Check for transfer error */
if (status & EHCI_QTD_HALTED) {
/* the transfer is finished */
td = NULL;
break;
}
/* Check for short transfer */
if (len > 0) {
if (xfer->flags_int.short_frames_ok) {
/* follow alt next */
td = td->alt_next;
} else {
/* the transfer is finished */
td = NULL;
}
break;
}
td = td->obj_next;
if (td->alt_next != td_alt_next) {
/* this USB frame is complete */
break;
}
}
/* update transfer cache */
xfer->td_transfer_cache = td;
#ifdef USB_DEBUG
if (status & EHCI_QTD_STATERRS) {
DPRINTFN(11, "error, addr=%d, endpt=0x%02x, frame=0x%02x"
"status=%s%s%s%s%s%s%s%s\n",
xfer->address, xfer->endpointno, xfer->aframes,
(status & EHCI_QTD_ACTIVE) ? "[ACTIVE]" : "[NOT_ACTIVE]",
(status & EHCI_QTD_HALTED) ? "[HALTED]" : "",
(status & EHCI_QTD_BUFERR) ? "[BUFERR]" : "",
(status & EHCI_QTD_BABBLE) ? "[BABBLE]" : "",
(status & EHCI_QTD_XACTERR) ? "[XACTERR]" : "",
(status & EHCI_QTD_MISSEDMICRO) ? "[MISSED]" : "",
(status & EHCI_QTD_SPLITXSTATE) ? "[SPLIT]" : "",
(status & EHCI_QTD_PINGSTATE) ? "[PING]" : "");
}
#endif
if (status & EHCI_QTD_HALTED) {
if ((xfer->xroot->udev->parent_hs_hub != NULL) ||
(xfer->xroot->udev->address != 0)) {
/* try to separate I/O errors from STALL */
if (EHCI_QTD_GET_CERR(status) == 0)
return (USB_ERR_IOERROR);
}
return (USB_ERR_STALLED);
}
return (USB_ERR_NORMAL_COMPLETION);
}
static void
ehci_non_isoc_done(struct usb_xfer *xfer)
{
- ehci_softc_t *sc = EHCI_BUS2SC(xfer->xroot->bus);
ehci_qh_t *qh;
- uint32_t status;
usb_error_t err = 0;
DPRINTFN(13, "xfer=%p endpoint=%p transfer done\n",
xfer, xfer->endpoint);
#ifdef USB_DEBUG
if (ehcidebug > 10) {
ehci_softc_t *sc = EHCI_BUS2SC(xfer->xroot->bus);
ehci_dump_sqtds(sc, xfer->td_transfer_first);
}
#endif
/* extract data toggle directly from the QH's overlay area */
qh = xfer->qh_start[xfer->flags_int.curr_dma_set];
usb_pc_cpu_invalidate(qh->page_cache);
-
- status = hc32toh(sc, qh->qh_qtd.qtd_status);
/* reset scanner */
xfer->td_transfer_cache = xfer->td_transfer_first;
if (xfer->flags_int.control_xfr) {
if (xfer->flags_int.control_hdr) {
err = ehci_non_isoc_done_sub(xfer);
}
xfer->aframes = 1;
if (xfer->td_transfer_cache == NULL) {
goto done;
}
}
while (xfer->aframes != xfer->nframes) {
err = ehci_non_isoc_done_sub(xfer);
xfer->aframes++;
if (xfer->td_transfer_cache == NULL) {
goto done;
}
}
if (xfer->flags_int.control_xfr &&
!xfer->flags_int.control_act) {
err = ehci_non_isoc_done_sub(xfer);
}
done:
ehci_device_done(xfer, err);
}
/*------------------------------------------------------------------------*
* ehci_check_transfer
*
* Return values:
* 0: USB transfer is not finished
* Else: USB transfer is finished
*------------------------------------------------------------------------*/
static uint8_t
ehci_check_transfer(struct usb_xfer *xfer)
{
const struct usb_pipe_methods *methods = xfer->endpoint->methods;
ehci_softc_t *sc = EHCI_BUS2SC(xfer->xroot->bus);
uint32_t status;
DPRINTFN(13, "xfer=%p checking transfer\n", xfer);
if (methods == &ehci_device_isoc_fs_methods) {
ehci_sitd_t *td;
/* isochronous full speed transfer */
td = xfer->td_transfer_last;
usb_pc_cpu_invalidate(td->page_cache);
status = hc32toh(sc, td->sitd_status);
/* also check if first is complete */
td = xfer->td_transfer_first;
usb_pc_cpu_invalidate(td->page_cache);
status |= hc32toh(sc, td->sitd_status);
if (!(status & EHCI_SITD_ACTIVE)) {
ehci_device_done(xfer, USB_ERR_NORMAL_COMPLETION);
goto transferred;
}
} else if (methods == &ehci_device_isoc_hs_methods) {
ehci_itd_t *td;
/* isochronous high speed transfer */
/* check last transfer */
td = xfer->td_transfer_last;
usb_pc_cpu_invalidate(td->page_cache);
status = td->itd_status[0];
status |= td->itd_status[1];
status |= td->itd_status[2];
status |= td->itd_status[3];
status |= td->itd_status[4];
status |= td->itd_status[5];
status |= td->itd_status[6];
status |= td->itd_status[7];
/* also check first transfer */
td = xfer->td_transfer_first;
usb_pc_cpu_invalidate(td->page_cache);
status |= td->itd_status[0];
status |= td->itd_status[1];
status |= td->itd_status[2];
status |= td->itd_status[3];
status |= td->itd_status[4];
status |= td->itd_status[5];
status |= td->itd_status[6];
status |= td->itd_status[7];
/* if no transactions are active we continue */
if (!(status & htohc32(sc, EHCI_ITD_ACTIVE))) {
ehci_device_done(xfer, USB_ERR_NORMAL_COMPLETION);
goto transferred;
}
} else {
ehci_qtd_t *td;
ehci_qh_t *qh;
/* non-isochronous transfer */
/*
* check whether there is an error somewhere in the middle,
* or whether there was a short packet (SPD and not ACTIVE)
*/
td = xfer->td_transfer_cache;
qh = xfer->qh_start[xfer->flags_int.curr_dma_set];
usb_pc_cpu_invalidate(qh->page_cache);
status = hc32toh(sc, qh->qh_qtd.qtd_status);
if (status & EHCI_QTD_ACTIVE) {
/* transfer is pending */
goto done;
}
while (1) {
usb_pc_cpu_invalidate(td->page_cache);
status = hc32toh(sc, td->qtd_status);
/*
* Check if there is an active TD which
* indicates that the transfer isn't done.
*/
if (status & EHCI_QTD_ACTIVE) {
/* update cache */
xfer->td_transfer_cache = td;
goto done;
}
/*
* last transfer descriptor makes the transfer done
*/
if (((void *)td) == xfer->td_transfer_last) {
break;
}
/*
* any kind of error makes the transfer done
*/
if (status & EHCI_QTD_HALTED) {
break;
}
/*
* if there is no alternate next transfer, a short
* packet also makes the transfer done
*/
if (EHCI_QTD_GET_BYTES(status)) {
if (xfer->flags_int.short_frames_ok) {
/* follow alt next */
if (td->alt_next) {
td = td->alt_next;
continue;
}
}
/* transfer is done */
break;
}
td = td->obj_next;
}
ehci_non_isoc_done(xfer);
goto transferred;
}
done:
DPRINTFN(13, "xfer=%p is still active\n", xfer);
return (0);
transferred:
return (1);
}
static void
ehci_pcd_enable(ehci_softc_t *sc)
{
USB_BUS_LOCK_ASSERT(&sc->sc_bus, MA_OWNED);
sc->sc_eintrs |= EHCI_STS_PCD;
EOWRITE4(sc, EHCI_USBINTR, sc->sc_eintrs);
/* acknowledge any PCD interrupt */
EOWRITE4(sc, EHCI_USBSTS, EHCI_STS_PCD);
ehci_root_intr(sc);
}
static void
ehci_interrupt_poll(ehci_softc_t *sc)
{
struct usb_xfer *xfer;
repeat:
TAILQ_FOREACH(xfer, &sc->sc_bus.intr_q.head, wait_entry) {
/*
* check if transfer is transferred
*/
if (ehci_check_transfer(xfer)) {
/* queue has been modified */
goto repeat;
}
}
}
/*
* Some EHCI chips from VIA / ATI seem to trigger interrupts before
* writing back the qTD status, or miss signalling occasionally under
* heavy load. If the host machine is too fast, we can miss
* transaction completion - when we scan the active list the
* transaction still seems to be active. This generally exhibits
* itself as a umass stall that never recovers.
*
* We work around this behaviour by setting up this callback after any
* softintr that completes with transactions still pending, giving us
* another chance to check for completion after the writeback has
* taken place.
*/
static void
ehci_poll_timeout(void *arg)
{
ehci_softc_t *sc = arg;
DPRINTFN(3, "\n");
ehci_interrupt_poll(sc);
}
/*------------------------------------------------------------------------*
* ehci_interrupt - EHCI interrupt handler
*
* NOTE: Do not access "sc->sc_bus.bdev" inside the interrupt handler,
* hence the interrupt handler will be setup before "sc->sc_bus.bdev"
* is present !
*------------------------------------------------------------------------*/
void
ehci_interrupt(ehci_softc_t *sc)
{
uint32_t status;
USB_BUS_LOCK(&sc->sc_bus);
DPRINTFN(16, "real interrupt\n");
#ifdef USB_DEBUG
if (ehcidebug > 15) {
ehci_dump_regs(sc);
}
#endif
status = EHCI_STS_INTRS(EOREAD4(sc, EHCI_USBSTS));
if (status == 0) {
/* the interrupt was not for us */
goto done;
}
if (!(status & sc->sc_eintrs)) {
goto done;
}
EOWRITE4(sc, EHCI_USBSTS, status); /* acknowledge */
status &= sc->sc_eintrs;
if (status & EHCI_STS_HSE) {
printf("%s: unrecoverable error, "
"controller halted\n", __FUNCTION__);
#ifdef USB_DEBUG
ehci_dump_regs(sc);
ehci_dump_isoc(sc);
#endif
}
if (status & EHCI_STS_PCD) {
/*
* Disable PCD interrupt for now, because it will be
* on until the port has been reset.
*/
sc->sc_eintrs &= ~EHCI_STS_PCD;
EOWRITE4(sc, EHCI_USBINTR, sc->sc_eintrs);
ehci_root_intr(sc);
/* do not allow RHSC interrupts > 1 per second */
usb_callout_reset(&sc->sc_tmo_pcd, hz,
(void *)&ehci_pcd_enable, sc);
}
status &= ~(EHCI_STS_INT | EHCI_STS_ERRINT | EHCI_STS_PCD | EHCI_STS_IAA);
if (status != 0) {
/* block unprocessed interrupts */
sc->sc_eintrs &= ~status;
EOWRITE4(sc, EHCI_USBINTR, sc->sc_eintrs);
printf("%s: blocking interrupts 0x%x\n", __FUNCTION__, status);
}
/* poll all the USB transfers */
ehci_interrupt_poll(sc);
if (sc->sc_flags & EHCI_SCFLG_LOSTINTRBUG) {
usb_callout_reset(&sc->sc_tmo_poll, hz / 128,
(void *)&ehci_poll_timeout, sc);
}
done:
USB_BUS_UNLOCK(&sc->sc_bus);
}
/*
* called when a request does not complete
*/
static void
ehci_timeout(void *arg)
{
struct usb_xfer *xfer = arg;
DPRINTF("xfer=%p\n", xfer);
USB_BUS_LOCK_ASSERT(xfer->xroot->bus, MA_OWNED);
/* transfer is transferred */
ehci_device_done(xfer, USB_ERR_TIMEOUT);
}
static void
ehci_do_poll(struct usb_bus *bus)
{
ehci_softc_t *sc = EHCI_BUS2SC(bus);
USB_BUS_LOCK(&sc->sc_bus);
ehci_interrupt_poll(sc);
USB_BUS_UNLOCK(&sc->sc_bus);
}
static void
ehci_setup_standard_chain_sub(struct ehci_std_temp *temp)
{
struct usb_page_search buf_res;
ehci_qtd_t *td;
ehci_qtd_t *td_next;
ehci_qtd_t *td_alt_next;
uint32_t buf_offset;
uint32_t average;
uint32_t len_old;
uint32_t terminate;
uint32_t qtd_altnext;
uint8_t shortpkt_old;
uint8_t precompute;
terminate = temp->sc->sc_terminate_self;
qtd_altnext = temp->sc->sc_terminate_self;
td_alt_next = NULL;
buf_offset = 0;
shortpkt_old = temp->shortpkt;
len_old = temp->len;
precompute = 1;
restart:
td = temp->td;
td_next = temp->td_next;
while (1) {
if (temp->len == 0) {
if (temp->shortpkt) {
break;
}
/* send a Zero Length Packet, ZLP, last */
temp->shortpkt = 1;
average = 0;
} else {
average = temp->average;
if (temp->len < average) {
if (temp->len % temp->max_frame_size) {
temp->shortpkt = 1;
}
average = temp->len;
}
}
if (td_next == NULL) {
panic("%s: out of EHCI transfer descriptors!", __FUNCTION__);
}
/* get next TD */
td = td_next;
td_next = td->obj_next;
/* check if we are pre-computing */
if (precompute) {
/* update remaining length */
temp->len -= average;
continue;
}
/* fill out current TD */
td->qtd_status =
temp->qtd_status |
htohc32(temp->sc, EHCI_QTD_IOC |
EHCI_QTD_SET_BYTES(average));
if (average == 0) {
if (temp->auto_data_toggle == 0) {
/* update data toggle, ZLP case */
temp->qtd_status ^=
htohc32(temp->sc, EHCI_QTD_TOGGLE_MASK);
}
td->len = 0;
/* properly reset reserved fields */
td->qtd_buffer[0] = 0;
td->qtd_buffer[1] = 0;
td->qtd_buffer[2] = 0;
td->qtd_buffer[3] = 0;
td->qtd_buffer[4] = 0;
td->qtd_buffer_hi[0] = 0;
td->qtd_buffer_hi[1] = 0;
td->qtd_buffer_hi[2] = 0;
td->qtd_buffer_hi[3] = 0;
td->qtd_buffer_hi[4] = 0;
} else {
uint8_t x;
if (temp->auto_data_toggle == 0) {
/* update data toggle */
if (howmany(average, temp->max_frame_size) & 1) {
temp->qtd_status ^=
htohc32(temp->sc, EHCI_QTD_TOGGLE_MASK);
}
}
td->len = average;
/* update remaining length */
temp->len -= average;
/* fill out buffer pointers */
usbd_get_page(temp->pc, buf_offset, &buf_res);
td->qtd_buffer[0] =
htohc32(temp->sc, buf_res.physaddr);
td->qtd_buffer_hi[0] = 0;
x = 1;
while (average > EHCI_PAGE_SIZE) {
average -= EHCI_PAGE_SIZE;
buf_offset += EHCI_PAGE_SIZE;
usbd_get_page(temp->pc, buf_offset, &buf_res);
td->qtd_buffer[x] =
htohc32(temp->sc,
buf_res.physaddr & (~0xFFF));
td->qtd_buffer_hi[x] = 0;
x++;
}
/*
* NOTE: The "average" variable is never zero after
* exiting the loop above !
*
* NOTE: We have to subtract one from the offset to
* ensure that we are computing the physical address
* of a valid page !
*/
buf_offset += average;
usbd_get_page(temp->pc, buf_offset - 1, &buf_res);
td->qtd_buffer[x] =
htohc32(temp->sc,
buf_res.physaddr & (~0xFFF));
td->qtd_buffer_hi[x] = 0;
/* properly reset reserved fields */
while (++x < EHCI_QTD_NBUFFERS) {
td->qtd_buffer[x] = 0;
td->qtd_buffer_hi[x] = 0;
}
}
if (td_next) {
/* link the current TD with the next one */
td->qtd_next = td_next->qtd_self;
}
td->qtd_altnext = qtd_altnext;
td->alt_next = td_alt_next;
usb_pc_cpu_flush(td->page_cache);
}
if (precompute) {
precompute = 0;
/* setup alt next pointer, if any */
if (temp->last_frame) {
td_alt_next = NULL;
qtd_altnext = terminate;
} else {
/* we use this field internally */
td_alt_next = td_next;
if (temp->setup_alt_next) {
qtd_altnext = td_next->qtd_self;
} else {
qtd_altnext = terminate;
}
}
/* restore */
temp->shortpkt = shortpkt_old;
temp->len = len_old;
goto restart;
}
temp->td = td;
temp->td_next = td_next;
}
static void
ehci_setup_standard_chain(struct usb_xfer *xfer, ehci_qh_t **qh_last)
{
struct ehci_std_temp temp;
const struct usb_pipe_methods *methods;
ehci_qh_t *qh;
ehci_qtd_t *td;
uint32_t qh_endp;
uint32_t qh_endphub;
uint32_t x;
DPRINTFN(9, "addr=%d endpt=%d sumlen=%d speed=%d\n",
xfer->address, UE_GET_ADDR(xfer->endpointno),
xfer->sumlen, usbd_get_speed(xfer->xroot->udev));
temp.average = xfer->max_hc_frame_size;
temp.max_frame_size = xfer->max_frame_size;
temp.sc = EHCI_BUS2SC(xfer->xroot->bus);
/* toggle the DMA set we are using */
xfer->flags_int.curr_dma_set ^= 1;
/* get next DMA set */
td = xfer->td_start[xfer->flags_int.curr_dma_set];
xfer->td_transfer_first = td;
xfer->td_transfer_cache = td;
temp.td = NULL;
temp.td_next = td;
temp.qtd_status = 0;
temp.last_frame = 0;
temp.setup_alt_next = xfer->flags_int.short_frames_ok;
if (xfer->flags_int.control_xfr) {
if (xfer->endpoint->toggle_next) {
/* DATA1 is next */
temp.qtd_status |=
htohc32(temp.sc, EHCI_QTD_SET_TOGGLE(1));
}
temp.auto_data_toggle = 0;
} else {
temp.auto_data_toggle = 1;
}
if ((xfer->xroot->udev->parent_hs_hub != NULL) ||
(xfer->xroot->udev->address != 0)) {
/* max 3 retries */
temp.qtd_status |=
htohc32(temp.sc, EHCI_QTD_SET_CERR(3));
}
/* check if we should prepend a setup message */
if (xfer->flags_int.control_xfr) {
if (xfer->flags_int.control_hdr) {
xfer->endpoint->toggle_next = 0;
temp.qtd_status &=
htohc32(temp.sc, EHCI_QTD_SET_CERR(3));
temp.qtd_status |= htohc32(temp.sc,
EHCI_QTD_ACTIVE |
EHCI_QTD_SET_PID(EHCI_QTD_PID_SETUP) |
EHCI_QTD_SET_TOGGLE(0));
temp.len = xfer->frlengths[0];
temp.pc = xfer->frbuffers + 0;
temp.shortpkt = temp.len ? 1 : 0;
/* check for last frame */
if (xfer->nframes == 1) {
/* no STATUS stage yet, SETUP is last */
if (xfer->flags_int.control_act) {
temp.last_frame = 1;
temp.setup_alt_next = 0;
}
}
ehci_setup_standard_chain_sub(&temp);
}
x = 1;
} else {
x = 0;
}
while (x != xfer->nframes) {
/* DATA0 / DATA1 message */
temp.len = xfer->frlengths[x];
temp.pc = xfer->frbuffers + x;
x++;
if (x == xfer->nframes) {
if (xfer->flags_int.control_xfr) {
/* no STATUS stage yet, DATA is last */
if (xfer->flags_int.control_act) {
temp.last_frame = 1;
temp.setup_alt_next = 0;
}
} else {
temp.last_frame = 1;
temp.setup_alt_next = 0;
}
}
/* keep previous data toggle and error count */
temp.qtd_status &=
htohc32(temp.sc, EHCI_QTD_SET_CERR(3) |
EHCI_QTD_SET_TOGGLE(1));
if (temp.len == 0) {
/* make sure that we send an USB packet */
temp.shortpkt = 0;
} else {
/* regular data transfer */
temp.shortpkt = (xfer->flags.force_short_xfer) ? 0 : 1;
}
/* set endpoint direction */
temp.qtd_status |=
(UE_GET_DIR(xfer->endpointno) == UE_DIR_IN) ?
htohc32(temp.sc, EHCI_QTD_ACTIVE |
EHCI_QTD_SET_PID(EHCI_QTD_PID_IN)) :
htohc32(temp.sc, EHCI_QTD_ACTIVE |
EHCI_QTD_SET_PID(EHCI_QTD_PID_OUT));
ehci_setup_standard_chain_sub(&temp);
}
/* check if we should append a status stage */
if (xfer->flags_int.control_xfr &&
!xfer->flags_int.control_act) {
/*
* Send a DATA1 message and invert the current endpoint
* direction.
*/
temp.qtd_status &= htohc32(temp.sc, EHCI_QTD_SET_CERR(3) |
EHCI_QTD_SET_TOGGLE(1));
temp.qtd_status |=
(UE_GET_DIR(xfer->endpointno) == UE_DIR_OUT) ?
htohc32(temp.sc, EHCI_QTD_ACTIVE |
EHCI_QTD_SET_PID(EHCI_QTD_PID_IN) |
EHCI_QTD_SET_TOGGLE(1)) :
htohc32(temp.sc, EHCI_QTD_ACTIVE |
EHCI_QTD_SET_PID(EHCI_QTD_PID_OUT) |
EHCI_QTD_SET_TOGGLE(1));
temp.len = 0;
temp.pc = NULL;
temp.shortpkt = 0;
temp.last_frame = 1;
temp.setup_alt_next = 0;
ehci_setup_standard_chain_sub(&temp);
}
td = temp.td;
/* the last TD terminates the transfer: */
td->qtd_next = htohc32(temp.sc, EHCI_LINK_TERMINATE);
td->qtd_altnext = htohc32(temp.sc, EHCI_LINK_TERMINATE);
usb_pc_cpu_flush(td->page_cache);
/* must have at least one frame! */
xfer->td_transfer_last = td;
#ifdef USB_DEBUG
if (ehcidebug > 8) {
DPRINTF("nexttog=%d; data before transfer:\n",
xfer->endpoint->toggle_next);
ehci_dump_sqtds(temp.sc,
xfer->td_transfer_first);
}
#endif
methods = xfer->endpoint->methods;
qh = xfer->qh_start[xfer->flags_int.curr_dma_set];
/* the "qh_link" field is filled when the QH is added */
qh_endp =
(EHCI_QH_SET_ADDR(xfer->address) |
EHCI_QH_SET_ENDPT(UE_GET_ADDR(xfer->endpointno)) |
EHCI_QH_SET_MPL(xfer->max_packet_size));
if (usbd_get_speed(xfer->xroot->udev) == USB_SPEED_HIGH) {
qh_endp |= EHCI_QH_SET_EPS(EHCI_QH_SPEED_HIGH);
if (methods != &ehci_device_intr_methods)
qh_endp |= EHCI_QH_SET_NRL(8);
} else {
if (usbd_get_speed(xfer->xroot->udev) == USB_SPEED_FULL) {
qh_endp |= EHCI_QH_SET_EPS(EHCI_QH_SPEED_FULL);
} else {
qh_endp |= EHCI_QH_SET_EPS(EHCI_QH_SPEED_LOW);
}
if (methods == &ehci_device_ctrl_methods) {
qh_endp |= EHCI_QH_CTL;
}
if (methods != &ehci_device_intr_methods) {
/* Only try one time per microframe! */
qh_endp |= EHCI_QH_SET_NRL(1);
}
}
if (temp.auto_data_toggle == 0) {
/* software computes the data toggle */
qh_endp |= EHCI_QH_DTC;
}
qh->qh_endp = htohc32(temp.sc, qh_endp);
qh_endphub =
(EHCI_QH_SET_MULT(xfer->max_packet_count & 3) |
EHCI_QH_SET_CMASK(xfer->endpoint->usb_cmask) |
EHCI_QH_SET_SMASK(xfer->endpoint->usb_smask) |
EHCI_QH_SET_HUBA(xfer->xroot->udev->hs_hub_addr) |
EHCI_QH_SET_PORT(xfer->xroot->udev->hs_port_no));
qh->qh_endphub = htohc32(temp.sc, qh_endphub);
qh->qh_curqtd = 0;
/* fill the overlay qTD */
if (temp.auto_data_toggle && xfer->endpoint->toggle_next) {
/* DATA1 is next */
qh->qh_qtd.qtd_status = htohc32(temp.sc, EHCI_QTD_SET_TOGGLE(1));
} else {
qh->qh_qtd.qtd_status = 0;
}
td = xfer->td_transfer_first;
qh->qh_qtd.qtd_next = td->qtd_self;
qh->qh_qtd.qtd_altnext =
htohc32(temp.sc, EHCI_LINK_TERMINATE);
/* properly reset reserved fields */
qh->qh_qtd.qtd_buffer[0] = 0;
qh->qh_qtd.qtd_buffer[1] = 0;
qh->qh_qtd.qtd_buffer[2] = 0;
qh->qh_qtd.qtd_buffer[3] = 0;
qh->qh_qtd.qtd_buffer[4] = 0;
qh->qh_qtd.qtd_buffer_hi[0] = 0;
qh->qh_qtd.qtd_buffer_hi[1] = 0;
qh->qh_qtd.qtd_buffer_hi[2] = 0;
qh->qh_qtd.qtd_buffer_hi[3] = 0;
qh->qh_qtd.qtd_buffer_hi[4] = 0;
usb_pc_cpu_flush(qh->page_cache);
if (xfer->xroot->udev->flags.self_suspended == 0) {
EHCI_APPEND_QH(qh, *qh_last);
}
}
static void
ehci_root_intr(ehci_softc_t *sc)
{
uint16_t i;
uint16_t m;
USB_BUS_LOCK_ASSERT(&sc->sc_bus, MA_OWNED);
/* clear any old interrupt data */
memset(sc->sc_hub_idata, 0, sizeof(sc->sc_hub_idata));
/* set bits */
m = (sc->sc_noport + 1);
if (m > (8 * sizeof(sc->sc_hub_idata))) {
m = (8 * sizeof(sc->sc_hub_idata));
}
for (i = 1; i < m; i++) {
/* pick out CHANGE bits from the status register */
if (EOREAD4(sc, EHCI_PORTSC(i)) & EHCI_PS_CLEAR) {
sc->sc_hub_idata[i / 8] |= 1 << (i % 8);
DPRINTF("port %d changed\n", i);
}
}
uhub_root_intr(&sc->sc_bus, sc->sc_hub_idata,
sizeof(sc->sc_hub_idata));
}
static void
ehci_isoc_fs_done(ehci_softc_t *sc, struct usb_xfer *xfer)
{
uint32_t nframes = xfer->nframes;
uint32_t status;
uint32_t *plen = xfer->frlengths;
uint16_t len = 0;
ehci_sitd_t *td = xfer->td_transfer_first;
ehci_sitd_t **pp_last = &sc->sc_isoc_fs_p_last[xfer->qh_pos];
DPRINTFN(13, "xfer=%p endpoint=%p transfer done\n",
xfer, xfer->endpoint);
while (nframes--) {
if (td == NULL) {
panic("%s:%d: out of TD's\n",
__FUNCTION__, __LINE__);
}
if (pp_last >= &sc->sc_isoc_fs_p_last[EHCI_VIRTUAL_FRAMELIST_COUNT]) {
pp_last = &sc->sc_isoc_fs_p_last[0];
}
#ifdef USB_DEBUG
if (ehcidebug > 15) {
DPRINTF("isoc FS-TD\n");
ehci_dump_sitd(sc, td);
}
#endif
usb_pc_cpu_invalidate(td->page_cache);
status = hc32toh(sc, td->sitd_status);
len = EHCI_SITD_GET_LEN(status);
DPRINTFN(2, "status=0x%08x, rem=%u\n", status, len);
if (*plen >= len) {
len = *plen - len;
} else {
len = 0;
}
*plen = len;
/* remove FS-TD from schedule */
EHCI_REMOVE_FS_TD(td, *pp_last);
pp_last++;
plen++;
td = td->obj_next;
}
xfer->aframes = xfer->nframes;
}
static void
ehci_isoc_hs_done(ehci_softc_t *sc, struct usb_xfer *xfer)
{
uint32_t nframes = xfer->nframes;
uint32_t status;
uint32_t *plen = xfer->frlengths;
uint16_t len = 0;
uint8_t td_no = 0;
ehci_itd_t *td = xfer->td_transfer_first;
ehci_itd_t **pp_last = &sc->sc_isoc_hs_p_last[xfer->qh_pos];
DPRINTFN(13, "xfer=%p endpoint=%p transfer done\n",
xfer, xfer->endpoint);
while (nframes) {
if (td == NULL) {
panic("%s:%d: out of TD's\n",
__FUNCTION__, __LINE__);
}
if (pp_last >= &sc->sc_isoc_hs_p_last[EHCI_VIRTUAL_FRAMELIST_COUNT]) {
pp_last = &sc->sc_isoc_hs_p_last[0];
}
#ifdef USB_DEBUG
if (ehcidebug > 15) {
DPRINTF("isoc HS-TD\n");
ehci_dump_itd(sc, td);
}
#endif
usb_pc_cpu_invalidate(td->page_cache);
status = hc32toh(sc, td->itd_status[td_no]);
len = EHCI_ITD_GET_LEN(status);
DPRINTFN(2, "status=0x%08x, len=%u\n", status, len);
if (xfer->endpoint->usb_smask & (1 << td_no)) {
if (*plen >= len) {
/*
* The length is valid. NOTE: The
* complete length is written back
* into the status field, and not the
* remainder like with other transfer
* descriptor types.
*/
} else {
/* Invalid length - truncate */
len = 0;
}
*plen = len;
plen++;
nframes--;
}
td_no++;
if ((td_no == 8) || (nframes == 0)) {
/* remove HS-TD from schedule */
EHCI_REMOVE_HS_TD(td, *pp_last);
pp_last++;
td_no = 0;
td = td->obj_next;
}
}
xfer->aframes = xfer->nframes;
}
/* NOTE: "done" can be run two times in a row,
* from close and from interrupt
*/
static void
ehci_device_done(struct usb_xfer *xfer, usb_error_t error)
{
const struct usb_pipe_methods *methods = xfer->endpoint->methods;
ehci_softc_t *sc = EHCI_BUS2SC(xfer->xroot->bus);
USB_BUS_LOCK_ASSERT(&sc->sc_bus, MA_OWNED);
DPRINTFN(2, "xfer=%p, endpoint=%p, error=%d\n",
xfer, xfer->endpoint, error);
if ((methods == &ehci_device_bulk_methods) ||
(methods == &ehci_device_ctrl_methods)) {
#ifdef USB_DEBUG
if (ehcidebug > 8) {
DPRINTF("nexttog=%d; data after transfer:\n",
xfer->endpoint->toggle_next);
ehci_dump_sqtds(sc,
xfer->td_transfer_first);
}
#endif
EHCI_REMOVE_QH(xfer->qh_start[xfer->flags_int.curr_dma_set],
sc->sc_async_p_last);
}
if (methods == &ehci_device_intr_methods) {
EHCI_REMOVE_QH(xfer->qh_start[xfer->flags_int.curr_dma_set],
sc->sc_intr_p_last[xfer->qh_pos]);
}
/*
* Only finish isochronous transfers once which will update
* "xfer->frlengths".
*/
if (xfer->td_transfer_first &&
xfer->td_transfer_last) {
if (methods == &ehci_device_isoc_fs_methods) {
ehci_isoc_fs_done(sc, xfer);
}
if (methods == &ehci_device_isoc_hs_methods) {
ehci_isoc_hs_done(sc, xfer);
}
xfer->td_transfer_first = NULL;
xfer->td_transfer_last = NULL;
}
/* dequeue transfer and start next transfer */
usbd_transfer_done(xfer, error);
}
/*------------------------------------------------------------------------*
* ehci bulk support
*------------------------------------------------------------------------*/
static void
ehci_device_bulk_open(struct usb_xfer *xfer)
{
return;
}
static void
ehci_device_bulk_close(struct usb_xfer *xfer)
{
ehci_device_done(xfer, USB_ERR_CANCELLED);
}
static void
ehci_device_bulk_enter(struct usb_xfer *xfer)
{
return;
}
static void
ehci_doorbell_async(struct ehci_softc *sc)
{
uint32_t temp;
/*
* XXX Performance quirk: Some Host Controllers have a too low
* interrupt rate. Issue an IAAD to stimulate the Host
* Controller after queueing the BULK transfer.
*
* XXX Force the host controller to refresh any QH caches.
*/
temp = EOREAD4(sc, EHCI_USBCMD);
if (!(temp & EHCI_CMD_IAAD))
EOWRITE4(sc, EHCI_USBCMD, temp | EHCI_CMD_IAAD);
}
static void
ehci_device_bulk_start(struct usb_xfer *xfer)
{
ehci_softc_t *sc = EHCI_BUS2SC(xfer->xroot->bus);
/* setup TD's and QH */
ehci_setup_standard_chain(xfer, &sc->sc_async_p_last);
/* put transfer on interrupt queue */
ehci_transfer_intr_enqueue(xfer);
/*
* XXX Certain nVidia chipsets choke when using the IAAD
* feature too frequently.
*/
if (sc->sc_flags & EHCI_SCFLG_IAADBUG)
return;
ehci_doorbell_async(sc);
}
static const struct usb_pipe_methods ehci_device_bulk_methods =
{
.open = ehci_device_bulk_open,
.close = ehci_device_bulk_close,
.enter = ehci_device_bulk_enter,
.start = ehci_device_bulk_start,
};
/*------------------------------------------------------------------------*
* ehci control support
*------------------------------------------------------------------------*/
static void
ehci_device_ctrl_open(struct usb_xfer *xfer)
{
return;
}
static void
ehci_device_ctrl_close(struct usb_xfer *xfer)
{
ehci_device_done(xfer, USB_ERR_CANCELLED);
}
static void
ehci_device_ctrl_enter(struct usb_xfer *xfer)
{
return;
}
static void
ehci_device_ctrl_start(struct usb_xfer *xfer)
{
ehci_softc_t *sc = EHCI_BUS2SC(xfer->xroot->bus);
/* setup TD's and QH */
ehci_setup_standard_chain(xfer, &sc->sc_async_p_last);
/* put transfer on interrupt queue */
ehci_transfer_intr_enqueue(xfer);
}
static const struct usb_pipe_methods ehci_device_ctrl_methods =
{
.open = ehci_device_ctrl_open,
.close = ehci_device_ctrl_close,
.enter = ehci_device_ctrl_enter,
.start = ehci_device_ctrl_start,
};
/*------------------------------------------------------------------------*
* ehci interrupt support
*------------------------------------------------------------------------*/
static void
ehci_device_intr_open(struct usb_xfer *xfer)
{
ehci_softc_t *sc = EHCI_BUS2SC(xfer->xroot->bus);
uint16_t best;
uint16_t bit;
uint16_t x;
usb_hs_bandwidth_alloc(xfer);
/*
* Find the best QH position corresponding to the given interval:
*/
best = 0;
bit = EHCI_VIRTUAL_FRAMELIST_COUNT / 2;
while (bit) {
if (xfer->interval >= bit) {
x = bit;
best = bit;
while (x & bit) {
if (sc->sc_intr_stat[x] <
sc->sc_intr_stat[best]) {
best = x;
}
x++;
}
break;
}
bit >>= 1;
}
sc->sc_intr_stat[best]++;
xfer->qh_pos = best;
DPRINTFN(3, "best=%d interval=%d\n",
best, xfer->interval);
}
static void
ehci_device_intr_close(struct usb_xfer *xfer)
{
ehci_softc_t *sc = EHCI_BUS2SC(xfer->xroot->bus);
sc->sc_intr_stat[xfer->qh_pos]--;
ehci_device_done(xfer, USB_ERR_CANCELLED);
/* bandwidth must be freed after device done */
usb_hs_bandwidth_free(xfer);
}
static void
ehci_device_intr_enter(struct usb_xfer *xfer)
{
return;
}
static void
ehci_device_intr_start(struct usb_xfer *xfer)
{
ehci_softc_t *sc = EHCI_BUS2SC(xfer->xroot->bus);
/* setup TD's and QH */
ehci_setup_standard_chain(xfer, &sc->sc_intr_p_last[xfer->qh_pos]);
/* put transfer on interrupt queue */
ehci_transfer_intr_enqueue(xfer);
}
static const struct usb_pipe_methods ehci_device_intr_methods =
{
.open = ehci_device_intr_open,
.close = ehci_device_intr_close,
.enter = ehci_device_intr_enter,
.start = ehci_device_intr_start,
};
/*------------------------------------------------------------------------*
* ehci full speed isochronous support
*------------------------------------------------------------------------*/
static void
ehci_device_isoc_fs_open(struct usb_xfer *xfer)
{
ehci_softc_t *sc = EHCI_BUS2SC(xfer->xroot->bus);
ehci_sitd_t *td;
uint32_t sitd_portaddr;
uint8_t ds;
sitd_portaddr =
EHCI_SITD_SET_ADDR(xfer->address) |
EHCI_SITD_SET_ENDPT(UE_GET_ADDR(xfer->endpointno)) |
EHCI_SITD_SET_HUBA(xfer->xroot->udev->hs_hub_addr) |
EHCI_SITD_SET_PORT(xfer->xroot->udev->hs_port_no);
if (UE_GET_DIR(xfer->endpointno) == UE_DIR_IN)
sitd_portaddr |= EHCI_SITD_SET_DIR_IN;
sitd_portaddr = htohc32(sc, sitd_portaddr);
/* initialize all TD's */
for (ds = 0; ds != 2; ds++) {
for (td = xfer->td_start[ds]; td; td = td->obj_next) {
td->sitd_portaddr = sitd_portaddr;
/*
* TODO: make some kind of automatic
* SMASK/CMASK selection based on micro-frame
* usage
*
* micro-frame usage (8 microframes per 1ms)
*/
td->sitd_back = htohc32(sc, EHCI_LINK_TERMINATE);
usb_pc_cpu_flush(td->page_cache);
}
}
}
static void
ehci_device_isoc_fs_close(struct usb_xfer *xfer)
{
ehci_device_done(xfer, USB_ERR_CANCELLED);
}
static void
ehci_device_isoc_fs_enter(struct usb_xfer *xfer)
{
struct usb_page_search buf_res;
ehci_softc_t *sc = EHCI_BUS2SC(xfer->xroot->bus);
ehci_sitd_t *td;
ehci_sitd_t *td_last = NULL;
ehci_sitd_t **pp_last;
uint32_t *plen;
uint32_t buf_offset;
uint32_t nframes;
uint32_t temp;
uint32_t sitd_mask;
uint16_t tlen;
uint8_t sa;
uint8_t sb;
#ifdef USB_DEBUG
uint8_t once = 1;
#endif
DPRINTFN(6, "xfer=%p next=%d nframes=%d\n",
xfer, xfer->endpoint->isoc_next, xfer->nframes);
/* get the current frame index */
nframes = EOREAD4(sc, EHCI_FRINDEX) / 8;
/*
* check if the frame index is within the window where the frames
* will be inserted
*/
buf_offset = (nframes - xfer->endpoint->isoc_next) &
(EHCI_VIRTUAL_FRAMELIST_COUNT - 1);
if ((xfer->endpoint->is_synced == 0) ||
(buf_offset < xfer->nframes)) {
/*
* If there is data underflow or the pipe queue is empty we
* schedule the transfer a few frames ahead of the current
* frame position. Else two isochronous transfers might
* overlap.
*/
xfer->endpoint->isoc_next = (nframes + 3) &
(EHCI_VIRTUAL_FRAMELIST_COUNT - 1);
xfer->endpoint->is_synced = 1;
DPRINTFN(3, "start next=%d\n", xfer->endpoint->isoc_next);
}
/*
* compute how many milliseconds the insertion is ahead of the
* current frame position:
*/
buf_offset = (xfer->endpoint->isoc_next - nframes) &
(EHCI_VIRTUAL_FRAMELIST_COUNT - 1);
/*
* pre-compute when the isochronous transfer will be finished:
*/
xfer->isoc_time_complete =
usb_isoc_time_expand(&sc->sc_bus, nframes) +
buf_offset + xfer->nframes;
/* get the real number of frames */
nframes = xfer->nframes;
buf_offset = 0;
plen = xfer->frlengths;
/* toggle the DMA set we are using */
xfer->flags_int.curr_dma_set ^= 1;
/* get next DMA set */
td = xfer->td_start[xfer->flags_int.curr_dma_set];
xfer->td_transfer_first = td;
pp_last = &sc->sc_isoc_fs_p_last[xfer->endpoint->isoc_next];
/* store starting position */
xfer->qh_pos = xfer->endpoint->isoc_next;
while (nframes--) {
if (td == NULL) {
panic("%s:%d: out of TD's\n",
__FUNCTION__, __LINE__);
}
if (pp_last >= &sc->sc_isoc_fs_p_last[EHCI_VIRTUAL_FRAMELIST_COUNT])
pp_last = &sc->sc_isoc_fs_p_last[0];
/* reuse sitd_portaddr and sitd_back from last transfer */
if (*plen > xfer->max_frame_size) {
#ifdef USB_DEBUG
if (once) {
once = 0;
printf("%s: frame length(%d) exceeds %d "
"bytes (frame truncated)\n",
__FUNCTION__, *plen,
xfer->max_frame_size);
}
#endif
*plen = xfer->max_frame_size;
}
/* allocate a slot */
sa = usbd_fs_isoc_schedule_alloc_slot(xfer,
xfer->isoc_time_complete - nframes - 1);
if (sa == 255) {
/*
* Schedule is FULL, set length to zero:
*/
*plen = 0;
sa = USB_FS_ISOC_UFRAME_MAX - 1;
}
if (*plen) {
/*
* only call "usbd_get_page()" when we have a
* non-zero length
*/
usbd_get_page(xfer->frbuffers, buf_offset, &buf_res);
td->sitd_bp[0] = htohc32(sc, buf_res.physaddr);
buf_offset += *plen;
/*
* NOTE: We need to subtract one from the offset so
* that we are on a valid page!
*/
usbd_get_page(xfer->frbuffers, buf_offset - 1,
&buf_res);
temp = buf_res.physaddr & ~0xFFF;
} else {
td->sitd_bp[0] = 0;
temp = 0;
}
if (UE_GET_DIR(xfer->endpointno) == UE_DIR_OUT) {
tlen = *plen;
if (tlen <= 188) {
temp |= 1; /* T-count = 1, TP = ALL */
tlen = 1;
} else {
tlen += 187;
tlen /= 188;
temp |= tlen; /* T-count = [1..6] */
temp |= 8; /* TP = Begin */
}
tlen += sa;
if (tlen >= 8) {
sb = 0;
} else {
sb = (1 << tlen);
}
sa = (1 << sa);
sa = (sb - sa) & 0x3F;
sb = 0;
} else {
sb = (-(4 << sa)) & 0xFE;
sa = (1 << sa) & 0x3F;
}
sitd_mask = (EHCI_SITD_SET_SMASK(sa) |
EHCI_SITD_SET_CMASK(sb));
td->sitd_bp[1] = htohc32(sc, temp);
td->sitd_mask = htohc32(sc, sitd_mask);
if (nframes == 0) {
td->sitd_status = htohc32(sc,
EHCI_SITD_IOC |
EHCI_SITD_ACTIVE |
EHCI_SITD_SET_LEN(*plen));
} else {
td->sitd_status = htohc32(sc,
EHCI_SITD_ACTIVE |
EHCI_SITD_SET_LEN(*plen));
}
usb_pc_cpu_flush(td->page_cache);
#ifdef USB_DEBUG
if (ehcidebug > 15) {
DPRINTF("FS-TD %d\n", nframes);
ehci_dump_sitd(sc, td);
}
#endif
/* insert TD into schedule */
EHCI_APPEND_FS_TD(td, *pp_last);
pp_last++;
plen++;
td_last = td;
td = td->obj_next;
}
xfer->td_transfer_last = td_last;
/* update isoc_next */
xfer->endpoint->isoc_next = (pp_last - &sc->sc_isoc_fs_p_last[0]) &
(EHCI_VIRTUAL_FRAMELIST_COUNT - 1);
/*
* We don't allow cancelling of the SPLIT transaction USB FULL
* speed transfer, because it disturbs the bandwidth
* computation algorithm.
*/
xfer->flags_int.can_cancel_immed = 0;
}
static void
ehci_device_isoc_fs_start(struct usb_xfer *xfer)
{
/*
* We don't allow cancelling of the SPLIT transaction USB FULL
* speed transfer, because it disturbs the bandwidth
* computation algorithm.
*/
xfer->flags_int.can_cancel_immed = 0;
/* set a default timeout */
if (xfer->timeout == 0)
xfer->timeout = 500; /* ms */
/* put transfer on interrupt queue */
ehci_transfer_intr_enqueue(xfer);
}
static const struct usb_pipe_methods ehci_device_isoc_fs_methods =
{
.open = ehci_device_isoc_fs_open,
.close = ehci_device_isoc_fs_close,
.enter = ehci_device_isoc_fs_enter,
.start = ehci_device_isoc_fs_start,
};
/*------------------------------------------------------------------------*
* ehci high speed isochronous support
*------------------------------------------------------------------------*/
static void
ehci_device_isoc_hs_open(struct usb_xfer *xfer)
{
ehci_softc_t *sc = EHCI_BUS2SC(xfer->xroot->bus);
ehci_itd_t *td;
uint32_t temp;
uint8_t ds;
usb_hs_bandwidth_alloc(xfer);
/* initialize all TD's */
for (ds = 0; ds != 2; ds++) {
for (td = xfer->td_start[ds]; td; td = td->obj_next) {
/* set TD inactive */
td->itd_status[0] = 0;
td->itd_status[1] = 0;
td->itd_status[2] = 0;
td->itd_status[3] = 0;
td->itd_status[4] = 0;
td->itd_status[5] = 0;
td->itd_status[6] = 0;
td->itd_status[7] = 0;
/* set endpoint and address */
td->itd_bp[0] = htohc32(sc,
EHCI_ITD_SET_ADDR(xfer->address) |
EHCI_ITD_SET_ENDPT(UE_GET_ADDR(xfer->endpointno)));
temp =
EHCI_ITD_SET_MPL(xfer->max_packet_size & 0x7FF);
/* set direction */
if (UE_GET_DIR(xfer->endpointno) == UE_DIR_IN) {
temp |= EHCI_ITD_SET_DIR_IN;
}
/* set maximum packet size */
td->itd_bp[1] = htohc32(sc, temp);
/* set transfer multiplier */
td->itd_bp[2] = htohc32(sc, xfer->max_packet_count & 3);
usb_pc_cpu_flush(td->page_cache);
}
}
}
static void
ehci_device_isoc_hs_close(struct usb_xfer *xfer)
{
ehci_device_done(xfer, USB_ERR_CANCELLED);
/* bandwidth must be freed after device done */
usb_hs_bandwidth_free(xfer);
}
static void
ehci_device_isoc_hs_enter(struct usb_xfer *xfer)
{
struct usb_page_search buf_res;
ehci_softc_t *sc = EHCI_BUS2SC(xfer->xroot->bus);
ehci_itd_t *td;
ehci_itd_t *td_last = NULL;
ehci_itd_t **pp_last;
bus_size_t page_addr;
uint32_t *plen;
uint32_t status;
uint32_t buf_offset;
uint32_t nframes;
uint32_t itd_offset[8 + 1];
uint8_t x;
uint8_t td_no;
uint8_t page_no;
uint8_t shift = usbd_xfer_get_fps_shift(xfer);
#ifdef USB_DEBUG
uint8_t once = 1;
#endif
DPRINTFN(6, "xfer=%p next=%d nframes=%d shift=%d\n",
xfer, xfer->endpoint->isoc_next, xfer->nframes, (int)shift);
/* get the current frame index */
nframes = EOREAD4(sc, EHCI_FRINDEX) / 8;
/*
* check if the frame index is within the window where the frames
* will be inserted
*/
buf_offset = (nframes - xfer->endpoint->isoc_next) &
(EHCI_VIRTUAL_FRAMELIST_COUNT - 1);
if ((xfer->endpoint->is_synced == 0) ||
(buf_offset < (((xfer->nframes << shift) + 7) / 8))) {
/*
* If there is data underflow or the pipe queue is empty we
* schedule the transfer a few frames ahead of the current
* frame position. Else two isochronous transfers might
* overlap.
*/
xfer->endpoint->isoc_next = (nframes + 3) &
(EHCI_VIRTUAL_FRAMELIST_COUNT - 1);
xfer->endpoint->is_synced = 1;
DPRINTFN(3, "start next=%d\n", xfer->endpoint->isoc_next);
}
/*
* compute how many milliseconds the insertion is ahead of the
* current frame position:
*/
buf_offset = (xfer->endpoint->isoc_next - nframes) &
(EHCI_VIRTUAL_FRAMELIST_COUNT - 1);
/*
* pre-compute when the isochronous transfer will be finished:
*/
xfer->isoc_time_complete =
usb_isoc_time_expand(&sc->sc_bus, nframes) + buf_offset +
(((xfer->nframes << shift) + 7) / 8);
/* get the real number of frames */
nframes = xfer->nframes;
buf_offset = 0;
td_no = 0;
plen = xfer->frlengths;
/* toggle the DMA set we are using */
xfer->flags_int.curr_dma_set ^= 1;
/* get next DMA set */
td = xfer->td_start[xfer->flags_int.curr_dma_set];
xfer->td_transfer_first = td;
pp_last = &sc->sc_isoc_hs_p_last[xfer->endpoint->isoc_next];
/* store starting position */
xfer->qh_pos = xfer->endpoint->isoc_next;
while (nframes) {
if (td == NULL) {
panic("%s:%d: out of TD's\n",
__FUNCTION__, __LINE__);
}
if (pp_last >= &sc->sc_isoc_hs_p_last[EHCI_VIRTUAL_FRAMELIST_COUNT]) {
pp_last = &sc->sc_isoc_hs_p_last[0];
}
/* range check */
if (*plen > xfer->max_frame_size) {
#ifdef USB_DEBUG
if (once) {
once = 0;
printf("%s: frame length(%d) exceeds %d bytes "
"(frame truncated)\n",
__FUNCTION__, *plen, xfer->max_frame_size);
}
#endif
*plen = xfer->max_frame_size;
}
if (xfer->endpoint->usb_smask & (1 << td_no)) {
status = (EHCI_ITD_SET_LEN(*plen) |
EHCI_ITD_ACTIVE |
EHCI_ITD_SET_PG(0));
td->itd_status[td_no] = htohc32(sc, status);
itd_offset[td_no] = buf_offset;
buf_offset += *plen;
plen++;
nframes --;
} else {
td->itd_status[td_no] = 0; /* not active */
itd_offset[td_no] = buf_offset;
}
td_no++;
if ((td_no == 8) || (nframes == 0)) {
/* the rest of the transfers are not active, if any */
for (x = td_no; x != 8; x++) {
td->itd_status[x] = 0; /* not active */
}
/* check if there is any data to be transferred */
if (itd_offset[0] != buf_offset) {
page_no = 0;
itd_offset[td_no] = buf_offset;
/* get first page offset */
usbd_get_page(xfer->frbuffers, itd_offset[0], &buf_res);
/* get page address */
page_addr = buf_res.physaddr & ~0xFFF;
/* update page address */
td->itd_bp[0] &= htohc32(sc, 0xFFF);
td->itd_bp[0] |= htohc32(sc, page_addr);
for (x = 0; x != td_no; x++) {
/* set page number and page offset */
status = (EHCI_ITD_SET_PG(page_no) |
(buf_res.physaddr & 0xFFF));
td->itd_status[x] |= htohc32(sc, status);
/* get next page offset */
if (itd_offset[x + 1] == buf_offset) {
/*
* We subtract one so that
* we don't go off the last
* page!
*/
usbd_get_page(xfer->frbuffers, buf_offset - 1, &buf_res);
} else {
usbd_get_page(xfer->frbuffers, itd_offset[x + 1], &buf_res);
}
/* check if we need a new page */
if ((buf_res.physaddr ^ page_addr) & ~0xFFF) {
/* new page needed */
page_addr = buf_res.physaddr & ~0xFFF;
if (page_no == 6) {
panic("%s: too many pages\n", __FUNCTION__);
}
page_no++;
/* update page address */
td->itd_bp[page_no] &= htohc32(sc, 0xFFF);
td->itd_bp[page_no] |= htohc32(sc, page_addr);
}
}
}
/* set IOC bit if we are complete */
if (nframes == 0) {
td->itd_status[td_no - 1] |= htohc32(sc, EHCI_ITD_IOC);
}
usb_pc_cpu_flush(td->page_cache);
#ifdef USB_DEBUG
if (ehcidebug > 15) {
DPRINTF("HS-TD %d\n", nframes);
ehci_dump_itd(sc, td);
}
#endif
/* insert TD into schedule */
EHCI_APPEND_HS_TD(td, *pp_last);
pp_last++;
td_no = 0;
td_last = td;
td = td->obj_next;
}
}
xfer->td_transfer_last = td_last;
/* update isoc_next */
xfer->endpoint->isoc_next = (pp_last - &sc->sc_isoc_hs_p_last[0]) &
(EHCI_VIRTUAL_FRAMELIST_COUNT - 1);
}
static void
ehci_device_isoc_hs_start(struct usb_xfer *xfer)
{
/* put transfer on interrupt queue */
ehci_transfer_intr_enqueue(xfer);
}
static const struct usb_pipe_methods ehci_device_isoc_hs_methods =
{
.open = ehci_device_isoc_hs_open,
.close = ehci_device_isoc_hs_close,
.enter = ehci_device_isoc_hs_enter,
.start = ehci_device_isoc_hs_start,
};
/*------------------------------------------------------------------------*
* ehci root control support
*------------------------------------------------------------------------*
* Simulate a hardware hub by handling all the necessary requests.
*------------------------------------------------------------------------*/
static const
struct usb_device_descriptor ehci_devd =
{
sizeof(struct usb_device_descriptor),
UDESC_DEVICE, /* type */
{0x00, 0x02}, /* USB version */
UDCLASS_HUB, /* class */
UDSUBCLASS_HUB, /* subclass */
UDPROTO_HSHUBSTT, /* protocol */
64, /* max packet */
{0}, {0}, {0x00, 0x01}, /* device id */
1, 2, 0, /* string indexes */
1 /* # of configurations */
};
static const
struct usb_device_qualifier ehci_odevd =
{
sizeof(struct usb_device_qualifier),
UDESC_DEVICE_QUALIFIER, /* type */
{0x00, 0x02}, /* USB version */
UDCLASS_HUB, /* class */
UDSUBCLASS_HUB, /* subclass */
UDPROTO_FSHUB, /* protocol */
0, /* max packet */
0, /* # of configurations */
0
};
static const struct ehci_config_desc ehci_confd = {
.confd = {
.bLength = sizeof(struct usb_config_descriptor),
.bDescriptorType = UDESC_CONFIG,
.wTotalLength[0] = sizeof(ehci_confd),
.bNumInterface = 1,
.bConfigurationValue = 1,
.iConfiguration = 0,
.bmAttributes = UC_SELF_POWERED,
.bMaxPower = 0 /* max power */
},
.ifcd = {
.bLength = sizeof(struct usb_interface_descriptor),
.bDescriptorType = UDESC_INTERFACE,
.bNumEndpoints = 1,
.bInterfaceClass = UICLASS_HUB,
.bInterfaceSubClass = UISUBCLASS_HUB,
.bInterfaceProtocol = 0,
},
.endpd = {
.bLength = sizeof(struct usb_endpoint_descriptor),
.bDescriptorType = UDESC_ENDPOINT,
.bEndpointAddress = UE_DIR_IN | EHCI_INTR_ENDPT,
.bmAttributes = UE_INTERRUPT,
.wMaxPacketSize[0] = 8, /* max packet (63 ports) */
.bInterval = 255,
},
};
static const
struct usb_hub_descriptor ehci_hubd =
{
.bDescLength = 0, /* dynamic length */
.bDescriptorType = UDESC_HUB,
};
uint16_t
ehci_get_port_speed_portsc(struct ehci_softc *sc, uint16_t index)
{
uint32_t v;
v = EOREAD4(sc, EHCI_PORTSC(index));
v = (v >> EHCI_PORTSC_PSPD_SHIFT) & EHCI_PORTSC_PSPD_MASK;
if (v == EHCI_PORT_SPEED_HIGH)
return (UPS_HIGH_SPEED);
if (v == EHCI_PORT_SPEED_LOW)
return (UPS_LOW_SPEED);
return (0);
}
uint16_t
ehci_get_port_speed_hostc(struct ehci_softc *sc, uint16_t index)
{
uint32_t v;
v = EOREAD4(sc, EHCI_HOSTC(index));
v = (v >> EHCI_HOSTC_PSPD_SHIFT) & EHCI_HOSTC_PSPD_MASK;
if (v == EHCI_PORT_SPEED_HIGH)
return (UPS_HIGH_SPEED);
if (v == EHCI_PORT_SPEED_LOW)
return (UPS_LOW_SPEED);
return (0);
}
static void
ehci_disown(ehci_softc_t *sc, uint16_t index, uint8_t lowspeed)
{
uint32_t port;
uint32_t v;
DPRINTF("index=%d lowspeed=%d\n", index, lowspeed);
port = EHCI_PORTSC(index);
v = EOREAD4(sc, port) & ~EHCI_PS_CLEAR;
EOWRITE4(sc, port, v | EHCI_PS_PO);
}
static usb_error_t
ehci_roothub_exec(struct usb_device *udev,
struct usb_device_request *req, const void **pptr, uint16_t *plength)
{
ehci_softc_t *sc = EHCI_BUS2SC(udev->bus);
const char *str_ptr;
const void *ptr;
uint32_t port;
uint32_t v;
uint16_t len;
uint16_t i;
uint16_t value;
uint16_t index;
usb_error_t err;
USB_BUS_LOCK_ASSERT(&sc->sc_bus, MA_OWNED);
/* buffer reset */
ptr = (const void *)&sc->sc_hub_desc;
len = 0;
err = 0;
value = UGETW(req->wValue);
index = UGETW(req->wIndex);
DPRINTFN(3, "type=0x%02x request=0x%02x wLen=0x%04x "
"wValue=0x%04x wIndex=0x%04x\n",
req->bmRequestType, req->bRequest,
UGETW(req->wLength), value, index);
#define C(x,y) ((x) | ((y) << 8))
switch (C(req->bRequest, req->bmRequestType)) {
case C(UR_CLEAR_FEATURE, UT_WRITE_DEVICE):
case C(UR_CLEAR_FEATURE, UT_WRITE_INTERFACE):
case C(UR_CLEAR_FEATURE, UT_WRITE_ENDPOINT):
/*
* DEVICE_REMOTE_WAKEUP and ENDPOINT_HALT are no-ops
* for the integrated root hub.
*/
break;
case C(UR_GET_CONFIG, UT_READ_DEVICE):
len = 1;
sc->sc_hub_desc.temp[0] = sc->sc_conf;
break;
case C(UR_GET_DESCRIPTOR, UT_READ_DEVICE):
switch (value >> 8) {
case UDESC_DEVICE:
if ((value & 0xff) != 0) {
err = USB_ERR_IOERROR;
goto done;
}
len = sizeof(ehci_devd);
ptr = (const void *)&ehci_devd;
break;
/*
* We can't really operate at another speed,
* but the specification says we need this
* descriptor:
*/
case UDESC_DEVICE_QUALIFIER:
if ((value & 0xff) != 0) {
err = USB_ERR_IOERROR;
goto done;
}
len = sizeof(ehci_odevd);
ptr = (const void *)&ehci_odevd;
break;
case UDESC_CONFIG:
if ((value & 0xff) != 0) {
err = USB_ERR_IOERROR;
goto done;
}
len = sizeof(ehci_confd);
ptr = (const void *)&ehci_confd;
break;
case UDESC_STRING:
switch (value & 0xff) {
case 0: /* Language table */
str_ptr = "\001";
break;
case 1: /* Vendor */
str_ptr = sc->sc_vendor;
break;
case 2: /* Product */
str_ptr = "EHCI root HUB";
break;
default:
str_ptr = "";
break;
}
len = usb_make_str_desc(
sc->sc_hub_desc.temp,
sizeof(sc->sc_hub_desc.temp),
str_ptr);
break;
default:
err = USB_ERR_IOERROR;
goto done;
}
break;
case C(UR_GET_INTERFACE, UT_READ_INTERFACE):
len = 1;
sc->sc_hub_desc.temp[0] = 0;
break;
case C(UR_GET_STATUS, UT_READ_DEVICE):
len = 2;
USETW(sc->sc_hub_desc.stat.wStatus, UDS_SELF_POWERED);
break;
case C(UR_GET_STATUS, UT_READ_INTERFACE):
case C(UR_GET_STATUS, UT_READ_ENDPOINT):
len = 2;
USETW(sc->sc_hub_desc.stat.wStatus, 0);
break;
case C(UR_SET_ADDRESS, UT_WRITE_DEVICE):
if (value >= EHCI_MAX_DEVICES) {
err = USB_ERR_IOERROR;
goto done;
}
sc->sc_addr = value;
break;
case C(UR_SET_CONFIG, UT_WRITE_DEVICE):
if ((value != 0) && (value != 1)) {
err = USB_ERR_IOERROR;
goto done;
}
sc->sc_conf = value;
break;
case C(UR_SET_DESCRIPTOR, UT_WRITE_DEVICE):
break;
case C(UR_SET_FEATURE, UT_WRITE_DEVICE):
case C(UR_SET_FEATURE, UT_WRITE_INTERFACE):
case C(UR_SET_FEATURE, UT_WRITE_ENDPOINT):
err = USB_ERR_IOERROR;
goto done;
case C(UR_SET_INTERFACE, UT_WRITE_INTERFACE):
break;
case C(UR_SYNCH_FRAME, UT_WRITE_ENDPOINT):
break;
/* Hub requests */
case C(UR_CLEAR_FEATURE, UT_WRITE_CLASS_DEVICE):
break;
case C(UR_CLEAR_FEATURE, UT_WRITE_CLASS_OTHER):
DPRINTFN(9, "UR_CLEAR_PORT_FEATURE\n");
if ((index < 1) ||
(index > sc->sc_noport)) {
err = USB_ERR_IOERROR;
goto done;
}
port = EHCI_PORTSC(index);
v = EOREAD4(sc, port) & ~EHCI_PS_CLEAR;
switch (value) {
case UHF_PORT_ENABLE:
EOWRITE4(sc, port, v & ~EHCI_PS_PE);
break;
case UHF_PORT_SUSPEND:
if ((v & EHCI_PS_SUSP) && (!(v & EHCI_PS_FPR))) {
/*
* waking up a High Speed device is rather
* complicated if
*/
EOWRITE4(sc, port, v | EHCI_PS_FPR);
}
/* wait 20ms for resume sequence to complete */
usb_pause_mtx(&sc->sc_bus.bus_mtx, hz / 50);
EOWRITE4(sc, port, v & ~(EHCI_PS_SUSP |
EHCI_PS_FPR | (3 << 10) /* High Speed */ ));
/* 4ms settle time */
usb_pause_mtx(&sc->sc_bus.bus_mtx, hz / 250);
break;
case UHF_PORT_POWER:
EOWRITE4(sc, port, v & ~EHCI_PS_PP);
break;
case UHF_PORT_TEST:
DPRINTFN(3, "clear port test "
"%d\n", index);
break;
case UHF_PORT_INDICATOR:
DPRINTFN(3, "clear port ind "
"%d\n", index);
EOWRITE4(sc, port, v & ~EHCI_PS_PIC);
break;
case UHF_C_PORT_CONNECTION:
EOWRITE4(sc, port, v | EHCI_PS_CSC);
break;
case UHF_C_PORT_ENABLE:
EOWRITE4(sc, port, v | EHCI_PS_PEC);
break;
case UHF_C_PORT_SUSPEND:
EOWRITE4(sc, port, v | EHCI_PS_SUSP);
break;
case UHF_C_PORT_OVER_CURRENT:
EOWRITE4(sc, port, v | EHCI_PS_OCC);
break;
case UHF_C_PORT_RESET:
sc->sc_isreset = 0;
break;
default:
err = USB_ERR_IOERROR;
goto done;
}
break;
case C(UR_GET_DESCRIPTOR, UT_READ_CLASS_DEVICE):
if ((value & 0xff) != 0) {
err = USB_ERR_IOERROR;
goto done;
}
v = EREAD4(sc, EHCI_HCSPARAMS);
sc->sc_hub_desc.hubd = ehci_hubd;
sc->sc_hub_desc.hubd.bNbrPorts = sc->sc_noport;
if (EHCI_HCS_PPC(v))
i = UHD_PWR_INDIVIDUAL;
else
i = UHD_PWR_NO_SWITCH;
if (EHCI_HCS_P_INDICATOR(v))
i |= UHD_PORT_IND;
USETW(sc->sc_hub_desc.hubd.wHubCharacteristics, i);
/* XXX can't find out? */
sc->sc_hub_desc.hubd.bPwrOn2PwrGood = 200;
/* XXX don't know if ports are removable or not */
sc->sc_hub_desc.hubd.bDescLength =
8 + ((sc->sc_noport + 7) / 8);
len = sc->sc_hub_desc.hubd.bDescLength;
break;
case C(UR_GET_STATUS, UT_READ_CLASS_DEVICE):
len = 16;
memset(sc->sc_hub_desc.temp, 0, 16);
break;
case C(UR_GET_STATUS, UT_READ_CLASS_OTHER):
DPRINTFN(9, "get port status i=%d\n",
index);
if ((index < 1) ||
(index > sc->sc_noport)) {
err = USB_ERR_IOERROR;
goto done;
}
v = EOREAD4(sc, EHCI_PORTSC(index));
DPRINTFN(9, "port status=0x%04x\n", v);
if (sc->sc_flags & EHCI_SCFLG_TT) {
if (sc->sc_vendor_get_port_speed != NULL) {
i = sc->sc_vendor_get_port_speed(sc, index);
} else {
device_printf(sc->sc_bus.bdev,
"EHCI_SCFLG_TT quirk is set but "
"sc_vendor_get_hub_speed() is NULL\n");
i = UPS_HIGH_SPEED;
}
} else {
i = UPS_HIGH_SPEED;
}
if (v & EHCI_PS_CS)
i |= UPS_CURRENT_CONNECT_STATUS;
if (v & EHCI_PS_PE)
i |= UPS_PORT_ENABLED;
if ((v & EHCI_PS_SUSP) && !(v & EHCI_PS_FPR))
i |= UPS_SUSPEND;
if (v & EHCI_PS_OCA)
i |= UPS_OVERCURRENT_INDICATOR;
if (v & EHCI_PS_PR)
i |= UPS_RESET;
if (v & EHCI_PS_PP)
i |= UPS_PORT_POWER;
USETW(sc->sc_hub_desc.ps.wPortStatus, i);
i = 0;
if (v & EHCI_PS_CSC)
i |= UPS_C_CONNECT_STATUS;
if (v & EHCI_PS_PEC)
i |= UPS_C_PORT_ENABLED;
if (v & EHCI_PS_OCC)
i |= UPS_C_OVERCURRENT_INDICATOR;
if (v & EHCI_PS_FPR)
i |= UPS_C_SUSPEND;
if (sc->sc_isreset)
i |= UPS_C_PORT_RESET;
USETW(sc->sc_hub_desc.ps.wPortChange, i);
len = sizeof(sc->sc_hub_desc.ps);
break;
case C(UR_SET_DESCRIPTOR, UT_WRITE_CLASS_DEVICE):
err = USB_ERR_IOERROR;
goto done;
case C(UR_SET_FEATURE, UT_WRITE_CLASS_DEVICE):
break;
case C(UR_SET_FEATURE, UT_WRITE_CLASS_OTHER):
if ((index < 1) ||
(index > sc->sc_noport)) {
err = USB_ERR_IOERROR;
goto done;
}
port = EHCI_PORTSC(index);
v = EOREAD4(sc, port) & ~EHCI_PS_CLEAR;
switch (value) {
case UHF_PORT_ENABLE:
EOWRITE4(sc, port, v | EHCI_PS_PE);
break;
case UHF_PORT_SUSPEND:
EOWRITE4(sc, port, v | EHCI_PS_SUSP);
break;
case UHF_PORT_RESET:
DPRINTFN(6, "reset port %d\n", index);
#ifdef USB_DEBUG
if (ehcinohighspeed) {
/*
* Connect USB device to companion
* controller.
*/
ehci_disown(sc, index, 1);
break;
}
#endif
if (EHCI_PS_IS_LOWSPEED(v) &&
(sc->sc_flags & EHCI_SCFLG_TT) == 0) {
/* Low speed device, give up ownership. */
ehci_disown(sc, index, 1);
break;
}
/* Start reset sequence. */
v &= ~(EHCI_PS_PE | EHCI_PS_PR);
EOWRITE4(sc, port, v | EHCI_PS_PR);
/* Wait for reset to complete. */
usb_pause_mtx(&sc->sc_bus.bus_mtx,
USB_MS_TO_TICKS(usb_port_root_reset_delay));
/* Terminate reset sequence. */
if (!(sc->sc_flags & EHCI_SCFLG_NORESTERM))
EOWRITE4(sc, port, v);
/* Wait for HC to complete reset. */
usb_pause_mtx(&sc->sc_bus.bus_mtx,
USB_MS_TO_TICKS(EHCI_PORT_RESET_COMPLETE));
v = EOREAD4(sc, port);
DPRINTF("ehci after reset, status=0x%08x\n", v);
if (v & EHCI_PS_PR) {
device_printf(sc->sc_bus.bdev,
"port reset timeout\n");
err = USB_ERR_TIMEOUT;
goto done;
}
if (!(v & EHCI_PS_PE) &&
(sc->sc_flags & EHCI_SCFLG_TT) == 0) {
/* Not a high speed device, give up ownership.*/
ehci_disown(sc, index, 0);
break;
}
sc->sc_isreset = 1;
DPRINTF("ehci port %d reset, status = 0x%08x\n",
index, v);
break;
case UHF_PORT_POWER:
DPRINTFN(3, "set port power %d\n", index);
EOWRITE4(sc, port, v | EHCI_PS_PP);
break;
case UHF_PORT_TEST:
DPRINTFN(3, "set port test %d\n", index);
break;
case UHF_PORT_INDICATOR:
DPRINTFN(3, "set port ind %d\n", index);
EOWRITE4(sc, port, v | EHCI_PS_PIC);
break;
default:
err = USB_ERR_IOERROR;
goto done;
}
break;
case C(UR_CLEAR_TT_BUFFER, UT_WRITE_CLASS_OTHER):
case C(UR_RESET_TT, UT_WRITE_CLASS_OTHER):
case C(UR_GET_TT_STATE, UT_READ_CLASS_OTHER):
case C(UR_STOP_TT, UT_WRITE_CLASS_OTHER):
break;
default:
err = USB_ERR_IOERROR;
goto done;
}
done:
*plength = len;
*pptr = ptr;
return (err);
}
static void
ehci_xfer_setup(struct usb_setup_params *parm)
{
struct usb_page_search page_info;
struct usb_page_cache *pc;
ehci_softc_t *sc;
struct usb_xfer *xfer;
void *last_obj;
uint32_t nqtd;
uint32_t nqh;
uint32_t nsitd;
uint32_t nitd;
uint32_t n;
sc = EHCI_BUS2SC(parm->udev->bus);
xfer = parm->curr_xfer;
nqtd = 0;
nqh = 0;
nsitd = 0;
nitd = 0;
/*
* compute maximum number of some structures
*/
if (parm->methods == &ehci_device_ctrl_methods) {
/*
* The proof for the "nqtd" formula is illustrated like
* this:
*
* +------------------------------------+
* | |
* | |remainder -> |
* | +-----+---+ |
* | | xxx | x | frm 0 |
* | +-----+---++ |
* | | xxx | xx | frm 1 |
* | +-----+----+ |
* | ... |
* +------------------------------------+
*
* "xxx" means a completely full USB transfer descriptor
*
* "x" and "xx" means a short USB packet
*
* For the remainder of an USB transfer modulo
* "max_data_length" we need two USB transfer descriptors.
* One to transfer the remaining data and one to finalise
* with a zero length packet in case the "force_short_xfer"
* flag is set. We only need two USB transfer descriptors in
* the case where the transfer length of the first one is a
* factor of "max_frame_size". The rest of the needed USB
* transfer descriptors is given by the buffer size divided
* by the maximum data payload.
*/
parm->hc_max_packet_size = 0x400;
parm->hc_max_packet_count = 1;
parm->hc_max_frame_size = EHCI_QTD_PAYLOAD_MAX;
xfer->flags_int.bdma_enable = 1;
usbd_transfer_setup_sub(parm);
nqh = 1;
nqtd = ((2 * xfer->nframes) + 1 /* STATUS */
+ (xfer->max_data_length / xfer->max_hc_frame_size));
} else if (parm->methods == &ehci_device_bulk_methods) {
parm->hc_max_packet_size = 0x400;
parm->hc_max_packet_count = 1;
parm->hc_max_frame_size = EHCI_QTD_PAYLOAD_MAX;
xfer->flags_int.bdma_enable = 1;
usbd_transfer_setup_sub(parm);
nqh = 1;
nqtd = ((2 * xfer->nframes)
+ (xfer->max_data_length / xfer->max_hc_frame_size));
} else if (parm->methods == &ehci_device_intr_methods) {
if (parm->speed == USB_SPEED_HIGH) {
parm->hc_max_packet_size = 0x400;
parm->hc_max_packet_count = 3;
} else if (parm->speed == USB_SPEED_FULL) {
parm->hc_max_packet_size = USB_FS_BYTES_PER_HS_UFRAME;
parm->hc_max_packet_count = 1;
} else {
parm->hc_max_packet_size = USB_FS_BYTES_PER_HS_UFRAME / 8;
parm->hc_max_packet_count = 1;
}
parm->hc_max_frame_size = EHCI_QTD_PAYLOAD_MAX;
xfer->flags_int.bdma_enable = 1;
usbd_transfer_setup_sub(parm);
nqh = 1;
nqtd = ((2 * xfer->nframes)
+ (xfer->max_data_length / xfer->max_hc_frame_size));
} else if (parm->methods == &ehci_device_isoc_fs_methods) {
parm->hc_max_packet_size = 0x3FF;
parm->hc_max_packet_count = 1;
parm->hc_max_frame_size = 0x3FF;
xfer->flags_int.bdma_enable = 1;
usbd_transfer_setup_sub(parm);
nsitd = xfer->nframes;
} else if (parm->methods == &ehci_device_isoc_hs_methods) {
parm->hc_max_packet_size = 0x400;
parm->hc_max_packet_count = 3;
parm->hc_max_frame_size = 0xC00;
xfer->flags_int.bdma_enable = 1;
usbd_transfer_setup_sub(parm);
nitd = ((xfer->nframes + 7) / 8) <<
usbd_xfer_get_fps_shift(xfer);
} else {
parm->hc_max_packet_size = 0x400;
parm->hc_max_packet_count = 1;
parm->hc_max_frame_size = 0x400;
usbd_transfer_setup_sub(parm);
}
alloc_dma_set:
if (parm->err) {
return;
}
/*
* Allocate queue heads and transfer descriptors
*/
last_obj = NULL;
if (usbd_transfer_setup_sub_malloc(
parm, &pc, sizeof(ehci_itd_t),
EHCI_ITD_ALIGN, nitd)) {
parm->err = USB_ERR_NOMEM;
return;
}
if (parm->buf) {
for (n = 0; n != nitd; n++) {
ehci_itd_t *td;
usbd_get_page(pc + n, 0, &page_info);
td = page_info.buffer;
/* init TD */
td->itd_self = htohc32(sc, page_info.physaddr | EHCI_LINK_ITD);
td->obj_next = last_obj;
td->page_cache = pc + n;
last_obj = td;
usb_pc_cpu_flush(pc + n);
}
}
if (usbd_transfer_setup_sub_malloc(
parm, &pc, sizeof(ehci_sitd_t),
EHCI_SITD_ALIGN, nsitd)) {
parm->err = USB_ERR_NOMEM;
return;
}
if (parm->buf) {
for (n = 0; n != nsitd; n++) {
ehci_sitd_t *td;
usbd_get_page(pc + n, 0, &page_info);
td = page_info.buffer;
/* init TD */
td->sitd_self = htohc32(sc, page_info.physaddr | EHCI_LINK_SITD);
td->obj_next = last_obj;
td->page_cache = pc + n;
last_obj = td;
usb_pc_cpu_flush(pc + n);
}
}
if (usbd_transfer_setup_sub_malloc(
parm, &pc, sizeof(ehci_qtd_t),
EHCI_QTD_ALIGN, nqtd)) {
parm->err = USB_ERR_NOMEM;
return;
}
if (parm->buf) {
for (n = 0; n != nqtd; n++) {
ehci_qtd_t *qtd;
usbd_get_page(pc + n, 0, &page_info);
qtd = page_info.buffer;
/* init TD */
qtd->qtd_self = htohc32(sc, page_info.physaddr);
qtd->obj_next = last_obj;
qtd->page_cache = pc + n;
last_obj = qtd;
usb_pc_cpu_flush(pc + n);
}
}
xfer->td_start[xfer->flags_int.curr_dma_set] = last_obj;
last_obj = NULL;
if (usbd_transfer_setup_sub_malloc(
parm, &pc, sizeof(ehci_qh_t),
EHCI_QH_ALIGN, nqh)) {
parm->err = USB_ERR_NOMEM;
return;
}
if (parm->buf) {
for (n = 0; n != nqh; n++) {
ehci_qh_t *qh;
usbd_get_page(pc + n, 0, &page_info);
qh = page_info.buffer;
/* init QH */
qh->qh_self = htohc32(sc, page_info.physaddr | EHCI_LINK_QH);
qh->obj_next = last_obj;
qh->page_cache = pc + n;
last_obj = qh;
usb_pc_cpu_flush(pc + n);
}
}
xfer->qh_start[xfer->flags_int.curr_dma_set] = last_obj;
if (!xfer->flags_int.curr_dma_set) {
xfer->flags_int.curr_dma_set = 1;
goto alloc_dma_set;
}
}
static void
ehci_xfer_unsetup(struct usb_xfer *xfer)
{
return;
}
static void
ehci_ep_init(struct usb_device *udev, struct usb_endpoint_descriptor *edesc,
struct usb_endpoint *ep)
{
ehci_softc_t *sc = EHCI_BUS2SC(udev->bus);
DPRINTFN(2, "endpoint=%p, addr=%d, endpt=%d, mode=%d (%d)\n",
ep, udev->address,
edesc->bEndpointAddress, udev->flags.usb_mode,
sc->sc_addr);
if (udev->device_index != sc->sc_addr) {
if ((udev->speed != USB_SPEED_HIGH) &&
((udev->hs_hub_addr == 0) ||
(udev->hs_port_no == 0) ||
(udev->parent_hs_hub == NULL) ||
(udev->parent_hs_hub->hub == NULL))) {
/* We need a transaction translator */
goto done;
}
switch (edesc->bmAttributes & UE_XFERTYPE) {
case UE_CONTROL:
ep->methods = &ehci_device_ctrl_methods;
break;
case UE_INTERRUPT:
ep->methods = &ehci_device_intr_methods;
break;
case UE_ISOCHRONOUS:
if (udev->speed == USB_SPEED_HIGH) {
ep->methods = &ehci_device_isoc_hs_methods;
} else if (udev->speed == USB_SPEED_FULL) {
ep->methods = &ehci_device_isoc_fs_methods;
}
break;
case UE_BULK:
ep->methods = &ehci_device_bulk_methods;
break;
default:
/* do nothing */
break;
}
}
done:
return;
}
static void
ehci_get_dma_delay(struct usb_device *udev, uint32_t *pus)
{
/*
* Wait until the hardware has finished any possible use of
* the transfer descriptor(s) and QH
*/
*pus = (1125); /* microseconds */
}
static void
ehci_device_resume(struct usb_device *udev)
{
ehci_softc_t *sc = EHCI_BUS2SC(udev->bus);
struct usb_xfer *xfer;
const struct usb_pipe_methods *methods;
DPRINTF("\n");
USB_BUS_LOCK(udev->bus);
TAILQ_FOREACH(xfer, &sc->sc_bus.intr_q.head, wait_entry) {
if (xfer->xroot->udev == udev) {
methods = xfer->endpoint->methods;
if ((methods == &ehci_device_bulk_methods) ||
(methods == &ehci_device_ctrl_methods)) {
EHCI_APPEND_QH(xfer->qh_start[xfer->flags_int.curr_dma_set],
sc->sc_async_p_last);
}
if (methods == &ehci_device_intr_methods) {
EHCI_APPEND_QH(xfer->qh_start[xfer->flags_int.curr_dma_set],
sc->sc_intr_p_last[xfer->qh_pos]);
}
}
}
USB_BUS_UNLOCK(udev->bus);
return;
}
static void
ehci_device_suspend(struct usb_device *udev)
{
ehci_softc_t *sc = EHCI_BUS2SC(udev->bus);
struct usb_xfer *xfer;
const struct usb_pipe_methods *methods;
DPRINTF("\n");
USB_BUS_LOCK(udev->bus);
TAILQ_FOREACH(xfer, &sc->sc_bus.intr_q.head, wait_entry) {
if (xfer->xroot->udev == udev) {
methods = xfer->endpoint->methods;
if ((methods == &ehci_device_bulk_methods) ||
(methods == &ehci_device_ctrl_methods)) {
EHCI_REMOVE_QH(xfer->qh_start[xfer->flags_int.curr_dma_set],
sc->sc_async_p_last);
}
if (methods == &ehci_device_intr_methods) {
EHCI_REMOVE_QH(xfer->qh_start[xfer->flags_int.curr_dma_set],
sc->sc_intr_p_last[xfer->qh_pos]);
}
}
}
USB_BUS_UNLOCK(udev->bus);
}
static void
ehci_set_hw_power_sleep(struct usb_bus *bus, uint32_t state)
{
struct ehci_softc *sc = EHCI_BUS2SC(bus);
switch (state) {
case USB_HW_POWER_SUSPEND:
case USB_HW_POWER_SHUTDOWN:
ehci_suspend(sc);
break;
case USB_HW_POWER_RESUME:
ehci_resume(sc);
break;
default:
break;
}
}
static void
ehci_set_hw_power(struct usb_bus *bus)
{
ehci_softc_t *sc = EHCI_BUS2SC(bus);
uint32_t temp;
uint32_t flags;
DPRINTF("\n");
USB_BUS_LOCK(bus);
flags = bus->hw_power_state;
temp = EOREAD4(sc, EHCI_USBCMD);
temp &= ~(EHCI_CMD_ASE | EHCI_CMD_PSE);
if (flags & (USB_HW_POWER_CONTROL |
USB_HW_POWER_BULK)) {
DPRINTF("Async is active\n");
temp |= EHCI_CMD_ASE;
}
if (flags & (USB_HW_POWER_INTERRUPT |
USB_HW_POWER_ISOC)) {
DPRINTF("Periodic is active\n");
temp |= EHCI_CMD_PSE;
}
EOWRITE4(sc, EHCI_USBCMD, temp);
USB_BUS_UNLOCK(bus);
return;
}
static void
ehci_start_dma_delay_second(struct usb_xfer *xfer)
{
struct ehci_softc *sc = EHCI_BUS2SC(xfer->xroot->bus);
DPRINTF("\n");
/* trigger doorbell */
ehci_doorbell_async(sc);
/* give the doorbell 4ms */
usbd_transfer_timeout_ms(xfer,
(void (*)(void *))&usb_dma_delay_done_cb, 4);
}
/*
* Ring the doorbell twice before freeing any DMA descriptors. Some host
* controllers apparently cache the QH descriptors and need a message
* that the cache needs to be discarded.
*/
static void
ehci_start_dma_delay(struct usb_xfer *xfer)
{
struct ehci_softc *sc = EHCI_BUS2SC(xfer->xroot->bus);
DPRINTF("\n");
/* trigger doorbell */
ehci_doorbell_async(sc);
/* give the doorbell 4ms */
usbd_transfer_timeout_ms(xfer,
(void (*)(void *))&ehci_start_dma_delay_second, 4);
}
static const struct usb_bus_methods ehci_bus_methods =
{
.endpoint_init = ehci_ep_init,
.xfer_setup = ehci_xfer_setup,
.xfer_unsetup = ehci_xfer_unsetup,
.get_dma_delay = ehci_get_dma_delay,
.device_resume = ehci_device_resume,
.device_suspend = ehci_device_suspend,
.set_hw_power = ehci_set_hw_power,
.set_hw_power_sleep = ehci_set_hw_power_sleep,
.roothub_exec = ehci_roothub_exec,
.xfer_poll = ehci_do_poll,
.start_dma_delay = ehci_start_dma_delay,
};
Index: head/sys/dev/usb/controller/ohci.c
===================================================================
--- head/sys/dev/usb/controller/ohci.c (revision 327172)
+++ head/sys/dev/usb/controller/ohci.c (revision 327173)
@@ -1,2736 +1,2734 @@
/* $FreeBSD$ */
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2008 Hans Petter Selasky. All rights reserved.
* Copyright (c) 1998 The NetBSD Foundation, Inc. All rights reserved.
* Copyright (c) 1998 Lennart Augustsson. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* USB Open Host Controller driver.
*
* OHCI spec: http://www.compaq.com/productinfo/development/openhci.html
* USB spec: http://www.usb.org/developers/docs/usbspec.zip
*/
#ifdef USB_GLOBAL_INCLUDE_FILE
#include USB_GLOBAL_INCLUDE_FILE
#else
#include <sys/stdint.h>
#include <sys/stddef.h>
#include <sys/param.h>
#include <sys/queue.h>
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/bus.h>
#include <sys/module.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/condvar.h>
#include <sys/sysctl.h>
#include <sys/sx.h>
#include <sys/unistd.h>
#include <sys/callout.h>
#include <sys/malloc.h>
#include <sys/priv.h>
#include <dev/usb/usb.h>
#include <dev/usb/usbdi.h>
#define USB_DEBUG_VAR ohcidebug
#include <dev/usb/usb_core.h>
#include <dev/usb/usb_debug.h>
#include <dev/usb/usb_busdma.h>
#include <dev/usb/usb_process.h>
#include <dev/usb/usb_transfer.h>
#include <dev/usb/usb_device.h>
#include <dev/usb/usb_hub.h>
#include <dev/usb/usb_util.h>
#include <dev/usb/usb_controller.h>
#include <dev/usb/usb_bus.h>
#endif /* USB_GLOBAL_INCLUDE_FILE */
#include <dev/usb/controller/ohci.h>
#include <dev/usb/controller/ohcireg.h>
#define OHCI_BUS2SC(bus) \
((ohci_softc_t *)(((uint8_t *)(bus)) - \
((uint8_t *)&(((ohci_softc_t *)0)->sc_bus))))
#ifdef USB_DEBUG
static int ohcidebug = 0;
static SYSCTL_NODE(_hw_usb, OID_AUTO, ohci, CTLFLAG_RW, 0, "USB ohci");
SYSCTL_INT(_hw_usb_ohci, OID_AUTO, debug, CTLFLAG_RWTUN,
&ohcidebug, 0, "ohci debug level");
static void ohci_dumpregs(ohci_softc_t *);
static void ohci_dump_tds(ohci_td_t *);
static uint8_t ohci_dump_td(ohci_td_t *);
static void ohci_dump_ed(ohci_ed_t *);
static uint8_t ohci_dump_itd(ohci_itd_t *);
static void ohci_dump_itds(ohci_itd_t *);
#endif
#define OBARR(sc) bus_space_barrier((sc)->sc_io_tag, (sc)->sc_io_hdl, 0, (sc)->sc_io_size, \
BUS_SPACE_BARRIER_READ|BUS_SPACE_BARRIER_WRITE)
#define OWRITE1(sc, r, x) \
do { OBARR(sc); bus_space_write_1((sc)->sc_io_tag, (sc)->sc_io_hdl, (r), (x)); } while (0)
#define OWRITE2(sc, r, x) \
do { OBARR(sc); bus_space_write_2((sc)->sc_io_tag, (sc)->sc_io_hdl, (r), (x)); } while (0)
#define OWRITE4(sc, r, x) \
do { OBARR(sc); bus_space_write_4((sc)->sc_io_tag, (sc)->sc_io_hdl, (r), (x)); } while (0)
#define OREAD1(sc, r) (OBARR(sc), bus_space_read_1((sc)->sc_io_tag, (sc)->sc_io_hdl, (r)))
#define OREAD2(sc, r) (OBARR(sc), bus_space_read_2((sc)->sc_io_tag, (sc)->sc_io_hdl, (r)))
#define OREAD4(sc, r) (OBARR(sc), bus_space_read_4((sc)->sc_io_tag, (sc)->sc_io_hdl, (r)))
#define OHCI_INTR_ENDPT 1
static const struct usb_bus_methods ohci_bus_methods;
static const struct usb_pipe_methods ohci_device_bulk_methods;
static const struct usb_pipe_methods ohci_device_ctrl_methods;
static const struct usb_pipe_methods ohci_device_intr_methods;
static const struct usb_pipe_methods ohci_device_isoc_methods;
static void ohci_do_poll(struct usb_bus *bus);
static void ohci_device_done(struct usb_xfer *xfer, usb_error_t error);
static void ohci_timeout(void *arg);
static uint8_t ohci_check_transfer(struct usb_xfer *xfer);
static void ohci_root_intr(ohci_softc_t *sc);
struct ohci_std_temp {
struct usb_page_cache *pc;
ohci_td_t *td;
ohci_td_t *td_next;
uint32_t average;
uint32_t td_flags;
uint32_t len;
uint16_t max_frame_size;
uint8_t shortpkt;
uint8_t setup_alt_next;
uint8_t last_frame;
};
static struct ohci_hcca *
ohci_get_hcca(ohci_softc_t *sc)
{
usb_pc_cpu_invalidate(&sc->sc_hw.hcca_pc);
return (sc->sc_hcca_p);
}
void
ohci_iterate_hw_softc(struct usb_bus *bus, usb_bus_mem_sub_cb_t *cb)
{
struct ohci_softc *sc = OHCI_BUS2SC(bus);
uint32_t i;
cb(bus, &sc->sc_hw.hcca_pc, &sc->sc_hw.hcca_pg,
sizeof(ohci_hcca_t), OHCI_HCCA_ALIGN);
cb(bus, &sc->sc_hw.ctrl_start_pc, &sc->sc_hw.ctrl_start_pg,
sizeof(ohci_ed_t), OHCI_ED_ALIGN);
cb(bus, &sc->sc_hw.bulk_start_pc, &sc->sc_hw.bulk_start_pg,
sizeof(ohci_ed_t), OHCI_ED_ALIGN);
cb(bus, &sc->sc_hw.isoc_start_pc, &sc->sc_hw.isoc_start_pg,
sizeof(ohci_ed_t), OHCI_ED_ALIGN);
for (i = 0; i != OHCI_NO_EDS; i++) {
cb(bus, sc->sc_hw.intr_start_pc + i, sc->sc_hw.intr_start_pg + i,
sizeof(ohci_ed_t), OHCI_ED_ALIGN);
}
}
static usb_error_t
ohci_controller_init(ohci_softc_t *sc, int do_suspend)
{
struct usb_page_search buf_res;
uint32_t i;
uint32_t ctl;
uint32_t ival;
uint32_t hcr;
uint32_t fm;
uint32_t per;
uint32_t desca;
/* Determine in what context we are running. */
ctl = OREAD4(sc, OHCI_CONTROL);
if (ctl & OHCI_IR) {
/* SMM active, request change */
DPRINTF("SMM active, request owner change\n");
OWRITE4(sc, OHCI_COMMAND_STATUS, OHCI_OCR);
for (i = 0; (i < 100) && (ctl & OHCI_IR); i++) {
usb_pause_mtx(NULL, hz / 1000);
ctl = OREAD4(sc, OHCI_CONTROL);
}
if (ctl & OHCI_IR) {
device_printf(sc->sc_bus.bdev,
"SMM does not respond, resetting\n");
OWRITE4(sc, OHCI_CONTROL, OHCI_HCFS_RESET);
goto reset;
}
} else {
DPRINTF("cold started\n");
reset:
/* controller was cold started */
usb_pause_mtx(NULL,
USB_MS_TO_TICKS(USB_BUS_RESET_DELAY));
}
/*
* This reset should not be necessary according to the OHCI spec, but
* without it some controllers do not start.
*/
DPRINTF("%s: resetting\n", device_get_nameunit(sc->sc_bus.bdev));
OWRITE4(sc, OHCI_CONTROL, OHCI_HCFS_RESET);
usb_pause_mtx(NULL,
USB_MS_TO_TICKS(USB_BUS_RESET_DELAY));
/* we now own the host controller and the bus has been reset */
ival = OHCI_GET_IVAL(OREAD4(sc, OHCI_FM_INTERVAL));
OWRITE4(sc, OHCI_COMMAND_STATUS, OHCI_HCR); /* Reset HC */
/* nominal time for a reset is 10 us */
for (i = 0; i < 10; i++) {
DELAY(10);
hcr = OREAD4(sc, OHCI_COMMAND_STATUS) & OHCI_HCR;
if (!hcr) {
break;
}
}
if (hcr) {
device_printf(sc->sc_bus.bdev, "reset timeout\n");
return (USB_ERR_IOERROR);
}
#ifdef USB_DEBUG
if (ohcidebug > 15) {
ohci_dumpregs(sc);
}
#endif
if (do_suspend) {
OWRITE4(sc, OHCI_CONTROL, OHCI_HCFS_SUSPEND);
return (USB_ERR_NORMAL_COMPLETION);
}
/* The controller is now in SUSPEND state, we have 2ms to finish. */
/* set up HC registers */
usbd_get_page(&sc->sc_hw.hcca_pc, 0, &buf_res);
OWRITE4(sc, OHCI_HCCA, buf_res.physaddr);
usbd_get_page(&sc->sc_hw.ctrl_start_pc, 0, &buf_res);
OWRITE4(sc, OHCI_CONTROL_HEAD_ED, buf_res.physaddr);
usbd_get_page(&sc->sc_hw.bulk_start_pc, 0, &buf_res);
OWRITE4(sc, OHCI_BULK_HEAD_ED, buf_res.physaddr);
/* disable all interrupts and then switch on all desired interrupts */
OWRITE4(sc, OHCI_INTERRUPT_DISABLE, OHCI_ALL_INTRS);
OWRITE4(sc, OHCI_INTERRUPT_ENABLE, sc->sc_eintrs | OHCI_MIE);
/* switch on desired functional features */
ctl = OREAD4(sc, OHCI_CONTROL);
ctl &= ~(OHCI_CBSR_MASK | OHCI_LES | OHCI_HCFS_MASK | OHCI_IR);
ctl |= OHCI_PLE | OHCI_IE | OHCI_CLE | OHCI_BLE |
OHCI_RATIO_1_4 | OHCI_HCFS_OPERATIONAL;
/* And finally start it! */
OWRITE4(sc, OHCI_CONTROL, ctl);
/*
* The controller is now OPERATIONAL. Set a some final
* registers that should be set earlier, but that the
* controller ignores when in the SUSPEND state.
*/
fm = (OREAD4(sc, OHCI_FM_INTERVAL) & OHCI_FIT) ^ OHCI_FIT;
fm |= OHCI_FSMPS(ival) | ival;
OWRITE4(sc, OHCI_FM_INTERVAL, fm);
per = OHCI_PERIODIC(ival); /* 90% periodic */
OWRITE4(sc, OHCI_PERIODIC_START, per);
/* Fiddle the No OverCurrent Protection bit to avoid chip bug. */
desca = OREAD4(sc, OHCI_RH_DESCRIPTOR_A);
OWRITE4(sc, OHCI_RH_DESCRIPTOR_A, desca | OHCI_NOCP);
OWRITE4(sc, OHCI_RH_STATUS, OHCI_LPSC); /* Enable port power */
usb_pause_mtx(NULL,
USB_MS_TO_TICKS(OHCI_ENABLE_POWER_DELAY));
OWRITE4(sc, OHCI_RH_DESCRIPTOR_A, desca);
/*
* The AMD756 requires a delay before re-reading the register,
* otherwise it will occasionally report 0 ports.
*/
sc->sc_noport = 0;
for (i = 0; (i < 10) && (sc->sc_noport == 0); i++) {
usb_pause_mtx(NULL,
USB_MS_TO_TICKS(OHCI_READ_DESC_DELAY));
sc->sc_noport = OHCI_GET_NDP(OREAD4(sc, OHCI_RH_DESCRIPTOR_A));
}
#ifdef USB_DEBUG
if (ohcidebug > 5) {
ohci_dumpregs(sc);
}
#endif
return (USB_ERR_NORMAL_COMPLETION);
}
static struct ohci_ed *
ohci_init_ed(struct usb_page_cache *pc)
{
struct usb_page_search buf_res;
struct ohci_ed *ed;
usbd_get_page(pc, 0, &buf_res);
ed = buf_res.buffer;
ed->ed_self = htole32(buf_res.physaddr);
ed->ed_flags = htole32(OHCI_ED_SKIP);
ed->page_cache = pc;
return (ed);
}
usb_error_t
ohci_init(ohci_softc_t *sc)
{
struct usb_page_search buf_res;
uint16_t i;
uint16_t bit;
uint16_t x;
uint16_t y;
DPRINTF("start\n");
sc->sc_eintrs = OHCI_NORMAL_INTRS;
/*
* Setup all ED's
*/
sc->sc_ctrl_p_last =
ohci_init_ed(&sc->sc_hw.ctrl_start_pc);
sc->sc_bulk_p_last =
ohci_init_ed(&sc->sc_hw.bulk_start_pc);
sc->sc_isoc_p_last =
ohci_init_ed(&sc->sc_hw.isoc_start_pc);
for (i = 0; i != OHCI_NO_EDS; i++) {
sc->sc_intr_p_last[i] =
ohci_init_ed(sc->sc_hw.intr_start_pc + i);
}
/*
* the QHs are arranged to give poll intervals that are
* powers of 2 times 1ms
*/
bit = OHCI_NO_EDS / 2;
while (bit) {
x = bit;
while (x & bit) {
ohci_ed_t *ed_x;
ohci_ed_t *ed_y;
y = (x ^ bit) | (bit / 2);
/*
* the next QH has half the poll interval
*/
ed_x = sc->sc_intr_p_last[x];
ed_y = sc->sc_intr_p_last[y];
ed_x->next = NULL;
ed_x->ed_next = ed_y->ed_self;
x++;
}
bit >>= 1;
}
if (1) {
ohci_ed_t *ed_int;
ohci_ed_t *ed_isc;
ed_int = sc->sc_intr_p_last[0];
ed_isc = sc->sc_isoc_p_last;
/* the last (1ms) QH */
ed_int->next = ed_isc;
ed_int->ed_next = ed_isc->ed_self;
}
usbd_get_page(&sc->sc_hw.hcca_pc, 0, &buf_res);
sc->sc_hcca_p = buf_res.buffer;
/*
* Fill HCCA interrupt table. The bit reversal is to get
* the tree set up properly to spread the interrupts.
*/
for (i = 0; i != OHCI_NO_INTRS; i++) {
sc->sc_hcca_p->hcca_interrupt_table[i] =
sc->sc_intr_p_last[i | (OHCI_NO_EDS / 2)]->ed_self;
}
/* flush all cache into memory */
usb_bus_mem_flush_all(&sc->sc_bus, &ohci_iterate_hw_softc);
/* set up the bus struct */
sc->sc_bus.methods = &ohci_bus_methods;
usb_callout_init_mtx(&sc->sc_tmo_rhsc, &sc->sc_bus.bus_mtx, 0);
#ifdef USB_DEBUG
if (ohcidebug > 15) {
for (i = 0; i != OHCI_NO_EDS; i++) {
printf("ed#%d ", i);
ohci_dump_ed(sc->sc_intr_p_last[i]);
}
printf("iso ");
ohci_dump_ed(sc->sc_isoc_p_last);
}
#endif
sc->sc_bus.usbrev = USB_REV_1_0;
if (ohci_controller_init(sc, 0) != 0)
return (USB_ERR_INVAL);
/* catch any lost interrupts */
ohci_do_poll(&sc->sc_bus);
return (USB_ERR_NORMAL_COMPLETION);
}
/*
* shut down the controller when the system is going down
*/
void
ohci_detach(struct ohci_softc *sc)
{
USB_BUS_LOCK(&sc->sc_bus);
usb_callout_stop(&sc->sc_tmo_rhsc);
OWRITE4(sc, OHCI_INTERRUPT_DISABLE, OHCI_ALL_INTRS);
OWRITE4(sc, OHCI_CONTROL, OHCI_HCFS_RESET);
USB_BUS_UNLOCK(&sc->sc_bus);
/* XXX let stray task complete */
usb_pause_mtx(NULL, hz / 20);
usb_callout_drain(&sc->sc_tmo_rhsc);
}
static void
ohci_suspend(ohci_softc_t *sc)
{
DPRINTF("\n");
#ifdef USB_DEBUG
if (ohcidebug > 2)
ohci_dumpregs(sc);
#endif
/* reset HC and leave it suspended */
ohci_controller_init(sc, 1);
}
static void
ohci_resume(ohci_softc_t *sc)
{
DPRINTF("\n");
#ifdef USB_DEBUG
if (ohcidebug > 2)
ohci_dumpregs(sc);
#endif
/* some broken BIOSes never initialize the Controller chip */
ohci_controller_init(sc, 0);
/* catch any lost interrupts */
ohci_do_poll(&sc->sc_bus);
}
#ifdef USB_DEBUG
static void
ohci_dumpregs(ohci_softc_t *sc)
{
struct ohci_hcca *hcca;
DPRINTF("ohci_dumpregs: rev=0x%08x control=0x%08x command=0x%08x\n",
OREAD4(sc, OHCI_REVISION),
OREAD4(sc, OHCI_CONTROL),
OREAD4(sc, OHCI_COMMAND_STATUS));
DPRINTF(" intrstat=0x%08x intre=0x%08x intrd=0x%08x\n",
OREAD4(sc, OHCI_INTERRUPT_STATUS),
OREAD4(sc, OHCI_INTERRUPT_ENABLE),
OREAD4(sc, OHCI_INTERRUPT_DISABLE));
DPRINTF(" hcca=0x%08x percur=0x%08x ctrlhd=0x%08x\n",
OREAD4(sc, OHCI_HCCA),
OREAD4(sc, OHCI_PERIOD_CURRENT_ED),
OREAD4(sc, OHCI_CONTROL_HEAD_ED));
DPRINTF(" ctrlcur=0x%08x bulkhd=0x%08x bulkcur=0x%08x\n",
OREAD4(sc, OHCI_CONTROL_CURRENT_ED),
OREAD4(sc, OHCI_BULK_HEAD_ED),
OREAD4(sc, OHCI_BULK_CURRENT_ED));
DPRINTF(" done=0x%08x fmival=0x%08x fmrem=0x%08x\n",
OREAD4(sc, OHCI_DONE_HEAD),
OREAD4(sc, OHCI_FM_INTERVAL),
OREAD4(sc, OHCI_FM_REMAINING));
DPRINTF(" fmnum=0x%08x perst=0x%08x lsthrs=0x%08x\n",
OREAD4(sc, OHCI_FM_NUMBER),
OREAD4(sc, OHCI_PERIODIC_START),
OREAD4(sc, OHCI_LS_THRESHOLD));
DPRINTF(" desca=0x%08x descb=0x%08x stat=0x%08x\n",
OREAD4(sc, OHCI_RH_DESCRIPTOR_A),
OREAD4(sc, OHCI_RH_DESCRIPTOR_B),
OREAD4(sc, OHCI_RH_STATUS));
DPRINTF(" port1=0x%08x port2=0x%08x\n",
OREAD4(sc, OHCI_RH_PORT_STATUS(1)),
OREAD4(sc, OHCI_RH_PORT_STATUS(2)));
hcca = ohci_get_hcca(sc);
DPRINTF(" HCCA: frame_number=0x%04x done_head=0x%08x\n",
le32toh(hcca->hcca_frame_number),
le32toh(hcca->hcca_done_head));
}
static void
ohci_dump_tds(ohci_td_t *std)
{
for (; std; std = std->obj_next) {
if (ohci_dump_td(std)) {
break;
}
}
}
static uint8_t
ohci_dump_td(ohci_td_t *std)
{
uint32_t td_flags;
uint8_t temp;
usb_pc_cpu_invalidate(std->page_cache);
td_flags = le32toh(std->td_flags);
temp = (std->td_next == 0);
printf("TD(%p) at 0x%08x: %s%s%s%s%s delay=%d ec=%d "
"cc=%d\ncbp=0x%08x next=0x%08x be=0x%08x\n",
std, le32toh(std->td_self),
(td_flags & OHCI_TD_R) ? "-R" : "",
(td_flags & OHCI_TD_OUT) ? "-OUT" : "",
(td_flags & OHCI_TD_IN) ? "-IN" : "",
((td_flags & OHCI_TD_TOGGLE_MASK) == OHCI_TD_TOGGLE_1) ? "-TOG1" : "",
((td_flags & OHCI_TD_TOGGLE_MASK) == OHCI_TD_TOGGLE_0) ? "-TOG0" : "",
OHCI_TD_GET_DI(td_flags),
OHCI_TD_GET_EC(td_flags),
OHCI_TD_GET_CC(td_flags),
le32toh(std->td_cbp),
le32toh(std->td_next),
le32toh(std->td_be));
return (temp);
}
static uint8_t
ohci_dump_itd(ohci_itd_t *sitd)
{
uint32_t itd_flags;
uint16_t i;
uint8_t temp;
usb_pc_cpu_invalidate(sitd->page_cache);
itd_flags = le32toh(sitd->itd_flags);
temp = (sitd->itd_next == 0);
printf("ITD(%p) at 0x%08x: sf=%d di=%d fc=%d cc=%d\n"
"bp0=0x%08x next=0x%08x be=0x%08x\n",
sitd, le32toh(sitd->itd_self),
OHCI_ITD_GET_SF(itd_flags),
OHCI_ITD_GET_DI(itd_flags),
OHCI_ITD_GET_FC(itd_flags),
OHCI_ITD_GET_CC(itd_flags),
le32toh(sitd->itd_bp0),
le32toh(sitd->itd_next),
le32toh(sitd->itd_be));
for (i = 0; i < OHCI_ITD_NOFFSET; i++) {
printf("offs[%d]=0x%04x ", i,
(uint32_t)le16toh(sitd->itd_offset[i]));
}
printf("\n");
return (temp);
}
static void
ohci_dump_itds(ohci_itd_t *sitd)
{
for (; sitd; sitd = sitd->obj_next) {
if (ohci_dump_itd(sitd)) {
break;
}
}
}
static void
ohci_dump_ed(ohci_ed_t *sed)
{
uint32_t ed_flags;
uint32_t ed_headp;
usb_pc_cpu_invalidate(sed->page_cache);
ed_flags = le32toh(sed->ed_flags);
ed_headp = le32toh(sed->ed_headp);
printf("ED(%p) at 0x%08x: addr=%d endpt=%d maxp=%d flags=%s%s%s%s%s\n"
"tailp=0x%08x headflags=%s%s headp=0x%08x nexted=0x%08x\n",
sed, le32toh(sed->ed_self),
OHCI_ED_GET_FA(ed_flags),
OHCI_ED_GET_EN(ed_flags),
OHCI_ED_GET_MAXP(ed_flags),
(ed_flags & OHCI_ED_DIR_OUT) ? "-OUT" : "",
(ed_flags & OHCI_ED_DIR_IN) ? "-IN" : "",
(ed_flags & OHCI_ED_SPEED) ? "-LOWSPEED" : "",
(ed_flags & OHCI_ED_SKIP) ? "-SKIP" : "",
(ed_flags & OHCI_ED_FORMAT_ISO) ? "-ISO" : "",
le32toh(sed->ed_tailp),
(ed_headp & OHCI_HALTED) ? "-HALTED" : "",
(ed_headp & OHCI_TOGGLECARRY) ? "-CARRY" : "",
le32toh(sed->ed_headp),
le32toh(sed->ed_next));
}
#endif
static void
ohci_transfer_intr_enqueue(struct usb_xfer *xfer)
{
/* check for early completion */
if (ohci_check_transfer(xfer)) {
return;
}
/* put transfer on interrupt queue */
usbd_transfer_enqueue(&xfer->xroot->bus->intr_q, xfer);
/* start timeout, if any */
if (xfer->timeout != 0) {
usbd_transfer_timeout_ms(xfer, &ohci_timeout, xfer->timeout);
}
}
#define OHCI_APPEND_QH(sed,last) (last) = _ohci_append_qh(sed,last)
static ohci_ed_t *
_ohci_append_qh(ohci_ed_t *sed, ohci_ed_t *last)
{
DPRINTFN(11, "%p to %p\n", sed, last);
if (sed->prev != NULL) {
/* should not happen */
DPRINTFN(0, "ED already linked!\n");
return (last);
}
/* (sc->sc_bus.bus_mtx) must be locked */
sed->next = last->next;
sed->ed_next = last->ed_next;
sed->ed_tailp = 0;
sed->prev = last;
usb_pc_cpu_flush(sed->page_cache);
/*
* the last->next->prev is never followed: sed->next->prev = sed;
*/
last->next = sed;
last->ed_next = sed->ed_self;
usb_pc_cpu_flush(last->page_cache);
return (sed);
}
#define OHCI_REMOVE_QH(sed,last) (last) = _ohci_remove_qh(sed,last)
static ohci_ed_t *
_ohci_remove_qh(ohci_ed_t *sed, ohci_ed_t *last)
{
DPRINTFN(11, "%p from %p\n", sed, last);
/* (sc->sc_bus.bus_mtx) must be locked */
/* only remove if not removed from a queue */
if (sed->prev) {
sed->prev->next = sed->next;
sed->prev->ed_next = sed->ed_next;
usb_pc_cpu_flush(sed->prev->page_cache);
if (sed->next) {
sed->next->prev = sed->prev;
usb_pc_cpu_flush(sed->next->page_cache);
}
last = ((last == sed) ? sed->prev : last);
sed->prev = 0;
usb_pc_cpu_flush(sed->page_cache);
}
return (last);
}
static void
ohci_isoc_done(struct usb_xfer *xfer)
{
uint8_t nframes;
uint32_t *plen = xfer->frlengths;
volatile uint16_t *olen;
uint16_t len = 0;
ohci_itd_t *td = xfer->td_transfer_first;
while (1) {
if (td == NULL) {
panic("%s:%d: out of TD's\n",
__FUNCTION__, __LINE__);
}
#ifdef USB_DEBUG
if (ohcidebug > 5) {
DPRINTF("isoc TD\n");
ohci_dump_itd(td);
}
#endif
usb_pc_cpu_invalidate(td->page_cache);
nframes = td->frames;
olen = &td->itd_offset[0];
if (nframes > 8) {
nframes = 8;
}
while (nframes--) {
len = le16toh(*olen);
if ((len >> 12) == OHCI_CC_NOT_ACCESSED) {
len = 0;
} else {
len &= ((1 << 12) - 1);
}
if (len > *plen) {
len = 0;/* invalid length */
}
*plen = len;
plen++;
olen++;
}
if (((void *)td) == xfer->td_transfer_last) {
break;
}
td = td->obj_next;
}
xfer->aframes = xfer->nframes;
ohci_device_done(xfer, USB_ERR_NORMAL_COMPLETION);
}
#ifdef USB_DEBUG
static const char *const
ohci_cc_strs[] =
{
"NO_ERROR",
"CRC",
"BIT_STUFFING",
"DATA_TOGGLE_MISMATCH",
"STALL",
"DEVICE_NOT_RESPONDING",
"PID_CHECK_FAILURE",
"UNEXPECTED_PID",
"DATA_OVERRUN",
"DATA_UNDERRUN",
"BUFFER_OVERRUN",
"BUFFER_UNDERRUN",
"reserved",
"reserved",
"NOT_ACCESSED",
"NOT_ACCESSED"
};
#endif
static usb_error_t
ohci_non_isoc_done_sub(struct usb_xfer *xfer)
{
ohci_td_t *td;
ohci_td_t *td_alt_next;
uint32_t temp;
uint32_t phy_start;
uint32_t phy_end;
uint32_t td_flags;
uint16_t cc;
td = xfer->td_transfer_cache;
td_alt_next = td->alt_next;
td_flags = 0;
if (xfer->aframes != xfer->nframes) {
usbd_xfer_set_frame_len(xfer, xfer->aframes, 0);
}
while (1) {
usb_pc_cpu_invalidate(td->page_cache);
phy_start = le32toh(td->td_cbp);
td_flags = le32toh(td->td_flags);
cc = OHCI_TD_GET_CC(td_flags);
if (phy_start) {
/*
* short transfer - compute the number of remaining
* bytes in the hardware buffer:
*/
phy_end = le32toh(td->td_be);
temp = (OHCI_PAGE(phy_start ^ phy_end) ?
(OHCI_PAGE_SIZE + 1) : 0x0001);
temp += OHCI_PAGE_OFFSET(phy_end);
temp -= OHCI_PAGE_OFFSET(phy_start);
if (temp > td->len) {
/* guard against corruption */
cc = OHCI_CC_STALL;
} else if (xfer->aframes != xfer->nframes) {
/*
* Sum up total transfer length
* in "frlengths[]":
*/
xfer->frlengths[xfer->aframes] += td->len - temp;
}
} else {
if (xfer->aframes != xfer->nframes) {
/* transfer was complete */
xfer->frlengths[xfer->aframes] += td->len;
}
}
/* Check for last transfer */
if (((void *)td) == xfer->td_transfer_last) {
td = NULL;
break;
}
/* Check transfer status */
if (cc) {
/* the transfer is finished */
td = NULL;
break;
}
/* Check for short transfer */
if (phy_start) {
if (xfer->flags_int.short_frames_ok) {
/* follow alt next */
td = td->alt_next;
} else {
/* the transfer is finished */
td = NULL;
}
break;
}
td = td->obj_next;
if (td->alt_next != td_alt_next) {
/* this USB frame is complete */
break;
}
}
/* update transfer cache */
xfer->td_transfer_cache = td;
DPRINTFN(16, "error cc=%d (%s)\n",
cc, ohci_cc_strs[cc]);
return ((cc == 0) ? USB_ERR_NORMAL_COMPLETION :
(cc == OHCI_CC_STALL) ? USB_ERR_STALLED : USB_ERR_IOERROR);
}
static void
ohci_non_isoc_done(struct usb_xfer *xfer)
{
usb_error_t err = 0;
DPRINTFN(13, "xfer=%p endpoint=%p transfer done\n",
xfer, xfer->endpoint);
#ifdef USB_DEBUG
if (ohcidebug > 10) {
ohci_dump_tds(xfer->td_transfer_first);
}
#endif
/* reset scanner */
xfer->td_transfer_cache = xfer->td_transfer_first;
if (xfer->flags_int.control_xfr) {
if (xfer->flags_int.control_hdr) {
err = ohci_non_isoc_done_sub(xfer);
}
xfer->aframes = 1;
if (xfer->td_transfer_cache == NULL) {
goto done;
}
}
while (xfer->aframes != xfer->nframes) {
err = ohci_non_isoc_done_sub(xfer);
xfer->aframes++;
if (xfer->td_transfer_cache == NULL) {
goto done;
}
}
if (xfer->flags_int.control_xfr &&
!xfer->flags_int.control_act) {
err = ohci_non_isoc_done_sub(xfer);
}
done:
ohci_device_done(xfer, err);
}
/*------------------------------------------------------------------------*
* ohci_check_transfer_sub
*------------------------------------------------------------------------*/
static void
ohci_check_transfer_sub(struct usb_xfer *xfer)
{
ohci_td_t *td;
ohci_ed_t *ed;
uint32_t phy_start;
uint32_t td_flags;
uint32_t td_next;
uint16_t cc;
td = xfer->td_transfer_cache;
while (1) {
usb_pc_cpu_invalidate(td->page_cache);
phy_start = le32toh(td->td_cbp);
td_flags = le32toh(td->td_flags);
td_next = le32toh(td->td_next);
/* Check for last transfer */
if (((void *)td) == xfer->td_transfer_last) {
/* the transfer is finished */
td = NULL;
break;
}
/* Check transfer status */
cc = OHCI_TD_GET_CC(td_flags);
if (cc) {
/* the transfer is finished */
td = NULL;
break;
}
/*
* Check if we reached the last packet
* or if there is a short packet:
*/
if (((td_next & (~0xF)) == OHCI_TD_NEXT_END) || phy_start) {
/* follow alt next */
td = td->alt_next;
break;
}
td = td->obj_next;
}
/* update transfer cache */
xfer->td_transfer_cache = td;
if (td) {
ed = xfer->qh_start[xfer->flags_int.curr_dma_set];
ed->ed_headp = td->td_self;
usb_pc_cpu_flush(ed->page_cache);
DPRINTFN(13, "xfer=%p following alt next\n", xfer);
/*
* Make sure that the OHCI re-scans the schedule by
* writing the BLF and CLF bits:
*/
if (xfer->xroot->udev->flags.self_suspended) {
/* nothing to do */
} else if (xfer->endpoint->methods == &ohci_device_bulk_methods) {
ohci_softc_t *sc = OHCI_BUS2SC(xfer->xroot->bus);
OWRITE4(sc, OHCI_COMMAND_STATUS, OHCI_BLF);
} else if (xfer->endpoint->methods == &ohci_device_ctrl_methods) {
ohci_softc_t *sc = OHCI_BUS2SC(xfer->xroot->bus);
OWRITE4(sc, OHCI_COMMAND_STATUS, OHCI_CLF);
}
}
}
/*------------------------------------------------------------------------*
* ohci_check_transfer
*
* Return values:
* 0: USB transfer is not finished
* Else: USB transfer is finished
*------------------------------------------------------------------------*/
static uint8_t
ohci_check_transfer(struct usb_xfer *xfer)
{
ohci_ed_t *ed;
uint32_t ed_headp;
uint32_t ed_tailp;
DPRINTFN(13, "xfer=%p checking transfer\n", xfer);
ed = xfer->qh_start[xfer->flags_int.curr_dma_set];
usb_pc_cpu_invalidate(ed->page_cache);
ed_headp = le32toh(ed->ed_headp);
ed_tailp = le32toh(ed->ed_tailp);
if ((ed_headp & OHCI_HALTED) ||
(((ed_headp ^ ed_tailp) & (~0xF)) == 0)) {
if (xfer->endpoint->methods == &ohci_device_isoc_methods) {
/* isochronous transfer */
ohci_isoc_done(xfer);
} else {
if (xfer->flags_int.short_frames_ok) {
ohci_check_transfer_sub(xfer);
if (xfer->td_transfer_cache) {
/* not finished yet */
return (0);
}
}
/* store data-toggle */
if (ed_headp & OHCI_TOGGLECARRY) {
xfer->endpoint->toggle_next = 1;
} else {
xfer->endpoint->toggle_next = 0;
}
/* non-isochronous transfer */
ohci_non_isoc_done(xfer);
}
return (1);
}
DPRINTFN(13, "xfer=%p is still active\n", xfer);
return (0);
}
static void
ohci_rhsc_enable(ohci_softc_t *sc)
{
DPRINTFN(5, "\n");
USB_BUS_LOCK_ASSERT(&sc->sc_bus, MA_OWNED);
sc->sc_eintrs |= OHCI_RHSC;
OWRITE4(sc, OHCI_INTERRUPT_ENABLE, OHCI_RHSC);
/* acknowledge any RHSC interrupt */
OWRITE4(sc, OHCI_INTERRUPT_STATUS, OHCI_RHSC);
ohci_root_intr(sc);
}
static void
ohci_interrupt_poll(ohci_softc_t *sc)
{
struct usb_xfer *xfer;
repeat:
TAILQ_FOREACH(xfer, &sc->sc_bus.intr_q.head, wait_entry) {
/*
* check if transfer is transferred
*/
if (ohci_check_transfer(xfer)) {
/* queue has been modified */
goto repeat;
}
}
}
/*------------------------------------------------------------------------*
* ohci_interrupt - OHCI interrupt handler
*
* NOTE: Do not access "sc->sc_bus.bdev" inside the interrupt handler,
* hence the interrupt handler will be setup before "sc->sc_bus.bdev"
* is present !
*------------------------------------------------------------------------*/
void
ohci_interrupt(ohci_softc_t *sc)
{
struct ohci_hcca *hcca;
uint32_t status;
uint32_t done;
USB_BUS_LOCK(&sc->sc_bus);
hcca = ohci_get_hcca(sc);
DPRINTFN(16, "real interrupt\n");
#ifdef USB_DEBUG
if (ohcidebug > 15) {
ohci_dumpregs(sc);
}
#endif
done = le32toh(hcca->hcca_done_head);
/*
* The LSb of done is used to inform the HC Driver that an interrupt
* condition exists for both the Done list and for another event
* recorded in HcInterruptStatus. On an interrupt from the HC, the
* HC Driver checks the HccaDoneHead Value. If this value is 0, then
* the interrupt was caused by other than the HccaDoneHead update
* and the HcInterruptStatus register needs to be accessed to
* determine that exact interrupt cause. If HccaDoneHead is nonzero,
* then a Done list update interrupt is indicated and if the LSb of
* done is nonzero, then an additional interrupt event is indicated
* and HcInterruptStatus should be checked to determine its cause.
*/
if (done != 0) {
status = 0;
if (done & ~OHCI_DONE_INTRS) {
status |= OHCI_WDH;
}
if (done & OHCI_DONE_INTRS) {
status |= OREAD4(sc, OHCI_INTERRUPT_STATUS);
}
hcca->hcca_done_head = 0;
usb_pc_cpu_flush(&sc->sc_hw.hcca_pc);
} else {
status = OREAD4(sc, OHCI_INTERRUPT_STATUS) & ~OHCI_WDH;
}
status &= ~OHCI_MIE;
if (status == 0) {
/*
* nothing to be done (PCI shared
* interrupt)
*/
goto done;
}
OWRITE4(sc, OHCI_INTERRUPT_STATUS, status); /* Acknowledge */
status &= sc->sc_eintrs;
if (status == 0) {
goto done;
}
if (status & (OHCI_SO | OHCI_RD | OHCI_UE | OHCI_RHSC)) {
#if 0
if (status & OHCI_SO) {
/* XXX do what */
}
#endif
if (status & OHCI_RD) {
printf("%s: resume detect\n", __FUNCTION__);
/* XXX process resume detect */
}
if (status & OHCI_UE) {
printf("%s: unrecoverable error, "
"controller halted\n", __FUNCTION__);
OWRITE4(sc, OHCI_CONTROL, OHCI_HCFS_RESET);
/* XXX what else */
}
if (status & OHCI_RHSC) {
/*
* Disable RHSC interrupt for now, because it will be
* on until the port has been reset.
*/
sc->sc_eintrs &= ~OHCI_RHSC;
OWRITE4(sc, OHCI_INTERRUPT_DISABLE, OHCI_RHSC);
ohci_root_intr(sc);
/* do not allow RHSC interrupts > 1 per second */
usb_callout_reset(&sc->sc_tmo_rhsc, hz,
(void *)&ohci_rhsc_enable, sc);
}
}
status &= ~(OHCI_RHSC | OHCI_WDH | OHCI_SO);
if (status != 0) {
/* Block unprocessed interrupts. XXX */
OWRITE4(sc, OHCI_INTERRUPT_DISABLE, status);
sc->sc_eintrs &= ~status;
printf("%s: blocking intrs 0x%x\n",
__FUNCTION__, status);
}
/* poll all the USB transfers */
ohci_interrupt_poll(sc);
done:
USB_BUS_UNLOCK(&sc->sc_bus);
}
/*
* called when a request does not complete
*/
static void
ohci_timeout(void *arg)
{
struct usb_xfer *xfer = arg;
DPRINTF("xfer=%p\n", xfer);
USB_BUS_LOCK_ASSERT(xfer->xroot->bus, MA_OWNED);
/* transfer is transferred */
ohci_device_done(xfer, USB_ERR_TIMEOUT);
}
static void
ohci_do_poll(struct usb_bus *bus)
{
struct ohci_softc *sc = OHCI_BUS2SC(bus);
USB_BUS_LOCK(&sc->sc_bus);
ohci_interrupt_poll(sc);
USB_BUS_UNLOCK(&sc->sc_bus);
}
static void
ohci_setup_standard_chain_sub(struct ohci_std_temp *temp)
{
struct usb_page_search buf_res;
ohci_td_t *td;
ohci_td_t *td_next;
ohci_td_t *td_alt_next;
uint32_t buf_offset;
uint32_t average;
uint32_t len_old;
uint8_t shortpkt_old;
uint8_t precompute;
td_alt_next = NULL;
buf_offset = 0;
shortpkt_old = temp->shortpkt;
len_old = temp->len;
precompute = 1;
/* software is used to detect short incoming transfers */
if ((temp->td_flags & htole32(OHCI_TD_DP_MASK)) == htole32(OHCI_TD_IN)) {
temp->td_flags |= htole32(OHCI_TD_R);
} else {
temp->td_flags &= ~htole32(OHCI_TD_R);
}
restart:
td = temp->td;
td_next = temp->td_next;
while (1) {
if (temp->len == 0) {
if (temp->shortpkt) {
break;
}
/* send a Zero Length Packet, ZLP, last */
temp->shortpkt = 1;
average = 0;
} else {
average = temp->average;
if (temp->len < average) {
if (temp->len % temp->max_frame_size) {
temp->shortpkt = 1;
}
average = temp->len;
}
}
if (td_next == NULL) {
panic("%s: out of OHCI transfer descriptors!", __FUNCTION__);
}
/* get next TD */
td = td_next;
td_next = td->obj_next;
/* check if we are pre-computing */
if (precompute) {
/* update remaining length */
temp->len -= average;
continue;
}
/* fill out current TD */
td->td_flags = temp->td_flags;
/* the next TD uses TOGGLE_CARRY */
temp->td_flags &= ~htole32(OHCI_TD_TOGGLE_MASK);
if (average == 0) {
/*
* The buffer start and end phys addresses should be
* 0x0 for a zero length packet.
*/
td->td_cbp = 0;
td->td_be = 0;
td->len = 0;
} else {
usbd_get_page(temp->pc, buf_offset, &buf_res);
td->td_cbp = htole32(buf_res.physaddr);
buf_offset += (average - 1);
usbd_get_page(temp->pc, buf_offset, &buf_res);
td->td_be = htole32(buf_res.physaddr);
buf_offset++;
td->len = average;
/* update remaining length */
temp->len -= average;
}
if ((td_next == td_alt_next) && temp->setup_alt_next) {
/* we need to receive these frames one by one ! */
td->td_flags &= htole32(~OHCI_TD_INTR_MASK);
td->td_flags |= htole32(OHCI_TD_SET_DI(1));
td->td_next = htole32(OHCI_TD_NEXT_END);
} else {
if (td_next) {
/* link the current TD with the next one */
td->td_next = td_next->td_self;
}
}
td->alt_next = td_alt_next;
usb_pc_cpu_flush(td->page_cache);
}
if (precompute) {
precompute = 0;
/* setup alt next pointer, if any */
if (temp->last_frame) {
/* no alternate next */
td_alt_next = NULL;
} else {
/* we use this field internally */
td_alt_next = td_next;
}
/* restore */
temp->shortpkt = shortpkt_old;
temp->len = len_old;
goto restart;
}
temp->td = td;
temp->td_next = td_next;
}
static void
ohci_setup_standard_chain(struct usb_xfer *xfer, ohci_ed_t **ed_last)
{
struct ohci_std_temp temp;
const struct usb_pipe_methods *methods;
ohci_ed_t *ed;
ohci_td_t *td;
uint32_t ed_flags;
uint32_t x;
DPRINTFN(9, "addr=%d endpt=%d sumlen=%d speed=%d\n",
xfer->address, UE_GET_ADDR(xfer->endpointno),
xfer->sumlen, usbd_get_speed(xfer->xroot->udev));
temp.average = xfer->max_hc_frame_size;
temp.max_frame_size = xfer->max_frame_size;
/* toggle the DMA set we are using */
xfer->flags_int.curr_dma_set ^= 1;
/* get next DMA set */
td = xfer->td_start[xfer->flags_int.curr_dma_set];
xfer->td_transfer_first = td;
xfer->td_transfer_cache = td;
temp.td = NULL;
temp.td_next = td;
temp.last_frame = 0;
temp.setup_alt_next = xfer->flags_int.short_frames_ok;
methods = xfer->endpoint->methods;
/* check if we should prepend a setup message */
if (xfer->flags_int.control_xfr) {
if (xfer->flags_int.control_hdr) {
temp.td_flags = htole32(OHCI_TD_SETUP | OHCI_TD_NOCC |
OHCI_TD_TOGGLE_0 | OHCI_TD_NOINTR);
temp.len = xfer->frlengths[0];
temp.pc = xfer->frbuffers + 0;
temp.shortpkt = temp.len ? 1 : 0;
/* check for last frame */
if (xfer->nframes == 1) {
/* no STATUS stage yet, SETUP is last */
if (xfer->flags_int.control_act) {
temp.last_frame = 1;
temp.setup_alt_next = 0;
}
}
ohci_setup_standard_chain_sub(&temp);
/*
* XXX assume that the setup message is
* contained within one USB packet:
*/
xfer->endpoint->toggle_next = 1;
}
x = 1;
} else {
x = 0;
}
temp.td_flags = htole32(OHCI_TD_NOCC | OHCI_TD_NOINTR);
/* set data toggle */
if (xfer->endpoint->toggle_next) {
temp.td_flags |= htole32(OHCI_TD_TOGGLE_1);
} else {
temp.td_flags |= htole32(OHCI_TD_TOGGLE_0);
}
/* set endpoint direction */
if (UE_GET_DIR(xfer->endpointno) == UE_DIR_IN) {
temp.td_flags |= htole32(OHCI_TD_IN);
} else {
temp.td_flags |= htole32(OHCI_TD_OUT);
}
while (x != xfer->nframes) {
/* DATA0 / DATA1 message */
temp.len = xfer->frlengths[x];
temp.pc = xfer->frbuffers + x;
x++;
if (x == xfer->nframes) {
if (xfer->flags_int.control_xfr) {
/* no STATUS stage yet, DATA is last */
if (xfer->flags_int.control_act) {
temp.last_frame = 1;
temp.setup_alt_next = 0;
}
} else {
temp.last_frame = 1;
temp.setup_alt_next = 0;
}
}
if (temp.len == 0) {
/* make sure that we send an USB packet */
temp.shortpkt = 0;
} else {
/* regular data transfer */
temp.shortpkt = (xfer->flags.force_short_xfer) ? 0 : 1;
}
ohci_setup_standard_chain_sub(&temp);
}
/* check if we should append a status stage */
if (xfer->flags_int.control_xfr &&
!xfer->flags_int.control_act) {
/*
* Send a DATA1 message and invert the current endpoint
* direction.
*/
/* set endpoint direction and data toggle */
if (UE_GET_DIR(xfer->endpointno) == UE_DIR_IN) {
temp.td_flags = htole32(OHCI_TD_OUT |
OHCI_TD_NOCC | OHCI_TD_TOGGLE_1 | OHCI_TD_SET_DI(1));
} else {
temp.td_flags = htole32(OHCI_TD_IN |
OHCI_TD_NOCC | OHCI_TD_TOGGLE_1 | OHCI_TD_SET_DI(1));
}
temp.len = 0;
temp.pc = NULL;
temp.shortpkt = 0;
temp.last_frame = 1;
temp.setup_alt_next = 0;
ohci_setup_standard_chain_sub(&temp);
}
td = temp.td;
/* Ensure that last TD is terminating: */
td->td_next = htole32(OHCI_TD_NEXT_END);
td->td_flags &= ~htole32(OHCI_TD_INTR_MASK);
td->td_flags |= htole32(OHCI_TD_SET_DI(1));
usb_pc_cpu_flush(td->page_cache);
/* must have at least one frame! */
xfer->td_transfer_last = td;
#ifdef USB_DEBUG
if (ohcidebug > 8) {
DPRINTF("nexttog=%d; data before transfer:\n",
xfer->endpoint->toggle_next);
ohci_dump_tds(xfer->td_transfer_first);
}
#endif
ed = xfer->qh_start[xfer->flags_int.curr_dma_set];
ed_flags = (OHCI_ED_SET_FA(xfer->address) |
OHCI_ED_SET_EN(UE_GET_ADDR(xfer->endpointno)) |
OHCI_ED_SET_MAXP(xfer->max_frame_size));
ed_flags |= (OHCI_ED_FORMAT_GEN | OHCI_ED_DIR_TD);
if (xfer->xroot->udev->speed == USB_SPEED_LOW) {
ed_flags |= OHCI_ED_SPEED;
}
ed->ed_flags = htole32(ed_flags);
td = xfer->td_transfer_first;
ed->ed_headp = td->td_self;
if (xfer->xroot->udev->flags.self_suspended == 0) {
/* the append function will flush the endpoint descriptor */
OHCI_APPEND_QH(ed, *ed_last);
if (methods == &ohci_device_bulk_methods) {
ohci_softc_t *sc = OHCI_BUS2SC(xfer->xroot->bus);
OWRITE4(sc, OHCI_COMMAND_STATUS, OHCI_BLF);
}
if (methods == &ohci_device_ctrl_methods) {
ohci_softc_t *sc = OHCI_BUS2SC(xfer->xroot->bus);
OWRITE4(sc, OHCI_COMMAND_STATUS, OHCI_CLF);
}
} else {
usb_pc_cpu_flush(ed->page_cache);
}
}
static void
ohci_root_intr(ohci_softc_t *sc)
{
uint32_t hstatus;
uint16_t i;
uint16_t m;
USB_BUS_LOCK_ASSERT(&sc->sc_bus, MA_OWNED);
/* clear any old interrupt data */
memset(sc->sc_hub_idata, 0, sizeof(sc->sc_hub_idata));
hstatus = OREAD4(sc, OHCI_RH_STATUS);
DPRINTF("sc=%p hstatus=0x%08x\n",
sc, hstatus);
/* set bits */
m = (sc->sc_noport + 1);
if (m > (8 * sizeof(sc->sc_hub_idata))) {
m = (8 * sizeof(sc->sc_hub_idata));
}
for (i = 1; i < m; i++) {
/* pick out CHANGE bits from the status register */
if (OREAD4(sc, OHCI_RH_PORT_STATUS(i)) >> 16) {
sc->sc_hub_idata[i / 8] |= 1 << (i % 8);
DPRINTF("port %d changed\n", i);
}
}
uhub_root_intr(&sc->sc_bus, sc->sc_hub_idata,
sizeof(sc->sc_hub_idata));
}
/* NOTE: "done" can be run two times in a row,
* from close and from interrupt
*/
static void
ohci_device_done(struct usb_xfer *xfer, usb_error_t error)
{
const struct usb_pipe_methods *methods = xfer->endpoint->methods;
ohci_softc_t *sc = OHCI_BUS2SC(xfer->xroot->bus);
ohci_ed_t *ed;
USB_BUS_LOCK_ASSERT(&sc->sc_bus, MA_OWNED);
DPRINTFN(2, "xfer=%p, endpoint=%p, error=%d\n",
xfer, xfer->endpoint, error);
ed = xfer->qh_start[xfer->flags_int.curr_dma_set];
if (ed) {
usb_pc_cpu_invalidate(ed->page_cache);
}
if (methods == &ohci_device_bulk_methods) {
OHCI_REMOVE_QH(ed, sc->sc_bulk_p_last);
}
if (methods == &ohci_device_ctrl_methods) {
OHCI_REMOVE_QH(ed, sc->sc_ctrl_p_last);
}
if (methods == &ohci_device_intr_methods) {
OHCI_REMOVE_QH(ed, sc->sc_intr_p_last[xfer->qh_pos]);
}
if (methods == &ohci_device_isoc_methods) {
OHCI_REMOVE_QH(ed, sc->sc_isoc_p_last);
}
xfer->td_transfer_first = NULL;
xfer->td_transfer_last = NULL;
/* dequeue transfer and start next transfer */
usbd_transfer_done(xfer, error);
}
/*------------------------------------------------------------------------*
* ohci bulk support
*------------------------------------------------------------------------*/
static void
ohci_device_bulk_open(struct usb_xfer *xfer)
{
return;
}
static void
ohci_device_bulk_close(struct usb_xfer *xfer)
{
ohci_device_done(xfer, USB_ERR_CANCELLED);
}
static void
ohci_device_bulk_enter(struct usb_xfer *xfer)
{
return;
}
static void
ohci_device_bulk_start(struct usb_xfer *xfer)
{
ohci_softc_t *sc = OHCI_BUS2SC(xfer->xroot->bus);
/* setup TD's and QH */
ohci_setup_standard_chain(xfer, &sc->sc_bulk_p_last);
/* put transfer on interrupt queue */
ohci_transfer_intr_enqueue(xfer);
}
static const struct usb_pipe_methods ohci_device_bulk_methods =
{
.open = ohci_device_bulk_open,
.close = ohci_device_bulk_close,
.enter = ohci_device_bulk_enter,
.start = ohci_device_bulk_start,
};
/*------------------------------------------------------------------------*
* ohci control support
*------------------------------------------------------------------------*/
static void
ohci_device_ctrl_open(struct usb_xfer *xfer)
{
return;
}
static void
ohci_device_ctrl_close(struct usb_xfer *xfer)
{
ohci_device_done(xfer, USB_ERR_CANCELLED);
}
static void
ohci_device_ctrl_enter(struct usb_xfer *xfer)
{
return;
}
static void
ohci_device_ctrl_start(struct usb_xfer *xfer)
{
ohci_softc_t *sc = OHCI_BUS2SC(xfer->xroot->bus);
/* setup TD's and QH */
ohci_setup_standard_chain(xfer, &sc->sc_ctrl_p_last);
/* put transfer on interrupt queue */
ohci_transfer_intr_enqueue(xfer);
}
static const struct usb_pipe_methods ohci_device_ctrl_methods =
{
.open = ohci_device_ctrl_open,
.close = ohci_device_ctrl_close,
.enter = ohci_device_ctrl_enter,
.start = ohci_device_ctrl_start,
};
/*------------------------------------------------------------------------*
* ohci interrupt support
*------------------------------------------------------------------------*/
static void
ohci_device_intr_open(struct usb_xfer *xfer)
{
ohci_softc_t *sc = OHCI_BUS2SC(xfer->xroot->bus);
uint16_t best;
uint16_t bit;
uint16_t x;
best = 0;
bit = OHCI_NO_EDS / 2;
while (bit) {
if (xfer->interval >= bit) {
x = bit;
best = bit;
while (x & bit) {
if (sc->sc_intr_stat[x] <
sc->sc_intr_stat[best]) {
best = x;
}
x++;
}
break;
}
bit >>= 1;
}
sc->sc_intr_stat[best]++;
xfer->qh_pos = best;
DPRINTFN(3, "best=%d interval=%d\n",
best, xfer->interval);
}
static void
ohci_device_intr_close(struct usb_xfer *xfer)
{
ohci_softc_t *sc = OHCI_BUS2SC(xfer->xroot->bus);
sc->sc_intr_stat[xfer->qh_pos]--;
ohci_device_done(xfer, USB_ERR_CANCELLED);
}
static void
ohci_device_intr_enter(struct usb_xfer *xfer)
{
return;
}
static void
ohci_device_intr_start(struct usb_xfer *xfer)
{
ohci_softc_t *sc = OHCI_BUS2SC(xfer->xroot->bus);
/* setup TD's and QH */
ohci_setup_standard_chain(xfer, &sc->sc_intr_p_last[xfer->qh_pos]);
/* put transfer on interrupt queue */
ohci_transfer_intr_enqueue(xfer);
}
static const struct usb_pipe_methods ohci_device_intr_methods =
{
.open = ohci_device_intr_open,
.close = ohci_device_intr_close,
.enter = ohci_device_intr_enter,
.start = ohci_device_intr_start,
};
/*------------------------------------------------------------------------*
* ohci isochronous support
*------------------------------------------------------------------------*/
static void
ohci_device_isoc_open(struct usb_xfer *xfer)
{
return;
}
static void
ohci_device_isoc_close(struct usb_xfer *xfer)
{
/**/
ohci_device_done(xfer, USB_ERR_CANCELLED);
}
static void
ohci_device_isoc_enter(struct usb_xfer *xfer)
{
struct usb_page_search buf_res;
ohci_softc_t *sc = OHCI_BUS2SC(xfer->xroot->bus);
struct ohci_hcca *hcca;
uint32_t buf_offset;
uint32_t nframes;
uint32_t ed_flags;
uint32_t *plen;
uint16_t itd_offset[OHCI_ITD_NOFFSET];
uint16_t length;
uint8_t ncur;
ohci_itd_t *td;
ohci_itd_t *td_last = NULL;
ohci_ed_t *ed;
hcca = ohci_get_hcca(sc);
nframes = le32toh(hcca->hcca_frame_number);
DPRINTFN(6, "xfer=%p isoc_next=%u nframes=%u hcca_fn=%u\n",
xfer, xfer->endpoint->isoc_next, xfer->nframes, nframes);
if ((xfer->endpoint->is_synced == 0) ||
(((nframes - xfer->endpoint->isoc_next) & 0xFFFF) < xfer->nframes) ||
(((xfer->endpoint->isoc_next - nframes) & 0xFFFF) >= 128)) {
/*
* If there is data underflow or the pipe queue is empty we
* schedule the transfer a few frames ahead of the current
* frame position. Else two isochronous transfers might
* overlap.
*/
xfer->endpoint->isoc_next = (nframes + 3) & 0xFFFF;
xfer->endpoint->is_synced = 1;
DPRINTFN(3, "start next=%d\n", xfer->endpoint->isoc_next);
}
/*
* compute how many milliseconds the insertion is ahead of the
* current frame position:
*/
buf_offset = ((xfer->endpoint->isoc_next - nframes) & 0xFFFF);
/*
* pre-compute when the isochronous transfer will be finished:
*/
xfer->isoc_time_complete =
(usb_isoc_time_expand(&sc->sc_bus, nframes) + buf_offset +
xfer->nframes);
/* get the real number of frames */
nframes = xfer->nframes;
buf_offset = 0;
plen = xfer->frlengths;
/* toggle the DMA set we are using */
xfer->flags_int.curr_dma_set ^= 1;
/* get next DMA set */
td = xfer->td_start[xfer->flags_int.curr_dma_set];
xfer->td_transfer_first = td;
ncur = 0;
length = 0;
while (nframes--) {
if (td == NULL) {
panic("%s:%d: out of TD's\n",
__FUNCTION__, __LINE__);
}
itd_offset[ncur] = length;
buf_offset += *plen;
length += *plen;
plen++;
ncur++;
if ( /* check if the ITD is full */
(ncur == OHCI_ITD_NOFFSET) ||
/* check if we have put more than 4K into the ITD */
(length & 0xF000) ||
/* check if it is the last frame */
(nframes == 0)) {
/* fill current ITD */
td->itd_flags = htole32(
OHCI_ITD_NOCC |
OHCI_ITD_SET_SF(xfer->endpoint->isoc_next) |
OHCI_ITD_NOINTR |
OHCI_ITD_SET_FC(ncur));
td->frames = ncur;
xfer->endpoint->isoc_next += ncur;
if (length == 0) {
/* all zero */
td->itd_bp0 = 0;
td->itd_be = ~0;
while (ncur--) {
td->itd_offset[ncur] =
htole16(OHCI_ITD_MK_OFFS(0));
}
} else {
usbd_get_page(xfer->frbuffers, buf_offset - length, &buf_res);
length = OHCI_PAGE_MASK(buf_res.physaddr);
buf_res.physaddr =
OHCI_PAGE(buf_res.physaddr);
td->itd_bp0 = htole32(buf_res.physaddr);
usbd_get_page(xfer->frbuffers, buf_offset - 1, &buf_res);
td->itd_be = htole32(buf_res.physaddr);
while (ncur--) {
itd_offset[ncur] += length;
itd_offset[ncur] =
OHCI_ITD_MK_OFFS(itd_offset[ncur]);
td->itd_offset[ncur] =
htole16(itd_offset[ncur]);
}
}
ncur = 0;
length = 0;
td_last = td;
td = td->obj_next;
if (td) {
/* link the last TD with the next one */
td_last->itd_next = td->itd_self;
}
usb_pc_cpu_flush(td_last->page_cache);
}
}
/* update the last TD */
td_last->itd_flags &= ~htole32(OHCI_ITD_NOINTR);
td_last->itd_flags |= htole32(OHCI_ITD_SET_DI(0));
td_last->itd_next = 0;
usb_pc_cpu_flush(td_last->page_cache);
xfer->td_transfer_last = td_last;
#ifdef USB_DEBUG
if (ohcidebug > 8) {
DPRINTF("data before transfer:\n");
ohci_dump_itds(xfer->td_transfer_first);
}
#endif
ed = xfer->qh_start[xfer->flags_int.curr_dma_set];
if (UE_GET_DIR(xfer->endpointno) == UE_DIR_IN)
ed_flags = (OHCI_ED_DIR_IN | OHCI_ED_FORMAT_ISO);
else
ed_flags = (OHCI_ED_DIR_OUT | OHCI_ED_FORMAT_ISO);
ed_flags |= (OHCI_ED_SET_FA(xfer->address) |
OHCI_ED_SET_EN(UE_GET_ADDR(xfer->endpointno)) |
OHCI_ED_SET_MAXP(xfer->max_frame_size));
if (xfer->xroot->udev->speed == USB_SPEED_LOW) {
ed_flags |= OHCI_ED_SPEED;
}
ed->ed_flags = htole32(ed_flags);
td = xfer->td_transfer_first;
ed->ed_headp = td->itd_self;
/* isochronous transfers are not affected by suspend / resume */
/* the append function will flush the endpoint descriptor */
OHCI_APPEND_QH(ed, sc->sc_isoc_p_last);
}
static void
ohci_device_isoc_start(struct usb_xfer *xfer)
{
/* put transfer on interrupt queue */
ohci_transfer_intr_enqueue(xfer);
}
static const struct usb_pipe_methods ohci_device_isoc_methods =
{
.open = ohci_device_isoc_open,
.close = ohci_device_isoc_close,
.enter = ohci_device_isoc_enter,
.start = ohci_device_isoc_start,
};
/*------------------------------------------------------------------------*
* ohci root control support
*------------------------------------------------------------------------*
* Simulate a hardware hub by handling all the necessary requests.
*------------------------------------------------------------------------*/
static const
struct usb_device_descriptor ohci_devd =
{
sizeof(struct usb_device_descriptor),
UDESC_DEVICE, /* type */
{0x00, 0x01}, /* USB version */
UDCLASS_HUB, /* class */
UDSUBCLASS_HUB, /* subclass */
UDPROTO_FSHUB, /* protocol */
64, /* max packet */
{0}, {0}, {0x00, 0x01}, /* device id */
1, 2, 0, /* string indexes */
1 /* # of configurations */
};
static const
struct ohci_config_desc ohci_confd =
{
.confd = {
.bLength = sizeof(struct usb_config_descriptor),
.bDescriptorType = UDESC_CONFIG,
.wTotalLength[0] = sizeof(ohci_confd),
.bNumInterface = 1,
.bConfigurationValue = 1,
.iConfiguration = 0,
.bmAttributes = UC_SELF_POWERED,
.bMaxPower = 0, /* max power */
},
.ifcd = {
.bLength = sizeof(struct usb_interface_descriptor),
.bDescriptorType = UDESC_INTERFACE,
.bNumEndpoints = 1,
.bInterfaceClass = UICLASS_HUB,
.bInterfaceSubClass = UISUBCLASS_HUB,
.bInterfaceProtocol = 0,
},
.endpd = {
.bLength = sizeof(struct usb_endpoint_descriptor),
.bDescriptorType = UDESC_ENDPOINT,
.bEndpointAddress = UE_DIR_IN | OHCI_INTR_ENDPT,
.bmAttributes = UE_INTERRUPT,
.wMaxPacketSize[0] = 32,/* max packet (255 ports) */
.bInterval = 255,
},
};
static const
struct usb_hub_descriptor ohci_hubd =
{
.bDescLength = 0, /* dynamic length */
.bDescriptorType = UDESC_HUB,
};
static usb_error_t
ohci_roothub_exec(struct usb_device *udev,
struct usb_device_request *req, const void **pptr, uint16_t *plength)
{
ohci_softc_t *sc = OHCI_BUS2SC(udev->bus);
const void *ptr;
const char *str_ptr;
uint32_t port;
uint32_t v;
uint16_t len;
uint16_t value;
uint16_t index;
uint8_t l;
usb_error_t err;
USB_BUS_LOCK_ASSERT(&sc->sc_bus, MA_OWNED);
/* buffer reset */
ptr = (const void *)&sc->sc_hub_desc.temp;
len = 0;
err = 0;
value = UGETW(req->wValue);
index = UGETW(req->wIndex);
DPRINTFN(3, "type=0x%02x request=0x%02x wLen=0x%04x "
"wValue=0x%04x wIndex=0x%04x\n",
req->bmRequestType, req->bRequest,
UGETW(req->wLength), value, index);
#define C(x,y) ((x) | ((y) << 8))
switch (C(req->bRequest, req->bmRequestType)) {
case C(UR_CLEAR_FEATURE, UT_WRITE_DEVICE):
case C(UR_CLEAR_FEATURE, UT_WRITE_INTERFACE):
case C(UR_CLEAR_FEATURE, UT_WRITE_ENDPOINT):
/*
* DEVICE_REMOTE_WAKEUP and ENDPOINT_HALT are no-ops
* for the integrated root hub.
*/
break;
case C(UR_GET_CONFIG, UT_READ_DEVICE):
len = 1;
sc->sc_hub_desc.temp[0] = sc->sc_conf;
break;
case C(UR_GET_DESCRIPTOR, UT_READ_DEVICE):
switch (value >> 8) {
case UDESC_DEVICE:
if ((value & 0xff) != 0) {
err = USB_ERR_IOERROR;
goto done;
}
len = sizeof(ohci_devd);
ptr = (const void *)&ohci_devd;
break;
case UDESC_CONFIG:
if ((value & 0xff) != 0) {
err = USB_ERR_IOERROR;
goto done;
}
len = sizeof(ohci_confd);
ptr = (const void *)&ohci_confd;
break;
case UDESC_STRING:
switch (value & 0xff) {
case 0: /* Language table */
str_ptr = "\001";
break;
case 1: /* Vendor */
str_ptr = sc->sc_vendor;
break;
case 2: /* Product */
str_ptr = "OHCI root HUB";
break;
default:
str_ptr = "";
break;
}
len = usb_make_str_desc(
sc->sc_hub_desc.temp,
sizeof(sc->sc_hub_desc.temp),
str_ptr);
break;
default:
err = USB_ERR_IOERROR;
goto done;
}
break;
case C(UR_GET_INTERFACE, UT_READ_INTERFACE):
len = 1;
sc->sc_hub_desc.temp[0] = 0;
break;
case C(UR_GET_STATUS, UT_READ_DEVICE):
len = 2;
USETW(sc->sc_hub_desc.stat.wStatus, UDS_SELF_POWERED);
break;
case C(UR_GET_STATUS, UT_READ_INTERFACE):
case C(UR_GET_STATUS, UT_READ_ENDPOINT):
len = 2;
USETW(sc->sc_hub_desc.stat.wStatus, 0);
break;
case C(UR_SET_ADDRESS, UT_WRITE_DEVICE):
if (value >= OHCI_MAX_DEVICES) {
err = USB_ERR_IOERROR;
goto done;
}
sc->sc_addr = value;
break;
case C(UR_SET_CONFIG, UT_WRITE_DEVICE):
if ((value != 0) && (value != 1)) {
err = USB_ERR_IOERROR;
goto done;
}
sc->sc_conf = value;
break;
case C(UR_SET_DESCRIPTOR, UT_WRITE_DEVICE):
break;
case C(UR_SET_FEATURE, UT_WRITE_DEVICE):
case C(UR_SET_FEATURE, UT_WRITE_INTERFACE):
case C(UR_SET_FEATURE, UT_WRITE_ENDPOINT):
err = USB_ERR_IOERROR;
goto done;
case C(UR_SET_INTERFACE, UT_WRITE_INTERFACE):
break;
case C(UR_SYNCH_FRAME, UT_WRITE_ENDPOINT):
break;
/* Hub requests */
case C(UR_CLEAR_FEATURE, UT_WRITE_CLASS_DEVICE):
break;
case C(UR_CLEAR_FEATURE, UT_WRITE_CLASS_OTHER):
DPRINTFN(9, "UR_CLEAR_PORT_FEATURE "
"port=%d feature=%d\n",
index, value);
if ((index < 1) ||
(index > sc->sc_noport)) {
err = USB_ERR_IOERROR;
goto done;
}
port = OHCI_RH_PORT_STATUS(index);
switch (value) {
case UHF_PORT_ENABLE:
OWRITE4(sc, port, UPS_CURRENT_CONNECT_STATUS);
break;
case UHF_PORT_SUSPEND:
OWRITE4(sc, port, UPS_OVERCURRENT_INDICATOR);
break;
case UHF_PORT_POWER:
/* Yes, writing to the LOW_SPEED bit clears power. */
OWRITE4(sc, port, UPS_LOW_SPEED);
break;
case UHF_C_PORT_CONNECTION:
OWRITE4(sc, port, UPS_C_CONNECT_STATUS << 16);
break;
case UHF_C_PORT_ENABLE:
OWRITE4(sc, port, UPS_C_PORT_ENABLED << 16);
break;
case UHF_C_PORT_SUSPEND:
OWRITE4(sc, port, UPS_C_SUSPEND << 16);
break;
case UHF_C_PORT_OVER_CURRENT:
OWRITE4(sc, port, UPS_C_OVERCURRENT_INDICATOR << 16);
break;
case UHF_C_PORT_RESET:
OWRITE4(sc, port, UPS_C_PORT_RESET << 16);
break;
default:
err = USB_ERR_IOERROR;
goto done;
}
switch (value) {
case UHF_C_PORT_CONNECTION:
case UHF_C_PORT_ENABLE:
case UHF_C_PORT_SUSPEND:
case UHF_C_PORT_OVER_CURRENT:
case UHF_C_PORT_RESET:
/* enable RHSC interrupt if condition is cleared. */
if ((OREAD4(sc, port) >> 16) == 0)
ohci_rhsc_enable(sc);
break;
default:
break;
}
break;
case C(UR_GET_DESCRIPTOR, UT_READ_CLASS_DEVICE):
if ((value & 0xff) != 0) {
err = USB_ERR_IOERROR;
goto done;
}
v = OREAD4(sc, OHCI_RH_DESCRIPTOR_A);
sc->sc_hub_desc.hubd = ohci_hubd;
sc->sc_hub_desc.hubd.bNbrPorts = sc->sc_noport;
USETW(sc->sc_hub_desc.hubd.wHubCharacteristics,
(v & OHCI_NPS ? UHD_PWR_NO_SWITCH :
v & OHCI_PSM ? UHD_PWR_GANGED : UHD_PWR_INDIVIDUAL)
/* XXX overcurrent */
);
sc->sc_hub_desc.hubd.bPwrOn2PwrGood = OHCI_GET_POTPGT(v);
v = OREAD4(sc, OHCI_RH_DESCRIPTOR_B);
for (l = 0; l < sc->sc_noport; l++) {
if (v & 1) {
sc->sc_hub_desc.hubd.DeviceRemovable[l / 8] |= (1 << (l % 8));
}
v >>= 1;
}
sc->sc_hub_desc.hubd.bDescLength =
8 + ((sc->sc_noport + 7) / 8);
len = sc->sc_hub_desc.hubd.bDescLength;
break;
case C(UR_GET_STATUS, UT_READ_CLASS_DEVICE):
len = 16;
memset(sc->sc_hub_desc.temp, 0, 16);
break;
case C(UR_GET_STATUS, UT_READ_CLASS_OTHER):
DPRINTFN(9, "get port status i=%d\n",
index);
if ((index < 1) ||
(index > sc->sc_noport)) {
err = USB_ERR_IOERROR;
goto done;
}
v = OREAD4(sc, OHCI_RH_PORT_STATUS(index));
DPRINTFN(9, "port status=0x%04x\n", v);
v &= ~UPS_PORT_MODE_DEVICE; /* force host mode */
USETW(sc->sc_hub_desc.ps.wPortStatus, v);
USETW(sc->sc_hub_desc.ps.wPortChange, v >> 16);
len = sizeof(sc->sc_hub_desc.ps);
break;
case C(UR_SET_DESCRIPTOR, UT_WRITE_CLASS_DEVICE):
err = USB_ERR_IOERROR;
goto done;
case C(UR_SET_FEATURE, UT_WRITE_CLASS_DEVICE):
break;
case C(UR_SET_FEATURE, UT_WRITE_CLASS_OTHER):
if ((index < 1) ||
(index > sc->sc_noport)) {
err = USB_ERR_IOERROR;
goto done;
}
port = OHCI_RH_PORT_STATUS(index);
switch (value) {
case UHF_PORT_ENABLE:
OWRITE4(sc, port, UPS_PORT_ENABLED);
break;
case UHF_PORT_SUSPEND:
OWRITE4(sc, port, UPS_SUSPEND);
break;
case UHF_PORT_RESET:
DPRINTFN(6, "reset port %d\n", index);
OWRITE4(sc, port, UPS_RESET);
for (v = 0;; v++) {
if (v < 12) {
usb_pause_mtx(&sc->sc_bus.bus_mtx,
USB_MS_TO_TICKS(usb_port_root_reset_delay));
if ((OREAD4(sc, port) & UPS_RESET) == 0) {
break;
}
} else {
err = USB_ERR_TIMEOUT;
goto done;
}
}
DPRINTFN(9, "ohci port %d reset, status = 0x%04x\n",
index, OREAD4(sc, port));
break;
case UHF_PORT_POWER:
DPRINTFN(3, "set port power %d\n", index);
OWRITE4(sc, port, UPS_PORT_POWER);
break;
default:
err = USB_ERR_IOERROR;
goto done;
}
break;
default:
err = USB_ERR_IOERROR;
goto done;
}
done:
*plength = len;
*pptr = ptr;
return (err);
}
static void
ohci_xfer_setup(struct usb_setup_params *parm)
{
struct usb_page_search page_info;
struct usb_page_cache *pc;
- ohci_softc_t *sc;
struct usb_xfer *xfer;
void *last_obj;
uint32_t ntd;
uint32_t nitd;
uint32_t nqh;
uint32_t n;
- sc = OHCI_BUS2SC(parm->udev->bus);
xfer = parm->curr_xfer;
parm->hc_max_packet_size = 0x500;
parm->hc_max_packet_count = 1;
parm->hc_max_frame_size = OHCI_PAGE_SIZE;
/*
* calculate ntd and nqh
*/
if (parm->methods == &ohci_device_ctrl_methods) {
xfer->flags_int.bdma_enable = 1;
usbd_transfer_setup_sub(parm);
nitd = 0;
ntd = ((2 * xfer->nframes) + 1 /* STATUS */
+ (xfer->max_data_length / xfer->max_hc_frame_size));
nqh = 1;
} else if (parm->methods == &ohci_device_bulk_methods) {
xfer->flags_int.bdma_enable = 1;
usbd_transfer_setup_sub(parm);
nitd = 0;
ntd = ((2 * xfer->nframes)
+ (xfer->max_data_length / xfer->max_hc_frame_size));
nqh = 1;
} else if (parm->methods == &ohci_device_intr_methods) {
xfer->flags_int.bdma_enable = 1;
usbd_transfer_setup_sub(parm);
nitd = 0;
ntd = ((2 * xfer->nframes)
+ (xfer->max_data_length / xfer->max_hc_frame_size));
nqh = 1;
} else if (parm->methods == &ohci_device_isoc_methods) {
xfer->flags_int.bdma_enable = 1;
usbd_transfer_setup_sub(parm);
nitd = ((xfer->max_data_length / OHCI_PAGE_SIZE) +
howmany(xfer->nframes, OHCI_ITD_NOFFSET) +
1 /* EXTRA */ );
ntd = 0;
nqh = 1;
} else {
usbd_transfer_setup_sub(parm);
nitd = 0;
ntd = 0;
nqh = 0;
}
alloc_dma_set:
if (parm->err) {
return;
}
last_obj = NULL;
if (usbd_transfer_setup_sub_malloc(
parm, &pc, sizeof(ohci_td_t),
OHCI_TD_ALIGN, ntd)) {
parm->err = USB_ERR_NOMEM;
return;
}
if (parm->buf) {
for (n = 0; n != ntd; n++) {
ohci_td_t *td;
usbd_get_page(pc + n, 0, &page_info);
td = page_info.buffer;
/* init TD */
td->td_self = htole32(page_info.physaddr);
td->obj_next = last_obj;
td->page_cache = pc + n;
last_obj = td;
usb_pc_cpu_flush(pc + n);
}
}
if (usbd_transfer_setup_sub_malloc(
parm, &pc, sizeof(ohci_itd_t),
OHCI_ITD_ALIGN, nitd)) {
parm->err = USB_ERR_NOMEM;
return;
}
if (parm->buf) {
for (n = 0; n != nitd; n++) {
ohci_itd_t *itd;
usbd_get_page(pc + n, 0, &page_info);
itd = page_info.buffer;
/* init TD */
itd->itd_self = htole32(page_info.physaddr);
itd->obj_next = last_obj;
itd->page_cache = pc + n;
last_obj = itd;
usb_pc_cpu_flush(pc + n);
}
}
xfer->td_start[xfer->flags_int.curr_dma_set] = last_obj;
last_obj = NULL;
if (usbd_transfer_setup_sub_malloc(
parm, &pc, sizeof(ohci_ed_t),
OHCI_ED_ALIGN, nqh)) {
parm->err = USB_ERR_NOMEM;
return;
}
if (parm->buf) {
for (n = 0; n != nqh; n++) {
ohci_ed_t *ed;
usbd_get_page(pc + n, 0, &page_info);
ed = page_info.buffer;
/* init QH */
ed->ed_self = htole32(page_info.physaddr);
ed->obj_next = last_obj;
ed->page_cache = pc + n;
last_obj = ed;
usb_pc_cpu_flush(pc + n);
}
}
xfer->qh_start[xfer->flags_int.curr_dma_set] = last_obj;
if (!xfer->flags_int.curr_dma_set) {
xfer->flags_int.curr_dma_set = 1;
goto alloc_dma_set;
}
}
static void
ohci_ep_init(struct usb_device *udev, struct usb_endpoint_descriptor *edesc,
struct usb_endpoint *ep)
{
ohci_softc_t *sc = OHCI_BUS2SC(udev->bus);
DPRINTFN(2, "endpoint=%p, addr=%d, endpt=%d, mode=%d (%d)\n",
ep, udev->address,
edesc->bEndpointAddress, udev->flags.usb_mode,
sc->sc_addr);
if (udev->device_index != sc->sc_addr) {
switch (edesc->bmAttributes & UE_XFERTYPE) {
case UE_CONTROL:
ep->methods = &ohci_device_ctrl_methods;
break;
case UE_INTERRUPT:
ep->methods = &ohci_device_intr_methods;
break;
case UE_ISOCHRONOUS:
if (udev->speed == USB_SPEED_FULL) {
ep->methods = &ohci_device_isoc_methods;
}
break;
case UE_BULK:
ep->methods = &ohci_device_bulk_methods;
break;
default:
/* do nothing */
break;
}
}
}
static void
ohci_xfer_unsetup(struct usb_xfer *xfer)
{
return;
}
static void
ohci_get_dma_delay(struct usb_device *udev, uint32_t *pus)
{
/*
* Wait until hardware has finished any possible use of the
* transfer descriptor(s) and QH
*/
*pus = (1125); /* microseconds */
}
static void
ohci_device_resume(struct usb_device *udev)
{
struct ohci_softc *sc = OHCI_BUS2SC(udev->bus);
struct usb_xfer *xfer;
const struct usb_pipe_methods *methods;
ohci_ed_t *ed;
DPRINTF("\n");
USB_BUS_LOCK(udev->bus);
TAILQ_FOREACH(xfer, &sc->sc_bus.intr_q.head, wait_entry) {
if (xfer->xroot->udev == udev) {
methods = xfer->endpoint->methods;
ed = xfer->qh_start[xfer->flags_int.curr_dma_set];
if (methods == &ohci_device_bulk_methods) {
OHCI_APPEND_QH(ed, sc->sc_bulk_p_last);
OWRITE4(sc, OHCI_COMMAND_STATUS, OHCI_BLF);
}
if (methods == &ohci_device_ctrl_methods) {
OHCI_APPEND_QH(ed, sc->sc_ctrl_p_last);
OWRITE4(sc, OHCI_COMMAND_STATUS, OHCI_CLF);
}
if (methods == &ohci_device_intr_methods) {
OHCI_APPEND_QH(ed, sc->sc_intr_p_last[xfer->qh_pos]);
}
}
}
USB_BUS_UNLOCK(udev->bus);
return;
}
static void
ohci_device_suspend(struct usb_device *udev)
{
struct ohci_softc *sc = OHCI_BUS2SC(udev->bus);
struct usb_xfer *xfer;
const struct usb_pipe_methods *methods;
ohci_ed_t *ed;
DPRINTF("\n");
USB_BUS_LOCK(udev->bus);
TAILQ_FOREACH(xfer, &sc->sc_bus.intr_q.head, wait_entry) {
if (xfer->xroot->udev == udev) {
methods = xfer->endpoint->methods;
ed = xfer->qh_start[xfer->flags_int.curr_dma_set];
if (methods == &ohci_device_bulk_methods) {
OHCI_REMOVE_QH(ed, sc->sc_bulk_p_last);
}
if (methods == &ohci_device_ctrl_methods) {
OHCI_REMOVE_QH(ed, sc->sc_ctrl_p_last);
}
if (methods == &ohci_device_intr_methods) {
OHCI_REMOVE_QH(ed, sc->sc_intr_p_last[xfer->qh_pos]);
}
}
}
USB_BUS_UNLOCK(udev->bus);
return;
}
static void
ohci_set_hw_power_sleep(struct usb_bus *bus, uint32_t state)
{
struct ohci_softc *sc = OHCI_BUS2SC(bus);
switch (state) {
case USB_HW_POWER_SUSPEND:
case USB_HW_POWER_SHUTDOWN:
ohci_suspend(sc);
break;
case USB_HW_POWER_RESUME:
ohci_resume(sc);
break;
default:
break;
}
}
static void
ohci_set_hw_power(struct usb_bus *bus)
{
struct ohci_softc *sc = OHCI_BUS2SC(bus);
uint32_t temp;
uint32_t flags;
DPRINTF("\n");
USB_BUS_LOCK(bus);
flags = bus->hw_power_state;
temp = OREAD4(sc, OHCI_CONTROL);
temp &= ~(OHCI_PLE | OHCI_IE | OHCI_CLE | OHCI_BLE);
if (flags & USB_HW_POWER_CONTROL)
temp |= OHCI_CLE;
if (flags & USB_HW_POWER_BULK)
temp |= OHCI_BLE;
if (flags & USB_HW_POWER_INTERRUPT)
temp |= OHCI_PLE;
if (flags & USB_HW_POWER_ISOC)
temp |= OHCI_IE | OHCI_PLE;
OWRITE4(sc, OHCI_CONTROL, temp);
USB_BUS_UNLOCK(bus);
return;
}
static const struct usb_bus_methods ohci_bus_methods =
{
.endpoint_init = ohci_ep_init,
.xfer_setup = ohci_xfer_setup,
.xfer_unsetup = ohci_xfer_unsetup,
.get_dma_delay = ohci_get_dma_delay,
.device_resume = ohci_device_resume,
.device_suspend = ohci_device_suspend,
.set_hw_power = ohci_set_hw_power,
.set_hw_power_sleep = ohci_set_hw_power_sleep,
.roothub_exec = ohci_roothub_exec,
.xfer_poll = ohci_do_poll,
};
Index: head/sys/dev/usb/controller/xhci.c
===================================================================
--- head/sys/dev/usb/controller/xhci.c (revision 327172)
+++ head/sys/dev/usb/controller/xhci.c (revision 327173)
@@ -1,4361 +1,4359 @@
/* $FreeBSD$ */
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2010 Hans Petter Selasky. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* USB eXtensible Host Controller Interface, a.k.a. USB 3.0 controller.
*
* The XHCI 1.0 spec can be found at
* http://www.intel.com/technology/usb/download/xHCI_Specification_for_USB.pdf
* and the USB 3.0 spec at
* http://www.usb.org/developers/docs/usb_30_spec_060910.zip
*/
/*
* A few words about the design implementation: This driver emulates
* the concept about TDs which is found in EHCI specification. This
* way we achieve that the USB controller drivers look similar to
* eachother which makes it easier to understand the code.
*/
#ifdef USB_GLOBAL_INCLUDE_FILE
#include USB_GLOBAL_INCLUDE_FILE
#else
#include <sys/stdint.h>
#include <sys/stddef.h>
#include <sys/param.h>
#include <sys/queue.h>
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/bus.h>
#include <sys/module.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/condvar.h>
#include <sys/sysctl.h>
#include <sys/sx.h>
#include <sys/unistd.h>
#include <sys/callout.h>
#include <sys/malloc.h>
#include <sys/priv.h>
#include <dev/usb/usb.h>
#include <dev/usb/usbdi.h>
#define USB_DEBUG_VAR xhcidebug
#include <dev/usb/usb_core.h>
#include <dev/usb/usb_debug.h>
#include <dev/usb/usb_busdma.h>
#include <dev/usb/usb_process.h>
#include <dev/usb/usb_transfer.h>
#include <dev/usb/usb_device.h>
#include <dev/usb/usb_hub.h>
#include <dev/usb/usb_util.h>
#include <dev/usb/usb_controller.h>
#include <dev/usb/usb_bus.h>
#endif /* USB_GLOBAL_INCLUDE_FILE */
#include <dev/usb/controller/xhci.h>
#include <dev/usb/controller/xhcireg.h>
#define XHCI_BUS2SC(bus) \
((struct xhci_softc *)(((uint8_t *)(bus)) - \
((uint8_t *)&(((struct xhci_softc *)0)->sc_bus))))
static SYSCTL_NODE(_hw_usb, OID_AUTO, xhci, CTLFLAG_RW, 0, "USB XHCI");
static int xhcistreams;
SYSCTL_INT(_hw_usb_xhci, OID_AUTO, streams, CTLFLAG_RWTUN,
&xhcistreams, 0, "Set to enable streams mode support");
#ifdef USB_DEBUG
static int xhcidebug;
static int xhciroute;
static int xhcipolling;
static int xhcidma32;
SYSCTL_INT(_hw_usb_xhci, OID_AUTO, debug, CTLFLAG_RWTUN,
&xhcidebug, 0, "Debug level");
SYSCTL_INT(_hw_usb_xhci, OID_AUTO, xhci_port_route, CTLFLAG_RWTUN,
&xhciroute, 0, "Routing bitmap for switching EHCI ports to the XHCI controller");
SYSCTL_INT(_hw_usb_xhci, OID_AUTO, use_polling, CTLFLAG_RWTUN,
&xhcipolling, 0, "Set to enable software interrupt polling for the XHCI controller");
SYSCTL_INT(_hw_usb_xhci, OID_AUTO, dma32, CTLFLAG_RWTUN,
&xhcidma32, 0, "Set to only use 32-bit DMA for the XHCI controller");
#else
#define xhciroute 0
#define xhcidma32 0
#endif
#define XHCI_INTR_ENDPT 1
struct xhci_std_temp {
struct xhci_softc *sc;
struct usb_page_cache *pc;
struct xhci_td *td;
struct xhci_td *td_next;
uint32_t len;
uint32_t offset;
uint32_t max_packet_size;
uint32_t average;
uint16_t isoc_delta;
uint16_t isoc_frame;
uint8_t shortpkt;
uint8_t multishort;
uint8_t last_frame;
uint8_t trb_type;
uint8_t direction;
uint8_t tbc;
uint8_t tlbpc;
uint8_t step_td;
uint8_t do_isoc_sync;
};
static void xhci_do_poll(struct usb_bus *);
static void xhci_device_done(struct usb_xfer *, usb_error_t);
static void xhci_root_intr(struct xhci_softc *);
static void xhci_free_device_ext(struct usb_device *);
static struct xhci_endpoint_ext *xhci_get_endpoint_ext(struct usb_device *,
struct usb_endpoint_descriptor *);
static usb_proc_callback_t xhci_configure_msg;
static usb_error_t xhci_configure_device(struct usb_device *);
static usb_error_t xhci_configure_endpoint(struct usb_device *,
struct usb_endpoint_descriptor *, struct xhci_endpoint_ext *,
uint16_t, uint8_t, uint8_t, uint8_t, uint16_t, uint16_t,
uint8_t);
static usb_error_t xhci_configure_mask(struct usb_device *,
uint32_t, uint8_t);
static usb_error_t xhci_cmd_evaluate_ctx(struct xhci_softc *,
uint64_t, uint8_t);
static void xhci_endpoint_doorbell(struct usb_xfer *);
static void xhci_ctx_set_le32(struct xhci_softc *sc, volatile uint32_t *ptr, uint32_t val);
static uint32_t xhci_ctx_get_le32(struct xhci_softc *sc, volatile uint32_t *ptr);
static void xhci_ctx_set_le64(struct xhci_softc *sc, volatile uint64_t *ptr, uint64_t val);
#ifdef USB_DEBUG
static uint64_t xhci_ctx_get_le64(struct xhci_softc *sc, volatile uint64_t *ptr);
#endif
static const struct usb_bus_methods xhci_bus_methods;
#ifdef USB_DEBUG
static void
xhci_dump_trb(struct xhci_trb *trb)
{
DPRINTFN(5, "trb = %p\n", trb);
DPRINTFN(5, "qwTrb0 = 0x%016llx\n", (long long)le64toh(trb->qwTrb0));
DPRINTFN(5, "dwTrb2 = 0x%08x\n", le32toh(trb->dwTrb2));
DPRINTFN(5, "dwTrb3 = 0x%08x\n", le32toh(trb->dwTrb3));
}
static void
xhci_dump_endpoint(struct xhci_softc *sc, struct xhci_endp_ctx *pep)
{
DPRINTFN(5, "pep = %p\n", pep);
DPRINTFN(5, "dwEpCtx0=0x%08x\n", xhci_ctx_get_le32(sc, &pep->dwEpCtx0));
DPRINTFN(5, "dwEpCtx1=0x%08x\n", xhci_ctx_get_le32(sc, &pep->dwEpCtx1));
DPRINTFN(5, "qwEpCtx2=0x%016llx\n", (long long)xhci_ctx_get_le64(sc, &pep->qwEpCtx2));
DPRINTFN(5, "dwEpCtx4=0x%08x\n", xhci_ctx_get_le32(sc, &pep->dwEpCtx4));
DPRINTFN(5, "dwEpCtx5=0x%08x\n", xhci_ctx_get_le32(sc, &pep->dwEpCtx5));
DPRINTFN(5, "dwEpCtx6=0x%08x\n", xhci_ctx_get_le32(sc, &pep->dwEpCtx6));
DPRINTFN(5, "dwEpCtx7=0x%08x\n", xhci_ctx_get_le32(sc, &pep->dwEpCtx7));
}
static void
xhci_dump_device(struct xhci_softc *sc, struct xhci_slot_ctx *psl)
{
DPRINTFN(5, "psl = %p\n", psl);
DPRINTFN(5, "dwSctx0=0x%08x\n", xhci_ctx_get_le32(sc, &psl->dwSctx0));
DPRINTFN(5, "dwSctx1=0x%08x\n", xhci_ctx_get_le32(sc, &psl->dwSctx1));
DPRINTFN(5, "dwSctx2=0x%08x\n", xhci_ctx_get_le32(sc, &psl->dwSctx2));
DPRINTFN(5, "dwSctx3=0x%08x\n", xhci_ctx_get_le32(sc, &psl->dwSctx3));
}
#endif
uint8_t
xhci_use_polling(void)
{
#ifdef USB_DEBUG
return (xhcipolling != 0);
#else
return (0);
#endif
}
static void
xhci_iterate_hw_softc(struct usb_bus *bus, usb_bus_mem_sub_cb_t *cb)
{
struct xhci_softc *sc = XHCI_BUS2SC(bus);
uint16_t i;
cb(bus, &sc->sc_hw.root_pc, &sc->sc_hw.root_pg,
sizeof(struct xhci_hw_root), XHCI_PAGE_SIZE);
cb(bus, &sc->sc_hw.ctx_pc, &sc->sc_hw.ctx_pg,
sizeof(struct xhci_dev_ctx_addr), XHCI_PAGE_SIZE);
for (i = 0; i != sc->sc_noscratch; i++) {
cb(bus, &sc->sc_hw.scratch_pc[i], &sc->sc_hw.scratch_pg[i],
XHCI_PAGE_SIZE, XHCI_PAGE_SIZE);
}
}
static void
xhci_ctx_set_le32(struct xhci_softc *sc, volatile uint32_t *ptr, uint32_t val)
{
if (sc->sc_ctx_is_64_byte) {
uint32_t offset;
/* exploit the fact that our structures are XHCI_PAGE_SIZE aligned */
/* all contexts are initially 32-bytes */
offset = ((uintptr_t)ptr) & ((XHCI_PAGE_SIZE - 1) & ~(31U));
ptr = (volatile uint32_t *)(((volatile uint8_t *)ptr) + offset);
}
*ptr = htole32(val);
}
static uint32_t
xhci_ctx_get_le32(struct xhci_softc *sc, volatile uint32_t *ptr)
{
if (sc->sc_ctx_is_64_byte) {
uint32_t offset;
/* exploit the fact that our structures are XHCI_PAGE_SIZE aligned */
/* all contexts are initially 32-bytes */
offset = ((uintptr_t)ptr) & ((XHCI_PAGE_SIZE - 1) & ~(31U));
ptr = (volatile uint32_t *)(((volatile uint8_t *)ptr) + offset);
}
return (le32toh(*ptr));
}
static void
xhci_ctx_set_le64(struct xhci_softc *sc, volatile uint64_t *ptr, uint64_t val)
{
if (sc->sc_ctx_is_64_byte) {
uint32_t offset;
/* exploit the fact that our structures are XHCI_PAGE_SIZE aligned */
/* all contexts are initially 32-bytes */
offset = ((uintptr_t)ptr) & ((XHCI_PAGE_SIZE - 1) & ~(31U));
ptr = (volatile uint64_t *)(((volatile uint8_t *)ptr) + offset);
}
*ptr = htole64(val);
}
#ifdef USB_DEBUG
static uint64_t
xhci_ctx_get_le64(struct xhci_softc *sc, volatile uint64_t *ptr)
{
if (sc->sc_ctx_is_64_byte) {
uint32_t offset;
/* exploit the fact that our structures are XHCI_PAGE_SIZE aligned */
/* all contexts are initially 32-bytes */
offset = ((uintptr_t)ptr) & ((XHCI_PAGE_SIZE - 1) & ~(31U));
ptr = (volatile uint64_t *)(((volatile uint8_t *)ptr) + offset);
}
return (le64toh(*ptr));
}
#endif
static int
xhci_reset_command_queue_locked(struct xhci_softc *sc)
{
struct usb_page_search buf_res;
struct xhci_hw_root *phwr;
uint64_t addr;
uint32_t temp;
DPRINTF("\n");
temp = XREAD4(sc, oper, XHCI_CRCR_LO);
if (temp & XHCI_CRCR_LO_CRR) {
DPRINTF("Command ring running\n");
temp &= ~(XHCI_CRCR_LO_CS | XHCI_CRCR_LO_CA);
/*
* Try to abort the last command as per section
* 4.6.1.2 "Aborting a Command" of the XHCI
* specification:
*/
/* stop and cancel */
XWRITE4(sc, oper, XHCI_CRCR_LO, temp | XHCI_CRCR_LO_CS);
XWRITE4(sc, oper, XHCI_CRCR_HI, 0);
XWRITE4(sc, oper, XHCI_CRCR_LO, temp | XHCI_CRCR_LO_CA);
XWRITE4(sc, oper, XHCI_CRCR_HI, 0);
/* wait 250ms */
usb_pause_mtx(&sc->sc_bus.bus_mtx, hz / 4);
/* check if command ring is still running */
temp = XREAD4(sc, oper, XHCI_CRCR_LO);
if (temp & XHCI_CRCR_LO_CRR) {
DPRINTF("Comand ring still running\n");
return (USB_ERR_IOERROR);
}
}
/* reset command ring */
sc->sc_command_ccs = 1;
sc->sc_command_idx = 0;
usbd_get_page(&sc->sc_hw.root_pc, 0, &buf_res);
/* set up command ring control base address */
addr = buf_res.physaddr;
phwr = buf_res.buffer;
addr += (uintptr_t)&((struct xhci_hw_root *)0)->hwr_commands[0];
DPRINTF("CRCR=0x%016llx\n", (unsigned long long)addr);
memset(phwr->hwr_commands, 0, sizeof(phwr->hwr_commands));
phwr->hwr_commands[XHCI_MAX_COMMANDS - 1].qwTrb0 = htole64(addr);
usb_pc_cpu_flush(&sc->sc_hw.root_pc);
XWRITE4(sc, oper, XHCI_CRCR_LO, ((uint32_t)addr) | XHCI_CRCR_LO_RCS);
XWRITE4(sc, oper, XHCI_CRCR_HI, (uint32_t)(addr >> 32));
return (0);
}
usb_error_t
xhci_start_controller(struct xhci_softc *sc)
{
struct usb_page_search buf_res;
struct xhci_hw_root *phwr;
struct xhci_dev_ctx_addr *pdctxa;
usb_error_t err;
uint64_t addr;
uint32_t temp;
uint16_t i;
DPRINTF("\n");
sc->sc_event_ccs = 1;
sc->sc_event_idx = 0;
sc->sc_command_ccs = 1;
sc->sc_command_idx = 0;
err = xhci_reset_controller(sc);
if (err)
return (err);
/* set up number of device slots */
DPRINTF("CONFIG=0x%08x -> 0x%08x\n",
XREAD4(sc, oper, XHCI_CONFIG), sc->sc_noslot);
XWRITE4(sc, oper, XHCI_CONFIG, sc->sc_noslot);
temp = XREAD4(sc, oper, XHCI_USBSTS);
/* clear interrupts */
XWRITE4(sc, oper, XHCI_USBSTS, temp);
/* disable all device notifications */
XWRITE4(sc, oper, XHCI_DNCTRL, 0);
/* set up device context base address */
usbd_get_page(&sc->sc_hw.ctx_pc, 0, &buf_res);
pdctxa = buf_res.buffer;
memset(pdctxa, 0, sizeof(*pdctxa));
addr = buf_res.physaddr;
addr += (uintptr_t)&((struct xhci_dev_ctx_addr *)0)->qwSpBufPtr[0];
/* slot 0 points to the table of scratchpad pointers */
pdctxa->qwBaaDevCtxAddr[0] = htole64(addr);
for (i = 0; i != sc->sc_noscratch; i++) {
struct usb_page_search buf_scp;
usbd_get_page(&sc->sc_hw.scratch_pc[i], 0, &buf_scp);
pdctxa->qwSpBufPtr[i] = htole64((uint64_t)buf_scp.physaddr);
}
addr = buf_res.physaddr;
XWRITE4(sc, oper, XHCI_DCBAAP_LO, (uint32_t)addr);
XWRITE4(sc, oper, XHCI_DCBAAP_HI, (uint32_t)(addr >> 32));
XWRITE4(sc, oper, XHCI_DCBAAP_LO, (uint32_t)addr);
XWRITE4(sc, oper, XHCI_DCBAAP_HI, (uint32_t)(addr >> 32));
/* set up event table size */
DPRINTF("ERSTSZ=0x%08x -> 0x%08x\n",
XREAD4(sc, runt, XHCI_ERSTSZ(0)), sc->sc_erst_max);
XWRITE4(sc, runt, XHCI_ERSTSZ(0), XHCI_ERSTS_SET(sc->sc_erst_max));
/* set up interrupt rate */
XWRITE4(sc, runt, XHCI_IMOD(0), sc->sc_imod_default);
usbd_get_page(&sc->sc_hw.root_pc, 0, &buf_res);
phwr = buf_res.buffer;
addr = buf_res.physaddr;
addr += (uintptr_t)&((struct xhci_hw_root *)0)->hwr_events[0];
/* reset hardware root structure */
memset(phwr, 0, sizeof(*phwr));
phwr->hwr_ring_seg[0].qwEvrsTablePtr = htole64(addr);
phwr->hwr_ring_seg[0].dwEvrsTableSize = htole32(XHCI_MAX_EVENTS);
DPRINTF("ERDP(0)=0x%016llx\n", (unsigned long long)addr);
XWRITE4(sc, runt, XHCI_ERDP_LO(0), (uint32_t)addr);
XWRITE4(sc, runt, XHCI_ERDP_HI(0), (uint32_t)(addr >> 32));
addr = buf_res.physaddr;
DPRINTF("ERSTBA(0)=0x%016llx\n", (unsigned long long)addr);
XWRITE4(sc, runt, XHCI_ERSTBA_LO(0), (uint32_t)addr);
XWRITE4(sc, runt, XHCI_ERSTBA_HI(0), (uint32_t)(addr >> 32));
/* set up interrupter registers */
temp = XREAD4(sc, runt, XHCI_IMAN(0));
temp |= XHCI_IMAN_INTR_ENA;
XWRITE4(sc, runt, XHCI_IMAN(0), temp);
/* set up command ring control base address */
addr = buf_res.physaddr;
addr += (uintptr_t)&((struct xhci_hw_root *)0)->hwr_commands[0];
DPRINTF("CRCR=0x%016llx\n", (unsigned long long)addr);
XWRITE4(sc, oper, XHCI_CRCR_LO, ((uint32_t)addr) | XHCI_CRCR_LO_RCS);
XWRITE4(sc, oper, XHCI_CRCR_HI, (uint32_t)(addr >> 32));
phwr->hwr_commands[XHCI_MAX_COMMANDS - 1].qwTrb0 = htole64(addr);
usb_bus_mem_flush_all(&sc->sc_bus, &xhci_iterate_hw_softc);
/* Go! */
XWRITE4(sc, oper, XHCI_USBCMD, XHCI_CMD_RS |
XHCI_CMD_INTE | XHCI_CMD_HSEE);
for (i = 0; i != 100; i++) {
usb_pause_mtx(NULL, hz / 100);
temp = XREAD4(sc, oper, XHCI_USBSTS) & XHCI_STS_HCH;
if (!temp)
break;
}
if (temp) {
XWRITE4(sc, oper, XHCI_USBCMD, 0);
device_printf(sc->sc_bus.parent, "Run timeout.\n");
return (USB_ERR_IOERROR);
}
/* catch any lost interrupts */
xhci_do_poll(&sc->sc_bus);
if (sc->sc_port_route != NULL) {
/* Route all ports to the XHCI by default */
sc->sc_port_route(sc->sc_bus.parent,
~xhciroute, xhciroute);
}
return (0);
}
usb_error_t
xhci_halt_controller(struct xhci_softc *sc)
{
uint32_t temp;
uint16_t i;
DPRINTF("\n");
sc->sc_capa_off = 0;
sc->sc_oper_off = XREAD1(sc, capa, XHCI_CAPLENGTH);
sc->sc_runt_off = XREAD4(sc, capa, XHCI_RTSOFF) & ~0xF;
sc->sc_door_off = XREAD4(sc, capa, XHCI_DBOFF) & ~0x3;
/* Halt controller */
XWRITE4(sc, oper, XHCI_USBCMD, 0);
for (i = 0; i != 100; i++) {
usb_pause_mtx(NULL, hz / 100);
temp = XREAD4(sc, oper, XHCI_USBSTS) & XHCI_STS_HCH;
if (temp)
break;
}
if (!temp) {
device_printf(sc->sc_bus.parent, "Controller halt timeout.\n");
return (USB_ERR_IOERROR);
}
return (0);
}
usb_error_t
xhci_reset_controller(struct xhci_softc *sc)
{
uint32_t temp = 0;
uint16_t i;
DPRINTF("\n");
/* Reset controller */
XWRITE4(sc, oper, XHCI_USBCMD, XHCI_CMD_HCRST);
for (i = 0; i != 100; i++) {
usb_pause_mtx(NULL, hz / 100);
temp = (XREAD4(sc, oper, XHCI_USBCMD) & XHCI_CMD_HCRST) |
(XREAD4(sc, oper, XHCI_USBSTS) & XHCI_STS_CNR);
if (!temp)
break;
}
if (temp) {
device_printf(sc->sc_bus.parent, "Controller "
"reset timeout.\n");
return (USB_ERR_IOERROR);
}
return (0);
}
usb_error_t
xhci_init(struct xhci_softc *sc, device_t self, uint8_t dma32)
{
uint32_t temp;
DPRINTF("\n");
/* initialize some bus fields */
sc->sc_bus.parent = self;
/* set the bus revision */
sc->sc_bus.usbrev = USB_REV_3_0;
/* set up the bus struct */
sc->sc_bus.methods = &xhci_bus_methods;
/* set up devices array */
sc->sc_bus.devices = sc->sc_devices;
sc->sc_bus.devices_max = XHCI_MAX_DEVICES;
/* set default cycle state in case of early interrupts */
sc->sc_event_ccs = 1;
sc->sc_command_ccs = 1;
/* set up bus space offsets */
sc->sc_capa_off = 0;
sc->sc_oper_off = XREAD1(sc, capa, XHCI_CAPLENGTH);
sc->sc_runt_off = XREAD4(sc, capa, XHCI_RTSOFF) & ~0x1F;
sc->sc_door_off = XREAD4(sc, capa, XHCI_DBOFF) & ~0x3;
DPRINTF("CAPLENGTH=0x%x\n", sc->sc_oper_off);
DPRINTF("RUNTIMEOFFSET=0x%x\n", sc->sc_runt_off);
DPRINTF("DOOROFFSET=0x%x\n", sc->sc_door_off);
DPRINTF("xHCI version = 0x%04x\n", XREAD2(sc, capa, XHCI_HCIVERSION));
if (!(XREAD4(sc, oper, XHCI_PAGESIZE) & XHCI_PAGESIZE_4K)) {
device_printf(sc->sc_bus.parent, "Controller does "
"not support 4K page size.\n");
return (ENXIO);
}
temp = XREAD4(sc, capa, XHCI_HCSPARAMS0);
DPRINTF("HCS0 = 0x%08x\n", temp);
/* set up context size */
if (XHCI_HCS0_CSZ(temp)) {
sc->sc_ctx_is_64_byte = 1;
} else {
sc->sc_ctx_is_64_byte = 0;
}
/* get DMA bits */
sc->sc_bus.dma_bits = (XHCI_HCS0_AC64(temp) &&
xhcidma32 == 0 && dma32 == 0) ? 64 : 32;
device_printf(self, "%d bytes context size, %d-bit DMA\n",
sc->sc_ctx_is_64_byte ? 64 : 32, (int)sc->sc_bus.dma_bits);
temp = XREAD4(sc, capa, XHCI_HCSPARAMS1);
/* get number of device slots */
sc->sc_noport = XHCI_HCS1_N_PORTS(temp);
if (sc->sc_noport == 0) {
device_printf(sc->sc_bus.parent, "Invalid number "
"of ports: %u\n", sc->sc_noport);
return (ENXIO);
}
sc->sc_noport = sc->sc_noport;
sc->sc_noslot = XHCI_HCS1_DEVSLOT_MAX(temp);
DPRINTF("Max slots: %u\n", sc->sc_noslot);
if (sc->sc_noslot > XHCI_MAX_DEVICES)
sc->sc_noslot = XHCI_MAX_DEVICES;
temp = XREAD4(sc, capa, XHCI_HCSPARAMS2);
DPRINTF("HCS2=0x%08x\n", temp);
/* get number of scratchpads */
sc->sc_noscratch = XHCI_HCS2_SPB_MAX(temp);
if (sc->sc_noscratch > XHCI_MAX_SCRATCHPADS) {
device_printf(sc->sc_bus.parent, "XHCI request "
"too many scratchpads\n");
return (ENOMEM);
}
DPRINTF("Max scratch: %u\n", sc->sc_noscratch);
/* get event table size */
sc->sc_erst_max = 1U << XHCI_HCS2_ERST_MAX(temp);
if (sc->sc_erst_max > XHCI_MAX_RSEG)
sc->sc_erst_max = XHCI_MAX_RSEG;
temp = XREAD4(sc, capa, XHCI_HCSPARAMS3);
/* get maximum exit latency */
sc->sc_exit_lat_max = XHCI_HCS3_U1_DEL(temp) +
XHCI_HCS3_U2_DEL(temp) + 250 /* us */;
/* Check if we should use the default IMOD value. */
if (sc->sc_imod_default == 0)
sc->sc_imod_default = XHCI_IMOD_DEFAULT;
/* get all DMA memory */
if (usb_bus_mem_alloc_all(&sc->sc_bus,
USB_GET_DMA_TAG(self), &xhci_iterate_hw_softc)) {
return (ENOMEM);
}
/* set up command queue mutex and condition varible */
cv_init(&sc->sc_cmd_cv, "CMDQ");
sx_init(&sc->sc_cmd_sx, "CMDQ lock");
sc->sc_config_msg[0].hdr.pm_callback = &xhci_configure_msg;
sc->sc_config_msg[0].bus = &sc->sc_bus;
sc->sc_config_msg[1].hdr.pm_callback = &xhci_configure_msg;
sc->sc_config_msg[1].bus = &sc->sc_bus;
return (0);
}
void
xhci_uninit(struct xhci_softc *sc)
{
/*
* NOTE: At this point the control transfer process is gone
* and "xhci_configure_msg" is no longer called. Consequently
* waiting for the configuration messages to complete is not
* needed.
*/
usb_bus_mem_free_all(&sc->sc_bus, &xhci_iterate_hw_softc);
cv_destroy(&sc->sc_cmd_cv);
sx_destroy(&sc->sc_cmd_sx);
}
static void
xhci_set_hw_power_sleep(struct usb_bus *bus, uint32_t state)
{
struct xhci_softc *sc = XHCI_BUS2SC(bus);
switch (state) {
case USB_HW_POWER_SUSPEND:
DPRINTF("Stopping the XHCI\n");
xhci_halt_controller(sc);
xhci_reset_controller(sc);
break;
case USB_HW_POWER_SHUTDOWN:
DPRINTF("Stopping the XHCI\n");
xhci_halt_controller(sc);
xhci_reset_controller(sc);
break;
case USB_HW_POWER_RESUME:
DPRINTF("Starting the XHCI\n");
xhci_start_controller(sc);
break;
default:
break;
}
}
static usb_error_t
xhci_generic_done_sub(struct usb_xfer *xfer)
{
struct xhci_td *td;
struct xhci_td *td_alt_next;
uint32_t len;
uint8_t status;
td = xfer->td_transfer_cache;
td_alt_next = td->alt_next;
if (xfer->aframes != xfer->nframes)
usbd_xfer_set_frame_len(xfer, xfer->aframes, 0);
while (1) {
usb_pc_cpu_invalidate(td->page_cache);
status = td->status;
len = td->remainder;
DPRINTFN(4, "xfer=%p[%u/%u] rem=%u/%u status=%u\n",
xfer, (unsigned int)xfer->aframes,
(unsigned int)xfer->nframes,
(unsigned int)len, (unsigned int)td->len,
(unsigned int)status);
/*
* Verify the status length and
* add the length to "frlengths[]":
*/
if (len > td->len) {
/* should not happen */
DPRINTF("Invalid status length, "
"0x%04x/0x%04x bytes\n", len, td->len);
status = XHCI_TRB_ERROR_LENGTH;
} else if (xfer->aframes != xfer->nframes) {
xfer->frlengths[xfer->aframes] += td->len - len;
}
/* Check for last transfer */
if (((void *)td) == xfer->td_transfer_last) {
td = NULL;
break;
}
/* Check for transfer error */
if (status != XHCI_TRB_ERROR_SHORT_PKT &&
status != XHCI_TRB_ERROR_SUCCESS) {
/* the transfer is finished */
td = NULL;
break;
}
/* Check for short transfer */
if (len > 0) {
if (xfer->flags_int.short_frames_ok ||
xfer->flags_int.isochronous_xfr ||
xfer->flags_int.control_xfr) {
/* follow alt next */
td = td->alt_next;
} else {
/* the transfer is finished */
td = NULL;
}
break;
}
td = td->obj_next;
if (td->alt_next != td_alt_next) {
/* this USB frame is complete */
break;
}
}
/* update transfer cache */
xfer->td_transfer_cache = td;
return ((status == XHCI_TRB_ERROR_STALL) ? USB_ERR_STALLED :
(status != XHCI_TRB_ERROR_SHORT_PKT &&
status != XHCI_TRB_ERROR_SUCCESS) ? USB_ERR_IOERROR :
USB_ERR_NORMAL_COMPLETION);
}
static void
xhci_generic_done(struct usb_xfer *xfer)
{
usb_error_t err = 0;
DPRINTFN(13, "xfer=%p endpoint=%p transfer done\n",
xfer, xfer->endpoint);
/* reset scanner */
xfer->td_transfer_cache = xfer->td_transfer_first;
if (xfer->flags_int.control_xfr) {
if (xfer->flags_int.control_hdr)
err = xhci_generic_done_sub(xfer);
xfer->aframes = 1;
if (xfer->td_transfer_cache == NULL)
goto done;
}
while (xfer->aframes != xfer->nframes) {
err = xhci_generic_done_sub(xfer);
xfer->aframes++;
if (xfer->td_transfer_cache == NULL)
goto done;
}
if (xfer->flags_int.control_xfr &&
!xfer->flags_int.control_act)
err = xhci_generic_done_sub(xfer);
done:
/* transfer is complete */
xhci_device_done(xfer, err);
}
static void
xhci_activate_transfer(struct usb_xfer *xfer)
{
struct xhci_td *td;
td = xfer->td_transfer_cache;
usb_pc_cpu_invalidate(td->page_cache);
if (!(td->td_trb[0].dwTrb3 & htole32(XHCI_TRB_3_CYCLE_BIT))) {
/* activate the transfer */
td->td_trb[0].dwTrb3 |= htole32(XHCI_TRB_3_CYCLE_BIT);
usb_pc_cpu_flush(td->page_cache);
xhci_endpoint_doorbell(xfer);
}
}
static void
xhci_skip_transfer(struct usb_xfer *xfer)
{
struct xhci_td *td;
struct xhci_td *td_last;
td = xfer->td_transfer_cache;
td_last = xfer->td_transfer_last;
td = td->alt_next;
usb_pc_cpu_invalidate(td->page_cache);
if (!(td->td_trb[0].dwTrb3 & htole32(XHCI_TRB_3_CYCLE_BIT))) {
usb_pc_cpu_invalidate(td_last->page_cache);
/* copy LINK TRB to current waiting location */
td->td_trb[0].qwTrb0 = td_last->td_trb[td_last->ntrb].qwTrb0;
td->td_trb[0].dwTrb2 = td_last->td_trb[td_last->ntrb].dwTrb2;
usb_pc_cpu_flush(td->page_cache);
td->td_trb[0].dwTrb3 = td_last->td_trb[td_last->ntrb].dwTrb3;
usb_pc_cpu_flush(td->page_cache);
xhci_endpoint_doorbell(xfer);
}
}
/*------------------------------------------------------------------------*
* xhci_check_transfer
*------------------------------------------------------------------------*/
static void
xhci_check_transfer(struct xhci_softc *sc, struct xhci_trb *trb)
{
struct xhci_endpoint_ext *pepext;
int64_t offset;
uint64_t td_event;
uint32_t temp;
uint32_t remainder;
uint16_t stream_id;
uint16_t i;
uint8_t status;
uint8_t halted;
uint8_t epno;
uint8_t index;
/* decode TRB */
td_event = le64toh(trb->qwTrb0);
temp = le32toh(trb->dwTrb2);
remainder = XHCI_TRB_2_REM_GET(temp);
status = XHCI_TRB_2_ERROR_GET(temp);
stream_id = XHCI_TRB_2_STREAM_GET(temp);
temp = le32toh(trb->dwTrb3);
epno = XHCI_TRB_3_EP_GET(temp);
index = XHCI_TRB_3_SLOT_GET(temp);
/* check if error means halted */
halted = (status != XHCI_TRB_ERROR_SHORT_PKT &&
status != XHCI_TRB_ERROR_SUCCESS);
DPRINTF("slot=%u epno=%u stream=%u remainder=%u status=%u\n",
index, epno, stream_id, remainder, status);
if (index > sc->sc_noslot) {
DPRINTF("Invalid slot.\n");
return;
}
if ((epno == 0) || (epno >= XHCI_MAX_ENDPOINTS)) {
DPRINTF("Invalid endpoint.\n");
return;
}
pepext = &sc->sc_hw.devs[index].endp[epno];
if (pepext->trb_ep_mode != USB_EP_MODE_STREAMS) {
stream_id = 0;
DPRINTF("stream_id=0\n");
} else if (stream_id >= XHCI_MAX_STREAMS) {
DPRINTF("Invalid stream ID.\n");
return;
}
/* try to find the USB transfer that generated the event */
for (i = 0; i != (XHCI_MAX_TRANSFERS - 1); i++) {
struct usb_xfer *xfer;
struct xhci_td *td;
xfer = pepext->xfer[i + (XHCI_MAX_TRANSFERS * stream_id)];
if (xfer == NULL)
continue;
td = xfer->td_transfer_cache;
DPRINTFN(5, "Checking if 0x%016llx == (0x%016llx .. 0x%016llx)\n",
(long long)td_event,
(long long)td->td_self,
(long long)td->td_self + sizeof(td->td_trb));
/*
* NOTE: Some XHCI implementations might not trigger
* an event on the last LINK TRB so we need to
* consider both the last and second last event
* address as conditions for a successful transfer.
*
* NOTE: We assume that the XHCI will only trigger one
* event per chain of TRBs.
*/
offset = td_event - td->td_self;
if (offset >= 0 &&
offset < (int64_t)sizeof(td->td_trb)) {
usb_pc_cpu_invalidate(td->page_cache);
/* compute rest of remainder, if any */
for (i = (offset / 16) + 1; i < td->ntrb; i++) {
temp = le32toh(td->td_trb[i].dwTrb2);
remainder += XHCI_TRB_2_BYTES_GET(temp);
}
DPRINTFN(5, "New remainder: %u\n", remainder);
/* clear isochronous transfer errors */
if (xfer->flags_int.isochronous_xfr) {
if (halted) {
halted = 0;
status = XHCI_TRB_ERROR_SUCCESS;
remainder = td->len;
}
}
/* "td->remainder" is verified later */
td->remainder = remainder;
td->status = status;
usb_pc_cpu_flush(td->page_cache);
/*
* 1) Last transfer descriptor makes the
* transfer done
*/
if (((void *)td) == xfer->td_transfer_last) {
DPRINTF("TD is last\n");
xhci_generic_done(xfer);
break;
}
/*
* 2) Any kind of error makes the transfer
* done
*/
if (halted) {
DPRINTF("TD has I/O error\n");
xhci_generic_done(xfer);
break;
}
/*
* 3) If there is no alternate next transfer,
* a short packet also makes the transfer done
*/
if (td->remainder > 0) {
if (td->alt_next == NULL) {
DPRINTF(
"short TD has no alternate next\n");
xhci_generic_done(xfer);
break;
}
DPRINTF("TD has short pkt\n");
if (xfer->flags_int.short_frames_ok ||
xfer->flags_int.isochronous_xfr ||
xfer->flags_int.control_xfr) {
/* follow the alt next */
xfer->td_transfer_cache = td->alt_next;
xhci_activate_transfer(xfer);
break;
}
xhci_skip_transfer(xfer);
xhci_generic_done(xfer);
break;
}
/*
* 4) Transfer complete - go to next TD
*/
DPRINTF("Following next TD\n");
xfer->td_transfer_cache = td->obj_next;
xhci_activate_transfer(xfer);
break; /* there should only be one match */
}
}
}
static int
xhci_check_command(struct xhci_softc *sc, struct xhci_trb *trb)
{
if (sc->sc_cmd_addr == trb->qwTrb0) {
DPRINTF("Received command event\n");
sc->sc_cmd_result[0] = trb->dwTrb2;
sc->sc_cmd_result[1] = trb->dwTrb3;
cv_signal(&sc->sc_cmd_cv);
return (1); /* command match */
}
return (0);
}
static int
xhci_interrupt_poll(struct xhci_softc *sc)
{
struct usb_page_search buf_res;
struct xhci_hw_root *phwr;
uint64_t addr;
uint32_t temp;
int retval = 0;
uint16_t i;
uint8_t event;
uint8_t j;
uint8_t k;
uint8_t t;
usbd_get_page(&sc->sc_hw.root_pc, 0, &buf_res);
phwr = buf_res.buffer;
/* Receive any events */
usb_pc_cpu_invalidate(&sc->sc_hw.root_pc);
i = sc->sc_event_idx;
j = sc->sc_event_ccs;
t = 2;
while (1) {
temp = le32toh(phwr->hwr_events[i].dwTrb3);
k = (temp & XHCI_TRB_3_CYCLE_BIT) ? 1 : 0;
if (j != k)
break;
event = XHCI_TRB_3_TYPE_GET(temp);
DPRINTFN(10, "event[%u] = %u (0x%016llx 0x%08lx 0x%08lx)\n",
i, event, (long long)le64toh(phwr->hwr_events[i].qwTrb0),
(long)le32toh(phwr->hwr_events[i].dwTrb2),
(long)le32toh(phwr->hwr_events[i].dwTrb3));
switch (event) {
case XHCI_TRB_EVENT_TRANSFER:
xhci_check_transfer(sc, &phwr->hwr_events[i]);
break;
case XHCI_TRB_EVENT_CMD_COMPLETE:
retval |= xhci_check_command(sc, &phwr->hwr_events[i]);
break;
default:
DPRINTF("Unhandled event = %u\n", event);
break;
}
i++;
if (i == XHCI_MAX_EVENTS) {
i = 0;
j ^= 1;
/* check for timeout */
if (!--t)
break;
}
}
sc->sc_event_idx = i;
sc->sc_event_ccs = j;
/*
* NOTE: The Event Ring Dequeue Pointer Register is 64-bit
* latched. That means to activate the register we need to
* write both the low and high double word of the 64-bit
* register.
*/
addr = buf_res.physaddr;
addr += (uintptr_t)&((struct xhci_hw_root *)0)->hwr_events[i];
/* try to clear busy bit */
addr |= XHCI_ERDP_LO_BUSY;
XWRITE4(sc, runt, XHCI_ERDP_LO(0), (uint32_t)addr);
XWRITE4(sc, runt, XHCI_ERDP_HI(0), (uint32_t)(addr >> 32));
return (retval);
}
static usb_error_t
xhci_do_command(struct xhci_softc *sc, struct xhci_trb *trb,
uint16_t timeout_ms)
{
struct usb_page_search buf_res;
struct xhci_hw_root *phwr;
uint64_t addr;
uint32_t temp;
uint8_t i;
uint8_t j;
uint8_t timeout = 0;
int err;
XHCI_CMD_ASSERT_LOCKED(sc);
/* get hardware root structure */
usbd_get_page(&sc->sc_hw.root_pc, 0, &buf_res);
phwr = buf_res.buffer;
/* Queue command */
USB_BUS_LOCK(&sc->sc_bus);
retry:
i = sc->sc_command_idx;
j = sc->sc_command_ccs;
DPRINTFN(10, "command[%u] = %u (0x%016llx, 0x%08lx, 0x%08lx)\n",
i, XHCI_TRB_3_TYPE_GET(le32toh(trb->dwTrb3)),
(long long)le64toh(trb->qwTrb0),
(long)le32toh(trb->dwTrb2),
(long)le32toh(trb->dwTrb3));
phwr->hwr_commands[i].qwTrb0 = trb->qwTrb0;
phwr->hwr_commands[i].dwTrb2 = trb->dwTrb2;
usb_pc_cpu_flush(&sc->sc_hw.root_pc);
temp = trb->dwTrb3;
if (j)
temp |= htole32(XHCI_TRB_3_CYCLE_BIT);
else
temp &= ~htole32(XHCI_TRB_3_CYCLE_BIT);
temp &= ~htole32(XHCI_TRB_3_TC_BIT);
phwr->hwr_commands[i].dwTrb3 = temp;
usb_pc_cpu_flush(&sc->sc_hw.root_pc);
addr = buf_res.physaddr;
addr += (uintptr_t)&((struct xhci_hw_root *)0)->hwr_commands[i];
sc->sc_cmd_addr = htole64(addr);
i++;
if (i == (XHCI_MAX_COMMANDS - 1)) {
if (j) {
temp = htole32(XHCI_TRB_3_TC_BIT |
XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_LINK) |
XHCI_TRB_3_CYCLE_BIT);
} else {
temp = htole32(XHCI_TRB_3_TC_BIT |
XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_LINK));
}
phwr->hwr_commands[i].dwTrb3 = temp;
usb_pc_cpu_flush(&sc->sc_hw.root_pc);
i = 0;
j ^= 1;
}
sc->sc_command_idx = i;
sc->sc_command_ccs = j;
XWRITE4(sc, door, XHCI_DOORBELL(0), 0);
err = cv_timedwait(&sc->sc_cmd_cv, &sc->sc_bus.bus_mtx,
USB_MS_TO_TICKS(timeout_ms));
/*
* In some error cases event interrupts are not generated.
* Poll one time to see if the command has completed.
*/
if (err != 0 && xhci_interrupt_poll(sc) != 0) {
DPRINTF("Command was completed when polling\n");
err = 0;
}
if (err != 0) {
DPRINTF("Command timeout!\n");
/*
* After some weeks of continuous operation, it has
* been observed that the ASMedia Technology, ASM1042
* SuperSpeed USB Host Controller can suddenly stop
* accepting commands via the command queue. Try to
* first reset the command queue. If that fails do a
* host controller reset.
*/
if (timeout == 0 &&
xhci_reset_command_queue_locked(sc) == 0) {
temp = le32toh(trb->dwTrb3);
/*
* Avoid infinite XHCI reset loops if the set
* address command fails to respond due to a
* non-enumerating device:
*/
if (XHCI_TRB_3_TYPE_GET(temp) == XHCI_TRB_TYPE_ADDRESS_DEVICE &&
(temp & XHCI_TRB_3_BSR_BIT) == 0) {
DPRINTF("Set address timeout\n");
} else {
timeout = 1;
goto retry;
}
} else {
DPRINTF("Controller reset!\n");
usb_bus_reset_async_locked(&sc->sc_bus);
}
err = USB_ERR_TIMEOUT;
trb->dwTrb2 = 0;
trb->dwTrb3 = 0;
} else {
temp = le32toh(sc->sc_cmd_result[0]);
if (XHCI_TRB_2_ERROR_GET(temp) != XHCI_TRB_ERROR_SUCCESS)
err = USB_ERR_IOERROR;
trb->dwTrb2 = sc->sc_cmd_result[0];
trb->dwTrb3 = sc->sc_cmd_result[1];
}
USB_BUS_UNLOCK(&sc->sc_bus);
return (err);
}
#if 0
static usb_error_t
xhci_cmd_nop(struct xhci_softc *sc)
{
struct xhci_trb trb;
uint32_t temp;
DPRINTF("\n");
trb.qwTrb0 = 0;
trb.dwTrb2 = 0;
temp = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_NOOP);
trb.dwTrb3 = htole32(temp);
return (xhci_do_command(sc, &trb, 100 /* ms */));
}
#endif
static usb_error_t
xhci_cmd_enable_slot(struct xhci_softc *sc, uint8_t *pslot)
{
struct xhci_trb trb;
uint32_t temp;
usb_error_t err;
DPRINTF("\n");
trb.qwTrb0 = 0;
trb.dwTrb2 = 0;
trb.dwTrb3 = htole32(XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_ENABLE_SLOT));
err = xhci_do_command(sc, &trb, 100 /* ms */);
if (err)
goto done;
temp = le32toh(trb.dwTrb3);
*pslot = XHCI_TRB_3_SLOT_GET(temp);
done:
return (err);
}
static usb_error_t
xhci_cmd_disable_slot(struct xhci_softc *sc, uint8_t slot_id)
{
struct xhci_trb trb;
uint32_t temp;
DPRINTF("\n");
trb.qwTrb0 = 0;
trb.dwTrb2 = 0;
temp = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_DISABLE_SLOT) |
XHCI_TRB_3_SLOT_SET(slot_id);
trb.dwTrb3 = htole32(temp);
return (xhci_do_command(sc, &trb, 100 /* ms */));
}
static usb_error_t
xhci_cmd_set_address(struct xhci_softc *sc, uint64_t input_ctx,
uint8_t bsr, uint8_t slot_id)
{
struct xhci_trb trb;
uint32_t temp;
DPRINTF("\n");
trb.qwTrb0 = htole64(input_ctx);
trb.dwTrb2 = 0;
temp = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_ADDRESS_DEVICE) |
XHCI_TRB_3_SLOT_SET(slot_id);
if (bsr)
temp |= XHCI_TRB_3_BSR_BIT;
trb.dwTrb3 = htole32(temp);
return (xhci_do_command(sc, &trb, 500 /* ms */));
}
static usb_error_t
xhci_set_address(struct usb_device *udev, struct mtx *mtx, uint16_t address)
{
struct usb_page_search buf_inp;
struct usb_page_search buf_dev;
struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
struct xhci_hw_dev *hdev;
struct xhci_dev_ctx *pdev;
struct xhci_endpoint_ext *pepext;
uint32_t temp;
uint16_t mps;
usb_error_t err;
uint8_t index;
/* the root HUB case is not handled here */
if (udev->parent_hub == NULL)
return (USB_ERR_INVAL);
index = udev->controller_slot_id;
hdev = &sc->sc_hw.devs[index];
if (mtx != NULL)
mtx_unlock(mtx);
XHCI_CMD_LOCK(sc);
switch (hdev->state) {
case XHCI_ST_DEFAULT:
case XHCI_ST_ENABLED:
hdev->state = XHCI_ST_ENABLED;
/* set configure mask to slot and EP0 */
xhci_configure_mask(udev, 3, 0);
/* configure input slot context structure */
err = xhci_configure_device(udev);
if (err != 0) {
DPRINTF("Could not configure device\n");
break;
}
/* configure input endpoint context structure */
switch (udev->speed) {
case USB_SPEED_LOW:
case USB_SPEED_FULL:
mps = 8;
break;
case USB_SPEED_HIGH:
mps = 64;
break;
default:
mps = 512;
break;
}
pepext = xhci_get_endpoint_ext(udev,
&udev->ctrl_ep_desc);
/* ensure the control endpoint is setup again */
USB_BUS_LOCK(udev->bus);
pepext->trb_halted = 1;
pepext->trb_running = 0;
USB_BUS_UNLOCK(udev->bus);
err = xhci_configure_endpoint(udev,
&udev->ctrl_ep_desc, pepext,
0, 1, 1, 0, mps, mps, USB_EP_MODE_DEFAULT);
if (err != 0) {
DPRINTF("Could not configure default endpoint\n");
break;
}
/* execute set address command */
usbd_get_page(&hdev->input_pc, 0, &buf_inp);
err = xhci_cmd_set_address(sc, buf_inp.physaddr,
(address == 0), index);
if (err != 0) {
temp = le32toh(sc->sc_cmd_result[0]);
if (address == 0 && sc->sc_port_route != NULL &&
XHCI_TRB_2_ERROR_GET(temp) ==
XHCI_TRB_ERROR_PARAMETER) {
/* LynxPoint XHCI - ports are not switchable */
/* Un-route all ports from the XHCI */
sc->sc_port_route(sc->sc_bus.parent, 0, ~0);
}
DPRINTF("Could not set address "
"for slot %u.\n", index);
if (address != 0)
break;
}
/* update device address to new value */
usbd_get_page(&hdev->device_pc, 0, &buf_dev);
pdev = buf_dev.buffer;
usb_pc_cpu_invalidate(&hdev->device_pc);
temp = xhci_ctx_get_le32(sc, &pdev->ctx_slot.dwSctx3);
udev->address = XHCI_SCTX_3_DEV_ADDR_GET(temp);
/* update device state to new value */
if (address != 0)
hdev->state = XHCI_ST_ADDRESSED;
else
hdev->state = XHCI_ST_DEFAULT;
break;
default:
DPRINTF("Wrong state for set address.\n");
err = USB_ERR_IOERROR;
break;
}
XHCI_CMD_UNLOCK(sc);
if (mtx != NULL)
mtx_lock(mtx);
return (err);
}
static usb_error_t
xhci_cmd_configure_ep(struct xhci_softc *sc, uint64_t input_ctx,
uint8_t deconfigure, uint8_t slot_id)
{
struct xhci_trb trb;
uint32_t temp;
DPRINTF("\n");
trb.qwTrb0 = htole64(input_ctx);
trb.dwTrb2 = 0;
temp = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_CONFIGURE_EP) |
XHCI_TRB_3_SLOT_SET(slot_id);
if (deconfigure)
temp |= XHCI_TRB_3_DCEP_BIT;
trb.dwTrb3 = htole32(temp);
return (xhci_do_command(sc, &trb, 100 /* ms */));
}
static usb_error_t
xhci_cmd_evaluate_ctx(struct xhci_softc *sc, uint64_t input_ctx,
uint8_t slot_id)
{
struct xhci_trb trb;
uint32_t temp;
DPRINTF("\n");
trb.qwTrb0 = htole64(input_ctx);
trb.dwTrb2 = 0;
temp = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_EVALUATE_CTX) |
XHCI_TRB_3_SLOT_SET(slot_id);
trb.dwTrb3 = htole32(temp);
return (xhci_do_command(sc, &trb, 100 /* ms */));
}
static usb_error_t
xhci_cmd_reset_ep(struct xhci_softc *sc, uint8_t preserve,
uint8_t ep_id, uint8_t slot_id)
{
struct xhci_trb trb;
uint32_t temp;
DPRINTF("\n");
trb.qwTrb0 = 0;
trb.dwTrb2 = 0;
temp = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_RESET_EP) |
XHCI_TRB_3_SLOT_SET(slot_id) |
XHCI_TRB_3_EP_SET(ep_id);
if (preserve)
temp |= XHCI_TRB_3_PRSV_BIT;
trb.dwTrb3 = htole32(temp);
return (xhci_do_command(sc, &trb, 100 /* ms */));
}
static usb_error_t
xhci_cmd_set_tr_dequeue_ptr(struct xhci_softc *sc, uint64_t dequeue_ptr,
uint16_t stream_id, uint8_t ep_id, uint8_t slot_id)
{
struct xhci_trb trb;
uint32_t temp;
DPRINTF("\n");
trb.qwTrb0 = htole64(dequeue_ptr);
temp = XHCI_TRB_2_STREAM_SET(stream_id);
trb.dwTrb2 = htole32(temp);
temp = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_SET_TR_DEQUEUE) |
XHCI_TRB_3_SLOT_SET(slot_id) |
XHCI_TRB_3_EP_SET(ep_id);
trb.dwTrb3 = htole32(temp);
return (xhci_do_command(sc, &trb, 100 /* ms */));
}
static usb_error_t
xhci_cmd_stop_ep(struct xhci_softc *sc, uint8_t suspend,
uint8_t ep_id, uint8_t slot_id)
{
struct xhci_trb trb;
uint32_t temp;
DPRINTF("\n");
trb.qwTrb0 = 0;
trb.dwTrb2 = 0;
temp = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_STOP_EP) |
XHCI_TRB_3_SLOT_SET(slot_id) |
XHCI_TRB_3_EP_SET(ep_id);
if (suspend)
temp |= XHCI_TRB_3_SUSP_EP_BIT;
trb.dwTrb3 = htole32(temp);
return (xhci_do_command(sc, &trb, 100 /* ms */));
}
static usb_error_t
xhci_cmd_reset_dev(struct xhci_softc *sc, uint8_t slot_id)
{
struct xhci_trb trb;
uint32_t temp;
DPRINTF("\n");
trb.qwTrb0 = 0;
trb.dwTrb2 = 0;
temp = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_RESET_DEVICE) |
XHCI_TRB_3_SLOT_SET(slot_id);
trb.dwTrb3 = htole32(temp);
return (xhci_do_command(sc, &trb, 100 /* ms */));
}
/*------------------------------------------------------------------------*
* xhci_interrupt - XHCI interrupt handler
*------------------------------------------------------------------------*/
void
xhci_interrupt(struct xhci_softc *sc)
{
uint32_t status;
uint32_t temp;
USB_BUS_LOCK(&sc->sc_bus);
status = XREAD4(sc, oper, XHCI_USBSTS);
/* acknowledge interrupts, if any */
if (status != 0) {
XWRITE4(sc, oper, XHCI_USBSTS, status);
DPRINTFN(16, "real interrupt (status=0x%08x)\n", status);
}
temp = XREAD4(sc, runt, XHCI_IMAN(0));
/* force clearing of pending interrupts */
if (temp & XHCI_IMAN_INTR_PEND)
XWRITE4(sc, runt, XHCI_IMAN(0), temp);
/* check for event(s) */
xhci_interrupt_poll(sc);
if (status & (XHCI_STS_PCD | XHCI_STS_HCH |
XHCI_STS_HSE | XHCI_STS_HCE)) {
if (status & XHCI_STS_PCD) {
xhci_root_intr(sc);
}
if (status & XHCI_STS_HCH) {
printf("%s: host controller halted\n",
__FUNCTION__);
}
if (status & XHCI_STS_HSE) {
printf("%s: host system error\n",
__FUNCTION__);
}
if (status & XHCI_STS_HCE) {
printf("%s: host controller error\n",
__FUNCTION__);
}
}
USB_BUS_UNLOCK(&sc->sc_bus);
}
/*------------------------------------------------------------------------*
* xhci_timeout - XHCI timeout handler
*------------------------------------------------------------------------*/
static void
xhci_timeout(void *arg)
{
struct usb_xfer *xfer = arg;
DPRINTF("xfer=%p\n", xfer);
USB_BUS_LOCK_ASSERT(xfer->xroot->bus, MA_OWNED);
/* transfer is transferred */
xhci_device_done(xfer, USB_ERR_TIMEOUT);
}
static void
xhci_do_poll(struct usb_bus *bus)
{
struct xhci_softc *sc = XHCI_BUS2SC(bus);
USB_BUS_LOCK(&sc->sc_bus);
xhci_interrupt_poll(sc);
USB_BUS_UNLOCK(&sc->sc_bus);
}
static void
xhci_setup_generic_chain_sub(struct xhci_std_temp *temp)
{
struct usb_page_search buf_res;
struct xhci_td *td;
struct xhci_td *td_next;
struct xhci_td *td_alt_next;
struct xhci_td *td_first;
uint32_t buf_offset;
uint32_t average;
uint32_t len_old;
uint32_t npkt_off;
uint32_t dword;
uint8_t shortpkt_old;
uint8_t precompute;
uint8_t x;
td_alt_next = NULL;
buf_offset = 0;
shortpkt_old = temp->shortpkt;
len_old = temp->len;
npkt_off = 0;
precompute = 1;
restart:
td = temp->td;
td_next = td_first = temp->td_next;
while (1) {
if (temp->len == 0) {
if (temp->shortpkt)
break;
/* send a Zero Length Packet, ZLP, last */
temp->shortpkt = 1;
average = 0;
} else {
average = temp->average;
if (temp->len < average) {
if (temp->len % temp->max_packet_size) {
temp->shortpkt = 1;
}
average = temp->len;
}
}
if (td_next == NULL)
panic("%s: out of XHCI transfer descriptors!", __FUNCTION__);
/* get next TD */
td = td_next;
td_next = td->obj_next;
/* check if we are pre-computing */
if (precompute) {
/* update remaining length */
temp->len -= average;
continue;
}
/* fill out current TD */
td->len = average;
td->remainder = 0;
td->status = 0;
/* update remaining length */
temp->len -= average;
/* reset TRB index */
x = 0;
if (temp->trb_type == XHCI_TRB_TYPE_SETUP_STAGE) {
/* immediate data */
if (average > 8)
average = 8;
td->td_trb[0].qwTrb0 = 0;
usbd_copy_out(temp->pc, temp->offset + buf_offset,
(uint8_t *)(uintptr_t)&td->td_trb[0].qwTrb0,
average);
dword = XHCI_TRB_2_BYTES_SET(8) |
XHCI_TRB_2_TDSZ_SET(0) |
XHCI_TRB_2_IRQ_SET(0);
td->td_trb[0].dwTrb2 = htole32(dword);
dword = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_SETUP_STAGE) |
XHCI_TRB_3_IDT_BIT | XHCI_TRB_3_CYCLE_BIT;
/* check wLength */
if (td->td_trb[0].qwTrb0 &
htole64(XHCI_TRB_0_WLENGTH_MASK)) {
if (td->td_trb[0].qwTrb0 &
htole64(XHCI_TRB_0_DIR_IN_MASK))
dword |= XHCI_TRB_3_TRT_IN;
else
dword |= XHCI_TRB_3_TRT_OUT;
}
td->td_trb[0].dwTrb3 = htole32(dword);
#ifdef USB_DEBUG
xhci_dump_trb(&td->td_trb[x]);
#endif
x++;
} else do {
uint32_t npkt;
/* fill out buffer pointers */
if (average == 0) {
memset(&buf_res, 0, sizeof(buf_res));
} else {
usbd_get_page(temp->pc, temp->offset +
buf_offset, &buf_res);
/* get length to end of page */
if (buf_res.length > average)
buf_res.length = average;
/* check for maximum length */
if (buf_res.length > XHCI_TD_PAGE_SIZE)
buf_res.length = XHCI_TD_PAGE_SIZE;
npkt_off += buf_res.length;
}
/* set up npkt */
npkt = howmany(len_old - npkt_off,
temp->max_packet_size);
if (npkt == 0)
npkt = 1;
else if (npkt > 31)
npkt = 31;
/* fill out TRB's */
td->td_trb[x].qwTrb0 =
htole64((uint64_t)buf_res.physaddr);
dword =
XHCI_TRB_2_BYTES_SET(buf_res.length) |
XHCI_TRB_2_TDSZ_SET(npkt) |
XHCI_TRB_2_IRQ_SET(0);
td->td_trb[x].dwTrb2 = htole32(dword);
switch (temp->trb_type) {
case XHCI_TRB_TYPE_ISOCH:
dword = XHCI_TRB_3_CHAIN_BIT | XHCI_TRB_3_CYCLE_BIT |
XHCI_TRB_3_TBC_SET(temp->tbc) |
XHCI_TRB_3_TLBPC_SET(temp->tlbpc);
if (td != td_first) {
dword |= XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_NORMAL);
} else if (temp->do_isoc_sync != 0) {
temp->do_isoc_sync = 0;
/* wait until "isoc_frame" */
dword |= XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_ISOCH) |
XHCI_TRB_3_FRID_SET(temp->isoc_frame / 8);
} else {
/* start data transfer at next interval */
dword |= XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_ISOCH) |
XHCI_TRB_3_ISO_SIA_BIT;
}
if (temp->direction == UE_DIR_IN)
dword |= XHCI_TRB_3_ISP_BIT;
break;
case XHCI_TRB_TYPE_DATA_STAGE:
dword = XHCI_TRB_3_CHAIN_BIT | XHCI_TRB_3_CYCLE_BIT |
XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_DATA_STAGE);
if (temp->direction == UE_DIR_IN)
dword |= XHCI_TRB_3_DIR_IN | XHCI_TRB_3_ISP_BIT;
/*
* Section 3.2.9 in the XHCI
* specification about control
* transfers says that we should use a
* normal-TRB if there are more TRBs
* extending the data-stage
* TRB. Update the "trb_type".
*/
temp->trb_type = XHCI_TRB_TYPE_NORMAL;
break;
case XHCI_TRB_TYPE_STATUS_STAGE:
dword = XHCI_TRB_3_CHAIN_BIT | XHCI_TRB_3_CYCLE_BIT |
XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_STATUS_STAGE);
if (temp->direction == UE_DIR_IN)
dword |= XHCI_TRB_3_DIR_IN;
break;
default: /* XHCI_TRB_TYPE_NORMAL */
dword = XHCI_TRB_3_CHAIN_BIT | XHCI_TRB_3_CYCLE_BIT |
XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_NORMAL);
if (temp->direction == UE_DIR_IN)
dword |= XHCI_TRB_3_ISP_BIT;
break;
}
td->td_trb[x].dwTrb3 = htole32(dword);
average -= buf_res.length;
buf_offset += buf_res.length;
#ifdef USB_DEBUG
xhci_dump_trb(&td->td_trb[x]);
#endif
x++;
} while (average != 0);
td->td_trb[x-1].dwTrb3 |= htole32(XHCI_TRB_3_IOC_BIT);
/* store number of data TRB's */
td->ntrb = x;
DPRINTF("NTRB=%u\n", x);
/* fill out link TRB */
if (td_next != NULL) {
/* link the current TD with the next one */
td->td_trb[x].qwTrb0 = htole64((uint64_t)td_next->td_self);
DPRINTF("LINK=0x%08llx\n", (long long)td_next->td_self);
} else {
/* this field will get updated later */
DPRINTF("NOLINK\n");
}
dword = XHCI_TRB_2_IRQ_SET(0);
td->td_trb[x].dwTrb2 = htole32(dword);
dword = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_LINK) |
XHCI_TRB_3_CYCLE_BIT | XHCI_TRB_3_IOC_BIT |
/*
* CHAIN-BIT: Ensure that a multi-TRB IN-endpoint
* frame only receives a single short packet event
* by setting the CHAIN bit in the LINK field. In
* addition some XHCI controllers have problems
* sending a ZLP unless the CHAIN-BIT is set in
* the LINK TRB.
*/
XHCI_TRB_3_CHAIN_BIT;
td->td_trb[x].dwTrb3 = htole32(dword);
td->alt_next = td_alt_next;
#ifdef USB_DEBUG
xhci_dump_trb(&td->td_trb[x]);
#endif
usb_pc_cpu_flush(td->page_cache);
}
if (precompute) {
precompute = 0;
/* set up alt next pointer, if any */
if (temp->last_frame) {
td_alt_next = NULL;
} else {
/* we use this field internally */
td_alt_next = td_next;
}
/* restore */
temp->shortpkt = shortpkt_old;
temp->len = len_old;
goto restart;
}
/*
* Remove cycle bit from the first TRB if we are
* stepping them:
*/
if (temp->step_td != 0) {
td_first->td_trb[0].dwTrb3 &= ~htole32(XHCI_TRB_3_CYCLE_BIT);
usb_pc_cpu_flush(td_first->page_cache);
}
/* clear TD SIZE to zero, hence this is the last TRB */
/* remove chain bit because this is the last data TRB in the chain */
td->td_trb[td->ntrb - 1].dwTrb2 &= ~htole32(XHCI_TRB_2_TDSZ_SET(15));
td->td_trb[td->ntrb - 1].dwTrb3 &= ~htole32(XHCI_TRB_3_CHAIN_BIT);
/* remove CHAIN-BIT from last LINK TRB */
td->td_trb[td->ntrb].dwTrb3 &= ~htole32(XHCI_TRB_3_CHAIN_BIT);
usb_pc_cpu_flush(td->page_cache);
temp->td = td;
temp->td_next = td_next;
}
static void
xhci_setup_generic_chain(struct usb_xfer *xfer)
{
struct xhci_std_temp temp;
struct xhci_td *td;
uint32_t x;
uint32_t y;
uint8_t mult;
temp.do_isoc_sync = 0;
temp.step_td = 0;
temp.tbc = 0;
temp.tlbpc = 0;
temp.average = xfer->max_hc_frame_size;
temp.max_packet_size = xfer->max_packet_size;
temp.sc = XHCI_BUS2SC(xfer->xroot->bus);
temp.pc = NULL;
temp.last_frame = 0;
temp.offset = 0;
temp.multishort = xfer->flags_int.isochronous_xfr ||
xfer->flags_int.control_xfr ||
xfer->flags_int.short_frames_ok;
/* toggle the DMA set we are using */
xfer->flags_int.curr_dma_set ^= 1;
/* get next DMA set */
td = xfer->td_start[xfer->flags_int.curr_dma_set];
temp.td = NULL;
temp.td_next = td;
xfer->td_transfer_first = td;
xfer->td_transfer_cache = td;
if (xfer->flags_int.isochronous_xfr) {
uint8_t shift;
/* compute multiplier for ISOCHRONOUS transfers */
mult = xfer->endpoint->ecomp ?
UE_GET_SS_ISO_MULT(xfer->endpoint->ecomp->bmAttributes)
: 0;
/* check for USB 2.0 multiplier */
if (mult == 0) {
mult = (xfer->endpoint->edesc->
wMaxPacketSize[1] >> 3) & 3;
}
/* range check */
if (mult > 2)
mult = 3;
else
mult++;
x = XREAD4(temp.sc, runt, XHCI_MFINDEX);
DPRINTF("MFINDEX=0x%08x\n", x);
switch (usbd_get_speed(xfer->xroot->udev)) {
case USB_SPEED_FULL:
shift = 3;
temp.isoc_delta = 8; /* 1ms */
x += temp.isoc_delta - 1;
x &= ~(temp.isoc_delta - 1);
break;
default:
shift = usbd_xfer_get_fps_shift(xfer);
temp.isoc_delta = 1U << shift;
x += temp.isoc_delta - 1;
x &= ~(temp.isoc_delta - 1);
/* simple frame load balancing */
x += xfer->endpoint->usb_uframe;
break;
}
y = XHCI_MFINDEX_GET(x - xfer->endpoint->isoc_next);
if ((xfer->endpoint->is_synced == 0) ||
(y < (xfer->nframes << shift)) ||
(XHCI_MFINDEX_GET(-y) >= (128 * 8))) {
/*
* If there is data underflow or the pipe
* queue is empty we schedule the transfer a
* few frames ahead of the current frame
* position. Else two isochronous transfers
* might overlap.
*/
xfer->endpoint->isoc_next = XHCI_MFINDEX_GET(x + (3 * 8));
xfer->endpoint->is_synced = 1;
temp.do_isoc_sync = 1;
DPRINTFN(3, "start next=%d\n", xfer->endpoint->isoc_next);
}
/* compute isochronous completion time */
y = XHCI_MFINDEX_GET(xfer->endpoint->isoc_next - (x & ~7));
xfer->isoc_time_complete =
usb_isoc_time_expand(&temp.sc->sc_bus, x / 8) +
(y / 8) + (((xfer->nframes << shift) + 7) / 8);
x = 0;
temp.isoc_frame = xfer->endpoint->isoc_next;
temp.trb_type = XHCI_TRB_TYPE_ISOCH;
xfer->endpoint->isoc_next += xfer->nframes << shift;
} else if (xfer->flags_int.control_xfr) {
/* check if we should prepend a setup message */
if (xfer->flags_int.control_hdr) {
temp.len = xfer->frlengths[0];
temp.pc = xfer->frbuffers + 0;
temp.shortpkt = temp.len ? 1 : 0;
temp.trb_type = XHCI_TRB_TYPE_SETUP_STAGE;
temp.direction = 0;
/* check for last frame */
if (xfer->nframes == 1) {
/* no STATUS stage yet, SETUP is last */
if (xfer->flags_int.control_act)
temp.last_frame = 1;
}
xhci_setup_generic_chain_sub(&temp);
}
x = 1;
mult = 1;
temp.isoc_delta = 0;
temp.isoc_frame = 0;
temp.trb_type = xfer->flags_int.control_did_data ?
XHCI_TRB_TYPE_NORMAL : XHCI_TRB_TYPE_DATA_STAGE;
} else {
x = 0;
mult = 1;
temp.isoc_delta = 0;
temp.isoc_frame = 0;
temp.trb_type = XHCI_TRB_TYPE_NORMAL;
}
if (x != xfer->nframes) {
/* set up page_cache pointer */
temp.pc = xfer->frbuffers + x;
/* set endpoint direction */
temp.direction = UE_GET_DIR(xfer->endpointno);
}
while (x != xfer->nframes) {
/* DATA0 / DATA1 message */
temp.len = xfer->frlengths[x];
temp.step_td = ((xfer->endpointno & UE_DIR_IN) &&
x != 0 && temp.multishort == 0);
x++;
if (x == xfer->nframes) {
if (xfer->flags_int.control_xfr) {
/* no STATUS stage yet, DATA is last */
if (xfer->flags_int.control_act)
temp.last_frame = 1;
} else {
temp.last_frame = 1;
}
}
if (temp.len == 0) {
/* make sure that we send an USB packet */
temp.shortpkt = 0;
temp.tbc = 0;
temp.tlbpc = mult - 1;
} else if (xfer->flags_int.isochronous_xfr) {
uint8_t tdpc;
/*
* Isochronous transfers don't have short
* packet termination:
*/
temp.shortpkt = 1;
/* isochronous transfers have a transfer limit */
if (temp.len > xfer->max_frame_size)
temp.len = xfer->max_frame_size;
/* compute TD packet count */
tdpc = howmany(temp.len, xfer->max_packet_size);
temp.tbc = howmany(tdpc, mult) - 1;
temp.tlbpc = (tdpc % mult);
if (temp.tlbpc == 0)
temp.tlbpc = mult - 1;
else
temp.tlbpc--;
} else {
/* regular data transfer */
temp.shortpkt = xfer->flags.force_short_xfer ? 0 : 1;
}
xhci_setup_generic_chain_sub(&temp);
if (xfer->flags_int.isochronous_xfr) {
temp.offset += xfer->frlengths[x - 1];
temp.isoc_frame += temp.isoc_delta;
} else {
/* get next Page Cache pointer */
temp.pc = xfer->frbuffers + x;
}
}
/* check if we should append a status stage */
if (xfer->flags_int.control_xfr &&
!xfer->flags_int.control_act) {
/*
* Send a DATA1 message and invert the current
* endpoint direction.
*/
#ifdef XHCI_STEP_STATUS_STAGE
temp.step_td = (xfer->nframes != 0);
#else
temp.step_td = 0;
#endif
temp.direction = UE_GET_DIR(xfer->endpointno) ^ UE_DIR_IN;
temp.len = 0;
temp.pc = NULL;
temp.shortpkt = 0;
temp.last_frame = 1;
temp.trb_type = XHCI_TRB_TYPE_STATUS_STAGE;
xhci_setup_generic_chain_sub(&temp);
}
td = temp.td;
/* must have at least one frame! */
xfer->td_transfer_last = td;
DPRINTF("first=%p last=%p\n", xfer->td_transfer_first, td);
}
static void
xhci_set_slot_pointer(struct xhci_softc *sc, uint8_t index, uint64_t dev_addr)
{
struct usb_page_search buf_res;
struct xhci_dev_ctx_addr *pdctxa;
usbd_get_page(&sc->sc_hw.ctx_pc, 0, &buf_res);
pdctxa = buf_res.buffer;
DPRINTF("addr[%u]=0x%016llx\n", index, (long long)dev_addr);
pdctxa->qwBaaDevCtxAddr[index] = htole64(dev_addr);
usb_pc_cpu_flush(&sc->sc_hw.ctx_pc);
}
static usb_error_t
xhci_configure_mask(struct usb_device *udev, uint32_t mask, uint8_t drop)
{
struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
struct usb_page_search buf_inp;
struct xhci_input_dev_ctx *pinp;
uint32_t temp;
uint8_t index;
uint8_t x;
index = udev->controller_slot_id;
usbd_get_page(&sc->sc_hw.devs[index].input_pc, 0, &buf_inp);
pinp = buf_inp.buffer;
if (drop) {
mask &= XHCI_INCTX_NON_CTRL_MASK;
xhci_ctx_set_le32(sc, &pinp->ctx_input.dwInCtx0, mask);
xhci_ctx_set_le32(sc, &pinp->ctx_input.dwInCtx1, 0);
} else {
/*
* Some hardware requires that we drop the endpoint
* context before adding it again:
*/
xhci_ctx_set_le32(sc, &pinp->ctx_input.dwInCtx0,
mask & XHCI_INCTX_NON_CTRL_MASK);
/* Add new endpoint context */
xhci_ctx_set_le32(sc, &pinp->ctx_input.dwInCtx1, mask);
/* find most significant set bit */
for (x = 31; x != 1; x--) {
if (mask & (1 << x))
break;
}
/* adjust */
x--;
/* figure out the maximum number of contexts */
if (x > sc->sc_hw.devs[index].context_num)
sc->sc_hw.devs[index].context_num = x;
else
x = sc->sc_hw.devs[index].context_num;
/* update number of contexts */
temp = xhci_ctx_get_le32(sc, &pinp->ctx_slot.dwSctx0);
temp &= ~XHCI_SCTX_0_CTX_NUM_SET(31);
temp |= XHCI_SCTX_0_CTX_NUM_SET(x + 1);
xhci_ctx_set_le32(sc, &pinp->ctx_slot.dwSctx0, temp);
}
usb_pc_cpu_flush(&sc->sc_hw.devs[index].input_pc);
return (0);
}
static usb_error_t
xhci_configure_endpoint(struct usb_device *udev,
struct usb_endpoint_descriptor *edesc, struct xhci_endpoint_ext *pepext,
uint16_t interval, uint8_t max_packet_count,
uint8_t mult, uint8_t fps_shift, uint16_t max_packet_size,
uint16_t max_frame_size, uint8_t ep_mode)
{
struct usb_page_search buf_inp;
struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
struct xhci_input_dev_ctx *pinp;
uint64_t ring_addr = pepext->physaddr;
uint32_t temp;
uint8_t index;
uint8_t epno;
uint8_t type;
index = udev->controller_slot_id;
usbd_get_page(&sc->sc_hw.devs[index].input_pc, 0, &buf_inp);
pinp = buf_inp.buffer;
epno = edesc->bEndpointAddress;
type = edesc->bmAttributes & UE_XFERTYPE;
if (type == UE_CONTROL)
epno |= UE_DIR_IN;
epno = XHCI_EPNO2EPID(epno);
if (epno == 0)
return (USB_ERR_NO_PIPE); /* invalid */
if (max_packet_count == 0)
return (USB_ERR_BAD_BUFSIZE);
max_packet_count--;
if (mult == 0)
return (USB_ERR_BAD_BUFSIZE);
/* store endpoint mode */
pepext->trb_ep_mode = ep_mode;
/* store bMaxPacketSize for control endpoints */
pepext->trb_ep_maxp = edesc->wMaxPacketSize[0];
usb_pc_cpu_flush(pepext->page_cache);
if (ep_mode == USB_EP_MODE_STREAMS) {
temp = XHCI_EPCTX_0_EPSTATE_SET(0) |
XHCI_EPCTX_0_MAXP_STREAMS_SET(XHCI_MAX_STREAMS_LOG - 1) |
XHCI_EPCTX_0_LSA_SET(1);
ring_addr += sizeof(struct xhci_trb) *
XHCI_MAX_TRANSFERS * XHCI_MAX_STREAMS;
} else {
temp = XHCI_EPCTX_0_EPSTATE_SET(0) |
XHCI_EPCTX_0_MAXP_STREAMS_SET(0) |
XHCI_EPCTX_0_LSA_SET(0);
ring_addr |= XHCI_EPCTX_2_DCS_SET(1);
}
switch (udev->speed) {
case USB_SPEED_FULL:
case USB_SPEED_LOW:
/* 1ms -> 125us */
fps_shift += 3;
break;
default:
break;
}
switch (type) {
case UE_INTERRUPT:
if (fps_shift > 3)
fps_shift--;
temp |= XHCI_EPCTX_0_IVAL_SET(fps_shift);
break;
case UE_ISOCHRONOUS:
temp |= XHCI_EPCTX_0_IVAL_SET(fps_shift);
switch (udev->speed) {
case USB_SPEED_SUPER:
if (mult > 3)
mult = 3;
temp |= XHCI_EPCTX_0_MULT_SET(mult - 1);
max_packet_count /= mult;
break;
default:
break;
}
break;
default:
break;
}
xhci_ctx_set_le32(sc, &pinp->ctx_ep[epno - 1].dwEpCtx0, temp);
temp =
XHCI_EPCTX_1_HID_SET(0) |
XHCI_EPCTX_1_MAXB_SET(max_packet_count) |
XHCI_EPCTX_1_MAXP_SIZE_SET(max_packet_size);
/*
* Always enable the "three strikes and you are gone" feature
* except for ISOCHRONOUS endpoints. This is suggested by
* section 4.3.3 in the XHCI specification about device slot
* initialisation.
*/
if (type != UE_ISOCHRONOUS)
temp |= XHCI_EPCTX_1_CERR_SET(3);
switch (type) {
case UE_CONTROL:
temp |= XHCI_EPCTX_1_EPTYPE_SET(4);
break;
case UE_ISOCHRONOUS:
temp |= XHCI_EPCTX_1_EPTYPE_SET(1);
break;
case UE_BULK:
temp |= XHCI_EPCTX_1_EPTYPE_SET(2);
break;
default:
temp |= XHCI_EPCTX_1_EPTYPE_SET(3);
break;
}
/* check for IN direction */
if (epno & 1)
temp |= XHCI_EPCTX_1_EPTYPE_SET(4);
xhci_ctx_set_le32(sc, &pinp->ctx_ep[epno - 1].dwEpCtx1, temp);
xhci_ctx_set_le64(sc, &pinp->ctx_ep[epno - 1].qwEpCtx2, ring_addr);
switch (edesc->bmAttributes & UE_XFERTYPE) {
case UE_INTERRUPT:
case UE_ISOCHRONOUS:
temp = XHCI_EPCTX_4_MAX_ESIT_PAYLOAD_SET(max_frame_size) |
XHCI_EPCTX_4_AVG_TRB_LEN_SET(MIN(XHCI_PAGE_SIZE,
max_frame_size));
break;
case UE_CONTROL:
temp = XHCI_EPCTX_4_AVG_TRB_LEN_SET(8);
break;
default:
temp = XHCI_EPCTX_4_AVG_TRB_LEN_SET(XHCI_PAGE_SIZE);
break;
}
xhci_ctx_set_le32(sc, &pinp->ctx_ep[epno - 1].dwEpCtx4, temp);
#ifdef USB_DEBUG
xhci_dump_endpoint(sc, &pinp->ctx_ep[epno - 1]);
#endif
usb_pc_cpu_flush(&sc->sc_hw.devs[index].input_pc);
return (0); /* success */
}
static usb_error_t
xhci_configure_endpoint_by_xfer(struct usb_xfer *xfer)
{
struct xhci_endpoint_ext *pepext;
struct usb_endpoint_ss_comp_descriptor *ecomp;
usb_stream_t x;
pepext = xhci_get_endpoint_ext(xfer->xroot->udev,
xfer->endpoint->edesc);
ecomp = xfer->endpoint->ecomp;
for (x = 0; x != XHCI_MAX_STREAMS; x++) {
uint64_t temp;
/* halt any transfers */
pepext->trb[x * XHCI_MAX_TRANSFERS].dwTrb3 = 0;
/* compute start of TRB ring for stream "x" */
temp = pepext->physaddr +
(x * XHCI_MAX_TRANSFERS * sizeof(struct xhci_trb)) +
XHCI_SCTX_0_SCT_SEC_TR_RING;
/* make tree structure */
pepext->trb[(XHCI_MAX_TRANSFERS *
XHCI_MAX_STREAMS) + x].qwTrb0 = htole64(temp);
/* reserved fields */
pepext->trb[(XHCI_MAX_TRANSFERS *
XHCI_MAX_STREAMS) + x].dwTrb2 = 0;
pepext->trb[(XHCI_MAX_TRANSFERS *
XHCI_MAX_STREAMS) + x].dwTrb3 = 0;
}
usb_pc_cpu_flush(pepext->page_cache);
return (xhci_configure_endpoint(xfer->xroot->udev,
xfer->endpoint->edesc, pepext,
xfer->interval, xfer->max_packet_count,
(ecomp != NULL) ? UE_GET_SS_ISO_MULT(ecomp->bmAttributes) + 1 : 1,
usbd_xfer_get_fps_shift(xfer), xfer->max_packet_size,
xfer->max_frame_size, xfer->endpoint->ep_mode));
}
static usb_error_t
xhci_configure_device(struct usb_device *udev)
{
struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
struct usb_page_search buf_inp;
struct usb_page_cache *pcinp;
struct xhci_input_dev_ctx *pinp;
struct usb_device *hubdev;
uint32_t temp;
uint32_t route;
uint32_t rh_port;
uint8_t is_hub;
uint8_t index;
uint8_t depth;
index = udev->controller_slot_id;
DPRINTF("index=%u\n", index);
pcinp = &sc->sc_hw.devs[index].input_pc;
usbd_get_page(pcinp, 0, &buf_inp);
pinp = buf_inp.buffer;
rh_port = 0;
route = 0;
/* figure out route string and root HUB port number */
for (hubdev = udev; hubdev != NULL; hubdev = hubdev->parent_hub) {
if (hubdev->parent_hub == NULL)
break;
depth = hubdev->parent_hub->depth;
/*
* NOTE: HS/FS/LS devices and the SS root HUB can have
* more than 15 ports
*/
rh_port = hubdev->port_no;
if (depth == 0)
break;
if (rh_port > 15)
rh_port = 15;
if (depth < 6)
route |= rh_port << (4 * (depth - 1));
}
DPRINTF("Route=0x%08x\n", route);
temp = XHCI_SCTX_0_ROUTE_SET(route) |
XHCI_SCTX_0_CTX_NUM_SET(
sc->sc_hw.devs[index].context_num + 1);
switch (udev->speed) {
case USB_SPEED_LOW:
temp |= XHCI_SCTX_0_SPEED_SET(2);
if (udev->parent_hs_hub != NULL &&
udev->parent_hs_hub->ddesc.bDeviceProtocol ==
UDPROTO_HSHUBMTT) {
DPRINTF("Device inherits MTT\n");
temp |= XHCI_SCTX_0_MTT_SET(1);
}
break;
case USB_SPEED_HIGH:
temp |= XHCI_SCTX_0_SPEED_SET(3);
if (sc->sc_hw.devs[index].nports != 0 &&
udev->ddesc.bDeviceProtocol == UDPROTO_HSHUBMTT) {
DPRINTF("HUB supports MTT\n");
temp |= XHCI_SCTX_0_MTT_SET(1);
}
break;
case USB_SPEED_FULL:
temp |= XHCI_SCTX_0_SPEED_SET(1);
if (udev->parent_hs_hub != NULL &&
udev->parent_hs_hub->ddesc.bDeviceProtocol ==
UDPROTO_HSHUBMTT) {
DPRINTF("Device inherits MTT\n");
temp |= XHCI_SCTX_0_MTT_SET(1);
}
break;
default:
temp |= XHCI_SCTX_0_SPEED_SET(4);
break;
}
is_hub = sc->sc_hw.devs[index].nports != 0 &&
(udev->speed == USB_SPEED_SUPER ||
udev->speed == USB_SPEED_HIGH);
if (is_hub)
temp |= XHCI_SCTX_0_HUB_SET(1);
xhci_ctx_set_le32(sc, &pinp->ctx_slot.dwSctx0, temp);
temp = XHCI_SCTX_1_RH_PORT_SET(rh_port);
if (is_hub) {
temp |= XHCI_SCTX_1_NUM_PORTS_SET(
sc->sc_hw.devs[index].nports);
}
switch (udev->speed) {
case USB_SPEED_SUPER:
switch (sc->sc_hw.devs[index].state) {
case XHCI_ST_ADDRESSED:
case XHCI_ST_CONFIGURED:
/* enable power save */
temp |= XHCI_SCTX_1_MAX_EL_SET(sc->sc_exit_lat_max);
break;
default:
/* disable power save */
break;
}
break;
default:
break;
}
xhci_ctx_set_le32(sc, &pinp->ctx_slot.dwSctx1, temp);
temp = XHCI_SCTX_2_IRQ_TARGET_SET(0);
if (is_hub) {
temp |= XHCI_SCTX_2_TT_THINK_TIME_SET(
sc->sc_hw.devs[index].tt);
}
hubdev = udev->parent_hs_hub;
/* check if we should activate the transaction translator */
switch (udev->speed) {
case USB_SPEED_FULL:
case USB_SPEED_LOW:
if (hubdev != NULL) {
temp |= XHCI_SCTX_2_TT_HUB_SID_SET(
hubdev->controller_slot_id);
temp |= XHCI_SCTX_2_TT_PORT_NUM_SET(
udev->hs_port_no);
}
break;
default:
break;
}
xhci_ctx_set_le32(sc, &pinp->ctx_slot.dwSctx2, temp);
/*
* These fields should be initialized to zero, according to
* XHCI section 6.2.2 - slot context:
*/
temp = XHCI_SCTX_3_DEV_ADDR_SET(0) |
XHCI_SCTX_3_SLOT_STATE_SET(0);
xhci_ctx_set_le32(sc, &pinp->ctx_slot.dwSctx3, temp);
#ifdef USB_DEBUG
xhci_dump_device(sc, &pinp->ctx_slot);
#endif
usb_pc_cpu_flush(pcinp);
return (0); /* success */
}
static usb_error_t
xhci_alloc_device_ext(struct usb_device *udev)
{
struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
struct usb_page_search buf_dev;
struct usb_page_search buf_ep;
struct xhci_trb *trb;
struct usb_page_cache *pc;
struct usb_page *pg;
uint64_t addr;
uint8_t index;
uint8_t i;
index = udev->controller_slot_id;
pc = &sc->sc_hw.devs[index].device_pc;
pg = &sc->sc_hw.devs[index].device_pg;
/* need to initialize the page cache */
pc->tag_parent = sc->sc_bus.dma_parent_tag;
if (usb_pc_alloc_mem(pc, pg, sc->sc_ctx_is_64_byte ?
(2 * sizeof(struct xhci_dev_ctx)) :
sizeof(struct xhci_dev_ctx), XHCI_PAGE_SIZE))
goto error;
usbd_get_page(pc, 0, &buf_dev);
pc = &sc->sc_hw.devs[index].input_pc;
pg = &sc->sc_hw.devs[index].input_pg;
/* need to initialize the page cache */
pc->tag_parent = sc->sc_bus.dma_parent_tag;
if (usb_pc_alloc_mem(pc, pg, sc->sc_ctx_is_64_byte ?
(2 * sizeof(struct xhci_input_dev_ctx)) :
sizeof(struct xhci_input_dev_ctx), XHCI_PAGE_SIZE)) {
goto error;
}
/* initialize all endpoint LINK TRBs */
for (i = 0; i != XHCI_MAX_ENDPOINTS; i++) {
pc = &sc->sc_hw.devs[index].endpoint_pc[i];
pg = &sc->sc_hw.devs[index].endpoint_pg[i];
/* need to initialize the page cache */
pc->tag_parent = sc->sc_bus.dma_parent_tag;
if (usb_pc_alloc_mem(pc, pg,
sizeof(struct xhci_dev_endpoint_trbs), XHCI_TRB_ALIGN)) {
goto error;
}
/* lookup endpoint TRB ring */
usbd_get_page(pc, 0, &buf_ep);
/* get TRB pointer */
trb = buf_ep.buffer;
trb += XHCI_MAX_TRANSFERS - 1;
/* get TRB start address */
addr = buf_ep.physaddr;
/* create LINK TRB */
trb->qwTrb0 = htole64(addr);
trb->dwTrb2 = htole32(XHCI_TRB_2_IRQ_SET(0));
trb->dwTrb3 = htole32(XHCI_TRB_3_CYCLE_BIT |
XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_LINK));
usb_pc_cpu_flush(pc);
}
xhci_set_slot_pointer(sc, index, buf_dev.physaddr);
return (0);
error:
xhci_free_device_ext(udev);
return (USB_ERR_NOMEM);
}
static void
xhci_free_device_ext(struct usb_device *udev)
{
struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
uint8_t index;
uint8_t i;
index = udev->controller_slot_id;
xhci_set_slot_pointer(sc, index, 0);
usb_pc_free_mem(&sc->sc_hw.devs[index].device_pc);
usb_pc_free_mem(&sc->sc_hw.devs[index].input_pc);
for (i = 0; i != XHCI_MAX_ENDPOINTS; i++)
usb_pc_free_mem(&sc->sc_hw.devs[index].endpoint_pc[i]);
}
static struct xhci_endpoint_ext *
xhci_get_endpoint_ext(struct usb_device *udev, struct usb_endpoint_descriptor *edesc)
{
struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
struct xhci_endpoint_ext *pepext;
struct usb_page_cache *pc;
struct usb_page_search buf_ep;
uint8_t epno;
uint8_t index;
epno = edesc->bEndpointAddress;
if ((edesc->bmAttributes & UE_XFERTYPE) == UE_CONTROL)
epno |= UE_DIR_IN;
epno = XHCI_EPNO2EPID(epno);
index = udev->controller_slot_id;
pc = &sc->sc_hw.devs[index].endpoint_pc[epno];
usbd_get_page(pc, 0, &buf_ep);
pepext = &sc->sc_hw.devs[index].endp[epno];
pepext->page_cache = pc;
pepext->trb = buf_ep.buffer;
pepext->physaddr = buf_ep.physaddr;
return (pepext);
}
static void
xhci_endpoint_doorbell(struct usb_xfer *xfer)
{
struct xhci_softc *sc = XHCI_BUS2SC(xfer->xroot->bus);
uint8_t epno;
uint8_t index;
epno = xfer->endpointno;
if (xfer->flags_int.control_xfr)
epno |= UE_DIR_IN;
epno = XHCI_EPNO2EPID(epno);
index = xfer->xroot->udev->controller_slot_id;
if (xfer->xroot->udev->flags.self_suspended == 0) {
XWRITE4(sc, door, XHCI_DOORBELL(index),
epno | XHCI_DB_SID_SET(xfer->stream_id));
}
}
static void
xhci_transfer_remove(struct usb_xfer *xfer, usb_error_t error)
{
struct xhci_endpoint_ext *pepext;
if (xfer->flags_int.bandwidth_reclaimed) {
xfer->flags_int.bandwidth_reclaimed = 0;
pepext = xhci_get_endpoint_ext(xfer->xroot->udev,
xfer->endpoint->edesc);
pepext->trb_used[xfer->stream_id]--;
pepext->xfer[xfer->qh_pos] = NULL;
if (error && pepext->trb_running != 0) {
pepext->trb_halted = 1;
pepext->trb_running = 0;
}
}
}
static usb_error_t
xhci_transfer_insert(struct usb_xfer *xfer)
{
struct xhci_td *td_first;
struct xhci_td *td_last;
struct xhci_trb *trb_link;
struct xhci_endpoint_ext *pepext;
uint64_t addr;
usb_stream_t id;
uint8_t i;
uint8_t inext;
uint8_t trb_limit;
DPRINTFN(8, "\n");
id = xfer->stream_id;
/* check if already inserted */
if (xfer->flags_int.bandwidth_reclaimed) {
DPRINTFN(8, "Already in schedule\n");
return (0);
}
pepext = xhci_get_endpoint_ext(xfer->xroot->udev,
xfer->endpoint->edesc);
td_first = xfer->td_transfer_first;
td_last = xfer->td_transfer_last;
addr = pepext->physaddr;
switch (xfer->endpoint->edesc->bmAttributes & UE_XFERTYPE) {
case UE_CONTROL:
case UE_INTERRUPT:
/* single buffered */
trb_limit = 1;
break;
default:
/* multi buffered */
trb_limit = (XHCI_MAX_TRANSFERS - 2);
break;
}
if (pepext->trb_used[id] >= trb_limit) {
DPRINTFN(8, "Too many TDs queued.\n");
return (USB_ERR_NOMEM);
}
/* check if bMaxPacketSize changed */
if (xfer->flags_int.control_xfr != 0 &&
pepext->trb_ep_maxp != xfer->endpoint->edesc->wMaxPacketSize[0]) {
DPRINTFN(8, "Reconfigure control endpoint\n");
/* force driver to reconfigure endpoint */
pepext->trb_halted = 1;
pepext->trb_running = 0;
}
/* check for stopped condition, after putting transfer on interrupt queue */
if (pepext->trb_running == 0) {
struct xhci_softc *sc = XHCI_BUS2SC(xfer->xroot->bus);
DPRINTFN(8, "Not running\n");
/* start configuration */
(void)usb_proc_msignal(USB_BUS_CONTROL_XFER_PROC(&sc->sc_bus),
&sc->sc_config_msg[0], &sc->sc_config_msg[1]);
return (0);
}
pepext->trb_used[id]++;
/* get current TRB index */
i = pepext->trb_index[id];
/* get next TRB index */
inext = (i + 1);
/* the last entry of the ring is a hardcoded link TRB */
if (inext >= (XHCI_MAX_TRANSFERS - 1))
inext = 0;
/* store next TRB index, before stream ID offset is added */
pepext->trb_index[id] = inext;
/* offset for stream */
i += id * XHCI_MAX_TRANSFERS;
inext += id * XHCI_MAX_TRANSFERS;
/* compute terminating return address */
addr += (inext * sizeof(struct xhci_trb));
/* compute link TRB pointer */
trb_link = td_last->td_trb + td_last->ntrb;
/* update next pointer of last link TRB */
trb_link->qwTrb0 = htole64(addr);
trb_link->dwTrb2 = htole32(XHCI_TRB_2_IRQ_SET(0));
trb_link->dwTrb3 = htole32(XHCI_TRB_3_IOC_BIT |
XHCI_TRB_3_CYCLE_BIT |
XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_LINK));
#ifdef USB_DEBUG
xhci_dump_trb(&td_last->td_trb[td_last->ntrb]);
#endif
usb_pc_cpu_flush(td_last->page_cache);
/* write ahead chain end marker */
pepext->trb[inext].qwTrb0 = 0;
pepext->trb[inext].dwTrb2 = 0;
pepext->trb[inext].dwTrb3 = 0;
/* update next pointer of link TRB */
pepext->trb[i].qwTrb0 = htole64((uint64_t)td_first->td_self);
pepext->trb[i].dwTrb2 = htole32(XHCI_TRB_2_IRQ_SET(0));
#ifdef USB_DEBUG
xhci_dump_trb(&pepext->trb[i]);
#endif
usb_pc_cpu_flush(pepext->page_cache);
/* toggle cycle bit which activates the transfer chain */
pepext->trb[i].dwTrb3 = htole32(XHCI_TRB_3_CYCLE_BIT |
XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_LINK));
usb_pc_cpu_flush(pepext->page_cache);
DPRINTF("qh_pos = %u\n", i);
pepext->xfer[i] = xfer;
xfer->qh_pos = i;
xfer->flags_int.bandwidth_reclaimed = 1;
xhci_endpoint_doorbell(xfer);
return (0);
}
static void
xhci_root_intr(struct xhci_softc *sc)
{
uint16_t i;
USB_BUS_LOCK_ASSERT(&sc->sc_bus, MA_OWNED);
/* clear any old interrupt data */
memset(sc->sc_hub_idata, 0, sizeof(sc->sc_hub_idata));
for (i = 1; i <= sc->sc_noport; i++) {
/* pick out CHANGE bits from the status register */
if (XREAD4(sc, oper, XHCI_PORTSC(i)) & (
XHCI_PS_CSC | XHCI_PS_PEC |
XHCI_PS_OCC | XHCI_PS_WRC |
XHCI_PS_PRC | XHCI_PS_PLC |
XHCI_PS_CEC)) {
sc->sc_hub_idata[i / 8] |= 1 << (i % 8);
DPRINTF("port %d changed\n", i);
}
}
uhub_root_intr(&sc->sc_bus, sc->sc_hub_idata,
sizeof(sc->sc_hub_idata));
}
/*------------------------------------------------------------------------*
* xhci_device_done - XHCI done handler
*
* NOTE: This function can be called two times in a row on
* the same USB transfer. From close and from interrupt.
*------------------------------------------------------------------------*/
static void
xhci_device_done(struct usb_xfer *xfer, usb_error_t error)
{
DPRINTFN(2, "xfer=%p, endpoint=%p, error=%d\n",
xfer, xfer->endpoint, error);
/* remove transfer from HW queue */
xhci_transfer_remove(xfer, error);
/* dequeue transfer and start next transfer */
usbd_transfer_done(xfer, error);
}
/*------------------------------------------------------------------------*
* XHCI data transfer support (generic type)
*------------------------------------------------------------------------*/
static void
xhci_device_generic_open(struct usb_xfer *xfer)
{
if (xfer->flags_int.isochronous_xfr) {
switch (xfer->xroot->udev->speed) {
case USB_SPEED_FULL:
break;
default:
usb_hs_bandwidth_alloc(xfer);
break;
}
}
}
static void
xhci_device_generic_close(struct usb_xfer *xfer)
{
DPRINTF("\n");
xhci_device_done(xfer, USB_ERR_CANCELLED);
if (xfer->flags_int.isochronous_xfr) {
switch (xfer->xroot->udev->speed) {
case USB_SPEED_FULL:
break;
default:
usb_hs_bandwidth_free(xfer);
break;
}
}
}
static void
xhci_device_generic_multi_enter(struct usb_endpoint *ep,
usb_stream_t stream_id, struct usb_xfer *enter_xfer)
{
struct usb_xfer *xfer;
/* check if there is a current transfer */
xfer = ep->endpoint_q[stream_id].curr;
if (xfer == NULL)
return;
/*
* Check if the current transfer is started and then pickup
* the next one, if any. Else wait for next start event due to
* block on failure feature.
*/
if (!xfer->flags_int.bandwidth_reclaimed)
return;
xfer = TAILQ_FIRST(&ep->endpoint_q[stream_id].head);
if (xfer == NULL) {
/*
* In case of enter we have to consider that the
* transfer is queued by the USB core after the enter
* method is called.
*/
xfer = enter_xfer;
if (xfer == NULL)
return;
}
/* try to multi buffer */
xhci_transfer_insert(xfer);
}
static void
xhci_device_generic_enter(struct usb_xfer *xfer)
{
DPRINTF("\n");
/* set up TD's and QH */
xhci_setup_generic_chain(xfer);
xhci_device_generic_multi_enter(xfer->endpoint,
xfer->stream_id, xfer);
}
static void
xhci_device_generic_start(struct usb_xfer *xfer)
{
DPRINTF("\n");
/* try to insert xfer on HW queue */
xhci_transfer_insert(xfer);
/* try to multi buffer */
xhci_device_generic_multi_enter(xfer->endpoint,
xfer->stream_id, NULL);
/* add transfer last on interrupt queue */
usbd_transfer_enqueue(&xfer->xroot->bus->intr_q, xfer);
/* start timeout, if any */
if (xfer->timeout != 0)
usbd_transfer_timeout_ms(xfer, &xhci_timeout, xfer->timeout);
}
static const struct usb_pipe_methods xhci_device_generic_methods =
{
.open = xhci_device_generic_open,
.close = xhci_device_generic_close,
.enter = xhci_device_generic_enter,
.start = xhci_device_generic_start,
};
/*------------------------------------------------------------------------*
* xhci root HUB support
*------------------------------------------------------------------------*
* Simulate a hardware HUB by handling all the necessary requests.
*------------------------------------------------------------------------*/
#define HSETW(ptr, val) ptr = { (uint8_t)(val), (uint8_t)((val) >> 8) }
static const
struct usb_device_descriptor xhci_devd =
{
.bLength = sizeof(xhci_devd),
.bDescriptorType = UDESC_DEVICE, /* type */
HSETW(.bcdUSB, 0x0300), /* USB version */
.bDeviceClass = UDCLASS_HUB, /* class */
.bDeviceSubClass = UDSUBCLASS_HUB, /* subclass */
.bDeviceProtocol = UDPROTO_SSHUB, /* protocol */
.bMaxPacketSize = 9, /* max packet size */
HSETW(.idVendor, 0x0000), /* vendor */
HSETW(.idProduct, 0x0000), /* product */
HSETW(.bcdDevice, 0x0100), /* device version */
.iManufacturer = 1,
.iProduct = 2,
.iSerialNumber = 0,
.bNumConfigurations = 1, /* # of configurations */
};
static const
struct xhci_bos_desc xhci_bosd = {
.bosd = {
.bLength = sizeof(xhci_bosd.bosd),
.bDescriptorType = UDESC_BOS,
HSETW(.wTotalLength, sizeof(xhci_bosd)),
.bNumDeviceCaps = 3,
},
.usb2extd = {
.bLength = sizeof(xhci_bosd.usb2extd),
.bDescriptorType = 1,
.bDevCapabilityType = 2,
.bmAttributes[0] = 2,
},
.usbdcd = {
.bLength = sizeof(xhci_bosd.usbdcd),
.bDescriptorType = UDESC_DEVICE_CAPABILITY,
.bDevCapabilityType = 3,
.bmAttributes = 0, /* XXX */
HSETW(.wSpeedsSupported, 0x000C),
.bFunctionalitySupport = 8,
.bU1DevExitLat = 255, /* dummy - not used */
.wU2DevExitLat = { 0x00, 0x08 },
},
.cidd = {
.bLength = sizeof(xhci_bosd.cidd),
.bDescriptorType = 1,
.bDevCapabilityType = 4,
.bReserved = 0,
.bContainerID = 0, /* XXX */
},
};
static const
struct xhci_config_desc xhci_confd = {
.confd = {
.bLength = sizeof(xhci_confd.confd),
.bDescriptorType = UDESC_CONFIG,
.wTotalLength[0] = sizeof(xhci_confd),
.bNumInterface = 1,
.bConfigurationValue = 1,
.iConfiguration = 0,
.bmAttributes = UC_SELF_POWERED,
.bMaxPower = 0 /* max power */
},
.ifcd = {
.bLength = sizeof(xhci_confd.ifcd),
.bDescriptorType = UDESC_INTERFACE,
.bNumEndpoints = 1,
.bInterfaceClass = UICLASS_HUB,
.bInterfaceSubClass = UISUBCLASS_HUB,
.bInterfaceProtocol = 0,
},
.endpd = {
.bLength = sizeof(xhci_confd.endpd),
.bDescriptorType = UDESC_ENDPOINT,
.bEndpointAddress = UE_DIR_IN | XHCI_INTR_ENDPT,
.bmAttributes = UE_INTERRUPT,
.wMaxPacketSize[0] = 2, /* max 15 ports */
.bInterval = 255,
},
.endpcd = {
.bLength = sizeof(xhci_confd.endpcd),
.bDescriptorType = UDESC_ENDPOINT_SS_COMP,
.bMaxBurst = 0,
.bmAttributes = 0,
},
};
static const
struct usb_hub_ss_descriptor xhci_hubd = {
.bLength = sizeof(xhci_hubd),
.bDescriptorType = UDESC_SS_HUB,
};
static usb_error_t
xhci_roothub_exec(struct usb_device *udev,
struct usb_device_request *req, const void **pptr, uint16_t *plength)
{
struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
const char *str_ptr;
const void *ptr;
uint32_t port;
uint32_t v;
uint16_t len;
uint16_t i;
uint16_t value;
uint16_t index;
uint8_t j;
usb_error_t err;
USB_BUS_LOCK_ASSERT(&sc->sc_bus, MA_OWNED);
/* buffer reset */
ptr = (const void *)&sc->sc_hub_desc;
len = 0;
err = 0;
value = UGETW(req->wValue);
index = UGETW(req->wIndex);
DPRINTFN(3, "type=0x%02x request=0x%02x wLen=0x%04x "
"wValue=0x%04x wIndex=0x%04x\n",
req->bmRequestType, req->bRequest,
UGETW(req->wLength), value, index);
#define C(x,y) ((x) | ((y) << 8))
switch (C(req->bRequest, req->bmRequestType)) {
case C(UR_CLEAR_FEATURE, UT_WRITE_DEVICE):
case C(UR_CLEAR_FEATURE, UT_WRITE_INTERFACE):
case C(UR_CLEAR_FEATURE, UT_WRITE_ENDPOINT):
/*
* DEVICE_REMOTE_WAKEUP and ENDPOINT_HALT are no-ops
* for the integrated root hub.
*/
break;
case C(UR_GET_CONFIG, UT_READ_DEVICE):
len = 1;
sc->sc_hub_desc.temp[0] = sc->sc_conf;
break;
case C(UR_GET_DESCRIPTOR, UT_READ_DEVICE):
switch (value >> 8) {
case UDESC_DEVICE:
if ((value & 0xff) != 0) {
err = USB_ERR_IOERROR;
goto done;
}
len = sizeof(xhci_devd);
ptr = (const void *)&xhci_devd;
break;
case UDESC_BOS:
if ((value & 0xff) != 0) {
err = USB_ERR_IOERROR;
goto done;
}
len = sizeof(xhci_bosd);
ptr = (const void *)&xhci_bosd;
break;
case UDESC_CONFIG:
if ((value & 0xff) != 0) {
err = USB_ERR_IOERROR;
goto done;
}
len = sizeof(xhci_confd);
ptr = (const void *)&xhci_confd;
break;
case UDESC_STRING:
switch (value & 0xff) {
case 0: /* Language table */
str_ptr = "\001";
break;
case 1: /* Vendor */
str_ptr = sc->sc_vendor;
break;
case 2: /* Product */
str_ptr = "XHCI root HUB";
break;
default:
str_ptr = "";
break;
}
len = usb_make_str_desc(
sc->sc_hub_desc.temp,
sizeof(sc->sc_hub_desc.temp),
str_ptr);
break;
default:
err = USB_ERR_IOERROR;
goto done;
}
break;
case C(UR_GET_INTERFACE, UT_READ_INTERFACE):
len = 1;
sc->sc_hub_desc.temp[0] = 0;
break;
case C(UR_GET_STATUS, UT_READ_DEVICE):
len = 2;
USETW(sc->sc_hub_desc.stat.wStatus, UDS_SELF_POWERED);
break;
case C(UR_GET_STATUS, UT_READ_INTERFACE):
case C(UR_GET_STATUS, UT_READ_ENDPOINT):
len = 2;
USETW(sc->sc_hub_desc.stat.wStatus, 0);
break;
case C(UR_SET_ADDRESS, UT_WRITE_DEVICE):
if (value >= XHCI_MAX_DEVICES) {
err = USB_ERR_IOERROR;
goto done;
}
break;
case C(UR_SET_CONFIG, UT_WRITE_DEVICE):
if (value != 0 && value != 1) {
err = USB_ERR_IOERROR;
goto done;
}
sc->sc_conf = value;
break;
case C(UR_SET_DESCRIPTOR, UT_WRITE_DEVICE):
break;
case C(UR_SET_FEATURE, UT_WRITE_DEVICE):
case C(UR_SET_FEATURE, UT_WRITE_INTERFACE):
case C(UR_SET_FEATURE, UT_WRITE_ENDPOINT):
err = USB_ERR_IOERROR;
goto done;
case C(UR_SET_INTERFACE, UT_WRITE_INTERFACE):
break;
case C(UR_SYNCH_FRAME, UT_WRITE_ENDPOINT):
break;
/* Hub requests */
case C(UR_CLEAR_FEATURE, UT_WRITE_CLASS_DEVICE):
break;
case C(UR_CLEAR_FEATURE, UT_WRITE_CLASS_OTHER):
DPRINTFN(9, "UR_CLEAR_PORT_FEATURE\n");
if ((index < 1) ||
(index > sc->sc_noport)) {
err = USB_ERR_IOERROR;
goto done;
}
port = XHCI_PORTSC(index);
v = XREAD4(sc, oper, port);
i = XHCI_PS_PLS_GET(v);
v &= ~XHCI_PS_CLEAR;
switch (value) {
case UHF_C_BH_PORT_RESET:
XWRITE4(sc, oper, port, v | XHCI_PS_WRC);
break;
case UHF_C_PORT_CONFIG_ERROR:
XWRITE4(sc, oper, port, v | XHCI_PS_CEC);
break;
case UHF_C_PORT_SUSPEND:
case UHF_C_PORT_LINK_STATE:
XWRITE4(sc, oper, port, v | XHCI_PS_PLC);
break;
case UHF_C_PORT_CONNECTION:
XWRITE4(sc, oper, port, v | XHCI_PS_CSC);
break;
case UHF_C_PORT_ENABLE:
XWRITE4(sc, oper, port, v | XHCI_PS_PEC);
break;
case UHF_C_PORT_OVER_CURRENT:
XWRITE4(sc, oper, port, v | XHCI_PS_OCC);
break;
case UHF_C_PORT_RESET:
XWRITE4(sc, oper, port, v | XHCI_PS_PRC);
break;
case UHF_PORT_ENABLE:
XWRITE4(sc, oper, port, v | XHCI_PS_PED);
break;
case UHF_PORT_POWER:
XWRITE4(sc, oper, port, v & ~XHCI_PS_PP);
break;
case UHF_PORT_INDICATOR:
XWRITE4(sc, oper, port, v & ~XHCI_PS_PIC_SET(3));
break;
case UHF_PORT_SUSPEND:
/* U3 -> U15 */
if (i == 3) {
XWRITE4(sc, oper, port, v |
XHCI_PS_PLS_SET(0xF) | XHCI_PS_LWS);
}
/* wait 20ms for resume sequence to complete */
usb_pause_mtx(&sc->sc_bus.bus_mtx, hz / 50);
/* U0 */
XWRITE4(sc, oper, port, v |
XHCI_PS_PLS_SET(0) | XHCI_PS_LWS);
break;
default:
err = USB_ERR_IOERROR;
goto done;
}
break;
case C(UR_GET_DESCRIPTOR, UT_READ_CLASS_DEVICE):
if ((value & 0xff) != 0) {
err = USB_ERR_IOERROR;
goto done;
}
v = XREAD4(sc, capa, XHCI_HCSPARAMS0);
sc->sc_hub_desc.hubd = xhci_hubd;
sc->sc_hub_desc.hubd.bNbrPorts = sc->sc_noport;
if (XHCI_HCS0_PPC(v))
i = UHD_PWR_INDIVIDUAL;
else
i = UHD_PWR_GANGED;
if (XHCI_HCS0_PIND(v))
i |= UHD_PORT_IND;
i |= UHD_OC_INDIVIDUAL;
USETW(sc->sc_hub_desc.hubd.wHubCharacteristics, i);
/* see XHCI section 5.4.9: */
sc->sc_hub_desc.hubd.bPwrOn2PwrGood = 10;
for (j = 1; j <= sc->sc_noport; j++) {
v = XREAD4(sc, oper, XHCI_PORTSC(j));
if (v & XHCI_PS_DR) {
sc->sc_hub_desc.hubd.
DeviceRemovable[j / 8] |= 1U << (j % 8);
}
}
len = sc->sc_hub_desc.hubd.bLength;
break;
case C(UR_GET_STATUS, UT_READ_CLASS_DEVICE):
len = 16;
memset(sc->sc_hub_desc.temp, 0, 16);
break;
case C(UR_GET_STATUS, UT_READ_CLASS_OTHER):
DPRINTFN(9, "UR_GET_STATUS i=%d\n", index);
if ((index < 1) ||
(index > sc->sc_noport)) {
err = USB_ERR_IOERROR;
goto done;
}
v = XREAD4(sc, oper, XHCI_PORTSC(index));
DPRINTFN(9, "port status=0x%08x\n", v);
i = UPS_PORT_LINK_STATE_SET(XHCI_PS_PLS_GET(v));
switch (XHCI_PS_SPEED_GET(v)) {
case 3:
i |= UPS_HIGH_SPEED;
break;
case 2:
i |= UPS_LOW_SPEED;
break;
case 1:
/* FULL speed */
break;
default:
i |= UPS_OTHER_SPEED;
break;
}
if (v & XHCI_PS_CCS)
i |= UPS_CURRENT_CONNECT_STATUS;
if (v & XHCI_PS_PED)
i |= UPS_PORT_ENABLED;
if (v & XHCI_PS_OCA)
i |= UPS_OVERCURRENT_INDICATOR;
if (v & XHCI_PS_PR)
i |= UPS_RESET;
if (v & XHCI_PS_PP) {
/*
* The USB 3.0 RH is using the
* USB 2.0's power bit
*/
i |= UPS_PORT_POWER;
}
USETW(sc->sc_hub_desc.ps.wPortStatus, i);
i = 0;
if (v & XHCI_PS_CSC)
i |= UPS_C_CONNECT_STATUS;
if (v & XHCI_PS_PEC)
i |= UPS_C_PORT_ENABLED;
if (v & XHCI_PS_OCC)
i |= UPS_C_OVERCURRENT_INDICATOR;
if (v & XHCI_PS_WRC)
i |= UPS_C_BH_PORT_RESET;
if (v & XHCI_PS_PRC)
i |= UPS_C_PORT_RESET;
if (v & XHCI_PS_PLC)
i |= UPS_C_PORT_LINK_STATE;
if (v & XHCI_PS_CEC)
i |= UPS_C_PORT_CONFIG_ERROR;
USETW(sc->sc_hub_desc.ps.wPortChange, i);
len = sizeof(sc->sc_hub_desc.ps);
break;
case C(UR_SET_DESCRIPTOR, UT_WRITE_CLASS_DEVICE):
err = USB_ERR_IOERROR;
goto done;
case C(UR_SET_FEATURE, UT_WRITE_CLASS_DEVICE):
break;
case C(UR_SET_FEATURE, UT_WRITE_CLASS_OTHER):
i = index >> 8;
index &= 0x00FF;
if ((index < 1) ||
(index > sc->sc_noport)) {
err = USB_ERR_IOERROR;
goto done;
}
port = XHCI_PORTSC(index);
v = XREAD4(sc, oper, port) & ~XHCI_PS_CLEAR;
switch (value) {
case UHF_PORT_U1_TIMEOUT:
if (XHCI_PS_SPEED_GET(v) != 4) {
err = USB_ERR_IOERROR;
goto done;
}
port = XHCI_PORTPMSC(index);
v = XREAD4(sc, oper, port);
v &= ~XHCI_PM3_U1TO_SET(0xFF);
v |= XHCI_PM3_U1TO_SET(i);
XWRITE4(sc, oper, port, v);
break;
case UHF_PORT_U2_TIMEOUT:
if (XHCI_PS_SPEED_GET(v) != 4) {
err = USB_ERR_IOERROR;
goto done;
}
port = XHCI_PORTPMSC(index);
v = XREAD4(sc, oper, port);
v &= ~XHCI_PM3_U2TO_SET(0xFF);
v |= XHCI_PM3_U2TO_SET(i);
XWRITE4(sc, oper, port, v);
break;
case UHF_BH_PORT_RESET:
XWRITE4(sc, oper, port, v | XHCI_PS_WPR);
break;
case UHF_PORT_LINK_STATE:
XWRITE4(sc, oper, port, v |
XHCI_PS_PLS_SET(i) | XHCI_PS_LWS);
/* 4ms settle time */
usb_pause_mtx(&sc->sc_bus.bus_mtx, hz / 250);
break;
case UHF_PORT_ENABLE:
DPRINTFN(3, "set port enable %d\n", index);
break;
case UHF_PORT_SUSPEND:
DPRINTFN(6, "suspend port %u (LPM=%u)\n", index, i);
j = XHCI_PS_SPEED_GET(v);
if ((j < 1) || (j > 3)) {
/* non-supported speed */
err = USB_ERR_IOERROR;
goto done;
}
XWRITE4(sc, oper, port, v |
XHCI_PS_PLS_SET(i ? 2 /* LPM */ : 3) | XHCI_PS_LWS);
break;
case UHF_PORT_RESET:
DPRINTFN(6, "reset port %d\n", index);
XWRITE4(sc, oper, port, v | XHCI_PS_PR);
break;
case UHF_PORT_POWER:
DPRINTFN(3, "set port power %d\n", index);
XWRITE4(sc, oper, port, v | XHCI_PS_PP);
break;
case UHF_PORT_TEST:
DPRINTFN(3, "set port test %d\n", index);
break;
case UHF_PORT_INDICATOR:
DPRINTFN(3, "set port indicator %d\n", index);
v &= ~XHCI_PS_PIC_SET(3);
v |= XHCI_PS_PIC_SET(1);
XWRITE4(sc, oper, port, v);
break;
default:
err = USB_ERR_IOERROR;
goto done;
}
break;
case C(UR_CLEAR_TT_BUFFER, UT_WRITE_CLASS_OTHER):
case C(UR_RESET_TT, UT_WRITE_CLASS_OTHER):
case C(UR_GET_TT_STATE, UT_READ_CLASS_OTHER):
case C(UR_STOP_TT, UT_WRITE_CLASS_OTHER):
break;
default:
err = USB_ERR_IOERROR;
goto done;
}
done:
*plength = len;
*pptr = ptr;
return (err);
}
static void
xhci_xfer_setup(struct usb_setup_params *parm)
{
struct usb_page_search page_info;
struct usb_page_cache *pc;
- struct xhci_softc *sc;
struct usb_xfer *xfer;
void *last_obj;
uint32_t ntd;
uint32_t n;
- sc = XHCI_BUS2SC(parm->udev->bus);
xfer = parm->curr_xfer;
/*
* The proof for the "ntd" formula is illustrated like this:
*
* +------------------------------------+
* | |
* | |remainder -> |
* | +-----+---+ |
* | | xxx | x | frm 0 |
* | +-----+---++ |
* | | xxx | xx | frm 1 |
* | +-----+----+ |
* | ... |
* +------------------------------------+
*
* "xxx" means a completely full USB transfer descriptor
*
* "x" and "xx" means a short USB packet
*
* For the remainder of an USB transfer modulo
* "max_data_length" we need two USB transfer descriptors.
* One to transfer the remaining data and one to finalise with
* a zero length packet in case the "force_short_xfer" flag is
* set. We only need two USB transfer descriptors in the case
* where the transfer length of the first one is a factor of
* "max_frame_size". The rest of the needed USB transfer
* descriptors is given by the buffer size divided by the
* maximum data payload.
*/
parm->hc_max_packet_size = 0x400;
parm->hc_max_packet_count = 16 * 3;
parm->hc_max_frame_size = XHCI_TD_PAYLOAD_MAX;
xfer->flags_int.bdma_enable = 1;
usbd_transfer_setup_sub(parm);
if (xfer->flags_int.isochronous_xfr) {
ntd = ((1 * xfer->nframes)
+ (xfer->max_data_length / xfer->max_hc_frame_size));
} else if (xfer->flags_int.control_xfr) {
ntd = ((2 * xfer->nframes) + 1 /* STATUS */
+ (xfer->max_data_length / xfer->max_hc_frame_size));
} else {
ntd = ((2 * xfer->nframes)
+ (xfer->max_data_length / xfer->max_hc_frame_size));
}
alloc_dma_set:
if (parm->err)
return;
/*
* Allocate queue heads and transfer descriptors
*/
last_obj = NULL;
if (usbd_transfer_setup_sub_malloc(
parm, &pc, sizeof(struct xhci_td),
XHCI_TD_ALIGN, ntd)) {
parm->err = USB_ERR_NOMEM;
return;
}
if (parm->buf) {
for (n = 0; n != ntd; n++) {
struct xhci_td *td;
usbd_get_page(pc + n, 0, &page_info);
td = page_info.buffer;
/* init TD */
td->td_self = page_info.physaddr;
td->obj_next = last_obj;
td->page_cache = pc + n;
last_obj = td;
usb_pc_cpu_flush(pc + n);
}
}
xfer->td_start[xfer->flags_int.curr_dma_set] = last_obj;
if (!xfer->flags_int.curr_dma_set) {
xfer->flags_int.curr_dma_set = 1;
goto alloc_dma_set;
}
}
static usb_error_t
xhci_configure_reset_endpoint(struct usb_xfer *xfer)
{
struct xhci_softc *sc = XHCI_BUS2SC(xfer->xroot->bus);
struct usb_page_search buf_inp;
struct usb_device *udev;
struct xhci_endpoint_ext *pepext;
struct usb_endpoint_descriptor *edesc;
struct usb_page_cache *pcinp;
usb_error_t err;
usb_stream_t stream_id;
uint8_t index;
uint8_t epno;
pepext = xhci_get_endpoint_ext(xfer->xroot->udev,
xfer->endpoint->edesc);
udev = xfer->xroot->udev;
index = udev->controller_slot_id;
pcinp = &sc->sc_hw.devs[index].input_pc;
usbd_get_page(pcinp, 0, &buf_inp);
edesc = xfer->endpoint->edesc;
epno = edesc->bEndpointAddress;
stream_id = xfer->stream_id;
if ((edesc->bmAttributes & UE_XFERTYPE) == UE_CONTROL)
epno |= UE_DIR_IN;
epno = XHCI_EPNO2EPID(epno);
if (epno == 0)
return (USB_ERR_NO_PIPE); /* invalid */
XHCI_CMD_LOCK(sc);
/* configure endpoint */
err = xhci_configure_endpoint_by_xfer(xfer);
if (err != 0) {
XHCI_CMD_UNLOCK(sc);
return (err);
}
/*
* Get the endpoint into the stopped state according to the
* endpoint context state diagram in the XHCI specification:
*/
err = xhci_cmd_stop_ep(sc, 0, epno, index);
if (err != 0)
DPRINTF("Could not stop endpoint %u\n", epno);
err = xhci_cmd_reset_ep(sc, 0, epno, index);
if (err != 0)
DPRINTF("Could not reset endpoint %u\n", epno);
err = xhci_cmd_set_tr_dequeue_ptr(sc,
(pepext->physaddr + (stream_id * sizeof(struct xhci_trb) *
XHCI_MAX_TRANSFERS)) | XHCI_EPCTX_2_DCS_SET(1),
stream_id, epno, index);
if (err != 0)
DPRINTF("Could not set dequeue ptr for endpoint %u\n", epno);
/*
* Get the endpoint into the running state according to the
* endpoint context state diagram in the XHCI specification:
*/
xhci_configure_mask(udev, (1U << epno) | 1U, 0);
if (epno > 1)
err = xhci_cmd_configure_ep(sc, buf_inp.physaddr, 0, index);
else
err = xhci_cmd_evaluate_ctx(sc, buf_inp.physaddr, index);
if (err != 0)
DPRINTF("Could not configure endpoint %u\n", epno);
XHCI_CMD_UNLOCK(sc);
return (0);
}
static void
xhci_xfer_unsetup(struct usb_xfer *xfer)
{
return;
}
static void
xhci_start_dma_delay(struct usb_xfer *xfer)
{
struct xhci_softc *sc = XHCI_BUS2SC(xfer->xroot->bus);
/* put transfer on interrupt queue (again) */
usbd_transfer_enqueue(&sc->sc_bus.intr_q, xfer);
(void)usb_proc_msignal(USB_BUS_CONTROL_XFER_PROC(&sc->sc_bus),
&sc->sc_config_msg[0], &sc->sc_config_msg[1]);
}
static void
xhci_configure_msg(struct usb_proc_msg *pm)
{
struct xhci_softc *sc;
struct xhci_endpoint_ext *pepext;
struct usb_xfer *xfer;
sc = XHCI_BUS2SC(((struct usb_bus_msg *)pm)->bus);
restart:
TAILQ_FOREACH(xfer, &sc->sc_bus.intr_q.head, wait_entry) {
pepext = xhci_get_endpoint_ext(xfer->xroot->udev,
xfer->endpoint->edesc);
if ((pepext->trb_halted != 0) ||
(pepext->trb_running == 0)) {
uint16_t i;
/* clear halted and running */
pepext->trb_halted = 0;
pepext->trb_running = 0;
/* nuke remaining buffered transfers */
for (i = 0; i != (XHCI_MAX_TRANSFERS *
XHCI_MAX_STREAMS); i++) {
/*
* NOTE: We need to use the timeout
* error code here else existing
* isochronous clients can get
* confused:
*/
if (pepext->xfer[i] != NULL) {
xhci_device_done(pepext->xfer[i],
USB_ERR_TIMEOUT);
}
}
/*
* NOTE: The USB transfer cannot vanish in
* this state!
*/
USB_BUS_UNLOCK(&sc->sc_bus);
xhci_configure_reset_endpoint(xfer);
USB_BUS_LOCK(&sc->sc_bus);
/* check if halted is still cleared */
if (pepext->trb_halted == 0) {
pepext->trb_running = 1;
memset(pepext->trb_index, 0,
sizeof(pepext->trb_index));
}
goto restart;
}
if (xfer->flags_int.did_dma_delay) {
/* remove transfer from interrupt queue (again) */
usbd_transfer_dequeue(xfer);
/* we are finally done */
usb_dma_delay_done_cb(xfer);
/* queue changed - restart */
goto restart;
}
}
TAILQ_FOREACH(xfer, &sc->sc_bus.intr_q.head, wait_entry) {
/* try to insert xfer on HW queue */
xhci_transfer_insert(xfer);
/* try to multi buffer */
xhci_device_generic_multi_enter(xfer->endpoint,
xfer->stream_id, NULL);
}
}
static void
xhci_ep_init(struct usb_device *udev, struct usb_endpoint_descriptor *edesc,
struct usb_endpoint *ep)
{
struct xhci_endpoint_ext *pepext;
DPRINTFN(2, "endpoint=%p, addr=%d, endpt=%d, mode=%d\n",
ep, udev->address, edesc->bEndpointAddress, udev->flags.usb_mode);
if (udev->parent_hub == NULL) {
/* root HUB has special endpoint handling */
return;
}
ep->methods = &xhci_device_generic_methods;
pepext = xhci_get_endpoint_ext(udev, edesc);
USB_BUS_LOCK(udev->bus);
pepext->trb_halted = 1;
pepext->trb_running = 0;
USB_BUS_UNLOCK(udev->bus);
}
static void
xhci_ep_uninit(struct usb_device *udev, struct usb_endpoint *ep)
{
}
static void
xhci_ep_clear_stall(struct usb_device *udev, struct usb_endpoint *ep)
{
struct xhci_endpoint_ext *pepext;
DPRINTF("\n");
if (udev->flags.usb_mode != USB_MODE_HOST) {
/* not supported */
return;
}
if (udev->parent_hub == NULL) {
/* root HUB has special endpoint handling */
return;
}
pepext = xhci_get_endpoint_ext(udev, ep->edesc);
USB_BUS_LOCK(udev->bus);
pepext->trb_halted = 1;
pepext->trb_running = 0;
USB_BUS_UNLOCK(udev->bus);
}
static usb_error_t
xhci_device_init(struct usb_device *udev)
{
struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
usb_error_t err;
uint8_t temp;
/* no init for root HUB */
if (udev->parent_hub == NULL)
return (0);
XHCI_CMD_LOCK(sc);
/* set invalid default */
udev->controller_slot_id = sc->sc_noslot + 1;
/* try to get a new slot ID from the XHCI */
err = xhci_cmd_enable_slot(sc, &temp);
if (err) {
XHCI_CMD_UNLOCK(sc);
return (err);
}
if (temp > sc->sc_noslot) {
XHCI_CMD_UNLOCK(sc);
return (USB_ERR_BAD_ADDRESS);
}
if (sc->sc_hw.devs[temp].state != XHCI_ST_DISABLED) {
DPRINTF("slot %u already allocated.\n", temp);
XHCI_CMD_UNLOCK(sc);
return (USB_ERR_BAD_ADDRESS);
}
/* store slot ID for later reference */
udev->controller_slot_id = temp;
/* reset data structure */
memset(&sc->sc_hw.devs[temp], 0, sizeof(sc->sc_hw.devs[0]));
/* set mark slot allocated */
sc->sc_hw.devs[temp].state = XHCI_ST_ENABLED;
err = xhci_alloc_device_ext(udev);
XHCI_CMD_UNLOCK(sc);
/* get device into default state */
if (err == 0)
err = xhci_set_address(udev, NULL, 0);
return (err);
}
static void
xhci_device_uninit(struct usb_device *udev)
{
struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
uint8_t index;
/* no init for root HUB */
if (udev->parent_hub == NULL)
return;
XHCI_CMD_LOCK(sc);
index = udev->controller_slot_id;
if (index <= sc->sc_noslot) {
xhci_cmd_disable_slot(sc, index);
sc->sc_hw.devs[index].state = XHCI_ST_DISABLED;
/* free device extension */
xhci_free_device_ext(udev);
}
XHCI_CMD_UNLOCK(sc);
}
static void
xhci_get_dma_delay(struct usb_device *udev, uint32_t *pus)
{
/*
* Wait until the hardware has finished any possible use of
* the transfer descriptor(s)
*/
*pus = 2048; /* microseconds */
}
static void
xhci_device_resume(struct usb_device *udev)
{
struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
uint8_t index;
uint8_t n;
uint8_t p;
DPRINTF("\n");
/* check for root HUB */
if (udev->parent_hub == NULL)
return;
index = udev->controller_slot_id;
XHCI_CMD_LOCK(sc);
/* blindly resume all endpoints */
USB_BUS_LOCK(udev->bus);
for (n = 1; n != XHCI_MAX_ENDPOINTS; n++) {
for (p = 0; p != XHCI_MAX_STREAMS; p++) {
XWRITE4(sc, door, XHCI_DOORBELL(index),
n | XHCI_DB_SID_SET(p));
}
}
USB_BUS_UNLOCK(udev->bus);
XHCI_CMD_UNLOCK(sc);
}
static void
xhci_device_suspend(struct usb_device *udev)
{
struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
uint8_t index;
uint8_t n;
usb_error_t err;
DPRINTF("\n");
/* check for root HUB */
if (udev->parent_hub == NULL)
return;
index = udev->controller_slot_id;
XHCI_CMD_LOCK(sc);
/* blindly suspend all endpoints */
for (n = 1; n != XHCI_MAX_ENDPOINTS; n++) {
err = xhci_cmd_stop_ep(sc, 1, n, index);
if (err != 0) {
DPRINTF("Failed to suspend endpoint "
"%u on slot %u (ignored).\n", n, index);
}
}
XHCI_CMD_UNLOCK(sc);
}
static void
xhci_set_hw_power(struct usb_bus *bus)
{
DPRINTF("\n");
}
static void
xhci_device_state_change(struct usb_device *udev)
{
struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
struct usb_page_search buf_inp;
usb_error_t err;
uint8_t index;
/* check for root HUB */
if (udev->parent_hub == NULL)
return;
index = udev->controller_slot_id;
DPRINTF("\n");
if (usb_get_device_state(udev) == USB_STATE_CONFIGURED) {
err = uhub_query_info(udev, &sc->sc_hw.devs[index].nports,
&sc->sc_hw.devs[index].tt);
if (err != 0)
sc->sc_hw.devs[index].nports = 0;
}
XHCI_CMD_LOCK(sc);
switch (usb_get_device_state(udev)) {
case USB_STATE_POWERED:
if (sc->sc_hw.devs[index].state == XHCI_ST_DEFAULT)
break;
/* set default state */
sc->sc_hw.devs[index].state = XHCI_ST_DEFAULT;
/* reset number of contexts */
sc->sc_hw.devs[index].context_num = 0;
err = xhci_cmd_reset_dev(sc, index);
if (err != 0) {
DPRINTF("Device reset failed "
"for slot %u.\n", index);
}
break;
case USB_STATE_ADDRESSED:
if (sc->sc_hw.devs[index].state == XHCI_ST_ADDRESSED)
break;
sc->sc_hw.devs[index].state = XHCI_ST_ADDRESSED;
/* set configure mask to slot only */
xhci_configure_mask(udev, 1, 0);
/* deconfigure all endpoints, except EP0 */
err = xhci_cmd_configure_ep(sc, 0, 1, index);
if (err) {
DPRINTF("Failed to deconfigure "
"slot %u.\n", index);
}
break;
case USB_STATE_CONFIGURED:
if (sc->sc_hw.devs[index].state == XHCI_ST_CONFIGURED)
break;
/* set configured state */
sc->sc_hw.devs[index].state = XHCI_ST_CONFIGURED;
/* reset number of contexts */
sc->sc_hw.devs[index].context_num = 0;
usbd_get_page(&sc->sc_hw.devs[index].input_pc, 0, &buf_inp);
xhci_configure_mask(udev, 3, 0);
err = xhci_configure_device(udev);
if (err != 0) {
DPRINTF("Could not configure device "
"at slot %u.\n", index);
}
err = xhci_cmd_evaluate_ctx(sc, buf_inp.physaddr, index);
if (err != 0) {
DPRINTF("Could not evaluate device "
"context at slot %u.\n", index);
}
break;
default:
break;
}
XHCI_CMD_UNLOCK(sc);
}
static usb_error_t
xhci_set_endpoint_mode(struct usb_device *udev, struct usb_endpoint *ep,
uint8_t ep_mode)
{
switch (ep_mode) {
case USB_EP_MODE_DEFAULT:
return (0);
case USB_EP_MODE_STREAMS:
if (xhcistreams == 0 ||
(ep->edesc->bmAttributes & UE_XFERTYPE) != UE_BULK ||
udev->speed != USB_SPEED_SUPER)
return (USB_ERR_INVAL);
return (0);
default:
return (USB_ERR_INVAL);
}
}
static const struct usb_bus_methods xhci_bus_methods = {
.endpoint_init = xhci_ep_init,
.endpoint_uninit = xhci_ep_uninit,
.xfer_setup = xhci_xfer_setup,
.xfer_unsetup = xhci_xfer_unsetup,
.get_dma_delay = xhci_get_dma_delay,
.device_init = xhci_device_init,
.device_uninit = xhci_device_uninit,
.device_resume = xhci_device_resume,
.device_suspend = xhci_device_suspend,
.set_hw_power = xhci_set_hw_power,
.roothub_exec = xhci_roothub_exec,
.xfer_poll = xhci_do_poll,
.start_dma_delay = xhci_start_dma_delay,
.set_address = xhci_set_address,
.clear_stall = xhci_ep_clear_stall,
.device_state_change = xhci_device_state_change,
.set_hw_power_sleep = xhci_set_hw_power_sleep,
.set_endpoint_mode = xhci_set_endpoint_mode,
};
Index: head/sys/dev/usb/storage/umass.c
===================================================================
--- head/sys/dev/usb/storage/umass.c (revision 327172)
+++ head/sys/dev/usb/storage/umass.c (revision 327173)
@@ -1,3020 +1,3019 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 1999 MAEKAWA Masahide <bishop@rr.iij4u.or.jp>,
* Nick Hibma <n_hibma@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
* $NetBSD: umass.c,v 1.28 2000/04/02 23:46:53 augustss Exp $
*/
/* Also already merged from NetBSD:
* $NetBSD: umass.c,v 1.67 2001/11/25 19:05:22 augustss Exp $
* $NetBSD: umass.c,v 1.90 2002/11/04 19:17:33 pooka Exp $
* $NetBSD: umass.c,v 1.108 2003/11/07 17:03:25 wiz Exp $
* $NetBSD: umass.c,v 1.109 2003/12/04 13:57:31 keihan Exp $
*/
/*
* Universal Serial Bus Mass Storage Class specs:
* http://www.usb.org/developers/devclass_docs/usb_msc_overview_1.2.pdf
* http://www.usb.org/developers/devclass_docs/usbmassbulk_10.pdf
* http://www.usb.org/developers/devclass_docs/usb_msc_cbi_1.1.pdf
* http://www.usb.org/developers/devclass_docs/usbmass-ufi10.pdf
*/
/*
* Ported to NetBSD by Lennart Augustsson <augustss@NetBSD.org>.
* Parts of the code written by Jason R. Thorpe <thorpej@shagadelic.org>.
*/
/*
* The driver handles 3 Wire Protocols
* - Command/Bulk/Interrupt (CBI)
* - Command/Bulk/Interrupt with Command Completion Interrupt (CBI with CCI)
* - Mass Storage Bulk-Only (BBB)
* (BBB refers Bulk/Bulk/Bulk for Command/Data/Status phases)
*
* Over these wire protocols it handles the following command protocols
* - SCSI
* - UFI (floppy command set)
* - 8070i (ATAPI)
*
* UFI and 8070i (ATAPI) are transformed versions of the SCSI command set. The
* sc->sc_transform method is used to convert the commands into the appropriate
* format (if at all necessary). For example, UFI requires all commands to be
* 12 bytes in length amongst other things.
*
* The source code below is marked and can be split into a number of pieces
* (in this order):
*
* - probe/attach/detach
* - generic transfer routines
* - BBB
* - CBI
* - CBI_I (in addition to functions from CBI)
* - CAM (Common Access Method)
* - SCSI
* - UFI
* - 8070i (ATAPI)
*
* The protocols are implemented using a state machine, for the transfers as
* well as for the resets. The state machine is contained in umass_t_*_callback.
* The state machine is started through either umass_command_start() or
* umass_reset().
*
* The reason for doing this is a) CAM performs a lot better this way and b) it
* avoids using tsleep from interrupt context (for example after a failed
* transfer).
*/
/*
* The SCSI related part of this driver has been derived from the
* dev/ppbus/vpo.c driver, by Nicolas Souchu (nsouch@FreeBSD.org).
*
* The CAM layer uses so called actions which are messages sent to the host
* adapter for completion. The actions come in through umass_cam_action. The
* appropriate block of routines is called depending on the transport protocol
* in use. When the transfer has finished, these routines call
* umass_cam_cb again to complete the CAM command.
*/
#include <sys/stdint.h>
#include <sys/stddef.h>
#include <sys/param.h>
#include <sys/queue.h>
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/bus.h>
#include <sys/module.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/condvar.h>
#include <sys/sysctl.h>
#include <sys/sx.h>
#include <sys/unistd.h>
#include <sys/callout.h>
#include <sys/malloc.h>
#include <sys/priv.h>
#include <dev/usb/usb.h>
#include <dev/usb/usbdi.h>
#include <dev/usb/usbdi_util.h>
#include "usbdevs.h"
#include <dev/usb/quirk/usb_quirk.h>
#include <cam/cam.h>
#include <cam/cam_ccb.h>
#include <cam/cam_sim.h>
#include <cam/cam_xpt_sim.h>
#include <cam/scsi/scsi_all.h>
#include <cam/scsi/scsi_da.h>
#include <cam/cam_periph.h>
#ifdef USB_DEBUG
#define DIF(m, x) \
do { \
if (umass_debug & (m)) { x ; } \
} while (0)
#define DPRINTF(sc, m, fmt, ...) \
do { \
if (umass_debug & (m)) { \
printf("%s:%s: " fmt, \
(sc) ? (const char *)(sc)->sc_name : \
(const char *)"umassX", \
__FUNCTION__ ,## __VA_ARGS__); \
} \
} while (0)
#define UDMASS_GEN 0x00010000 /* general */
#define UDMASS_SCSI 0x00020000 /* scsi */
#define UDMASS_UFI 0x00040000 /* ufi command set */
#define UDMASS_ATAPI 0x00080000 /* 8070i command set */
#define UDMASS_CMD (UDMASS_SCSI|UDMASS_UFI|UDMASS_ATAPI)
#define UDMASS_USB 0x00100000 /* USB general */
#define UDMASS_BBB 0x00200000 /* Bulk-Only transfers */
#define UDMASS_CBI 0x00400000 /* CBI transfers */
#define UDMASS_WIRE (UDMASS_BBB|UDMASS_CBI)
#define UDMASS_ALL 0xffff0000 /* all of the above */
static int umass_debug;
static int umass_throttle;
static SYSCTL_NODE(_hw_usb, OID_AUTO, umass, CTLFLAG_RW, 0, "USB umass");
SYSCTL_INT(_hw_usb_umass, OID_AUTO, debug, CTLFLAG_RWTUN,
&umass_debug, 0, "umass debug level");
SYSCTL_INT(_hw_usb_umass, OID_AUTO, throttle, CTLFLAG_RWTUN,
&umass_throttle, 0, "Forced delay between commands in milliseconds");
#else
#define DIF(...) do { } while (0)
#define DPRINTF(...) do { } while (0)
#endif
#define UMASS_BULK_SIZE (1 << 17)
#define UMASS_CBI_DIAGNOSTIC_CMDLEN 12 /* bytes */
#define UMASS_MAX_CMDLEN MAX(12, CAM_MAX_CDBLEN) /* bytes */
/* USB transfer definitions */
#define UMASS_T_BBB_RESET1 0 /* Bulk-Only */
#define UMASS_T_BBB_RESET2 1
#define UMASS_T_BBB_RESET3 2
#define UMASS_T_BBB_COMMAND 3
#define UMASS_T_BBB_DATA_READ 4
#define UMASS_T_BBB_DATA_RD_CS 5
#define UMASS_T_BBB_DATA_WRITE 6
#define UMASS_T_BBB_DATA_WR_CS 7
#define UMASS_T_BBB_STATUS 8
#define UMASS_T_BBB_MAX 9
#define UMASS_T_CBI_RESET1 0 /* CBI */
#define UMASS_T_CBI_RESET2 1
#define UMASS_T_CBI_RESET3 2
#define UMASS_T_CBI_COMMAND 3
#define UMASS_T_CBI_DATA_READ 4
#define UMASS_T_CBI_DATA_RD_CS 5
#define UMASS_T_CBI_DATA_WRITE 6
#define UMASS_T_CBI_DATA_WR_CS 7
#define UMASS_T_CBI_STATUS 8
#define UMASS_T_CBI_RESET4 9
#define UMASS_T_CBI_MAX 10
#define UMASS_T_MAX MAX(UMASS_T_CBI_MAX, UMASS_T_BBB_MAX)
/* Generic definitions */
/* Direction for transfer */
#define DIR_NONE 0
#define DIR_IN 1
#define DIR_OUT 2
/* device name */
#define DEVNAME "umass"
#define DEVNAME_SIM "umass-sim"
/* Approximate maximum transfer speeds (assumes 33% overhead). */
#define UMASS_FULL_TRANSFER_SPEED 1000
#define UMASS_HIGH_TRANSFER_SPEED 40000
#define UMASS_SUPER_TRANSFER_SPEED 400000
#define UMASS_FLOPPY_TRANSFER_SPEED 20
#define UMASS_TIMEOUT 5000 /* ms */
/* CAM specific definitions */
#define UMASS_SCSIID_MAX 1 /* maximum number of drives expected */
#define UMASS_SCSIID_HOST UMASS_SCSIID_MAX
/* Bulk-Only features */
#define UR_BBB_RESET 0xff /* Bulk-Only reset */
#define UR_BBB_GET_MAX_LUN 0xfe /* Get maximum lun */
/* Command Block Wrapper */
typedef struct {
uDWord dCBWSignature;
#define CBWSIGNATURE 0x43425355
uDWord dCBWTag;
uDWord dCBWDataTransferLength;
uByte bCBWFlags;
#define CBWFLAGS_OUT 0x00
#define CBWFLAGS_IN 0x80
uByte bCBWLUN;
uByte bCDBLength;
#define CBWCDBLENGTH 16
uByte CBWCDB[CBWCDBLENGTH];
} __packed umass_bbb_cbw_t;
#define UMASS_BBB_CBW_SIZE 31
/* Command Status Wrapper */
typedef struct {
uDWord dCSWSignature;
#define CSWSIGNATURE 0x53425355
#define CSWSIGNATURE_IMAGINATION_DBX1 0x43425355
#define CSWSIGNATURE_OLYMPUS_C1 0x55425355
uDWord dCSWTag;
uDWord dCSWDataResidue;
uByte bCSWStatus;
#define CSWSTATUS_GOOD 0x0
#define CSWSTATUS_FAILED 0x1
#define CSWSTATUS_PHASE 0x2
} __packed umass_bbb_csw_t;
#define UMASS_BBB_CSW_SIZE 13
/* CBI features */
#define UR_CBI_ADSC 0x00
typedef union {
struct {
uint8_t type;
#define IDB_TYPE_CCI 0x00
uint8_t value;
#define IDB_VALUE_PASS 0x00
#define IDB_VALUE_FAIL 0x01
#define IDB_VALUE_PHASE 0x02
#define IDB_VALUE_PERSISTENT 0x03
#define IDB_VALUE_STATUS_MASK 0x03
} __packed common;
struct {
uint8_t asc;
uint8_t ascq;
} __packed ufi;
} __packed umass_cbi_sbl_t;
struct umass_softc; /* see below */
typedef void (umass_callback_t)(struct umass_softc *sc, union ccb *ccb,
uint32_t residue, uint8_t status);
#define STATUS_CMD_OK 0 /* everything ok */
#define STATUS_CMD_UNKNOWN 1 /* will have to fetch sense */
#define STATUS_CMD_FAILED 2 /* transfer was ok, command failed */
#define STATUS_WIRE_FAILED 3 /* couldn't even get command across */
typedef uint8_t (umass_transform_t)(struct umass_softc *sc, uint8_t *cmd_ptr,
uint8_t cmd_len);
/* Wire and command protocol */
#define UMASS_PROTO_BBB 0x0001 /* USB wire protocol */
#define UMASS_PROTO_CBI 0x0002
#define UMASS_PROTO_CBI_I 0x0004
#define UMASS_PROTO_WIRE 0x00ff /* USB wire protocol mask */
#define UMASS_PROTO_SCSI 0x0100 /* command protocol */
#define UMASS_PROTO_ATAPI 0x0200
#define UMASS_PROTO_UFI 0x0400
#define UMASS_PROTO_RBC 0x0800
#define UMASS_PROTO_COMMAND 0xff00 /* command protocol mask */
/* Device specific quirks */
#define NO_QUIRKS 0x0000
/*
* The drive does not support Test Unit Ready. Convert to Start Unit
*/
#define NO_TEST_UNIT_READY 0x0001
/*
* The drive does not reset the Unit Attention state after REQUEST
* SENSE has been sent. The INQUIRY command does not reset the UA
* either, and so CAM runs in circles trying to retrieve the initial
* INQUIRY data.
*/
#define RS_NO_CLEAR_UA 0x0002
/* The drive does not support START STOP. */
#define NO_START_STOP 0x0004
/* Don't ask for full inquiry data (255b). */
#define FORCE_SHORT_INQUIRY 0x0008
/* Needs to be initialised the Shuttle way */
#define SHUTTLE_INIT 0x0010
/* Drive needs to be switched to alternate iface 1 */
#define ALT_IFACE_1 0x0020
/* Drive does not do 1Mb/s, but just floppy speeds (20kb/s) */
#define FLOPPY_SPEED 0x0040
/* The device can't count and gets the residue of transfers wrong */
#define IGNORE_RESIDUE 0x0080
/* No GetMaxLun call */
#define NO_GETMAXLUN 0x0100
/* The device uses a weird CSWSIGNATURE. */
#define WRONG_CSWSIG 0x0200
/* Device cannot handle INQUIRY so fake a generic response */
#define NO_INQUIRY 0x0400
/* Device cannot handle INQUIRY EVPD, return CHECK CONDITION */
#define NO_INQUIRY_EVPD 0x0800
/* Pad all RBC requests to 12 bytes. */
#define RBC_PAD_TO_12 0x1000
/*
* Device reports number of sectors from READ_CAPACITY, not max
* sector number.
*/
#define READ_CAPACITY_OFFBY1 0x2000
/*
* Device cannot handle a SCSI synchronize cache command. Normally
* this quirk would be handled in the cam layer, but for IDE bridges
* we need to associate the quirk with the bridge and not the
* underlying disk device. This is handled by faking a success
* result.
*/
#define NO_SYNCHRONIZE_CACHE 0x4000
/* Device does not support 'PREVENT/ALLOW MEDIUM REMOVAL'. */
#define NO_PREVENT_ALLOW 0x8000
struct umass_softc {
struct scsi_sense cam_scsi_sense;
struct scsi_test_unit_ready cam_scsi_test_unit_ready;
struct mtx sc_mtx;
struct {
uint8_t *data_ptr;
union ccb *ccb;
umass_callback_t *callback;
uint32_t data_len; /* bytes */
uint32_t data_rem; /* bytes */
uint32_t data_timeout; /* ms */
uint32_t actlen; /* bytes */
uint8_t cmd_data[UMASS_MAX_CMDLEN];
uint8_t cmd_len; /* bytes */
uint8_t dir;
uint8_t lun;
} sc_transfer;
/* Bulk specific variables for transfers in progress */
umass_bbb_cbw_t cbw; /* command block wrapper */
umass_bbb_csw_t csw; /* command status wrapper */
/* CBI specific variables for transfers in progress */
umass_cbi_sbl_t sbl; /* status block */
device_t sc_dev;
struct usb_device *sc_udev;
struct cam_sim *sc_sim; /* SCSI Interface Module */
struct usb_xfer *sc_xfer[UMASS_T_MAX];
/*
* The command transform function is used to convert the SCSI
* commands into their derivatives, like UFI, ATAPI, and friends.
*/
umass_transform_t *sc_transform;
uint32_t sc_unit;
uint32_t sc_quirks; /* they got it almost right */
uint32_t sc_proto; /* wire and cmd protocol */
uint8_t sc_name[16];
uint8_t sc_iface_no; /* interface number */
uint8_t sc_maxlun; /* maximum LUN number, inclusive */
uint8_t sc_last_xfer_index;
uint8_t sc_status_try;
};
struct umass_probe_proto {
uint32_t quirks;
uint32_t proto;
int error;
};
/* prototypes */
static device_probe_t umass_probe;
static device_attach_t umass_attach;
static device_detach_t umass_detach;
static usb_callback_t umass_tr_error;
static usb_callback_t umass_t_bbb_reset1_callback;
static usb_callback_t umass_t_bbb_reset2_callback;
static usb_callback_t umass_t_bbb_reset3_callback;
static usb_callback_t umass_t_bbb_command_callback;
static usb_callback_t umass_t_bbb_data_read_callback;
static usb_callback_t umass_t_bbb_data_rd_cs_callback;
static usb_callback_t umass_t_bbb_data_write_callback;
static usb_callback_t umass_t_bbb_data_wr_cs_callback;
static usb_callback_t umass_t_bbb_status_callback;
static usb_callback_t umass_t_cbi_reset1_callback;
static usb_callback_t umass_t_cbi_reset2_callback;
static usb_callback_t umass_t_cbi_reset3_callback;
static usb_callback_t umass_t_cbi_reset4_callback;
static usb_callback_t umass_t_cbi_command_callback;
static usb_callback_t umass_t_cbi_data_read_callback;
static usb_callback_t umass_t_cbi_data_rd_cs_callback;
static usb_callback_t umass_t_cbi_data_write_callback;
static usb_callback_t umass_t_cbi_data_wr_cs_callback;
static usb_callback_t umass_t_cbi_status_callback;
static void umass_cancel_ccb(struct umass_softc *);
static void umass_init_shuttle(struct umass_softc *);
static void umass_reset(struct umass_softc *);
static void umass_t_bbb_data_clear_stall_callback(struct usb_xfer *,
uint8_t, uint8_t, usb_error_t);
static void umass_command_start(struct umass_softc *, uint8_t, void *,
uint32_t, uint32_t, umass_callback_t *, union ccb *);
static uint8_t umass_bbb_get_max_lun(struct umass_softc *);
static void umass_cbi_start_status(struct umass_softc *);
static void umass_t_cbi_data_clear_stall_callback(struct usb_xfer *,
uint8_t, uint8_t, usb_error_t);
static int umass_cam_attach_sim(struct umass_softc *);
static void umass_cam_attach(struct umass_softc *);
static void umass_cam_detach_sim(struct umass_softc *);
static void umass_cam_action(struct cam_sim *, union ccb *);
static void umass_cam_poll(struct cam_sim *);
static void umass_cam_cb(struct umass_softc *, union ccb *, uint32_t,
uint8_t);
static void umass_cam_sense_cb(struct umass_softc *, union ccb *, uint32_t,
uint8_t);
static void umass_cam_quirk_cb(struct umass_softc *, union ccb *, uint32_t,
uint8_t);
static uint8_t umass_scsi_transform(struct umass_softc *, uint8_t *, uint8_t);
static uint8_t umass_rbc_transform(struct umass_softc *, uint8_t *, uint8_t);
static uint8_t umass_ufi_transform(struct umass_softc *, uint8_t *, uint8_t);
static uint8_t umass_atapi_transform(struct umass_softc *, uint8_t *,
uint8_t);
static uint8_t umass_no_transform(struct umass_softc *, uint8_t *, uint8_t);
static uint8_t umass_std_transform(struct umass_softc *, union ccb *, uint8_t
*, uint8_t);
#ifdef USB_DEBUG
static void umass_bbb_dump_cbw(struct umass_softc *, umass_bbb_cbw_t *);
static void umass_bbb_dump_csw(struct umass_softc *, umass_bbb_csw_t *);
static void umass_cbi_dump_cmd(struct umass_softc *, void *, uint8_t);
static void umass_dump_buffer(struct umass_softc *, uint8_t *, uint32_t,
uint32_t);
#endif
static struct usb_config umass_bbb_config[UMASS_T_BBB_MAX] = {
[UMASS_T_BBB_RESET1] = {
.type = UE_CONTROL,
.endpoint = 0x00, /* Control pipe */
.direction = UE_DIR_ANY,
.bufsize = sizeof(struct usb_device_request),
.callback = &umass_t_bbb_reset1_callback,
.timeout = 5000, /* 5 seconds */
.interval = 500, /* 500 milliseconds */
},
[UMASS_T_BBB_RESET2] = {
.type = UE_CONTROL,
.endpoint = 0x00, /* Control pipe */
.direction = UE_DIR_ANY,
.bufsize = sizeof(struct usb_device_request),
.callback = &umass_t_bbb_reset2_callback,
.timeout = 5000, /* 5 seconds */
.interval = 50, /* 50 milliseconds */
},
[UMASS_T_BBB_RESET3] = {
.type = UE_CONTROL,
.endpoint = 0x00, /* Control pipe */
.direction = UE_DIR_ANY,
.bufsize = sizeof(struct usb_device_request),
.callback = &umass_t_bbb_reset3_callback,
.timeout = 5000, /* 5 seconds */
.interval = 50, /* 50 milliseconds */
},
[UMASS_T_BBB_COMMAND] = {
.type = UE_BULK,
.endpoint = UE_ADDR_ANY,
.direction = UE_DIR_OUT,
.bufsize = sizeof(umass_bbb_cbw_t),
.callback = &umass_t_bbb_command_callback,
.timeout = 5000, /* 5 seconds */
},
[UMASS_T_BBB_DATA_READ] = {
.type = UE_BULK,
.endpoint = UE_ADDR_ANY,
.direction = UE_DIR_IN,
.bufsize = UMASS_BULK_SIZE,
.flags = {.proxy_buffer = 1,.short_xfer_ok = 1,.ext_buffer=1,},
.callback = &umass_t_bbb_data_read_callback,
.timeout = 0, /* overwritten later */
},
[UMASS_T_BBB_DATA_RD_CS] = {
.type = UE_CONTROL,
.endpoint = 0x00, /* Control pipe */
.direction = UE_DIR_ANY,
.bufsize = sizeof(struct usb_device_request),
.callback = &umass_t_bbb_data_rd_cs_callback,
.timeout = 5000, /* 5 seconds */
},
[UMASS_T_BBB_DATA_WRITE] = {
.type = UE_BULK,
.endpoint = UE_ADDR_ANY,
.direction = UE_DIR_OUT,
.bufsize = UMASS_BULK_SIZE,
.flags = {.proxy_buffer = 1,.short_xfer_ok = 1,.ext_buffer=1,},
.callback = &umass_t_bbb_data_write_callback,
.timeout = 0, /* overwritten later */
},
[UMASS_T_BBB_DATA_WR_CS] = {
.type = UE_CONTROL,
.endpoint = 0x00, /* Control pipe */
.direction = UE_DIR_ANY,
.bufsize = sizeof(struct usb_device_request),
.callback = &umass_t_bbb_data_wr_cs_callback,
.timeout = 5000, /* 5 seconds */
},
[UMASS_T_BBB_STATUS] = {
.type = UE_BULK,
.endpoint = UE_ADDR_ANY,
.direction = UE_DIR_IN,
.bufsize = sizeof(umass_bbb_csw_t),
.flags = {.short_xfer_ok = 1,},
.callback = &umass_t_bbb_status_callback,
.timeout = 5000, /* ms */
},
};
static struct usb_config umass_cbi_config[UMASS_T_CBI_MAX] = {
[UMASS_T_CBI_RESET1] = {
.type = UE_CONTROL,
.endpoint = 0x00, /* Control pipe */
.direction = UE_DIR_ANY,
.bufsize = (sizeof(struct usb_device_request) +
UMASS_CBI_DIAGNOSTIC_CMDLEN),
.callback = &umass_t_cbi_reset1_callback,
.timeout = 5000, /* 5 seconds */
.interval = 500, /* 500 milliseconds */
},
[UMASS_T_CBI_RESET2] = {
.type = UE_CONTROL,
.endpoint = 0x00, /* Control pipe */
.direction = UE_DIR_ANY,
.bufsize = sizeof(struct usb_device_request),
.callback = &umass_t_cbi_reset2_callback,
.timeout = 5000, /* 5 seconds */
.interval = 50, /* 50 milliseconds */
},
[UMASS_T_CBI_RESET3] = {
.type = UE_CONTROL,
.endpoint = 0x00, /* Control pipe */
.direction = UE_DIR_ANY,
.bufsize = sizeof(struct usb_device_request),
.callback = &umass_t_cbi_reset3_callback,
.timeout = 5000, /* 5 seconds */
.interval = 50, /* 50 milliseconds */
},
[UMASS_T_CBI_COMMAND] = {
.type = UE_CONTROL,
.endpoint = 0x00, /* Control pipe */
.direction = UE_DIR_ANY,
.bufsize = (sizeof(struct usb_device_request) +
UMASS_MAX_CMDLEN),
.callback = &umass_t_cbi_command_callback,
.timeout = 5000, /* 5 seconds */
},
[UMASS_T_CBI_DATA_READ] = {
.type = UE_BULK,
.endpoint = UE_ADDR_ANY,
.direction = UE_DIR_IN,
.bufsize = UMASS_BULK_SIZE,
.flags = {.proxy_buffer = 1,.short_xfer_ok = 1,.ext_buffer=1,},
.callback = &umass_t_cbi_data_read_callback,
.timeout = 0, /* overwritten later */
},
[UMASS_T_CBI_DATA_RD_CS] = {
.type = UE_CONTROL,
.endpoint = 0x00, /* Control pipe */
.direction = UE_DIR_ANY,
.bufsize = sizeof(struct usb_device_request),
.callback = &umass_t_cbi_data_rd_cs_callback,
.timeout = 5000, /* 5 seconds */
},
[UMASS_T_CBI_DATA_WRITE] = {
.type = UE_BULK,
.endpoint = UE_ADDR_ANY,
.direction = UE_DIR_OUT,
.bufsize = UMASS_BULK_SIZE,
.flags = {.proxy_buffer = 1,.short_xfer_ok = 1,.ext_buffer=1,},
.callback = &umass_t_cbi_data_write_callback,
.timeout = 0, /* overwritten later */
},
[UMASS_T_CBI_DATA_WR_CS] = {
.type = UE_CONTROL,
.endpoint = 0x00, /* Control pipe */
.direction = UE_DIR_ANY,
.bufsize = sizeof(struct usb_device_request),
.callback = &umass_t_cbi_data_wr_cs_callback,
.timeout = 5000, /* 5 seconds */
},
[UMASS_T_CBI_STATUS] = {
.type = UE_INTERRUPT,
.endpoint = UE_ADDR_ANY,
.direction = UE_DIR_IN,
.flags = {.short_xfer_ok = 1,.no_pipe_ok = 1,},
.bufsize = sizeof(umass_cbi_sbl_t),
.callback = &umass_t_cbi_status_callback,
.timeout = 5000, /* ms */
},
[UMASS_T_CBI_RESET4] = {
.type = UE_CONTROL,
.endpoint = 0x00, /* Control pipe */
.direction = UE_DIR_ANY,
.bufsize = sizeof(struct usb_device_request),
.callback = &umass_t_cbi_reset4_callback,
.timeout = 5000, /* ms */
},
};
/* If device cannot return valid inquiry data, fake it */
static const uint8_t fake_inq_data[SHORT_INQUIRY_LENGTH] = {
0, /* removable */ 0x80, SCSI_REV_2, SCSI_REV_2,
/* additional_length */ 31, 0, 0, 0
};
#define UFI_COMMAND_LENGTH 12 /* UFI commands are always 12 bytes */
#define ATAPI_COMMAND_LENGTH 12 /* ATAPI commands are always 12 bytes */
static devclass_t umass_devclass;
static device_method_t umass_methods[] = {
/* Device interface */
DEVMETHOD(device_probe, umass_probe),
DEVMETHOD(device_attach, umass_attach),
DEVMETHOD(device_detach, umass_detach),
DEVMETHOD_END
};
static driver_t umass_driver = {
.name = "umass",
.methods = umass_methods,
.size = sizeof(struct umass_softc),
};
static const STRUCT_USB_HOST_ID __used umass_devs[] = {
/* generic mass storage class */
{USB_IFACE_CLASS(UICLASS_MASS),},
};
DRIVER_MODULE(umass, uhub, umass_driver, umass_devclass, NULL, 0);
MODULE_DEPEND(umass, usb, 1, 1, 1);
MODULE_DEPEND(umass, cam, 1, 1, 1);
MODULE_VERSION(umass, 1);
USB_PNP_HOST_INFO(umass_devs);
/*
* USB device probe/attach/detach
*/
static uint16_t
umass_get_proto(struct usb_interface *iface)
{
struct usb_interface_descriptor *id;
uint16_t retval;
retval = 0;
/* Check for a standards compliant device */
id = usbd_get_interface_descriptor(iface);
if ((id == NULL) ||
(id->bInterfaceClass != UICLASS_MASS)) {
goto done;
}
switch (id->bInterfaceSubClass) {
case UISUBCLASS_SCSI:
retval |= UMASS_PROTO_SCSI;
break;
case UISUBCLASS_UFI:
retval |= UMASS_PROTO_UFI;
break;
case UISUBCLASS_RBC:
retval |= UMASS_PROTO_RBC;
break;
case UISUBCLASS_SFF8020I:
case UISUBCLASS_SFF8070I:
retval |= UMASS_PROTO_ATAPI;
break;
default:
goto done;
}
switch (id->bInterfaceProtocol) {
case UIPROTO_MASS_CBI:
retval |= UMASS_PROTO_CBI;
break;
case UIPROTO_MASS_CBI_I:
retval |= UMASS_PROTO_CBI_I;
break;
case UIPROTO_MASS_BBB_OLD:
case UIPROTO_MASS_BBB:
retval |= UMASS_PROTO_BBB;
break;
default:
goto done;
}
done:
return (retval);
}
/*
* Match the device we are seeing with the devices supported.
*/
static struct umass_probe_proto
umass_probe_proto(device_t dev, struct usb_attach_arg *uaa)
{
struct umass_probe_proto ret;
uint32_t quirks = NO_QUIRKS;
uint32_t proto = umass_get_proto(uaa->iface);
memset(&ret, 0, sizeof(ret));
ret.error = BUS_PROBE_GENERIC;
/* Search for protocol enforcement */
if (usb_test_quirk(uaa, UQ_MSC_FORCE_WIRE_BBB)) {
proto &= ~UMASS_PROTO_WIRE;
proto |= UMASS_PROTO_BBB;
} else if (usb_test_quirk(uaa, UQ_MSC_FORCE_WIRE_CBI)) {
proto &= ~UMASS_PROTO_WIRE;
proto |= UMASS_PROTO_CBI;
} else if (usb_test_quirk(uaa, UQ_MSC_FORCE_WIRE_CBI_I)) {
proto &= ~UMASS_PROTO_WIRE;
proto |= UMASS_PROTO_CBI_I;
}
if (usb_test_quirk(uaa, UQ_MSC_FORCE_PROTO_SCSI)) {
proto &= ~UMASS_PROTO_COMMAND;
proto |= UMASS_PROTO_SCSI;
} else if (usb_test_quirk(uaa, UQ_MSC_FORCE_PROTO_ATAPI)) {
proto &= ~UMASS_PROTO_COMMAND;
proto |= UMASS_PROTO_ATAPI;
} else if (usb_test_quirk(uaa, UQ_MSC_FORCE_PROTO_UFI)) {
proto &= ~UMASS_PROTO_COMMAND;
proto |= UMASS_PROTO_UFI;
} else if (usb_test_quirk(uaa, UQ_MSC_FORCE_PROTO_RBC)) {
proto &= ~UMASS_PROTO_COMMAND;
proto |= UMASS_PROTO_RBC;
}
/* Check if the protocol is invalid */
if ((proto & UMASS_PROTO_COMMAND) == 0) {
ret.error = ENXIO;
goto done;
}
if ((proto & UMASS_PROTO_WIRE) == 0) {
ret.error = ENXIO;
goto done;
}
/* Search for quirks */
if (usb_test_quirk(uaa, UQ_MSC_NO_TEST_UNIT_READY))
quirks |= NO_TEST_UNIT_READY;
if (usb_test_quirk(uaa, UQ_MSC_NO_RS_CLEAR_UA))
quirks |= RS_NO_CLEAR_UA;
if (usb_test_quirk(uaa, UQ_MSC_NO_START_STOP))
quirks |= NO_START_STOP;
if (usb_test_quirk(uaa, UQ_MSC_NO_GETMAXLUN))
quirks |= NO_GETMAXLUN;
if (usb_test_quirk(uaa, UQ_MSC_NO_INQUIRY))
quirks |= NO_INQUIRY;
if (usb_test_quirk(uaa, UQ_MSC_NO_INQUIRY_EVPD))
quirks |= NO_INQUIRY_EVPD;
if (usb_test_quirk(uaa, UQ_MSC_NO_PREVENT_ALLOW))
quirks |= NO_PREVENT_ALLOW;
if (usb_test_quirk(uaa, UQ_MSC_NO_SYNC_CACHE))
quirks |= NO_SYNCHRONIZE_CACHE;
if (usb_test_quirk(uaa, UQ_MSC_SHUTTLE_INIT))
quirks |= SHUTTLE_INIT;
if (usb_test_quirk(uaa, UQ_MSC_ALT_IFACE_1))
quirks |= ALT_IFACE_1;
if (usb_test_quirk(uaa, UQ_MSC_FLOPPY_SPEED))
quirks |= FLOPPY_SPEED;
if (usb_test_quirk(uaa, UQ_MSC_IGNORE_RESIDUE))
quirks |= IGNORE_RESIDUE;
if (usb_test_quirk(uaa, UQ_MSC_WRONG_CSWSIG))
quirks |= WRONG_CSWSIG;
if (usb_test_quirk(uaa, UQ_MSC_RBC_PAD_TO_12))
quirks |= RBC_PAD_TO_12;
if (usb_test_quirk(uaa, UQ_MSC_READ_CAP_OFFBY1))
quirks |= READ_CAPACITY_OFFBY1;
if (usb_test_quirk(uaa, UQ_MSC_FORCE_SHORT_INQ))
quirks |= FORCE_SHORT_INQUIRY;
done:
ret.quirks = quirks;
ret.proto = proto;
return (ret);
}
static int
umass_probe(device_t dev)
{
struct usb_attach_arg *uaa = device_get_ivars(dev);
struct umass_probe_proto temp;
if (uaa->usb_mode != USB_MODE_HOST) {
return (ENXIO);
}
temp = umass_probe_proto(dev, uaa);
return (temp.error);
}
static int
umass_attach(device_t dev)
{
struct umass_softc *sc = device_get_softc(dev);
struct usb_attach_arg *uaa = device_get_ivars(dev);
struct umass_probe_proto temp = umass_probe_proto(dev, uaa);
struct usb_interface_descriptor *id;
int err;
/*
* NOTE: the softc struct is cleared in device_set_driver.
* We can safely call umass_detach without specifically
* initializing the struct.
*/
sc->sc_dev = dev;
sc->sc_udev = uaa->device;
sc->sc_proto = temp.proto;
sc->sc_quirks = temp.quirks;
sc->sc_unit = device_get_unit(dev);
snprintf(sc->sc_name, sizeof(sc->sc_name),
"%s", device_get_nameunit(dev));
device_set_usb_desc(dev);
mtx_init(&sc->sc_mtx, device_get_nameunit(dev),
NULL, MTX_DEF | MTX_RECURSE);
/* get interface index */
id = usbd_get_interface_descriptor(uaa->iface);
if (id == NULL) {
device_printf(dev, "failed to get "
"interface number\n");
goto detach;
}
sc->sc_iface_no = id->bInterfaceNumber;
#ifdef USB_DEBUG
device_printf(dev, " ");
switch (sc->sc_proto & UMASS_PROTO_COMMAND) {
case UMASS_PROTO_SCSI:
printf("SCSI");
break;
case UMASS_PROTO_ATAPI:
printf("8070i (ATAPI)");
break;
case UMASS_PROTO_UFI:
printf("UFI");
break;
case UMASS_PROTO_RBC:
printf("RBC");
break;
default:
printf("(unknown 0x%02x)",
sc->sc_proto & UMASS_PROTO_COMMAND);
break;
}
printf(" over ");
switch (sc->sc_proto & UMASS_PROTO_WIRE) {
case UMASS_PROTO_BBB:
printf("Bulk-Only");
break;
case UMASS_PROTO_CBI: /* uses Comand/Bulk pipes */
printf("CBI");
break;
case UMASS_PROTO_CBI_I: /* uses Comand/Bulk/Interrupt pipes */
printf("CBI with CCI");
break;
default:
printf("(unknown 0x%02x)",
sc->sc_proto & UMASS_PROTO_WIRE);
}
printf("; quirks = 0x%04x\n", sc->sc_quirks);
#endif
if (sc->sc_quirks & ALT_IFACE_1) {
err = usbd_set_alt_interface_index
(uaa->device, uaa->info.bIfaceIndex, 1);
if (err) {
DPRINTF(sc, UDMASS_USB, "could not switch to "
"Alt Interface 1\n");
goto detach;
}
}
/* allocate all required USB transfers */
if (sc->sc_proto & UMASS_PROTO_BBB) {
err = usbd_transfer_setup(uaa->device,
&uaa->info.bIfaceIndex, sc->sc_xfer, umass_bbb_config,
UMASS_T_BBB_MAX, sc, &sc->sc_mtx);
/* skip reset first time */
sc->sc_last_xfer_index = UMASS_T_BBB_COMMAND;
} else if (sc->sc_proto & (UMASS_PROTO_CBI | UMASS_PROTO_CBI_I)) {
err = usbd_transfer_setup(uaa->device,
&uaa->info.bIfaceIndex, sc->sc_xfer, umass_cbi_config,
UMASS_T_CBI_MAX, sc, &sc->sc_mtx);
/* skip reset first time */
sc->sc_last_xfer_index = UMASS_T_CBI_COMMAND;
} else {
err = USB_ERR_INVAL;
}
if (err) {
device_printf(dev, "could not setup required "
"transfers, %s\n", usbd_errstr(err));
goto detach;
}
#ifdef USB_DEBUG
if (umass_throttle > 0) {
uint8_t x;
int iv;
iv = umass_throttle;
if (iv < 1)
iv = 1;
else if (iv > 8000)
iv = 8000;
for (x = 0; x != UMASS_T_MAX; x++) {
if (sc->sc_xfer[x] != NULL)
usbd_xfer_set_interval(sc->sc_xfer[x], iv);
}
}
#endif
sc->sc_transform =
(sc->sc_proto & UMASS_PROTO_SCSI) ? &umass_scsi_transform :
(sc->sc_proto & UMASS_PROTO_UFI) ? &umass_ufi_transform :
(sc->sc_proto & UMASS_PROTO_ATAPI) ? &umass_atapi_transform :
(sc->sc_proto & UMASS_PROTO_RBC) ? &umass_rbc_transform :
&umass_no_transform;
/* from here onwards the device can be used. */
if (sc->sc_quirks & SHUTTLE_INIT) {
umass_init_shuttle(sc);
}
/* get the maximum LUN supported by the device */
if (((sc->sc_proto & UMASS_PROTO_WIRE) == UMASS_PROTO_BBB) &&
!(sc->sc_quirks & NO_GETMAXLUN))
sc->sc_maxlun = umass_bbb_get_max_lun(sc);
else
sc->sc_maxlun = 0;
/* Prepare the SCSI command block */
sc->cam_scsi_sense.opcode = REQUEST_SENSE;
sc->cam_scsi_test_unit_ready.opcode = TEST_UNIT_READY;
/* register the SIM */
err = umass_cam_attach_sim(sc);
if (err) {
goto detach;
}
/* scan the SIM */
umass_cam_attach(sc);
DPRINTF(sc, UDMASS_GEN, "Attach finished\n");
return (0); /* success */
detach:
umass_detach(dev);
return (ENXIO); /* failure */
}
static int
umass_detach(device_t dev)
{
struct umass_softc *sc = device_get_softc(dev);
DPRINTF(sc, UDMASS_USB, "\n");
/* teardown our statemachine */
usbd_transfer_unsetup(sc->sc_xfer, UMASS_T_MAX);
mtx_lock(&sc->sc_mtx);
/* cancel any leftover CCB's */
umass_cancel_ccb(sc);
umass_cam_detach_sim(sc);
mtx_unlock(&sc->sc_mtx);
mtx_destroy(&sc->sc_mtx);
return (0); /* success */
}
static void
umass_init_shuttle(struct umass_softc *sc)
{
struct usb_device_request req;
- usb_error_t err;
uint8_t status[2] = {0, 0};
/*
* The Linux driver does this, but no one can tell us what the
* command does.
*/
req.bmRequestType = UT_READ_VENDOR_DEVICE;
req.bRequest = 1; /* XXX unknown command */
USETW(req.wValue, 0);
req.wIndex[0] = sc->sc_iface_no;
req.wIndex[1] = 0;
USETW(req.wLength, sizeof(status));
- err = usbd_do_request(sc->sc_udev, NULL, &req, &status);
+ usbd_do_request(sc->sc_udev, NULL, &req, &status);
DPRINTF(sc, UDMASS_GEN, "Shuttle init returned 0x%02x%02x\n",
status[0], status[1]);
}
/*
* Generic functions to handle transfers
*/
static void
umass_transfer_start(struct umass_softc *sc, uint8_t xfer_index)
{
DPRINTF(sc, UDMASS_GEN, "transfer index = "
"%d\n", xfer_index);
if (sc->sc_xfer[xfer_index]) {
sc->sc_last_xfer_index = xfer_index;
usbd_transfer_start(sc->sc_xfer[xfer_index]);
} else {
umass_cancel_ccb(sc);
}
}
static void
umass_reset(struct umass_softc *sc)
{
DPRINTF(sc, UDMASS_GEN, "resetting device\n");
/*
* stop the last transfer, if not already stopped:
*/
usbd_transfer_stop(sc->sc_xfer[sc->sc_last_xfer_index]);
umass_transfer_start(sc, 0);
}
static void
umass_cancel_ccb(struct umass_softc *sc)
{
union ccb *ccb;
USB_MTX_ASSERT(&sc->sc_mtx, MA_OWNED);
ccb = sc->sc_transfer.ccb;
sc->sc_transfer.ccb = NULL;
sc->sc_last_xfer_index = 0;
if (ccb) {
(sc->sc_transfer.callback)
(sc, ccb, (sc->sc_transfer.data_len -
sc->sc_transfer.actlen), STATUS_WIRE_FAILED);
}
}
static void
umass_tr_error(struct usb_xfer *xfer, usb_error_t error)
{
struct umass_softc *sc = usbd_xfer_softc(xfer);
if (error != USB_ERR_CANCELLED) {
DPRINTF(sc, UDMASS_GEN, "transfer error, %s -> "
"reset\n", usbd_errstr(error));
}
umass_cancel_ccb(sc);
}
/*
* BBB protocol specific functions
*/
static void
umass_t_bbb_reset1_callback(struct usb_xfer *xfer, usb_error_t error)
{
struct umass_softc *sc = usbd_xfer_softc(xfer);
struct usb_device_request req;
struct usb_page_cache *pc;
switch (USB_GET_STATE(xfer)) {
case USB_ST_TRANSFERRED:
umass_transfer_start(sc, UMASS_T_BBB_RESET2);
return;
case USB_ST_SETUP:
/*
* Reset recovery (5.3.4 in Universal Serial Bus Mass Storage Class)
*
* For Reset Recovery the host shall issue in the following order:
* a) a Bulk-Only Mass Storage Reset
* b) a Clear Feature HALT to the Bulk-In endpoint
* c) a Clear Feature HALT to the Bulk-Out endpoint
*
* This is done in 3 steps, using 3 transfers:
* UMASS_T_BBB_RESET1
* UMASS_T_BBB_RESET2
* UMASS_T_BBB_RESET3
*/
DPRINTF(sc, UDMASS_BBB, "BBB reset!\n");
req.bmRequestType = UT_WRITE_CLASS_INTERFACE;
req.bRequest = UR_BBB_RESET; /* bulk only reset */
USETW(req.wValue, 0);
req.wIndex[0] = sc->sc_iface_no;
req.wIndex[1] = 0;
USETW(req.wLength, 0);
pc = usbd_xfer_get_frame(xfer, 0);
usbd_copy_in(pc, 0, &req, sizeof(req));
usbd_xfer_set_frame_len(xfer, 0, sizeof(req));
usbd_xfer_set_frames(xfer, 1);
usbd_transfer_submit(xfer);
return;
default: /* Error */
umass_tr_error(xfer, error);
return;
}
}
static void
umass_t_bbb_reset2_callback(struct usb_xfer *xfer, usb_error_t error)
{
umass_t_bbb_data_clear_stall_callback(xfer, UMASS_T_BBB_RESET3,
UMASS_T_BBB_DATA_READ, error);
}
static void
umass_t_bbb_reset3_callback(struct usb_xfer *xfer, usb_error_t error)
{
umass_t_bbb_data_clear_stall_callback(xfer, UMASS_T_BBB_COMMAND,
UMASS_T_BBB_DATA_WRITE, error);
}
static void
umass_t_bbb_data_clear_stall_callback(struct usb_xfer *xfer,
uint8_t next_xfer, uint8_t stall_xfer, usb_error_t error)
{
struct umass_softc *sc = usbd_xfer_softc(xfer);
switch (USB_GET_STATE(xfer)) {
case USB_ST_TRANSFERRED:
tr_transferred:
umass_transfer_start(sc, next_xfer);
return;
case USB_ST_SETUP:
if (usbd_clear_stall_callback(xfer, sc->sc_xfer[stall_xfer])) {
goto tr_transferred;
}
return;
default: /* Error */
umass_tr_error(xfer, error);
return;
}
}
static void
umass_t_bbb_command_callback(struct usb_xfer *xfer, usb_error_t error)
{
struct umass_softc *sc = usbd_xfer_softc(xfer);
union ccb *ccb = sc->sc_transfer.ccb;
struct usb_page_cache *pc;
uint32_t tag;
switch (USB_GET_STATE(xfer)) {
case USB_ST_TRANSFERRED:
umass_transfer_start
(sc, ((sc->sc_transfer.dir == DIR_IN) ? UMASS_T_BBB_DATA_READ :
(sc->sc_transfer.dir == DIR_OUT) ? UMASS_T_BBB_DATA_WRITE :
UMASS_T_BBB_STATUS));
return;
case USB_ST_SETUP:
sc->sc_status_try = 0;
if (ccb) {
/*
* the initial value is not important,
* as long as the values are unique:
*/
tag = UGETDW(sc->cbw.dCBWTag) + 1;
USETDW(sc->cbw.dCBWSignature, CBWSIGNATURE);
USETDW(sc->cbw.dCBWTag, tag);
/*
* dCBWDataTransferLength:
* This field indicates the number of bytes of data that the host
* intends to transfer on the IN or OUT Bulk endpoint(as indicated by
* the Direction bit) during the execution of this command. If this
* field is set to 0, the device will expect that no data will be
* transferred IN or OUT during this command, regardless of the value
* of the Direction bit defined in dCBWFlags.
*/
USETDW(sc->cbw.dCBWDataTransferLength, sc->sc_transfer.data_len);
/*
* dCBWFlags:
* The bits of the Flags field are defined as follows:
* Bits 0-6 reserved
* Bit 7 Direction - this bit shall be ignored if the
* dCBWDataTransferLength field is zero.
* 0 = data Out from host to device
* 1 = data In from device to host
*/
sc->cbw.bCBWFlags = ((sc->sc_transfer.dir == DIR_IN) ?
CBWFLAGS_IN : CBWFLAGS_OUT);
sc->cbw.bCBWLUN = sc->sc_transfer.lun;
if (sc->sc_transfer.cmd_len > sizeof(sc->cbw.CBWCDB)) {
sc->sc_transfer.cmd_len = sizeof(sc->cbw.CBWCDB);
DPRINTF(sc, UDMASS_BBB, "Truncating long command!\n");
}
sc->cbw.bCDBLength = sc->sc_transfer.cmd_len;
/* copy SCSI command data */
memcpy(sc->cbw.CBWCDB, sc->sc_transfer.cmd_data,
sc->sc_transfer.cmd_len);
/* clear remaining command area */
memset(sc->cbw.CBWCDB +
sc->sc_transfer.cmd_len, 0,
sizeof(sc->cbw.CBWCDB) -
sc->sc_transfer.cmd_len);
DIF(UDMASS_BBB, umass_bbb_dump_cbw(sc, &sc->cbw));
pc = usbd_xfer_get_frame(xfer, 0);
usbd_copy_in(pc, 0, &sc->cbw, sizeof(sc->cbw));
usbd_xfer_set_frame_len(xfer, 0, sizeof(sc->cbw));
usbd_transfer_submit(xfer);
}
return;
default: /* Error */
umass_tr_error(xfer, error);
return;
}
}
static void
umass_t_bbb_data_read_callback(struct usb_xfer *xfer, usb_error_t error)
{
struct umass_softc *sc = usbd_xfer_softc(xfer);
uint32_t max_bulk = usbd_xfer_max_len(xfer);
int actlen, sumlen;
usbd_xfer_status(xfer, &actlen, &sumlen, NULL, NULL);
switch (USB_GET_STATE(xfer)) {
case USB_ST_TRANSFERRED:
sc->sc_transfer.data_rem -= actlen;
sc->sc_transfer.data_ptr += actlen;
sc->sc_transfer.actlen += actlen;
if (actlen < sumlen) {
/* short transfer */
sc->sc_transfer.data_rem = 0;
}
case USB_ST_SETUP:
DPRINTF(sc, UDMASS_BBB, "max_bulk=%d, data_rem=%d\n",
max_bulk, sc->sc_transfer.data_rem);
if (sc->sc_transfer.data_rem == 0) {
umass_transfer_start(sc, UMASS_T_BBB_STATUS);
return;
}
if (max_bulk > sc->sc_transfer.data_rem) {
max_bulk = sc->sc_transfer.data_rem;
}
usbd_xfer_set_timeout(xfer, sc->sc_transfer.data_timeout);
usbd_xfer_set_frame_data(xfer, 0, sc->sc_transfer.data_ptr,
max_bulk);
usbd_transfer_submit(xfer);
return;
default: /* Error */
if (error == USB_ERR_CANCELLED) {
umass_tr_error(xfer, error);
} else {
umass_transfer_start(sc, UMASS_T_BBB_DATA_RD_CS);
}
return;
}
}
static void
umass_t_bbb_data_rd_cs_callback(struct usb_xfer *xfer, usb_error_t error)
{
umass_t_bbb_data_clear_stall_callback(xfer, UMASS_T_BBB_STATUS,
UMASS_T_BBB_DATA_READ, error);
}
static void
umass_t_bbb_data_write_callback(struct usb_xfer *xfer, usb_error_t error)
{
struct umass_softc *sc = usbd_xfer_softc(xfer);
uint32_t max_bulk = usbd_xfer_max_len(xfer);
int actlen, sumlen;
usbd_xfer_status(xfer, &actlen, &sumlen, NULL, NULL);
switch (USB_GET_STATE(xfer)) {
case USB_ST_TRANSFERRED:
sc->sc_transfer.data_rem -= actlen;
sc->sc_transfer.data_ptr += actlen;
sc->sc_transfer.actlen += actlen;
if (actlen < sumlen) {
/* short transfer */
sc->sc_transfer.data_rem = 0;
}
case USB_ST_SETUP:
DPRINTF(sc, UDMASS_BBB, "max_bulk=%d, data_rem=%d\n",
max_bulk, sc->sc_transfer.data_rem);
if (sc->sc_transfer.data_rem == 0) {
umass_transfer_start(sc, UMASS_T_BBB_STATUS);
return;
}
if (max_bulk > sc->sc_transfer.data_rem) {
max_bulk = sc->sc_transfer.data_rem;
}
usbd_xfer_set_timeout(xfer, sc->sc_transfer.data_timeout);
usbd_xfer_set_frame_data(xfer, 0, sc->sc_transfer.data_ptr,
max_bulk);
usbd_transfer_submit(xfer);
return;
default: /* Error */
if (error == USB_ERR_CANCELLED) {
umass_tr_error(xfer, error);
} else {
umass_transfer_start(sc, UMASS_T_BBB_DATA_WR_CS);
}
return;
}
}
static void
umass_t_bbb_data_wr_cs_callback(struct usb_xfer *xfer, usb_error_t error)
{
umass_t_bbb_data_clear_stall_callback(xfer, UMASS_T_BBB_STATUS,
UMASS_T_BBB_DATA_WRITE, error);
}
static void
umass_t_bbb_status_callback(struct usb_xfer *xfer, usb_error_t error)
{
struct umass_softc *sc = usbd_xfer_softc(xfer);
union ccb *ccb = sc->sc_transfer.ccb;
struct usb_page_cache *pc;
uint32_t residue;
int actlen;
usbd_xfer_status(xfer, &actlen, NULL, NULL, NULL);
switch (USB_GET_STATE(xfer)) {
case USB_ST_TRANSFERRED:
/*
* Do a full reset if there is something wrong with the CSW:
*/
sc->sc_status_try = 1;
/* Zero missing parts of the CSW: */
if (actlen < (int)sizeof(sc->csw))
memset(&sc->csw, 0, sizeof(sc->csw));
pc = usbd_xfer_get_frame(xfer, 0);
usbd_copy_out(pc, 0, &sc->csw, actlen);
DIF(UDMASS_BBB, umass_bbb_dump_csw(sc, &sc->csw));
residue = UGETDW(sc->csw.dCSWDataResidue);
if ((!residue) || (sc->sc_quirks & IGNORE_RESIDUE)) {
residue = (sc->sc_transfer.data_len -
sc->sc_transfer.actlen);
}
if (residue > sc->sc_transfer.data_len) {
DPRINTF(sc, UDMASS_BBB, "truncating residue from %d "
"to %d bytes\n", residue, sc->sc_transfer.data_len);
residue = sc->sc_transfer.data_len;
}
/* translate weird command-status signatures: */
if (sc->sc_quirks & WRONG_CSWSIG) {
uint32_t temp = UGETDW(sc->csw.dCSWSignature);
if ((temp == CSWSIGNATURE_OLYMPUS_C1) ||
(temp == CSWSIGNATURE_IMAGINATION_DBX1)) {
USETDW(sc->csw.dCSWSignature, CSWSIGNATURE);
}
}
/* check CSW and handle eventual error */
if (UGETDW(sc->csw.dCSWSignature) != CSWSIGNATURE) {
DPRINTF(sc, UDMASS_BBB, "bad CSW signature 0x%08x != 0x%08x\n",
UGETDW(sc->csw.dCSWSignature), CSWSIGNATURE);
/*
* Invalid CSW: Wrong signature or wrong tag might
* indicate that we lost synchronization. Reset the
* device.
*/
goto tr_error;
} else if (UGETDW(sc->csw.dCSWTag) != UGETDW(sc->cbw.dCBWTag)) {
DPRINTF(sc, UDMASS_BBB, "Invalid CSW: tag 0x%08x should be "
"0x%08x\n", UGETDW(sc->csw.dCSWTag),
UGETDW(sc->cbw.dCBWTag));
goto tr_error;
} else if (sc->csw.bCSWStatus > CSWSTATUS_PHASE) {
DPRINTF(sc, UDMASS_BBB, "Invalid CSW: status %d > %d\n",
sc->csw.bCSWStatus, CSWSTATUS_PHASE);
goto tr_error;
} else if (sc->csw.bCSWStatus == CSWSTATUS_PHASE) {
DPRINTF(sc, UDMASS_BBB, "Phase error, residue = "
"%d\n", residue);
goto tr_error;
} else if (sc->sc_transfer.actlen > sc->sc_transfer.data_len) {
DPRINTF(sc, UDMASS_BBB, "Buffer overrun %d > %d\n",
sc->sc_transfer.actlen, sc->sc_transfer.data_len);
goto tr_error;
} else if (sc->csw.bCSWStatus == CSWSTATUS_FAILED) {
DPRINTF(sc, UDMASS_BBB, "Command failed, residue = "
"%d\n", residue);
sc->sc_transfer.ccb = NULL;
sc->sc_last_xfer_index = UMASS_T_BBB_COMMAND;
(sc->sc_transfer.callback)
(sc, ccb, residue, STATUS_CMD_FAILED);
} else {
sc->sc_transfer.ccb = NULL;
sc->sc_last_xfer_index = UMASS_T_BBB_COMMAND;
(sc->sc_transfer.callback)
(sc, ccb, residue, STATUS_CMD_OK);
}
return;
case USB_ST_SETUP:
usbd_xfer_set_frame_len(xfer, 0, usbd_xfer_max_len(xfer));
usbd_transfer_submit(xfer);
return;
default:
tr_error:
DPRINTF(sc, UDMASS_BBB, "Failed to read CSW: %s, try %d\n",
usbd_errstr(error), sc->sc_status_try);
if ((error == USB_ERR_CANCELLED) ||
(sc->sc_status_try)) {
umass_tr_error(xfer, error);
} else {
sc->sc_status_try = 1;
umass_transfer_start(sc, UMASS_T_BBB_DATA_RD_CS);
}
return;
}
}
static void
umass_command_start(struct umass_softc *sc, uint8_t dir,
void *data_ptr, uint32_t data_len,
uint32_t data_timeout, umass_callback_t *callback,
union ccb *ccb)
{
sc->sc_transfer.lun = ccb->ccb_h.target_lun;
/*
* NOTE: assumes that "sc->sc_transfer.cmd_data" and
* "sc->sc_transfer.cmd_len" has been properly
* initialized.
*/
sc->sc_transfer.dir = data_len ? dir : DIR_NONE;
sc->sc_transfer.data_ptr = data_ptr;
sc->sc_transfer.data_len = data_len;
sc->sc_transfer.data_rem = data_len;
sc->sc_transfer.data_timeout = (data_timeout + UMASS_TIMEOUT);
sc->sc_transfer.actlen = 0;
sc->sc_transfer.callback = callback;
sc->sc_transfer.ccb = ccb;
if (sc->sc_xfer[sc->sc_last_xfer_index]) {
usbd_transfer_start(sc->sc_xfer[sc->sc_last_xfer_index]);
} else {
umass_cancel_ccb(sc);
}
}
static uint8_t
umass_bbb_get_max_lun(struct umass_softc *sc)
{
struct usb_device_request req;
usb_error_t err;
uint8_t buf = 0;
/* The Get Max Lun command is a class-specific request. */
req.bmRequestType = UT_READ_CLASS_INTERFACE;
req.bRequest = UR_BBB_GET_MAX_LUN;
USETW(req.wValue, 0);
req.wIndex[0] = sc->sc_iface_no;
req.wIndex[1] = 0;
USETW(req.wLength, 1);
err = usbd_do_request(sc->sc_udev, NULL, &req, &buf);
if (err) {
buf = 0;
/* Device doesn't support Get Max Lun request. */
printf("%s: Get Max Lun not supported (%s)\n",
sc->sc_name, usbd_errstr(err));
}
return (buf);
}
/*
* Command/Bulk/Interrupt (CBI) specific functions
*/
static void
umass_cbi_start_status(struct umass_softc *sc)
{
if (sc->sc_xfer[UMASS_T_CBI_STATUS]) {
umass_transfer_start(sc, UMASS_T_CBI_STATUS);
} else {
union ccb *ccb = sc->sc_transfer.ccb;
sc->sc_transfer.ccb = NULL;
sc->sc_last_xfer_index = UMASS_T_CBI_COMMAND;
(sc->sc_transfer.callback)
(sc, ccb, (sc->sc_transfer.data_len -
sc->sc_transfer.actlen), STATUS_CMD_UNKNOWN);
}
}
static void
umass_t_cbi_reset1_callback(struct usb_xfer *xfer, usb_error_t error)
{
struct umass_softc *sc = usbd_xfer_softc(xfer);
struct usb_device_request req;
struct usb_page_cache *pc;
uint8_t buf[UMASS_CBI_DIAGNOSTIC_CMDLEN];
uint8_t i;
switch (USB_GET_STATE(xfer)) {
case USB_ST_TRANSFERRED:
umass_transfer_start(sc, UMASS_T_CBI_RESET2);
break;
case USB_ST_SETUP:
/*
* Command Block Reset Protocol
*
* First send a reset request to the device. Then clear
* any possibly stalled bulk endpoints.
*
* This is done in 3 steps, using 3 transfers:
* UMASS_T_CBI_RESET1
* UMASS_T_CBI_RESET2
* UMASS_T_CBI_RESET3
* UMASS_T_CBI_RESET4 (only if there is an interrupt endpoint)
*/
DPRINTF(sc, UDMASS_CBI, "CBI reset!\n");
req.bmRequestType = UT_WRITE_CLASS_INTERFACE;
req.bRequest = UR_CBI_ADSC;
USETW(req.wValue, 0);
req.wIndex[0] = sc->sc_iface_no;
req.wIndex[1] = 0;
USETW(req.wLength, UMASS_CBI_DIAGNOSTIC_CMDLEN);
/*
* The 0x1d code is the SEND DIAGNOSTIC command. To
* distinguish between the two, the last 10 bytes of the CBL
* is filled with 0xff (section 2.2 of the CBI
* specification)
*/
buf[0] = 0x1d; /* Command Block Reset */
buf[1] = 0x04;
for (i = 2; i < UMASS_CBI_DIAGNOSTIC_CMDLEN; i++) {
buf[i] = 0xff;
}
pc = usbd_xfer_get_frame(xfer, 0);
usbd_copy_in(pc, 0, &req, sizeof(req));
pc = usbd_xfer_get_frame(xfer, 1);
usbd_copy_in(pc, 0, buf, sizeof(buf));
usbd_xfer_set_frame_len(xfer, 0, sizeof(req));
usbd_xfer_set_frame_len(xfer, 1, sizeof(buf));
usbd_xfer_set_frames(xfer, 2);
usbd_transfer_submit(xfer);
break;
default: /* Error */
if (error == USB_ERR_CANCELLED)
umass_tr_error(xfer, error);
else
umass_transfer_start(sc, UMASS_T_CBI_RESET2);
break;
}
}
static void
umass_t_cbi_reset2_callback(struct usb_xfer *xfer, usb_error_t error)
{
umass_t_cbi_data_clear_stall_callback(xfer, UMASS_T_CBI_RESET3,
UMASS_T_CBI_DATA_READ, error);
}
static void
umass_t_cbi_reset3_callback(struct usb_xfer *xfer, usb_error_t error)
{
struct umass_softc *sc = usbd_xfer_softc(xfer);
umass_t_cbi_data_clear_stall_callback
(xfer, (sc->sc_xfer[UMASS_T_CBI_RESET4] &&
sc->sc_xfer[UMASS_T_CBI_STATUS]) ?
UMASS_T_CBI_RESET4 : UMASS_T_CBI_COMMAND,
UMASS_T_CBI_DATA_WRITE, error);
}
static void
umass_t_cbi_reset4_callback(struct usb_xfer *xfer, usb_error_t error)
{
umass_t_cbi_data_clear_stall_callback(xfer, UMASS_T_CBI_COMMAND,
UMASS_T_CBI_STATUS, error);
}
static void
umass_t_cbi_data_clear_stall_callback(struct usb_xfer *xfer,
uint8_t next_xfer, uint8_t stall_xfer, usb_error_t error)
{
struct umass_softc *sc = usbd_xfer_softc(xfer);
switch (USB_GET_STATE(xfer)) {
case USB_ST_TRANSFERRED:
tr_transferred:
if (next_xfer == UMASS_T_CBI_STATUS) {
umass_cbi_start_status(sc);
} else {
umass_transfer_start(sc, next_xfer);
}
break;
case USB_ST_SETUP:
if (usbd_clear_stall_callback(xfer, sc->sc_xfer[stall_xfer])) {
goto tr_transferred; /* should not happen */
}
break;
default: /* Error */
umass_tr_error(xfer, error);
break;
}
}
static void
umass_t_cbi_command_callback(struct usb_xfer *xfer, usb_error_t error)
{
struct umass_softc *sc = usbd_xfer_softc(xfer);
union ccb *ccb = sc->sc_transfer.ccb;
struct usb_device_request req;
struct usb_page_cache *pc;
switch (USB_GET_STATE(xfer)) {
case USB_ST_TRANSFERRED:
if (sc->sc_transfer.dir == DIR_NONE) {
umass_cbi_start_status(sc);
} else {
umass_transfer_start
(sc, (sc->sc_transfer.dir == DIR_IN) ?
UMASS_T_CBI_DATA_READ : UMASS_T_CBI_DATA_WRITE);
}
break;
case USB_ST_SETUP:
if (ccb) {
/*
* do a CBI transfer with cmd_len bytes from
* cmd_data, possibly a data phase of data_len
* bytes from/to the device and finally a status
* read phase.
*/
req.bmRequestType = UT_WRITE_CLASS_INTERFACE;
req.bRequest = UR_CBI_ADSC;
USETW(req.wValue, 0);
req.wIndex[0] = sc->sc_iface_no;
req.wIndex[1] = 0;
req.wLength[0] = sc->sc_transfer.cmd_len;
req.wLength[1] = 0;
pc = usbd_xfer_get_frame(xfer, 0);
usbd_copy_in(pc, 0, &req, sizeof(req));
pc = usbd_xfer_get_frame(xfer, 1);
usbd_copy_in(pc, 0, sc->sc_transfer.cmd_data,
sc->sc_transfer.cmd_len);
usbd_xfer_set_frame_len(xfer, 0, sizeof(req));
usbd_xfer_set_frame_len(xfer, 1, sc->sc_transfer.cmd_len);
usbd_xfer_set_frames(xfer,
sc->sc_transfer.cmd_len ? 2 : 1);
DIF(UDMASS_CBI,
umass_cbi_dump_cmd(sc,
sc->sc_transfer.cmd_data,
sc->sc_transfer.cmd_len));
usbd_transfer_submit(xfer);
}
break;
default: /* Error */
/*
* STALL on the control pipe can be result of the command error.
* Attempt to clear this STALL same as for bulk pipe also
* results in command completion interrupt, but ASC/ASCQ there
* look like not always valid, so don't bother about it.
*/
if ((error == USB_ERR_STALLED) ||
(sc->sc_transfer.callback == &umass_cam_cb)) {
sc->sc_transfer.ccb = NULL;
(sc->sc_transfer.callback)
(sc, ccb, sc->sc_transfer.data_len,
STATUS_CMD_UNKNOWN);
} else {
umass_tr_error(xfer, error);
/* skip reset */
sc->sc_last_xfer_index = UMASS_T_CBI_COMMAND;
}
break;
}
}
static void
umass_t_cbi_data_read_callback(struct usb_xfer *xfer, usb_error_t error)
{
struct umass_softc *sc = usbd_xfer_softc(xfer);
uint32_t max_bulk = usbd_xfer_max_len(xfer);
int actlen, sumlen;
usbd_xfer_status(xfer, &actlen, &sumlen, NULL, NULL);
switch (USB_GET_STATE(xfer)) {
case USB_ST_TRANSFERRED:
sc->sc_transfer.data_rem -= actlen;
sc->sc_transfer.data_ptr += actlen;
sc->sc_transfer.actlen += actlen;
if (actlen < sumlen) {
/* short transfer */
sc->sc_transfer.data_rem = 0;
}
case USB_ST_SETUP:
DPRINTF(sc, UDMASS_CBI, "max_bulk=%d, data_rem=%d\n",
max_bulk, sc->sc_transfer.data_rem);
if (sc->sc_transfer.data_rem == 0) {
umass_cbi_start_status(sc);
break;
}
if (max_bulk > sc->sc_transfer.data_rem) {
max_bulk = sc->sc_transfer.data_rem;
}
usbd_xfer_set_timeout(xfer, sc->sc_transfer.data_timeout);
usbd_xfer_set_frame_data(xfer, 0, sc->sc_transfer.data_ptr,
max_bulk);
usbd_transfer_submit(xfer);
break;
default: /* Error */
if ((error == USB_ERR_CANCELLED) ||
(sc->sc_transfer.callback != &umass_cam_cb)) {
umass_tr_error(xfer, error);
} else {
umass_transfer_start(sc, UMASS_T_CBI_DATA_RD_CS);
}
break;
}
}
static void
umass_t_cbi_data_rd_cs_callback(struct usb_xfer *xfer, usb_error_t error)
{
umass_t_cbi_data_clear_stall_callback(xfer, UMASS_T_CBI_STATUS,
UMASS_T_CBI_DATA_READ, error);
}
static void
umass_t_cbi_data_write_callback(struct usb_xfer *xfer, usb_error_t error)
{
struct umass_softc *sc = usbd_xfer_softc(xfer);
uint32_t max_bulk = usbd_xfer_max_len(xfer);
int actlen, sumlen;
usbd_xfer_status(xfer, &actlen, &sumlen, NULL, NULL);
switch (USB_GET_STATE(xfer)) {
case USB_ST_TRANSFERRED:
sc->sc_transfer.data_rem -= actlen;
sc->sc_transfer.data_ptr += actlen;
sc->sc_transfer.actlen += actlen;
if (actlen < sumlen) {
/* short transfer */
sc->sc_transfer.data_rem = 0;
}
case USB_ST_SETUP:
DPRINTF(sc, UDMASS_CBI, "max_bulk=%d, data_rem=%d\n",
max_bulk, sc->sc_transfer.data_rem);
if (sc->sc_transfer.data_rem == 0) {
umass_cbi_start_status(sc);
break;
}
if (max_bulk > sc->sc_transfer.data_rem) {
max_bulk = sc->sc_transfer.data_rem;
}
usbd_xfer_set_timeout(xfer, sc->sc_transfer.data_timeout);
usbd_xfer_set_frame_data(xfer, 0, sc->sc_transfer.data_ptr,
max_bulk);
usbd_transfer_submit(xfer);
break;
default: /* Error */
if ((error == USB_ERR_CANCELLED) ||
(sc->sc_transfer.callback != &umass_cam_cb)) {
umass_tr_error(xfer, error);
} else {
umass_transfer_start(sc, UMASS_T_CBI_DATA_WR_CS);
}
break;
}
}
static void
umass_t_cbi_data_wr_cs_callback(struct usb_xfer *xfer, usb_error_t error)
{
umass_t_cbi_data_clear_stall_callback(xfer, UMASS_T_CBI_STATUS,
UMASS_T_CBI_DATA_WRITE, error);
}
static void
umass_t_cbi_status_callback(struct usb_xfer *xfer, usb_error_t error)
{
struct umass_softc *sc = usbd_xfer_softc(xfer);
union ccb *ccb = sc->sc_transfer.ccb;
struct usb_page_cache *pc;
uint32_t residue;
uint8_t status;
int actlen;
usbd_xfer_status(xfer, &actlen, NULL, NULL, NULL);
switch (USB_GET_STATE(xfer)) {
case USB_ST_TRANSFERRED:
if (actlen < (int)sizeof(sc->sbl)) {
goto tr_setup;
}
pc = usbd_xfer_get_frame(xfer, 0);
usbd_copy_out(pc, 0, &sc->sbl, sizeof(sc->sbl));
residue = (sc->sc_transfer.data_len -
sc->sc_transfer.actlen);
/* dissect the information in the buffer */
if (sc->sc_proto & UMASS_PROTO_UFI) {
/*
* Section 3.4.3.1.3 specifies that the UFI command
* protocol returns an ASC and ASCQ in the interrupt
* data block.
*/
DPRINTF(sc, UDMASS_CBI, "UFI CCI, ASC = 0x%02x, "
"ASCQ = 0x%02x\n", sc->sbl.ufi.asc,
sc->sbl.ufi.ascq);
status = (((sc->sbl.ufi.asc == 0) &&
(sc->sbl.ufi.ascq == 0)) ?
STATUS_CMD_OK : STATUS_CMD_FAILED);
sc->sc_transfer.ccb = NULL;
sc->sc_last_xfer_index = UMASS_T_CBI_COMMAND;
(sc->sc_transfer.callback)
(sc, ccb, residue, status);
break;
} else {
/* Command Interrupt Data Block */
DPRINTF(sc, UDMASS_CBI, "type=0x%02x, value=0x%02x\n",
sc->sbl.common.type, sc->sbl.common.value);
if (sc->sbl.common.type == IDB_TYPE_CCI) {
status = (sc->sbl.common.value & IDB_VALUE_STATUS_MASK);
status = ((status == IDB_VALUE_PASS) ? STATUS_CMD_OK :
(status == IDB_VALUE_FAIL) ? STATUS_CMD_FAILED :
(status == IDB_VALUE_PERSISTENT) ? STATUS_CMD_FAILED :
STATUS_WIRE_FAILED);
sc->sc_transfer.ccb = NULL;
sc->sc_last_xfer_index = UMASS_T_CBI_COMMAND;
(sc->sc_transfer.callback)
(sc, ccb, residue, status);
break;
}
}
/* fallthrough */
case USB_ST_SETUP:
tr_setup:
usbd_xfer_set_frame_len(xfer, 0, usbd_xfer_max_len(xfer));
usbd_transfer_submit(xfer);
break;
default: /* Error */
DPRINTF(sc, UDMASS_CBI, "Failed to read CSW: %s\n",
usbd_errstr(error));
umass_tr_error(xfer, error);
break;
}
}
/*
* CAM specific functions (used by SCSI, UFI, 8070i (ATAPI))
*/
static int
umass_cam_attach_sim(struct umass_softc *sc)
{
struct cam_devq *devq; /* Per device Queue */
/*
* A HBA is attached to the CAM layer.
*
* The CAM layer will then after a while start probing for devices on
* the bus. The number of SIMs is limited to one.
*/
devq = cam_simq_alloc(1 /* maximum openings */ );
if (devq == NULL) {
return (ENOMEM);
}
sc->sc_sim = cam_sim_alloc
(&umass_cam_action, &umass_cam_poll,
DEVNAME_SIM,
sc /* priv */ ,
sc->sc_unit /* unit number */ ,
&sc->sc_mtx /* mutex */ ,
1 /* maximum device openings */ ,
0 /* maximum tagged device openings */ ,
devq);
if (sc->sc_sim == NULL) {
cam_simq_free(devq);
return (ENOMEM);
}
mtx_lock(&sc->sc_mtx);
if (xpt_bus_register(sc->sc_sim, sc->sc_dev,
sc->sc_unit) != CAM_SUCCESS) {
mtx_unlock(&sc->sc_mtx);
return (ENOMEM);
}
mtx_unlock(&sc->sc_mtx);
return (0);
}
static void
umass_cam_attach(struct umass_softc *sc)
{
#ifndef USB_DEBUG
if (bootverbose)
#endif
printf("%s:%d:%d: Attached to scbus%d\n",
sc->sc_name, cam_sim_path(sc->sc_sim),
sc->sc_unit, cam_sim_path(sc->sc_sim));
}
/* umass_cam_detach
* detach from the CAM layer
*/
static void
umass_cam_detach_sim(struct umass_softc *sc)
{
if (sc->sc_sim != NULL) {
if (xpt_bus_deregister(cam_sim_path(sc->sc_sim))) {
/* accessing the softc is not possible after this */
sc->sc_sim->softc = NULL;
cam_sim_free(sc->sc_sim, /* free_devq */ TRUE);
} else {
panic("%s: CAM layer is busy\n",
sc->sc_name);
}
sc->sc_sim = NULL;
}
}
/* umass_cam_action
* CAM requests for action come through here
*/
static void
umass_cam_action(struct cam_sim *sim, union ccb *ccb)
{
struct umass_softc *sc = (struct umass_softc *)sim->softc;
if (sc == NULL) {
ccb->ccb_h.status = CAM_SEL_TIMEOUT;
xpt_done(ccb);
return;
}
/* Perform the requested action */
switch (ccb->ccb_h.func_code) {
case XPT_SCSI_IO:
{
uint8_t *cmd;
uint8_t dir;
if (ccb->csio.ccb_h.flags & CAM_CDB_POINTER) {
cmd = (uint8_t *)(ccb->csio.cdb_io.cdb_ptr);
} else {
cmd = (uint8_t *)(ccb->csio.cdb_io.cdb_bytes);
}
DPRINTF(sc, UDMASS_SCSI, "%d:%d:%jx:XPT_SCSI_IO: "
"cmd: 0x%02x, flags: 0x%02x, "
"%db cmd/%db data/%db sense\n",
cam_sim_path(sc->sc_sim), ccb->ccb_h.target_id,
(uintmax_t)ccb->ccb_h.target_lun, cmd[0],
ccb->ccb_h.flags & CAM_DIR_MASK, ccb->csio.cdb_len,
ccb->csio.dxfer_len, ccb->csio.sense_len);
if (sc->sc_transfer.ccb) {
DPRINTF(sc, UDMASS_SCSI, "%d:%d:%jx:XPT_SCSI_IO: "
"I/O in progress, deferring\n",
cam_sim_path(sc->sc_sim), ccb->ccb_h.target_id,
(uintmax_t)ccb->ccb_h.target_lun);
ccb->ccb_h.status = CAM_SCSI_BUSY;
xpt_done(ccb);
goto done;
}
switch (ccb->ccb_h.flags & CAM_DIR_MASK) {
case CAM_DIR_IN:
dir = DIR_IN;
break;
case CAM_DIR_OUT:
dir = DIR_OUT;
DIF(UDMASS_SCSI,
umass_dump_buffer(sc, ccb->csio.data_ptr,
ccb->csio.dxfer_len, 48));
break;
default:
dir = DIR_NONE;
}
ccb->ccb_h.status = CAM_REQ_INPROG | CAM_SIM_QUEUED;
/*
* sc->sc_transform will convert the command to the
* command format needed by the specific command set
* and return the converted command in
* "sc->sc_transfer.cmd_data"
*/
if (umass_std_transform(sc, ccb, cmd, ccb->csio.cdb_len)) {
if (sc->sc_transfer.cmd_data[0] == INQUIRY) {
const char *pserial;
pserial = usb_get_serial(sc->sc_udev);
/*
* Umass devices don't generally report their serial numbers
* in the usual SCSI way. Emulate it here.
*/
if ((sc->sc_transfer.cmd_data[1] & SI_EVPD) &&
(sc->sc_transfer.cmd_data[2] == SVPD_UNIT_SERIAL_NUMBER) &&
(pserial[0] != '\0')) {
struct scsi_vpd_unit_serial_number *vpd_serial;
vpd_serial = (struct scsi_vpd_unit_serial_number *)ccb->csio.data_ptr;
vpd_serial->length = strlen(pserial);
if (vpd_serial->length > sizeof(vpd_serial->serial_num))
vpd_serial->length = sizeof(vpd_serial->serial_num);
memcpy(vpd_serial->serial_num, pserial, vpd_serial->length);
ccb->csio.scsi_status = SCSI_STATUS_OK;
ccb->ccb_h.status = CAM_REQ_CMP;
xpt_done(ccb);
goto done;
}
/*
* Handle EVPD inquiry for broken devices first
* NO_INQUIRY also implies NO_INQUIRY_EVPD
*/
if ((sc->sc_quirks & (NO_INQUIRY_EVPD | NO_INQUIRY)) &&
(sc->sc_transfer.cmd_data[1] & SI_EVPD)) {
scsi_set_sense_data(&ccb->csio.sense_data,
/*sense_format*/ SSD_TYPE_NONE,
/*current_error*/ 1,
/*sense_key*/ SSD_KEY_ILLEGAL_REQUEST,
/*asc*/ 0x24,
/*ascq*/ 0x00,
/*extra args*/ SSD_ELEM_NONE);
ccb->csio.scsi_status = SCSI_STATUS_CHECK_COND;
ccb->ccb_h.status =
CAM_SCSI_STATUS_ERROR |
CAM_AUTOSNS_VALID |
CAM_DEV_QFRZN;
xpt_freeze_devq(ccb->ccb_h.path, 1);
xpt_done(ccb);
goto done;
}
/*
* Return fake inquiry data for
* broken devices
*/
if (sc->sc_quirks & NO_INQUIRY) {
memcpy(ccb->csio.data_ptr, &fake_inq_data,
sizeof(fake_inq_data));
ccb->csio.scsi_status = SCSI_STATUS_OK;
ccb->ccb_h.status = CAM_REQ_CMP;
xpt_done(ccb);
goto done;
}
if (sc->sc_quirks & FORCE_SHORT_INQUIRY) {
ccb->csio.dxfer_len = SHORT_INQUIRY_LENGTH;
}
} else if (sc->sc_transfer.cmd_data[0] == PREVENT_ALLOW) {
if (sc->sc_quirks & NO_PREVENT_ALLOW) {
ccb->csio.scsi_status = SCSI_STATUS_OK;
ccb->ccb_h.status = CAM_REQ_CMP;
xpt_done(ccb);
goto done;
}
} else if (sc->sc_transfer.cmd_data[0] == SYNCHRONIZE_CACHE) {
if (sc->sc_quirks & NO_SYNCHRONIZE_CACHE) {
ccb->csio.scsi_status = SCSI_STATUS_OK;
ccb->ccb_h.status = CAM_REQ_CMP;
xpt_done(ccb);
goto done;
}
}
umass_command_start(sc, dir, ccb->csio.data_ptr,
ccb->csio.dxfer_len,
ccb->ccb_h.timeout,
&umass_cam_cb, ccb);
}
break;
}
case XPT_PATH_INQ:
{
struct ccb_pathinq *cpi = &ccb->cpi;
DPRINTF(sc, UDMASS_SCSI, "%d:%d:%jx:XPT_PATH_INQ:.\n",
sc ? cam_sim_path(sc->sc_sim) : -1, ccb->ccb_h.target_id,
(uintmax_t)ccb->ccb_h.target_lun);
/* host specific information */
cpi->version_num = 1;
cpi->hba_inquiry = 0;
cpi->target_sprt = 0;
cpi->hba_misc = PIM_NO_6_BYTE;
cpi->hba_eng_cnt = 0;
cpi->max_target = UMASS_SCSIID_MAX; /* one target */
cpi->initiator_id = UMASS_SCSIID_HOST;
strlcpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN);
strlcpy(cpi->hba_vid, "USB SCSI", HBA_IDLEN);
strlcpy(cpi->dev_name, cam_sim_name(sim), DEV_IDLEN);
cpi->unit_number = cam_sim_unit(sim);
cpi->bus_id = sc->sc_unit;
cpi->protocol = PROTO_SCSI;
cpi->protocol_version = SCSI_REV_2;
cpi->transport = XPORT_USB;
cpi->transport_version = 0;
if (sc == NULL) {
cpi->base_transfer_speed = 0;
cpi->max_lun = 0;
} else {
if (sc->sc_quirks & FLOPPY_SPEED) {
cpi->base_transfer_speed =
UMASS_FLOPPY_TRANSFER_SPEED;
} else {
switch (usbd_get_speed(sc->sc_udev)) {
case USB_SPEED_SUPER:
cpi->base_transfer_speed =
UMASS_SUPER_TRANSFER_SPEED;
cpi->maxio = MAXPHYS;
break;
case USB_SPEED_HIGH:
cpi->base_transfer_speed =
UMASS_HIGH_TRANSFER_SPEED;
break;
default:
cpi->base_transfer_speed =
UMASS_FULL_TRANSFER_SPEED;
break;
}
}
cpi->max_lun = sc->sc_maxlun;
}
cpi->ccb_h.status = CAM_REQ_CMP;
xpt_done(ccb);
break;
}
case XPT_RESET_DEV:
{
DPRINTF(sc, UDMASS_SCSI, "%d:%d:%jx:XPT_RESET_DEV:.\n",
cam_sim_path(sc->sc_sim), ccb->ccb_h.target_id,
(uintmax_t)ccb->ccb_h.target_lun);
umass_reset(sc);
ccb->ccb_h.status = CAM_REQ_CMP;
xpt_done(ccb);
break;
}
case XPT_GET_TRAN_SETTINGS:
{
struct ccb_trans_settings *cts = &ccb->cts;
DPRINTF(sc, UDMASS_SCSI, "%d:%d:%jx:XPT_GET_TRAN_SETTINGS:.\n",
cam_sim_path(sc->sc_sim), ccb->ccb_h.target_id,
(uintmax_t)ccb->ccb_h.target_lun);
cts->protocol = PROTO_SCSI;
cts->protocol_version = SCSI_REV_2;
cts->transport = XPORT_USB;
cts->transport_version = 0;
cts->xport_specific.valid = 0;
ccb->ccb_h.status = CAM_REQ_CMP;
xpt_done(ccb);
break;
}
case XPT_SET_TRAN_SETTINGS:
{
DPRINTF(sc, UDMASS_SCSI, "%d:%d:%jx:XPT_SET_TRAN_SETTINGS:.\n",
cam_sim_path(sc->sc_sim), ccb->ccb_h.target_id,
(uintmax_t)ccb->ccb_h.target_lun);
ccb->ccb_h.status = CAM_FUNC_NOTAVAIL;
xpt_done(ccb);
break;
}
case XPT_CALC_GEOMETRY:
{
cam_calc_geometry(&ccb->ccg, /* extended */ 1);
xpt_done(ccb);
break;
}
case XPT_NOOP:
{
DPRINTF(sc, UDMASS_SCSI, "%d:%d:%jx:XPT_NOOP:.\n",
sc ? cam_sim_path(sc->sc_sim) : -1, ccb->ccb_h.target_id,
(uintmax_t)ccb->ccb_h.target_lun);
ccb->ccb_h.status = CAM_REQ_CMP;
xpt_done(ccb);
break;
}
default:
DPRINTF(sc, UDMASS_SCSI, "%d:%d:%jx:func_code 0x%04x: "
"Not implemented\n",
sc ? cam_sim_path(sc->sc_sim) : -1, ccb->ccb_h.target_id,
(uintmax_t)ccb->ccb_h.target_lun, ccb->ccb_h.func_code);
ccb->ccb_h.status = CAM_FUNC_NOTAVAIL;
xpt_done(ccb);
break;
}
done:
return;
}
static void
umass_cam_poll(struct cam_sim *sim)
{
struct umass_softc *sc = (struct umass_softc *)sim->softc;
if (sc == NULL)
return;
DPRINTF(sc, UDMASS_SCSI, "CAM poll\n");
usbd_transfer_poll(sc->sc_xfer, UMASS_T_MAX);
}
/* umass_cam_cb
* finalise a completed CAM command
*/
static void
umass_cam_cb(struct umass_softc *sc, union ccb *ccb, uint32_t residue,
uint8_t status)
{
ccb->csio.resid = residue;
switch (status) {
case STATUS_CMD_OK:
ccb->ccb_h.status = CAM_REQ_CMP;
if ((sc->sc_quirks & READ_CAPACITY_OFFBY1) &&
(ccb->ccb_h.func_code == XPT_SCSI_IO) &&
(ccb->csio.cdb_io.cdb_bytes[0] == READ_CAPACITY)) {
struct scsi_read_capacity_data *rcap;
uint32_t maxsector;
rcap = (void *)(ccb->csio.data_ptr);
maxsector = scsi_4btoul(rcap->addr) - 1;
scsi_ulto4b(maxsector, rcap->addr);
}
/*
* We have to add SVPD_UNIT_SERIAL_NUMBER to the list
* of pages supported by the device - otherwise, CAM
* will never ask us for the serial number if the
* device cannot handle that by itself.
*/
if (ccb->ccb_h.func_code == XPT_SCSI_IO &&
sc->sc_transfer.cmd_data[0] == INQUIRY &&
(sc->sc_transfer.cmd_data[1] & SI_EVPD) &&
sc->sc_transfer.cmd_data[2] == SVPD_SUPPORTED_PAGE_LIST &&
(usb_get_serial(sc->sc_udev)[0] != '\0')) {
struct ccb_scsiio *csio;
struct scsi_vpd_supported_page_list *page_list;
csio = &ccb->csio;
page_list = (struct scsi_vpd_supported_page_list *)csio->data_ptr;
if (page_list->length + 1 < SVPD_SUPPORTED_PAGES_SIZE) {
page_list->list[page_list->length] = SVPD_UNIT_SERIAL_NUMBER;
page_list->length++;
}
}
xpt_done(ccb);
break;
case STATUS_CMD_UNKNOWN:
case STATUS_CMD_FAILED:
/* fetch sense data */
/* the rest of the command was filled in at attach */
sc->cam_scsi_sense.length = ccb->csio.sense_len;
DPRINTF(sc, UDMASS_SCSI, "Fetching %d bytes of "
"sense data\n", ccb->csio.sense_len);
if (umass_std_transform(sc, ccb, &sc->cam_scsi_sense.opcode,
sizeof(sc->cam_scsi_sense))) {
if ((sc->sc_quirks & FORCE_SHORT_INQUIRY) &&
(sc->sc_transfer.cmd_data[0] == INQUIRY)) {
ccb->csio.sense_len = SHORT_INQUIRY_LENGTH;
}
umass_command_start(sc, DIR_IN, &ccb->csio.sense_data.error_code,
ccb->csio.sense_len, ccb->ccb_h.timeout,
&umass_cam_sense_cb, ccb);
}
break;
default:
/*
* The wire protocol failed and will hopefully have
* recovered. We return an error to CAM and let CAM
* retry the command if necessary.
*/
xpt_freeze_devq(ccb->ccb_h.path, 1);
ccb->ccb_h.status = CAM_REQ_CMP_ERR | CAM_DEV_QFRZN;
xpt_done(ccb);
break;
}
}
/*
* Finalise a completed autosense operation
*/
static void
umass_cam_sense_cb(struct umass_softc *sc, union ccb *ccb, uint32_t residue,
uint8_t status)
{
uint8_t *cmd;
switch (status) {
case STATUS_CMD_OK:
case STATUS_CMD_UNKNOWN:
case STATUS_CMD_FAILED: {
int key, sense_len;
ccb->csio.sense_resid = residue;
sense_len = ccb->csio.sense_len - ccb->csio.sense_resid;
key = scsi_get_sense_key(&ccb->csio.sense_data, sense_len,
/*show_errors*/ 1);
if (ccb->csio.ccb_h.flags & CAM_CDB_POINTER) {
cmd = (uint8_t *)(ccb->csio.cdb_io.cdb_ptr);
} else {
cmd = (uint8_t *)(ccb->csio.cdb_io.cdb_bytes);
}
/*
* Getting sense data always succeeds (apart from wire
* failures):
*/
if ((sc->sc_quirks & RS_NO_CLEAR_UA) &&
(cmd[0] == INQUIRY) &&
(key == SSD_KEY_UNIT_ATTENTION)) {
/*
* Ignore unit attention errors in the case where
* the Unit Attention state is not cleared on
* REQUEST SENSE. They will appear again at the next
* command.
*/
ccb->ccb_h.status = CAM_REQ_CMP;
} else if (key == SSD_KEY_NO_SENSE) {
/*
* No problem after all (in the case of CBI without
* CCI)
*/
ccb->ccb_h.status = CAM_REQ_CMP;
} else if ((sc->sc_quirks & RS_NO_CLEAR_UA) &&
(cmd[0] == READ_CAPACITY) &&
(key == SSD_KEY_UNIT_ATTENTION)) {
/*
* Some devices do not clear the unit attention error
* on request sense. We insert a test unit ready
* command to make sure we clear the unit attention
* condition, then allow the retry to proceed as
* usual.
*/
xpt_freeze_devq(ccb->ccb_h.path, 1);
ccb->ccb_h.status = CAM_SCSI_STATUS_ERROR
| CAM_AUTOSNS_VALID | CAM_DEV_QFRZN;
ccb->csio.scsi_status = SCSI_STATUS_CHECK_COND;
#if 0
DELAY(300000);
#endif
DPRINTF(sc, UDMASS_SCSI, "Doing a sneaky"
"TEST_UNIT_READY\n");
/* the rest of the command was filled in at attach */
if ((sc->sc_transform)(sc,
&sc->cam_scsi_test_unit_ready.opcode,
sizeof(sc->cam_scsi_test_unit_ready)) == 1) {
umass_command_start(sc, DIR_NONE, NULL, 0,
ccb->ccb_h.timeout,
&umass_cam_quirk_cb, ccb);
break;
}
} else {
xpt_freeze_devq(ccb->ccb_h.path, 1);
if (key >= 0) {
ccb->ccb_h.status = CAM_SCSI_STATUS_ERROR
| CAM_AUTOSNS_VALID | CAM_DEV_QFRZN;
ccb->csio.scsi_status = SCSI_STATUS_CHECK_COND;
} else
ccb->ccb_h.status = CAM_AUTOSENSE_FAIL
| CAM_DEV_QFRZN;
}
xpt_done(ccb);
break;
}
default:
DPRINTF(sc, UDMASS_SCSI, "Autosense failed, "
"status %d\n", status);
xpt_freeze_devq(ccb->ccb_h.path, 1);
ccb->ccb_h.status = CAM_AUTOSENSE_FAIL | CAM_DEV_QFRZN;
xpt_done(ccb);
}
}
/*
* This completion code just handles the fact that we sent a test-unit-ready
* after having previously failed a READ CAPACITY with CHECK_COND. The CCB
* status for CAM is already set earlier.
*/
static void
umass_cam_quirk_cb(struct umass_softc *sc, union ccb *ccb, uint32_t residue,
uint8_t status)
{
DPRINTF(sc, UDMASS_SCSI, "Test unit ready "
"returned status %d\n", status);
xpt_done(ccb);
}
/*
* SCSI specific functions
*/
static uint8_t
umass_scsi_transform(struct umass_softc *sc, uint8_t *cmd_ptr,
uint8_t cmd_len)
{
if ((cmd_len == 0) ||
(cmd_len > sizeof(sc->sc_transfer.cmd_data))) {
DPRINTF(sc, UDMASS_SCSI, "Invalid command "
"length: %d bytes\n", cmd_len);
return (0); /* failure */
}
sc->sc_transfer.cmd_len = cmd_len;
switch (cmd_ptr[0]) {
case TEST_UNIT_READY:
if (sc->sc_quirks & NO_TEST_UNIT_READY) {
DPRINTF(sc, UDMASS_SCSI, "Converted TEST_UNIT_READY "
"to START_UNIT\n");
memset(sc->sc_transfer.cmd_data, 0, cmd_len);
sc->sc_transfer.cmd_data[0] = START_STOP_UNIT;
sc->sc_transfer.cmd_data[4] = SSS_START;
return (1);
}
break;
case INQUIRY:
/*
* some drives wedge when asked for full inquiry
* information.
*/
if (sc->sc_quirks & FORCE_SHORT_INQUIRY) {
memcpy(sc->sc_transfer.cmd_data, cmd_ptr, cmd_len);
sc->sc_transfer.cmd_data[4] = SHORT_INQUIRY_LENGTH;
return (1);
}
break;
}
memcpy(sc->sc_transfer.cmd_data, cmd_ptr, cmd_len);
return (1);
}
static uint8_t
umass_rbc_transform(struct umass_softc *sc, uint8_t *cmd_ptr, uint8_t cmd_len)
{
if ((cmd_len == 0) ||
(cmd_len > sizeof(sc->sc_transfer.cmd_data))) {
DPRINTF(sc, UDMASS_SCSI, "Invalid command "
"length: %d bytes\n", cmd_len);
return (0); /* failure */
}
switch (cmd_ptr[0]) {
/* these commands are defined in RBC: */
case READ_10:
case READ_CAPACITY:
case START_STOP_UNIT:
case SYNCHRONIZE_CACHE:
case WRITE_10:
case VERIFY_10:
case INQUIRY:
case MODE_SELECT_10:
case MODE_SENSE_10:
case TEST_UNIT_READY:
case WRITE_BUFFER:
/*
* The following commands are not listed in my copy of the
* RBC specs. CAM however seems to want those, and at least
* the Sony DSC device appears to support those as well
*/
case REQUEST_SENSE:
case PREVENT_ALLOW:
memcpy(sc->sc_transfer.cmd_data, cmd_ptr, cmd_len);
if ((sc->sc_quirks & RBC_PAD_TO_12) && (cmd_len < 12)) {
memset(sc->sc_transfer.cmd_data + cmd_len,
0, 12 - cmd_len);
cmd_len = 12;
}
sc->sc_transfer.cmd_len = cmd_len;
return (1); /* success */
/* All other commands are not legal in RBC */
default:
DPRINTF(sc, UDMASS_SCSI, "Unsupported RBC "
"command 0x%02x\n", cmd_ptr[0]);
return (0); /* failure */
}
}
static uint8_t
umass_ufi_transform(struct umass_softc *sc, uint8_t *cmd_ptr,
uint8_t cmd_len)
{
if ((cmd_len == 0) ||
(cmd_len > sizeof(sc->sc_transfer.cmd_data))) {
DPRINTF(sc, UDMASS_SCSI, "Invalid command "
"length: %d bytes\n", cmd_len);
return (0); /* failure */
}
/* An UFI command is always 12 bytes in length */
sc->sc_transfer.cmd_len = UFI_COMMAND_LENGTH;
/* Zero the command data */
memset(sc->sc_transfer.cmd_data, 0, UFI_COMMAND_LENGTH);
switch (cmd_ptr[0]) {
/*
* Commands of which the format has been verified. They
* should work. Copy the command into the (zeroed out)
* destination buffer.
*/
case TEST_UNIT_READY:
if (sc->sc_quirks & NO_TEST_UNIT_READY) {
/*
* Some devices do not support this command. Start
* Stop Unit should give the same results
*/
DPRINTF(sc, UDMASS_UFI, "Converted TEST_UNIT_READY "
"to START_UNIT\n");
sc->sc_transfer.cmd_data[0] = START_STOP_UNIT;
sc->sc_transfer.cmd_data[4] = SSS_START;
return (1);
}
break;
case REZERO_UNIT:
case REQUEST_SENSE:
case FORMAT_UNIT:
case INQUIRY:
case START_STOP_UNIT:
case SEND_DIAGNOSTIC:
case PREVENT_ALLOW:
case READ_CAPACITY:
case READ_10:
case WRITE_10:
case POSITION_TO_ELEMENT: /* SEEK_10 */
case WRITE_AND_VERIFY:
case VERIFY:
case MODE_SELECT_10:
case MODE_SENSE_10:
case READ_12:
case WRITE_12:
case READ_FORMAT_CAPACITIES:
break;
/*
* SYNCHRONIZE_CACHE isn't supported by UFI, nor should it be
* required for UFI devices, so it is appropriate to fake
* success.
*/
case SYNCHRONIZE_CACHE:
return (2);
default:
DPRINTF(sc, UDMASS_SCSI, "Unsupported UFI "
"command 0x%02x\n", cmd_ptr[0]);
return (0); /* failure */
}
memcpy(sc->sc_transfer.cmd_data, cmd_ptr, cmd_len);
return (1); /* success */
}
/*
* 8070i (ATAPI) specific functions
*/
static uint8_t
umass_atapi_transform(struct umass_softc *sc, uint8_t *cmd_ptr,
uint8_t cmd_len)
{
if ((cmd_len == 0) ||
(cmd_len > sizeof(sc->sc_transfer.cmd_data))) {
DPRINTF(sc, UDMASS_SCSI, "Invalid command "
"length: %d bytes\n", cmd_len);
return (0); /* failure */
}
/* An ATAPI command is always 12 bytes in length. */
sc->sc_transfer.cmd_len = ATAPI_COMMAND_LENGTH;
/* Zero the command data */
memset(sc->sc_transfer.cmd_data, 0, ATAPI_COMMAND_LENGTH);
switch (cmd_ptr[0]) {
/*
* Commands of which the format has been verified. They
* should work. Copy the command into the destination
* buffer.
*/
case INQUIRY:
/*
* some drives wedge when asked for full inquiry
* information.
*/
if (sc->sc_quirks & FORCE_SHORT_INQUIRY) {
memcpy(sc->sc_transfer.cmd_data, cmd_ptr, cmd_len);
sc->sc_transfer.cmd_data[4] = SHORT_INQUIRY_LENGTH;
return (1);
}
break;
case TEST_UNIT_READY:
if (sc->sc_quirks & NO_TEST_UNIT_READY) {
DPRINTF(sc, UDMASS_SCSI, "Converted TEST_UNIT_READY "
"to START_UNIT\n");
sc->sc_transfer.cmd_data[0] = START_STOP_UNIT;
sc->sc_transfer.cmd_data[4] = SSS_START;
return (1);
}
break;
case REZERO_UNIT:
case REQUEST_SENSE:
case START_STOP_UNIT:
case SEND_DIAGNOSTIC:
case PREVENT_ALLOW:
case READ_CAPACITY:
case READ_10:
case WRITE_10:
case POSITION_TO_ELEMENT: /* SEEK_10 */
case SYNCHRONIZE_CACHE:
case MODE_SELECT_10:
case MODE_SENSE_10:
case READ_BUFFER:
case 0x42: /* READ_SUBCHANNEL */
case 0x43: /* READ_TOC */
case 0x44: /* READ_HEADER */
case 0x47: /* PLAY_MSF (Play Minute/Second/Frame) */
case 0x48: /* PLAY_TRACK */
case 0x49: /* PLAY_TRACK_REL */
case 0x4b: /* PAUSE */
case 0x51: /* READ_DISK_INFO */
case 0x52: /* READ_TRACK_INFO */
case 0x54: /* SEND_OPC */
case 0x59: /* READ_MASTER_CUE */
case 0x5b: /* CLOSE_TR_SESSION */
case 0x5c: /* READ_BUFFER_CAP */
case 0x5d: /* SEND_CUE_SHEET */
case 0xa1: /* BLANK */
case 0xa5: /* PLAY_12 */
case 0xa6: /* EXCHANGE_MEDIUM */
case 0xad: /* READ_DVD_STRUCTURE */
case 0xbb: /* SET_CD_SPEED */
case 0xe5: /* READ_TRACK_INFO_PHILIPS */
break;
case READ_12:
case WRITE_12:
default:
DPRINTF(sc, UDMASS_SCSI, "Unsupported ATAPI "
"command 0x%02x - trying anyway\n",
cmd_ptr[0]);
break;
}
memcpy(sc->sc_transfer.cmd_data, cmd_ptr, cmd_len);
return (1); /* success */
}
static uint8_t
umass_no_transform(struct umass_softc *sc, uint8_t *cmd,
uint8_t cmdlen)
{
return (0); /* failure */
}
static uint8_t
umass_std_transform(struct umass_softc *sc, union ccb *ccb,
uint8_t *cmd, uint8_t cmdlen)
{
uint8_t retval;
retval = (sc->sc_transform) (sc, cmd, cmdlen);
if (retval == 2) {
ccb->ccb_h.status = CAM_REQ_CMP;
xpt_done(ccb);
return (0);
} else if (retval == 0) {
xpt_freeze_devq(ccb->ccb_h.path, 1);
ccb->ccb_h.status = CAM_REQ_INVALID | CAM_DEV_QFRZN;
xpt_done(ccb);
return (0);
}
/* Command should be executed */
return (1);
}
#ifdef USB_DEBUG
static void
umass_bbb_dump_cbw(struct umass_softc *sc, umass_bbb_cbw_t *cbw)
{
uint8_t *c = cbw->CBWCDB;
uint32_t dlen = UGETDW(cbw->dCBWDataTransferLength);
uint32_t tag = UGETDW(cbw->dCBWTag);
uint8_t clen = cbw->bCDBLength;
uint8_t flags = cbw->bCBWFlags;
uint8_t lun = cbw->bCBWLUN;
DPRINTF(sc, UDMASS_BBB, "CBW %d: cmd = %db "
"(0x%02x%02x%02x%02x%02x%02x%s), "
"data = %db, lun = %d, dir = %s\n",
tag, clen,
c[0], c[1], c[2], c[3], c[4], c[5], (clen > 6 ? "..." : ""),
dlen, lun, (flags == CBWFLAGS_IN ? "in" :
(flags == CBWFLAGS_OUT ? "out" : "<invalid>")));
}
static void
umass_bbb_dump_csw(struct umass_softc *sc, umass_bbb_csw_t *csw)
{
uint32_t sig = UGETDW(csw->dCSWSignature);
uint32_t tag = UGETDW(csw->dCSWTag);
uint32_t res = UGETDW(csw->dCSWDataResidue);
uint8_t status = csw->bCSWStatus;
DPRINTF(sc, UDMASS_BBB, "CSW %d: sig = 0x%08x (%s), tag = 0x%08x, "
"res = %d, status = 0x%02x (%s)\n",
tag, sig, (sig == CSWSIGNATURE ? "valid" : "invalid"),
tag, res,
status, (status == CSWSTATUS_GOOD ? "good" :
(status == CSWSTATUS_FAILED ? "failed" :
(status == CSWSTATUS_PHASE ? "phase" : "<invalid>"))));
}
static void
umass_cbi_dump_cmd(struct umass_softc *sc, void *cmd, uint8_t cmdlen)
{
uint8_t *c = cmd;
uint8_t dir = sc->sc_transfer.dir;
DPRINTF(sc, UDMASS_BBB, "cmd = %db "
"(0x%02x%02x%02x%02x%02x%02x%s), "
"data = %db, dir = %s\n",
cmdlen,
c[0], c[1], c[2], c[3], c[4], c[5], (cmdlen > 6 ? "..." : ""),
sc->sc_transfer.data_len,
(dir == DIR_IN ? "in" :
(dir == DIR_OUT ? "out" :
(dir == DIR_NONE ? "no data phase" : "<invalid>"))));
}
static void
umass_dump_buffer(struct umass_softc *sc, uint8_t *buffer, uint32_t buflen,
uint32_t printlen)
{
uint32_t i, j;
char s1[40];
char s2[40];
char s3[5];
s1[0] = '\0';
s3[0] = '\0';
sprintf(s2, " buffer=%p, buflen=%d", buffer, buflen);
for (i = 0; (i < buflen) && (i < printlen); i++) {
j = i % 16;
if (j == 0 && i != 0) {
DPRINTF(sc, UDMASS_GEN, "0x %s%s\n",
s1, s2);
s2[0] = '\0';
}
sprintf(&s1[j * 2], "%02x", buffer[i] & 0xff);
}
if (buflen > printlen)
sprintf(s3, " ...");
DPRINTF(sc, UDMASS_GEN, "0x %s%s%s\n",
s1, s2, s3);
}
#endif
Index: head/sys/dev/usb/usb_dev.c
===================================================================
--- head/sys/dev/usb/usb_dev.c (revision 327172)
+++ head/sys/dev/usb/usb_dev.c (revision 327173)
@@ -1,2482 +1,2470 @@
/* $FreeBSD$ */
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2006-2008 Hans Petter Selasky. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*
* usb_dev.c - An abstraction layer for creating devices under /dev/...
*/
#ifdef USB_GLOBAL_INCLUDE_FILE
#include USB_GLOBAL_INCLUDE_FILE
#else
#include <sys/stdint.h>
#include <sys/stddef.h>
#include <sys/param.h>
#include <sys/queue.h>
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/bus.h>
#include <sys/module.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/condvar.h>
#include <sys/sysctl.h>
#include <sys/sx.h>
#include <sys/unistd.h>
#include <sys/callout.h>
#include <sys/malloc.h>
#include <sys/priv.h>
#include <sys/vnode.h>
#include <sys/conf.h>
#include <sys/fcntl.h>
#include <dev/usb/usb.h>
#include <dev/usb/usb_ioctl.h>
#include <dev/usb/usbdi.h>
#include <dev/usb/usbdi_util.h>
#define USB_DEBUG_VAR usb_fifo_debug
#include <dev/usb/usb_core.h>
#include <dev/usb/usb_dev.h>
#include <dev/usb/usb_mbuf.h>
#include <dev/usb/usb_process.h>
#include <dev/usb/usb_device.h>
#include <dev/usb/usb_debug.h>
#include <dev/usb/usb_busdma.h>
#include <dev/usb/usb_generic.h>
#include <dev/usb/usb_dynamic.h>
#include <dev/usb/usb_util.h>
#include <dev/usb/usb_controller.h>
#include <dev/usb/usb_bus.h>
#include <sys/filio.h>
#include <sys/ttycom.h>
#include <sys/syscallsubr.h>
#include <machine/stdarg.h>
#endif /* USB_GLOBAL_INCLUDE_FILE */
#if USB_HAVE_UGEN
#ifdef USB_DEBUG
static int usb_fifo_debug = 0;
static SYSCTL_NODE(_hw_usb, OID_AUTO, dev, CTLFLAG_RW, 0, "USB device");
SYSCTL_INT(_hw_usb_dev, OID_AUTO, debug, CTLFLAG_RWTUN,
&usb_fifo_debug, 0, "Debug Level");
#endif
#if ((__FreeBSD_version >= 700001) || (__FreeBSD_version == 0) || \
((__FreeBSD_version >= 600034) && (__FreeBSD_version < 700000)))
#define USB_UCRED struct ucred *ucred,
#else
#define USB_UCRED
#endif
/* prototypes */
static int usb_fifo_open(struct usb_cdev_privdata *,
struct usb_fifo *, int);
static void usb_fifo_close(struct usb_fifo *, int);
static void usb_dev_init(void *);
static void usb_dev_init_post(void *);
static void usb_dev_uninit(void *);
static int usb_fifo_uiomove(struct usb_fifo *, void *, int,
struct uio *);
static void usb_fifo_check_methods(struct usb_fifo_methods *);
static struct usb_fifo *usb_fifo_alloc(struct mtx *);
static struct usb_endpoint *usb_dev_get_ep(struct usb_device *, uint8_t,
uint8_t);
static void usb_loc_fill(struct usb_fs_privdata *,
struct usb_cdev_privdata *);
static void usb_close(void *);
static usb_error_t usb_ref_device(struct usb_cdev_privdata *, struct usb_cdev_refdata *, int);
static usb_error_t usb_usb_ref_device(struct usb_cdev_privdata *, struct usb_cdev_refdata *);
static void usb_unref_device(struct usb_cdev_privdata *, struct usb_cdev_refdata *);
static d_open_t usb_open;
static d_ioctl_t usb_ioctl;
static d_read_t usb_read;
static d_write_t usb_write;
static d_poll_t usb_poll;
static d_kqfilter_t usb_kqfilter;
static d_ioctl_t usb_static_ioctl;
static usb_fifo_open_t usb_fifo_dummy_open;
static usb_fifo_close_t usb_fifo_dummy_close;
static usb_fifo_ioctl_t usb_fifo_dummy_ioctl;
static usb_fifo_cmd_t usb_fifo_dummy_cmd;
/* character device structure used for devices (/dev/ugenX.Y and /dev/uXXX) */
struct cdevsw usb_devsw = {
.d_version = D_VERSION,
.d_open = usb_open,
.d_ioctl = usb_ioctl,
.d_name = "usbdev",
.d_flags = D_TRACKCLOSE,
.d_read = usb_read,
.d_write = usb_write,
.d_poll = usb_poll,
.d_kqfilter = usb_kqfilter,
};
static struct cdev* usb_dev = NULL;
/* character device structure used for /dev/usb */
static struct cdevsw usb_static_devsw = {
.d_version = D_VERSION,
.d_ioctl = usb_static_ioctl,
.d_name = "usb"
};
static TAILQ_HEAD(, usb_symlink) usb_sym_head;
static struct sx usb_sym_lock;
struct mtx usb_ref_lock;
/*------------------------------------------------------------------------*
* usb_loc_fill
*
* This is used to fill out a usb_cdev_privdata structure based on the
* device's address as contained in usb_fs_privdata.
*------------------------------------------------------------------------*/
static void
usb_loc_fill(struct usb_fs_privdata* pd, struct usb_cdev_privdata *cpd)
{
cpd->bus_index = pd->bus_index;
cpd->dev_index = pd->dev_index;
cpd->ep_addr = pd->ep_addr;
cpd->fifo_index = pd->fifo_index;
}
/*------------------------------------------------------------------------*
* usb_ref_device
*
* This function is used to atomically refer an USB device by its
* device location. If this function returns success the USB device
* will not disappear until the USB device is unreferenced.
*
* Return values:
* 0: Success, refcount incremented on the given USB device.
* Else: Failure.
*------------------------------------------------------------------------*/
static usb_error_t
usb_ref_device(struct usb_cdev_privdata *cpd,
struct usb_cdev_refdata *crd, int need_uref)
{
struct usb_fifo **ppf;
struct usb_fifo *f;
DPRINTFN(2, "cpd=%p need uref=%d\n", cpd, need_uref);
/* clear all refs */
memset(crd, 0, sizeof(*crd));
mtx_lock(&usb_ref_lock);
cpd->bus = devclass_get_softc(usb_devclass_ptr, cpd->bus_index);
if (cpd->bus == NULL) {
DPRINTFN(2, "no bus at %u\n", cpd->bus_index);
goto error;
}
cpd->udev = cpd->bus->devices[cpd->dev_index];
if (cpd->udev == NULL) {
DPRINTFN(2, "no device at %u\n", cpd->dev_index);
goto error;
}
if (cpd->udev->state == USB_STATE_DETACHED &&
(need_uref != 2)) {
DPRINTFN(2, "device is detached\n");
goto error;
}
if (need_uref) {
DPRINTFN(2, "ref udev - needed\n");
if (cpd->udev->refcount == USB_DEV_REF_MAX) {
DPRINTFN(2, "no dev ref\n");
goto error;
}
cpd->udev->refcount++;
mtx_unlock(&usb_ref_lock);
/*
* We need to grab the enumeration SX-lock before
* grabbing the FIFO refs to avoid deadlock at detach!
*/
crd->do_unlock = usbd_enum_lock_sig(cpd->udev);
mtx_lock(&usb_ref_lock);
/*
* Set "is_uref" after grabbing the default SX lock
*/
crd->is_uref = 1;
/* check for signal */
if (crd->do_unlock > 1) {
crd->do_unlock = 0;
goto error;
}
}
/* check if we are doing an open */
if (cpd->fflags == 0) {
/* use zero defaults */
} else {
/* check for write */
if (cpd->fflags & FWRITE) {
ppf = cpd->udev->fifo;
f = ppf[cpd->fifo_index + USB_FIFO_TX];
crd->txfifo = f;
crd->is_write = 1; /* ref */
if (f == NULL || f->refcount == USB_FIFO_REF_MAX)
goto error;
if (f->curr_cpd != cpd)
goto error;
/* check if USB-FS is active */
if (f->fs_ep_max != 0) {
crd->is_usbfs = 1;
}
}
/* check for read */
if (cpd->fflags & FREAD) {
ppf = cpd->udev->fifo;
f = ppf[cpd->fifo_index + USB_FIFO_RX];
crd->rxfifo = f;
crd->is_read = 1; /* ref */
if (f == NULL || f->refcount == USB_FIFO_REF_MAX)
goto error;
if (f->curr_cpd != cpd)
goto error;
/* check if USB-FS is active */
if (f->fs_ep_max != 0) {
crd->is_usbfs = 1;
}
}
}
/* when everything is OK we increment the refcounts */
if (crd->is_write) {
DPRINTFN(2, "ref write\n");
crd->txfifo->refcount++;
}
if (crd->is_read) {
DPRINTFN(2, "ref read\n");
crd->rxfifo->refcount++;
}
mtx_unlock(&usb_ref_lock);
return (0);
error:
if (crd->do_unlock)
usbd_enum_unlock(cpd->udev);
if (crd->is_uref) {
if (--(cpd->udev->refcount) == 0)
cv_broadcast(&cpd->udev->ref_cv);
}
mtx_unlock(&usb_ref_lock);
DPRINTFN(2, "fail\n");
/* clear all refs */
memset(crd, 0, sizeof(*crd));
return (USB_ERR_INVAL);
}
/*------------------------------------------------------------------------*
* usb_usb_ref_device
*
* This function is used to upgrade an USB reference to include the
* USB device reference on a USB location.
*
* Return values:
* 0: Success, refcount incremented on the given USB device.
* Else: Failure.
*------------------------------------------------------------------------*/
static usb_error_t
usb_usb_ref_device(struct usb_cdev_privdata *cpd,
struct usb_cdev_refdata *crd)
{
/*
* Check if we already got an USB reference on this location:
*/
if (crd->is_uref)
return (0); /* success */
/*
* To avoid deadlock at detach we need to drop the FIFO ref
* and re-acquire a new ref!
*/
usb_unref_device(cpd, crd);
return (usb_ref_device(cpd, crd, 1 /* need uref */));
}
/*------------------------------------------------------------------------*
* usb_unref_device
*
* This function will release the reference count by one unit for the
* given USB device.
*------------------------------------------------------------------------*/
static void
usb_unref_device(struct usb_cdev_privdata *cpd,
struct usb_cdev_refdata *crd)
{
DPRINTFN(2, "cpd=%p is_uref=%d\n", cpd, crd->is_uref);
if (crd->do_unlock)
usbd_enum_unlock(cpd->udev);
mtx_lock(&usb_ref_lock);
if (crd->is_read) {
if (--(crd->rxfifo->refcount) == 0) {
cv_signal(&crd->rxfifo->cv_drain);
}
crd->is_read = 0;
}
if (crd->is_write) {
if (--(crd->txfifo->refcount) == 0) {
cv_signal(&crd->txfifo->cv_drain);
}
crd->is_write = 0;
}
if (crd->is_uref) {
crd->is_uref = 0;
if (--(cpd->udev->refcount) == 0)
cv_broadcast(&cpd->udev->ref_cv);
}
mtx_unlock(&usb_ref_lock);
}
static struct usb_fifo *
usb_fifo_alloc(struct mtx *mtx)
{
struct usb_fifo *f;
f = malloc(sizeof(*f), M_USBDEV, M_WAITOK | M_ZERO);
if (f != NULL) {
cv_init(&f->cv_io, "FIFO-IO");
cv_init(&f->cv_drain, "FIFO-DRAIN");
f->priv_mtx = mtx;
f->refcount = 1;
knlist_init_mtx(&f->selinfo.si_note, mtx);
}
return (f);
}
/*------------------------------------------------------------------------*
* usb_fifo_create
*------------------------------------------------------------------------*/
static int
usb_fifo_create(struct usb_cdev_privdata *cpd,
struct usb_cdev_refdata *crd)
{
struct usb_device *udev = cpd->udev;
struct usb_fifo *f;
struct usb_endpoint *ep;
uint8_t n;
uint8_t is_tx;
uint8_t is_rx;
uint8_t no_null;
uint8_t is_busy;
int e = cpd->ep_addr;
is_tx = (cpd->fflags & FWRITE) ? 1 : 0;
is_rx = (cpd->fflags & FREAD) ? 1 : 0;
no_null = 1;
is_busy = 0;
/* Preallocated FIFO */
if (e < 0) {
DPRINTFN(5, "Preallocated FIFO\n");
if (is_tx) {
f = udev->fifo[cpd->fifo_index + USB_FIFO_TX];
if (f == NULL)
return (EINVAL);
crd->txfifo = f;
}
if (is_rx) {
f = udev->fifo[cpd->fifo_index + USB_FIFO_RX];
if (f == NULL)
return (EINVAL);
crd->rxfifo = f;
}
return (0);
}
KASSERT(e >= 0 && e <= 15, ("endpoint %d out of range", e));
/* search for a free FIFO slot */
DPRINTFN(5, "Endpoint device, searching for 0x%02x\n", e);
for (n = 0;; n += 2) {
if (n == USB_FIFO_MAX) {
if (no_null) {
no_null = 0;
n = 0;
} else {
/* end of FIFOs reached */
DPRINTFN(5, "out of FIFOs\n");
return (ENOMEM);
}
}
/* Check for TX FIFO */
if (is_tx) {
f = udev->fifo[n + USB_FIFO_TX];
if (f != NULL) {
if (f->dev_ep_index != e) {
/* wrong endpoint index */
continue;
}
if (f->curr_cpd != NULL) {
/* FIFO is opened */
is_busy = 1;
continue;
}
} else if (no_null) {
continue;
}
}
/* Check for RX FIFO */
if (is_rx) {
f = udev->fifo[n + USB_FIFO_RX];
if (f != NULL) {
if (f->dev_ep_index != e) {
/* wrong endpoint index */
continue;
}
if (f->curr_cpd != NULL) {
/* FIFO is opened */
is_busy = 1;
continue;
}
} else if (no_null) {
continue;
}
}
break;
}
if (no_null == 0) {
if (e >= (USB_EP_MAX / 2)) {
/* we don't create any endpoints in this range */
DPRINTFN(5, "ep out of range\n");
return (is_busy ? EBUSY : EINVAL);
}
}
if ((e != 0) && is_busy) {
/*
* Only the default control endpoint is allowed to be
* opened multiple times!
*/
DPRINTFN(5, "busy\n");
return (EBUSY);
}
/* Check TX FIFO */
if (is_tx &&
(udev->fifo[n + USB_FIFO_TX] == NULL)) {
ep = usb_dev_get_ep(udev, e, USB_FIFO_TX);
DPRINTFN(5, "dev_get_endpoint(%d, 0x%x)\n", e, USB_FIFO_TX);
if (ep == NULL) {
DPRINTFN(5, "dev_get_endpoint returned NULL\n");
return (EINVAL);
}
f = usb_fifo_alloc(&udev->device_mtx);
if (f == NULL) {
DPRINTFN(5, "could not alloc tx fifo\n");
return (ENOMEM);
}
/* update some fields */
f->fifo_index = n + USB_FIFO_TX;
f->dev_ep_index = e;
f->priv_sc0 = ep;
f->methods = &usb_ugen_methods;
f->iface_index = ep->iface_index;
f->udev = udev;
mtx_lock(&usb_ref_lock);
udev->fifo[n + USB_FIFO_TX] = f;
mtx_unlock(&usb_ref_lock);
}
/* Check RX FIFO */
if (is_rx &&
(udev->fifo[n + USB_FIFO_RX] == NULL)) {
ep = usb_dev_get_ep(udev, e, USB_FIFO_RX);
DPRINTFN(5, "dev_get_endpoint(%d, 0x%x)\n", e, USB_FIFO_RX);
if (ep == NULL) {
DPRINTFN(5, "dev_get_endpoint returned NULL\n");
return (EINVAL);
}
f = usb_fifo_alloc(&udev->device_mtx);
if (f == NULL) {
DPRINTFN(5, "could not alloc rx fifo\n");
return (ENOMEM);
}
/* update some fields */
f->fifo_index = n + USB_FIFO_RX;
f->dev_ep_index = e;
f->priv_sc0 = ep;
f->methods = &usb_ugen_methods;
f->iface_index = ep->iface_index;
f->udev = udev;
mtx_lock(&usb_ref_lock);
udev->fifo[n + USB_FIFO_RX] = f;
mtx_unlock(&usb_ref_lock);
}
if (is_tx) {
crd->txfifo = udev->fifo[n + USB_FIFO_TX];
}
if (is_rx) {
crd->rxfifo = udev->fifo[n + USB_FIFO_RX];
}
/* fill out fifo index */
DPRINTFN(5, "fifo index = %d\n", n);
cpd->fifo_index = n;
/* complete */
return (0);
}
void
usb_fifo_free(struct usb_fifo *f)
{
uint8_t n;
if (f == NULL) {
/* be NULL safe */
return;
}
/* destroy symlink devices, if any */
for (n = 0; n != 2; n++) {
if (f->symlink[n]) {
usb_free_symlink(f->symlink[n]);
f->symlink[n] = NULL;
}
}
mtx_lock(&usb_ref_lock);
/* delink ourselves to stop calls from userland */
if ((f->fifo_index < USB_FIFO_MAX) &&
(f->udev != NULL) &&
(f->udev->fifo[f->fifo_index] == f)) {
f->udev->fifo[f->fifo_index] = NULL;
} else {
DPRINTFN(0, "USB FIFO %p has not been linked\n", f);
}
/* decrease refcount */
f->refcount--;
/* need to wait until all callers have exited */
while (f->refcount != 0) {
mtx_unlock(&usb_ref_lock); /* avoid LOR */
mtx_lock(f->priv_mtx);
/* prevent write flush, if any */
f->flag_iserror = 1;
/* get I/O thread out of any sleep state */
if (f->flag_sleeping) {
f->flag_sleeping = 0;
cv_broadcast(&f->cv_io);
}
mtx_unlock(f->priv_mtx);
mtx_lock(&usb_ref_lock);
/*
* Check if the "f->refcount" variable reached zero
* during the unlocked time before entering wait:
*/
if (f->refcount == 0)
break;
/* wait for sync */
cv_wait(&f->cv_drain, &usb_ref_lock);
}
mtx_unlock(&usb_ref_lock);
/* take care of closing the device here, if any */
usb_fifo_close(f, 0);
cv_destroy(&f->cv_io);
cv_destroy(&f->cv_drain);
knlist_clear(&f->selinfo.si_note, 0);
seldrain(&f->selinfo);
knlist_destroy(&f->selinfo.si_note);
free(f, M_USBDEV);
}
static struct usb_endpoint *
usb_dev_get_ep(struct usb_device *udev, uint8_t ep_index, uint8_t dir)
{
struct usb_endpoint *ep;
uint8_t ep_dir;
if (ep_index == 0) {
ep = &udev->ctrl_ep;
} else {
if (dir == USB_FIFO_RX) {
if (udev->flags.usb_mode == USB_MODE_HOST) {
ep_dir = UE_DIR_IN;
} else {
ep_dir = UE_DIR_OUT;
}
} else {
if (udev->flags.usb_mode == USB_MODE_HOST) {
ep_dir = UE_DIR_OUT;
} else {
ep_dir = UE_DIR_IN;
}
}
ep = usbd_get_ep_by_addr(udev, ep_index | ep_dir);
}
if (ep == NULL) {
/* if the endpoint does not exist then return */
return (NULL);
}
if (ep->edesc == NULL) {
/* invalid endpoint */
return (NULL);
}
return (ep); /* success */
}
/*------------------------------------------------------------------------*
* usb_fifo_open
*
* Returns:
* 0: Success
* Else: Failure
*------------------------------------------------------------------------*/
static int
usb_fifo_open(struct usb_cdev_privdata *cpd,
struct usb_fifo *f, int fflags)
{
int err;
if (f == NULL) {
/* no FIFO there */
DPRINTFN(2, "no FIFO\n");
return (ENXIO);
}
/* remove FWRITE and FREAD flags */
fflags &= ~(FWRITE | FREAD);
/* set correct file flags */
if ((f->fifo_index & 1) == USB_FIFO_TX) {
fflags |= FWRITE;
} else {
fflags |= FREAD;
}
/* check if we are already opened */
/* we don't need any locks when checking this variable */
if (f->curr_cpd != NULL) {
err = EBUSY;
goto done;
}
/* reset short flag before open */
f->flag_short = 0;
/* call open method */
err = (f->methods->f_open) (f, fflags);
if (err) {
goto done;
}
mtx_lock(f->priv_mtx);
/* reset sleep flag */
f->flag_sleeping = 0;
/* reset error flag */
f->flag_iserror = 0;
/* reset complete flag */
f->flag_iscomplete = 0;
/* reset select flag */
f->flag_isselect = 0;
/* reset flushing flag */
f->flag_flushing = 0;
/* reset ASYNC proc flag */
f->async_p = NULL;
mtx_lock(&usb_ref_lock);
/* flag the fifo as opened to prevent others */
f->curr_cpd = cpd;
mtx_unlock(&usb_ref_lock);
/* reset queue */
usb_fifo_reset(f);
mtx_unlock(f->priv_mtx);
done:
return (err);
}
/*------------------------------------------------------------------------*
* usb_fifo_reset
*------------------------------------------------------------------------*/
void
usb_fifo_reset(struct usb_fifo *f)
{
struct usb_mbuf *m;
if (f == NULL) {
return;
}
while (1) {
USB_IF_DEQUEUE(&f->used_q, m);
if (m) {
USB_IF_ENQUEUE(&f->free_q, m);
} else {
break;
}
}
/* reset have fragment flag */
f->flag_have_fragment = 0;
}
/*------------------------------------------------------------------------*
* usb_fifo_close
*------------------------------------------------------------------------*/
static void
usb_fifo_close(struct usb_fifo *f, int fflags)
{
int err;
/* check if we are not opened */
if (f->curr_cpd == NULL) {
/* nothing to do - already closed */
return;
}
mtx_lock(f->priv_mtx);
/* clear current cdev private data pointer */
mtx_lock(&usb_ref_lock);
f->curr_cpd = NULL;
mtx_unlock(&usb_ref_lock);
/* check if we are watched by kevent */
KNOTE_LOCKED(&f->selinfo.si_note, 0);
/* check if we are selected */
if (f->flag_isselect) {
selwakeup(&f->selinfo);
f->flag_isselect = 0;
}
/* check if a thread wants SIGIO */
if (f->async_p != NULL) {
PROC_LOCK(f->async_p);
kern_psignal(f->async_p, SIGIO);
PROC_UNLOCK(f->async_p);
f->async_p = NULL;
}
/* remove FWRITE and FREAD flags */
fflags &= ~(FWRITE | FREAD);
/* flush written data, if any */
if ((f->fifo_index & 1) == USB_FIFO_TX) {
if (!f->flag_iserror) {
/* set flushing flag */
f->flag_flushing = 1;
/* get the last packet in */
if (f->flag_have_fragment) {
struct usb_mbuf *m;
f->flag_have_fragment = 0;
USB_IF_DEQUEUE(&f->free_q, m);
if (m) {
USB_IF_ENQUEUE(&f->used_q, m);
}
}
/* start write transfer, if not already started */
(f->methods->f_start_write) (f);
/* check if flushed already */
while (f->flag_flushing &&
(!f->flag_iserror)) {
/* wait until all data has been written */
f->flag_sleeping = 1;
err = cv_timedwait_sig(&f->cv_io, f->priv_mtx,
USB_MS_TO_TICKS(USB_DEFAULT_TIMEOUT));
if (err) {
DPRINTF("signal received\n");
break;
}
}
}
fflags |= FWRITE;
/* stop write transfer, if not already stopped */
(f->methods->f_stop_write) (f);
} else {
fflags |= FREAD;
/* stop write transfer, if not already stopped */
(f->methods->f_stop_read) (f);
}
/* check if we are sleeping */
if (f->flag_sleeping) {
DPRINTFN(2, "Sleeping at close!\n");
}
mtx_unlock(f->priv_mtx);
/* call close method */
(f->methods->f_close) (f, fflags);
DPRINTF("closed\n");
}
/*------------------------------------------------------------------------*
* usb_open - cdev callback
*------------------------------------------------------------------------*/
static int
usb_open(struct cdev *dev, int fflags, int devtype, struct thread *td)
{
struct usb_fs_privdata* pd = (struct usb_fs_privdata*)dev->si_drv1;
struct usb_cdev_refdata refs;
struct usb_cdev_privdata *cpd;
- int err, ep;
+ int err;
DPRINTFN(2, "%s fflags=0x%08x\n", devtoname(dev), fflags);
KASSERT(fflags & (FREAD|FWRITE), ("invalid open flags"));
if (((fflags & FREAD) && !(pd->mode & FREAD)) ||
((fflags & FWRITE) && !(pd->mode & FWRITE))) {
DPRINTFN(2, "access mode not supported\n");
return (EPERM);
}
cpd = malloc(sizeof(*cpd), M_USBDEV, M_WAITOK | M_ZERO);
- ep = cpd->ep_addr = pd->ep_addr;
usb_loc_fill(pd, cpd);
err = usb_ref_device(cpd, &refs, 1);
if (err) {
DPRINTFN(2, "cannot ref device\n");
free(cpd, M_USBDEV);
return (ENXIO);
}
cpd->fflags = fflags; /* access mode for open lifetime */
/* create FIFOs, if any */
err = usb_fifo_create(cpd, &refs);
/* check for error */
if (err) {
DPRINTFN(2, "cannot create fifo\n");
usb_unref_device(cpd, &refs);
free(cpd, M_USBDEV);
return (err);
}
if (fflags & FREAD) {
err = usb_fifo_open(cpd, refs.rxfifo, fflags);
if (err) {
DPRINTFN(2, "read open failed\n");
usb_unref_device(cpd, &refs);
free(cpd, M_USBDEV);
return (err);
}
}
if (fflags & FWRITE) {
err = usb_fifo_open(cpd, refs.txfifo, fflags);
if (err) {
DPRINTFN(2, "write open failed\n");
if (fflags & FREAD) {
usb_fifo_close(refs.rxfifo, fflags);
}
usb_unref_device(cpd, &refs);
free(cpd, M_USBDEV);
return (err);
}
}
usb_unref_device(cpd, &refs);
devfs_set_cdevpriv(cpd, usb_close);
return (0);
}
/*------------------------------------------------------------------------*
* usb_close - cdev callback
*------------------------------------------------------------------------*/
static void
usb_close(void *arg)
{
struct usb_cdev_refdata refs;
struct usb_cdev_privdata *cpd = arg;
int err;
DPRINTFN(2, "cpd=%p\n", cpd);
err = usb_ref_device(cpd, &refs,
2 /* uref and allow detached state */);
if (err) {
DPRINTFN(2, "Cannot grab USB reference when "
"closing USB file handle\n");
goto done;
}
if (cpd->fflags & FREAD) {
usb_fifo_close(refs.rxfifo, cpd->fflags);
}
if (cpd->fflags & FWRITE) {
usb_fifo_close(refs.txfifo, cpd->fflags);
}
usb_unref_device(cpd, &refs);
done:
free(cpd, M_USBDEV);
}
static void
usb_dev_init(void *arg)
{
mtx_init(&usb_ref_lock, "USB ref mutex", NULL, MTX_DEF);
sx_init(&usb_sym_lock, "USB sym mutex");
TAILQ_INIT(&usb_sym_head);
/* check the UGEN methods */
usb_fifo_check_methods(&usb_ugen_methods);
}
SYSINIT(usb_dev_init, SI_SUB_KLD, SI_ORDER_FIRST, usb_dev_init, NULL);
static void
usb_dev_init_post(void *arg)
{
/*
* Create /dev/usb - this is needed for usbconfig(8), which
* needs a well-known device name to access.
*/
usb_dev = make_dev(&usb_static_devsw, 0, UID_ROOT, GID_OPERATOR,
0644, USB_DEVICE_NAME);
if (usb_dev == NULL) {
DPRINTFN(0, "Could not create usb bus device\n");
}
}
SYSINIT(usb_dev_init_post, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, usb_dev_init_post, NULL);
static void
usb_dev_uninit(void *arg)
{
if (usb_dev != NULL) {
destroy_dev(usb_dev);
usb_dev = NULL;
}
mtx_destroy(&usb_ref_lock);
sx_destroy(&usb_sym_lock);
}
SYSUNINIT(usb_dev_uninit, SI_SUB_KICK_SCHEDULER, SI_ORDER_ANY, usb_dev_uninit, NULL);
static int
usb_ioctl_f_sub(struct usb_fifo *f, u_long cmd, void *addr,
struct thread *td)
{
int error = 0;
switch (cmd) {
case FIODTYPE:
*(int *)addr = 0; /* character device */
break;
case FIONBIO:
/* handled by upper FS layer */
break;
case FIOASYNC:
if (*(int *)addr) {
if (f->async_p != NULL) {
error = EBUSY;
break;
}
f->async_p = USB_TD_GET_PROC(td);
} else {
f->async_p = NULL;
}
break;
/* XXX this is not the most general solution */
case TIOCSPGRP:
if (f->async_p == NULL) {
error = EINVAL;
break;
}
if (*(int *)addr != USB_PROC_GET_GID(f->async_p)) {
error = EPERM;
break;
}
break;
default:
return (ENOIOCTL);
}
DPRINTFN(3, "cmd 0x%lx = %d\n", cmd, error);
return (error);
}
/*------------------------------------------------------------------------*
* usb_ioctl - cdev callback
*------------------------------------------------------------------------*/
static int
usb_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int fflag, struct thread* td)
{
struct usb_cdev_refdata refs;
struct usb_cdev_privdata* cpd;
struct usb_fifo *f;
int fflags;
int err;
DPRINTFN(2, "cmd=0x%lx\n", cmd);
err = devfs_get_cdevpriv((void **)&cpd);
if (err != 0)
return (err);
/*
* Performance optimisation: We try to check for IOCTL's that
* don't need the USB reference first. Then we grab the USB
* reference if we need it!
*/
err = usb_ref_device(cpd, &refs, 0 /* no uref */ );
if (err)
return (ENXIO);
fflags = cpd->fflags;
f = NULL; /* set default value */
err = ENOIOCTL; /* set default value */
if (fflags & FWRITE) {
f = refs.txfifo;
err = usb_ioctl_f_sub(f, cmd, addr, td);
}
if (fflags & FREAD) {
f = refs.rxfifo;
err = usb_ioctl_f_sub(f, cmd, addr, td);
}
KASSERT(f != NULL, ("fifo not found"));
if (err != ENOIOCTL)
goto done;
err = (f->methods->f_ioctl) (f, cmd, addr, fflags);
DPRINTFN(2, "f_ioctl cmd 0x%lx = %d\n", cmd, err);
if (err != ENOIOCTL)
goto done;
if (usb_usb_ref_device(cpd, &refs)) {
/* we lost the reference */
return (ENXIO);
}
err = (f->methods->f_ioctl_post) (f, cmd, addr, fflags);
DPRINTFN(2, "f_ioctl_post cmd 0x%lx = %d\n", cmd, err);
if (err == ENOIOCTL)
err = ENOTTY;
if (err)
goto done;
/* Wait for re-enumeration, if any */
while (f->udev->re_enumerate_wait != USB_RE_ENUM_DONE) {
usb_unref_device(cpd, &refs);
usb_pause_mtx(NULL, hz / 128);
while (usb_ref_device(cpd, &refs, 1 /* need uref */)) {
if (usb_ref_device(cpd, &refs, 0)) {
/* device no longer exists */
return (ENXIO);
}
usb_unref_device(cpd, &refs);
usb_pause_mtx(NULL, hz / 128);
}
}
done:
usb_unref_device(cpd, &refs);
return (err);
}
static void
usb_filter_detach(struct knote *kn)
{
struct usb_fifo *f = kn->kn_hook;
knlist_remove(&f->selinfo.si_note, kn, 0);
}
static int
usb_filter_write(struct knote *kn, long hint)
{
struct usb_cdev_privdata* cpd;
struct usb_fifo *f;
struct usb_mbuf *m;
DPRINTFN(2, "\n");
f = kn->kn_hook;
USB_MTX_ASSERT(f->priv_mtx, MA_OWNED);
cpd = f->curr_cpd;
if (cpd == NULL) {
m = (void *)1;
} else if (f->fs_ep_max == 0) {
if (f->flag_iserror) {
/* we got an error */
m = (void *)1;
} else {
if (f->queue_data == NULL) {
/*
* start write transfer, if not
* already started
*/
(f->methods->f_start_write) (f);
}
/* check if any packets are available */
USB_IF_POLL(&f->free_q, m);
}
} else {
if (f->flag_iscomplete) {
m = (void *)1;
} else {
m = NULL;
}
}
return (m ? 1 : 0);
}
static int
usb_filter_read(struct knote *kn, long hint)
{
struct usb_cdev_privdata* cpd;
struct usb_fifo *f;
struct usb_mbuf *m;
DPRINTFN(2, "\n");
f = kn->kn_hook;
USB_MTX_ASSERT(f->priv_mtx, MA_OWNED);
cpd = f->curr_cpd;
if (cpd == NULL) {
m = (void *)1;
} else if (f->fs_ep_max == 0) {
if (f->flag_iserror) {
/* we have an error */
m = (void *)1;
} else {
if (f->queue_data == NULL) {
/*
* start read transfer, if not
* already started
*/
(f->methods->f_start_read) (f);
}
/* check if any packets are available */
USB_IF_POLL(&f->used_q, m);
/* start reading data, if any */
if (m == NULL)
(f->methods->f_start_read) (f);
}
} else {
if (f->flag_iscomplete) {
m = (void *)1;
} else {
m = NULL;
}
}
return (m ? 1 : 0);
}
static struct filterops usb_filtops_write = {
.f_isfd = 1,
.f_detach = usb_filter_detach,
.f_event = usb_filter_write,
};
static struct filterops usb_filtops_read = {
.f_isfd = 1,
.f_detach = usb_filter_detach,
.f_event = usb_filter_read,
};
/* ARGSUSED */
static int
usb_kqfilter(struct cdev* dev, struct knote *kn)
{
struct usb_cdev_refdata refs;
struct usb_cdev_privdata* cpd;
struct usb_fifo *f;
int fflags;
int err = EINVAL;
DPRINTFN(2, "\n");
if (devfs_get_cdevpriv((void **)&cpd) != 0 ||
usb_ref_device(cpd, &refs, 0) != 0)
return (ENXIO);
fflags = cpd->fflags;
/* Figure out who needs service */
switch (kn->kn_filter) {
case EVFILT_WRITE:
if (fflags & FWRITE) {
f = refs.txfifo;
kn->kn_fop = &usb_filtops_write;
err = 0;
}
break;
case EVFILT_READ:
if (fflags & FREAD) {
f = refs.rxfifo;
kn->kn_fop = &usb_filtops_read;
err = 0;
}
break;
default:
err = EOPNOTSUPP;
break;
}
if (err == 0) {
kn->kn_hook = f;
mtx_lock(f->priv_mtx);
knlist_add(&f->selinfo.si_note, kn, 1);
mtx_unlock(f->priv_mtx);
}
usb_unref_device(cpd, &refs);
return (err);
}
/* ARGSUSED */
static int
usb_poll(struct cdev* dev, int events, struct thread* td)
{
struct usb_cdev_refdata refs;
struct usb_cdev_privdata* cpd;
struct usb_fifo *f;
struct usb_mbuf *m;
int fflags, revents;
if (devfs_get_cdevpriv((void **)&cpd) != 0 ||
usb_ref_device(cpd, &refs, 0) != 0)
return (events &
(POLLHUP|POLLIN|POLLRDNORM|POLLOUT|POLLWRNORM));
fflags = cpd->fflags;
/* Figure out who needs service */
revents = 0;
if ((events & (POLLOUT | POLLWRNORM)) &&
(fflags & FWRITE)) {
f = refs.txfifo;
mtx_lock(f->priv_mtx);
if (!refs.is_usbfs) {
if (f->flag_iserror) {
/* we got an error */
m = (void *)1;
} else {
if (f->queue_data == NULL) {
/*
* start write transfer, if not
* already started
*/
(f->methods->f_start_write) (f);
}
/* check if any packets are available */
USB_IF_POLL(&f->free_q, m);
}
} else {
if (f->flag_iscomplete) {
m = (void *)1;
} else {
m = NULL;
}
}
if (m) {
revents |= events & (POLLOUT | POLLWRNORM);
} else {
f->flag_isselect = 1;
selrecord(td, &f->selinfo);
}
mtx_unlock(f->priv_mtx);
}
if ((events & (POLLIN | POLLRDNORM)) &&
(fflags & FREAD)) {
f = refs.rxfifo;
mtx_lock(f->priv_mtx);
if (!refs.is_usbfs) {
if (f->flag_iserror) {
/* we have an error */
m = (void *)1;
} else {
if (f->queue_data == NULL) {
/*
* start read transfer, if not
* already started
*/
(f->methods->f_start_read) (f);
}
/* check if any packets are available */
USB_IF_POLL(&f->used_q, m);
}
} else {
if (f->flag_iscomplete) {
m = (void *)1;
} else {
m = NULL;
}
}
if (m) {
revents |= events & (POLLIN | POLLRDNORM);
} else {
f->flag_isselect = 1;
selrecord(td, &f->selinfo);
if (!refs.is_usbfs) {
/* start reading data */
(f->methods->f_start_read) (f);
}
}
mtx_unlock(f->priv_mtx);
}
usb_unref_device(cpd, &refs);
return (revents);
}
static int
usb_read(struct cdev *dev, struct uio *uio, int ioflag)
{
struct usb_cdev_refdata refs;
struct usb_cdev_privdata* cpd;
struct usb_fifo *f;
struct usb_mbuf *m;
- int fflags;
- int resid;
int io_len;
int err;
uint8_t tr_data = 0;
err = devfs_get_cdevpriv((void **)&cpd);
if (err != 0)
return (err);
err = usb_ref_device(cpd, &refs, 0 /* no uref */ );
if (err)
return (ENXIO);
- fflags = cpd->fflags;
-
f = refs.rxfifo;
if (f == NULL) {
/* should not happen */
usb_unref_device(cpd, &refs);
return (EPERM);
}
- resid = uio->uio_resid;
-
mtx_lock(f->priv_mtx);
/* check for permanent read error */
if (f->flag_iserror) {
err = EIO;
goto done;
}
/* check if USB-FS interface is active */
if (refs.is_usbfs) {
/*
* The queue is used for events that should be
* retrieved using the "USB_FS_COMPLETE" ioctl.
*/
err = EINVAL;
goto done;
}
while (uio->uio_resid > 0) {
USB_IF_DEQUEUE(&f->used_q, m);
if (m == NULL) {
/* start read transfer, if not already started */
(f->methods->f_start_read) (f);
if (ioflag & IO_NDELAY) {
if (tr_data) {
/* return length before error */
break;
}
err = EWOULDBLOCK;
break;
}
DPRINTF("sleeping\n");
err = usb_fifo_wait(f);
if (err) {
break;
}
continue;
}
if (f->methods->f_filter_read) {
/*
* Sometimes it is convenient to process data at the
* expense of a userland process instead of a kernel
* process.
*/
(f->methods->f_filter_read) (f, m);
}
tr_data = 1;
io_len = MIN(m->cur_data_len, uio->uio_resid);
DPRINTFN(2, "transfer %d bytes from %p\n",
io_len, m->cur_data_ptr);
err = usb_fifo_uiomove(f,
m->cur_data_ptr, io_len, uio);
m->cur_data_len -= io_len;
m->cur_data_ptr += io_len;
if (m->cur_data_len == 0) {
uint8_t last_packet;
last_packet = m->last_packet;
USB_IF_ENQUEUE(&f->free_q, m);
if (last_packet) {
/* keep framing */
break;
}
} else {
USB_IF_PREPEND(&f->used_q, m);
}
if (err) {
break;
}
}
done:
mtx_unlock(f->priv_mtx);
usb_unref_device(cpd, &refs);
return (err);
}
static int
usb_write(struct cdev *dev, struct uio *uio, int ioflag)
{
struct usb_cdev_refdata refs;
struct usb_cdev_privdata* cpd;
struct usb_fifo *f;
struct usb_mbuf *m;
uint8_t *pdata;
- int fflags;
- int resid;
int io_len;
int err;
uint8_t tr_data = 0;
DPRINTFN(2, "\n");
err = devfs_get_cdevpriv((void **)&cpd);
if (err != 0)
return (err);
err = usb_ref_device(cpd, &refs, 0 /* no uref */ );
if (err)
return (ENXIO);
- fflags = cpd->fflags;
-
f = refs.txfifo;
if (f == NULL) {
/* should not happen */
usb_unref_device(cpd, &refs);
return (EPERM);
}
- resid = uio->uio_resid;
mtx_lock(f->priv_mtx);
/* check for permanent write error */
if (f->flag_iserror) {
err = EIO;
goto done;
}
/* check if USB-FS interface is active */
if (refs.is_usbfs) {
/*
* The queue is used for events that should be
* retrieved using the "USB_FS_COMPLETE" ioctl.
*/
err = EINVAL;
goto done;
}
if (f->queue_data == NULL) {
/* start write transfer, if not already started */
(f->methods->f_start_write) (f);
}
/* we allow writing zero length data */
do {
USB_IF_DEQUEUE(&f->free_q, m);
if (m == NULL) {
if (ioflag & IO_NDELAY) {
if (tr_data) {
/* return length before error */
break;
}
err = EWOULDBLOCK;
break;
}
DPRINTF("sleeping\n");
err = usb_fifo_wait(f);
if (err) {
break;
}
continue;
}
tr_data = 1;
if (f->flag_have_fragment == 0) {
USB_MBUF_RESET(m);
io_len = m->cur_data_len;
pdata = m->cur_data_ptr;
if (io_len > uio->uio_resid)
io_len = uio->uio_resid;
m->cur_data_len = io_len;
} else {
io_len = m->max_data_len - m->cur_data_len;
pdata = m->cur_data_ptr + m->cur_data_len;
if (io_len > uio->uio_resid)
io_len = uio->uio_resid;
m->cur_data_len += io_len;
}
DPRINTFN(2, "transfer %d bytes to %p\n",
io_len, pdata);
err = usb_fifo_uiomove(f, pdata, io_len, uio);
if (err) {
f->flag_have_fragment = 0;
USB_IF_ENQUEUE(&f->free_q, m);
break;
}
/* check if the buffer is ready to be transmitted */
if ((f->flag_write_defrag == 0) ||
(m->cur_data_len == m->max_data_len)) {
f->flag_have_fragment = 0;
/*
* Check for write filter:
*
* Sometimes it is convenient to process data
* at the expense of a userland process
* instead of a kernel process.
*/
if (f->methods->f_filter_write) {
(f->methods->f_filter_write) (f, m);
}
/* Put USB mbuf in the used queue */
USB_IF_ENQUEUE(&f->used_q, m);
/* Start writing data, if not already started */
(f->methods->f_start_write) (f);
} else {
/* Wait for more data or close */
f->flag_have_fragment = 1;
USB_IF_PREPEND(&f->free_q, m);
}
} while (uio->uio_resid > 0);
done:
mtx_unlock(f->priv_mtx);
usb_unref_device(cpd, &refs);
return (err);
}
int
usb_static_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag,
struct thread *td)
{
union {
struct usb_read_dir *urd;
void* data;
} u;
int err;
u.data = data;
switch (cmd) {
case USB_READ_DIR:
err = usb_read_symlink(u.urd->urd_data,
u.urd->urd_startentry, u.urd->urd_maxlen);
break;
case USB_DEV_QUIRK_GET:
case USB_QUIRK_NAME_GET:
case USB_DEV_QUIRK_ADD:
case USB_DEV_QUIRK_REMOVE:
err = usb_quirk_ioctl_p(cmd, data, fflag, td);
break;
case USB_GET_TEMPLATE:
*(int *)data = usb_template;
err = 0;
break;
case USB_SET_TEMPLATE:
err = priv_check(curthread, PRIV_DRIVER);
if (err)
break;
usb_template = *(int *)data;
break;
default:
err = ENOTTY;
break;
}
return (err);
}
static int
usb_fifo_uiomove(struct usb_fifo *f, void *cp,
int n, struct uio *uio)
{
int error;
mtx_unlock(f->priv_mtx);
/*
* "uiomove()" can sleep so one needs to make a wrapper,
* exiting the mutex and checking things:
*/
error = uiomove(cp, n, uio);
mtx_lock(f->priv_mtx);
return (error);
}
int
usb_fifo_wait(struct usb_fifo *f)
{
int err;
USB_MTX_ASSERT(f->priv_mtx, MA_OWNED);
if (f->flag_iserror) {
/* we are gone */
return (EIO);
}
f->flag_sleeping = 1;
err = cv_wait_sig(&f->cv_io, f->priv_mtx);
if (f->flag_iserror) {
/* we are gone */
err = EIO;
}
return (err);
}
void
usb_fifo_signal(struct usb_fifo *f)
{
if (f->flag_sleeping) {
f->flag_sleeping = 0;
cv_broadcast(&f->cv_io);
}
}
void
usb_fifo_wakeup(struct usb_fifo *f)
{
usb_fifo_signal(f);
KNOTE_LOCKED(&f->selinfo.si_note, 0);
if (f->flag_isselect) {
selwakeup(&f->selinfo);
f->flag_isselect = 0;
}
if (f->async_p != NULL) {
PROC_LOCK(f->async_p);
kern_psignal(f->async_p, SIGIO);
PROC_UNLOCK(f->async_p);
}
}
static int
usb_fifo_dummy_open(struct usb_fifo *fifo, int fflags)
{
return (0);
}
static void
usb_fifo_dummy_close(struct usb_fifo *fifo, int fflags)
{
return;
}
static int
usb_fifo_dummy_ioctl(struct usb_fifo *fifo, u_long cmd, void *addr, int fflags)
{
return (ENOIOCTL);
}
static void
usb_fifo_dummy_cmd(struct usb_fifo *fifo)
{
fifo->flag_flushing = 0; /* not flushing */
}
static void
usb_fifo_check_methods(struct usb_fifo_methods *pm)
{
/* check that all callback functions are OK */
if (pm->f_open == NULL)
pm->f_open = &usb_fifo_dummy_open;
if (pm->f_close == NULL)
pm->f_close = &usb_fifo_dummy_close;
if (pm->f_ioctl == NULL)
pm->f_ioctl = &usb_fifo_dummy_ioctl;
if (pm->f_ioctl_post == NULL)
pm->f_ioctl_post = &usb_fifo_dummy_ioctl;
if (pm->f_start_read == NULL)
pm->f_start_read = &usb_fifo_dummy_cmd;
if (pm->f_stop_read == NULL)
pm->f_stop_read = &usb_fifo_dummy_cmd;
if (pm->f_start_write == NULL)
pm->f_start_write = &usb_fifo_dummy_cmd;
if (pm->f_stop_write == NULL)
pm->f_stop_write = &usb_fifo_dummy_cmd;
}
/*------------------------------------------------------------------------*
* usb_fifo_attach
*
* The following function will create a duplex FIFO.
*
* Return values:
* 0: Success.
* Else: Failure.
*------------------------------------------------------------------------*/
int
usb_fifo_attach(struct usb_device *udev, void *priv_sc,
struct mtx *priv_mtx, struct usb_fifo_methods *pm,
struct usb_fifo_sc *f_sc, uint16_t unit, int16_t subunit,
uint8_t iface_index, uid_t uid, gid_t gid, int mode)
{
struct usb_fifo *f_tx;
struct usb_fifo *f_rx;
char devname[32];
uint8_t n;
f_sc->fp[USB_FIFO_TX] = NULL;
f_sc->fp[USB_FIFO_RX] = NULL;
if (pm == NULL)
return (EINVAL);
/* check the methods */
usb_fifo_check_methods(pm);
if (priv_mtx == NULL)
priv_mtx = &Giant;
/* search for a free FIFO slot */
for (n = 0;; n += 2) {
if (n == USB_FIFO_MAX) {
/* end of FIFOs reached */
return (ENOMEM);
}
/* Check for TX FIFO */
if (udev->fifo[n + USB_FIFO_TX] != NULL) {
continue;
}
/* Check for RX FIFO */
if (udev->fifo[n + USB_FIFO_RX] != NULL) {
continue;
}
break;
}
f_tx = usb_fifo_alloc(priv_mtx);
f_rx = usb_fifo_alloc(priv_mtx);
if ((f_tx == NULL) || (f_rx == NULL)) {
usb_fifo_free(f_tx);
usb_fifo_free(f_rx);
return (ENOMEM);
}
/* initialise FIFO structures */
f_tx->fifo_index = n + USB_FIFO_TX;
f_tx->dev_ep_index = -1;
f_tx->priv_sc0 = priv_sc;
f_tx->methods = pm;
f_tx->iface_index = iface_index;
f_tx->udev = udev;
f_rx->fifo_index = n + USB_FIFO_RX;
f_rx->dev_ep_index = -1;
f_rx->priv_sc0 = priv_sc;
f_rx->methods = pm;
f_rx->iface_index = iface_index;
f_rx->udev = udev;
f_sc->fp[USB_FIFO_TX] = f_tx;
f_sc->fp[USB_FIFO_RX] = f_rx;
mtx_lock(&usb_ref_lock);
udev->fifo[f_tx->fifo_index] = f_tx;
udev->fifo[f_rx->fifo_index] = f_rx;
mtx_unlock(&usb_ref_lock);
for (n = 0; n != 4; n++) {
if (pm->basename[n] == NULL) {
continue;
}
if (subunit < 0) {
if (snprintf(devname, sizeof(devname),
"%s%u%s", pm->basename[n],
unit, pm->postfix[n] ?
pm->postfix[n] : "")) {
/* ignore */
}
} else {
if (snprintf(devname, sizeof(devname),
"%s%u.%d%s", pm->basename[n],
unit, subunit, pm->postfix[n] ?
pm->postfix[n] : "")) {
/* ignore */
}
}
/*
* Distribute the symbolic links into two FIFO structures:
*/
if (n & 1) {
f_rx->symlink[n / 2] =
usb_alloc_symlink(devname);
} else {
f_tx->symlink[n / 2] =
usb_alloc_symlink(devname);
}
/* Create the device */
f_sc->dev = usb_make_dev(udev, devname, -1,
f_tx->fifo_index & f_rx->fifo_index,
FREAD|FWRITE, uid, gid, mode);
}
DPRINTFN(2, "attached %p/%p\n", f_tx, f_rx);
return (0);
}
/*------------------------------------------------------------------------*
* usb_fifo_alloc_buffer
*
* Return values:
* 0: Success
* Else failure
*------------------------------------------------------------------------*/
int
usb_fifo_alloc_buffer(struct usb_fifo *f, usb_size_t bufsize,
uint16_t nbuf)
{
usb_fifo_free_buffer(f);
/* allocate an endpoint */
f->free_q.ifq_maxlen = nbuf;
f->used_q.ifq_maxlen = nbuf;
f->queue_data = usb_alloc_mbufs(
M_USBDEV, &f->free_q, bufsize, nbuf);
if ((f->queue_data == NULL) && bufsize && nbuf) {
return (ENOMEM);
}
return (0); /* success */
}
/*------------------------------------------------------------------------*
* usb_fifo_free_buffer
*
* This function will free the buffers associated with a FIFO. This
* function can be called multiple times in a row.
*------------------------------------------------------------------------*/
void
usb_fifo_free_buffer(struct usb_fifo *f)
{
if (f->queue_data) {
/* free old buffer */
free(f->queue_data, M_USBDEV);
f->queue_data = NULL;
}
/* reset queues */
memset(&f->free_q, 0, sizeof(f->free_q));
memset(&f->used_q, 0, sizeof(f->used_q));
}
void
usb_fifo_detach(struct usb_fifo_sc *f_sc)
{
if (f_sc == NULL) {
return;
}
usb_fifo_free(f_sc->fp[USB_FIFO_TX]);
usb_fifo_free(f_sc->fp[USB_FIFO_RX]);
f_sc->fp[USB_FIFO_TX] = NULL;
f_sc->fp[USB_FIFO_RX] = NULL;
usb_destroy_dev(f_sc->dev);
f_sc->dev = NULL;
DPRINTFN(2, "detached %p\n", f_sc);
}
usb_size_t
usb_fifo_put_bytes_max(struct usb_fifo *f)
{
struct usb_mbuf *m;
usb_size_t len;
USB_IF_POLL(&f->free_q, m);
if (m) {
len = m->max_data_len;
} else {
len = 0;
}
return (len);
}
/*------------------------------------------------------------------------*
* usb_fifo_put_data
*
* what:
* 0 - normal operation
* 1 - set last packet flag to enforce framing
*------------------------------------------------------------------------*/
void
usb_fifo_put_data(struct usb_fifo *f, struct usb_page_cache *pc,
usb_frlength_t offset, usb_frlength_t len, uint8_t what)
{
struct usb_mbuf *m;
usb_frlength_t io_len;
while (len || (what == 1)) {
USB_IF_DEQUEUE(&f->free_q, m);
if (m) {
USB_MBUF_RESET(m);
io_len = MIN(len, m->cur_data_len);
usbd_copy_out(pc, offset, m->cur_data_ptr, io_len);
m->cur_data_len = io_len;
offset += io_len;
len -= io_len;
if ((len == 0) && (what == 1)) {
m->last_packet = 1;
}
USB_IF_ENQUEUE(&f->used_q, m);
usb_fifo_wakeup(f);
if ((len == 0) || (what == 1)) {
break;
}
} else {
break;
}
}
}
void
usb_fifo_put_data_linear(struct usb_fifo *f, void *ptr,
usb_size_t len, uint8_t what)
{
struct usb_mbuf *m;
usb_size_t io_len;
while (len || (what == 1)) {
USB_IF_DEQUEUE(&f->free_q, m);
if (m) {
USB_MBUF_RESET(m);
io_len = MIN(len, m->cur_data_len);
memcpy(m->cur_data_ptr, ptr, io_len);
m->cur_data_len = io_len;
ptr = USB_ADD_BYTES(ptr, io_len);
len -= io_len;
if ((len == 0) && (what == 1)) {
m->last_packet = 1;
}
USB_IF_ENQUEUE(&f->used_q, m);
usb_fifo_wakeup(f);
if ((len == 0) || (what == 1)) {
break;
}
} else {
break;
}
}
}
uint8_t
usb_fifo_put_data_buffer(struct usb_fifo *f, void *ptr, usb_size_t len)
{
struct usb_mbuf *m;
USB_IF_DEQUEUE(&f->free_q, m);
if (m) {
m->cur_data_len = len;
m->cur_data_ptr = ptr;
USB_IF_ENQUEUE(&f->used_q, m);
usb_fifo_wakeup(f);
return (1);
}
return (0);
}
void
usb_fifo_put_data_error(struct usb_fifo *f)
{
f->flag_iserror = 1;
usb_fifo_wakeup(f);
}
/*------------------------------------------------------------------------*
* usb_fifo_get_data
*
* what:
* 0 - normal operation
* 1 - only get one "usb_mbuf"
*
* returns:
* 0 - no more data
* 1 - data in buffer
*------------------------------------------------------------------------*/
uint8_t
usb_fifo_get_data(struct usb_fifo *f, struct usb_page_cache *pc,
usb_frlength_t offset, usb_frlength_t len, usb_frlength_t *actlen,
uint8_t what)
{
struct usb_mbuf *m;
usb_frlength_t io_len;
uint8_t tr_data = 0;
actlen[0] = 0;
while (1) {
USB_IF_DEQUEUE(&f->used_q, m);
if (m) {
tr_data = 1;
io_len = MIN(len, m->cur_data_len);
usbd_copy_in(pc, offset, m->cur_data_ptr, io_len);
len -= io_len;
offset += io_len;
actlen[0] += io_len;
m->cur_data_ptr += io_len;
m->cur_data_len -= io_len;
if ((m->cur_data_len == 0) || (what == 1)) {
USB_IF_ENQUEUE(&f->free_q, m);
usb_fifo_wakeup(f);
if (what == 1) {
break;
}
} else {
USB_IF_PREPEND(&f->used_q, m);
}
} else {
if (tr_data) {
/* wait for data to be written out */
break;
}
if (f->flag_flushing) {
/* check if we should send a short packet */
if (f->flag_short != 0) {
f->flag_short = 0;
tr_data = 1;
break;
}
/* flushing complete */
f->flag_flushing = 0;
usb_fifo_wakeup(f);
}
break;
}
if (len == 0) {
break;
}
}
return (tr_data);
}
uint8_t
usb_fifo_get_data_linear(struct usb_fifo *f, void *ptr,
usb_size_t len, usb_size_t *actlen, uint8_t what)
{
struct usb_mbuf *m;
usb_size_t io_len;
uint8_t tr_data = 0;
actlen[0] = 0;
while (1) {
USB_IF_DEQUEUE(&f->used_q, m);
if (m) {
tr_data = 1;
io_len = MIN(len, m->cur_data_len);
memcpy(ptr, m->cur_data_ptr, io_len);
len -= io_len;
ptr = USB_ADD_BYTES(ptr, io_len);
actlen[0] += io_len;
m->cur_data_ptr += io_len;
m->cur_data_len -= io_len;
if ((m->cur_data_len == 0) || (what == 1)) {
USB_IF_ENQUEUE(&f->free_q, m);
usb_fifo_wakeup(f);
if (what == 1) {
break;
}
} else {
USB_IF_PREPEND(&f->used_q, m);
}
} else {
if (tr_data) {
/* wait for data to be written out */
break;
}
if (f->flag_flushing) {
/* check if we should send a short packet */
if (f->flag_short != 0) {
f->flag_short = 0;
tr_data = 1;
break;
}
/* flushing complete */
f->flag_flushing = 0;
usb_fifo_wakeup(f);
}
break;
}
if (len == 0) {
break;
}
}
return (tr_data);
}
uint8_t
usb_fifo_get_data_buffer(struct usb_fifo *f, void **pptr, usb_size_t *plen)
{
struct usb_mbuf *m;
USB_IF_POLL(&f->used_q, m);
if (m) {
*plen = m->cur_data_len;
*pptr = m->cur_data_ptr;
return (1);
}
return (0);
}
void
usb_fifo_get_data_error(struct usb_fifo *f)
{
f->flag_iserror = 1;
usb_fifo_wakeup(f);
}
/*------------------------------------------------------------------------*
* usb_alloc_symlink
*
* Return values:
* NULL: Failure
* Else: Pointer to symlink entry
*------------------------------------------------------------------------*/
struct usb_symlink *
usb_alloc_symlink(const char *target)
{
struct usb_symlink *ps;
ps = malloc(sizeof(*ps), M_USBDEV, M_WAITOK);
if (ps == NULL) {
return (ps);
}
/* XXX no longer needed */
strlcpy(ps->src_path, target, sizeof(ps->src_path));
ps->src_len = strlen(ps->src_path);
strlcpy(ps->dst_path, target, sizeof(ps->dst_path));
ps->dst_len = strlen(ps->dst_path);
sx_xlock(&usb_sym_lock);
TAILQ_INSERT_TAIL(&usb_sym_head, ps, sym_entry);
sx_unlock(&usb_sym_lock);
return (ps);
}
/*------------------------------------------------------------------------*
* usb_free_symlink
*------------------------------------------------------------------------*/
void
usb_free_symlink(struct usb_symlink *ps)
{
if (ps == NULL) {
return;
}
sx_xlock(&usb_sym_lock);
TAILQ_REMOVE(&usb_sym_head, ps, sym_entry);
sx_unlock(&usb_sym_lock);
free(ps, M_USBDEV);
}
/*------------------------------------------------------------------------*
* usb_read_symlink
*
* Return value:
* 0: Success
* Else: Failure
*------------------------------------------------------------------------*/
int
usb_read_symlink(uint8_t *user_ptr, uint32_t startentry, uint32_t user_len)
{
struct usb_symlink *ps;
uint32_t temp;
uint32_t delta = 0;
uint8_t len;
int error = 0;
sx_xlock(&usb_sym_lock);
TAILQ_FOREACH(ps, &usb_sym_head, sym_entry) {
/*
* Compute total length of source and destination symlink
* strings pluss one length byte and two NUL bytes:
*/
temp = ps->src_len + ps->dst_len + 3;
if (temp > 255) {
/*
* Skip entry because this length cannot fit
* into one byte:
*/
continue;
}
if (startentry != 0) {
/* decrement read offset */
startentry--;
continue;
}
if (temp > user_len) {
/* out of buffer space */
break;
}
len = temp;
/* copy out total length */
error = copyout(&len,
USB_ADD_BYTES(user_ptr, delta), 1);
if (error) {
break;
}
delta += 1;
/* copy out source string */
error = copyout(ps->src_path,
USB_ADD_BYTES(user_ptr, delta), ps->src_len);
if (error) {
break;
}
len = 0;
delta += ps->src_len;
error = copyout(&len,
USB_ADD_BYTES(user_ptr, delta), 1);
if (error) {
break;
}
delta += 1;
/* copy out destination string */
error = copyout(ps->dst_path,
USB_ADD_BYTES(user_ptr, delta), ps->dst_len);
if (error) {
break;
}
len = 0;
delta += ps->dst_len;
error = copyout(&len,
USB_ADD_BYTES(user_ptr, delta), 1);
if (error) {
break;
}
delta += 1;
user_len -= temp;
}
/* a zero length entry indicates the end */
if ((user_len != 0) && (error == 0)) {
len = 0;
error = copyout(&len,
USB_ADD_BYTES(user_ptr, delta), 1);
}
sx_unlock(&usb_sym_lock);
return (error);
}
void
usb_fifo_set_close_zlp(struct usb_fifo *f, uint8_t onoff)
{
if (f == NULL)
return;
/* send a Zero Length Packet, ZLP, before close */
f->flag_short = onoff;
}
void
usb_fifo_set_write_defrag(struct usb_fifo *f, uint8_t onoff)
{
if (f == NULL)
return;
/* defrag written data */
f->flag_write_defrag = onoff;
/* reset defrag state */
f->flag_have_fragment = 0;
}
void *
usb_fifo_softc(struct usb_fifo *f)
{
return (f->priv_sc0);
}
#endif /* USB_HAVE_UGEN */
Index: head/sys/dev/vnic/nic_main.c
===================================================================
--- head/sys/dev/vnic/nic_main.c (revision 327172)
+++ head/sys/dev/vnic/nic_main.c (revision 327173)
@@ -1,1232 +1,1229 @@
/*
* Copyright (C) 2015 Cavium Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/bitset.h>
#include <sys/bitstring.h>
#include <sys/bus.h>
#include <sys/endian.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/module.h>
#include <sys/rman.h>
#include <sys/pciio.h>
#include <sys/pcpu.h>
#include <sys/proc.h>
#include <sys/socket.h>
#include <sys/sockio.h>
#include <sys/cpuset.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <net/ethernet.h>
#include <net/if.h>
#include <net/if_media.h>
#include <machine/bus.h>
#include <machine/_inttypes.h>
#include <dev/pci/pcireg.h>
#include <dev/pci/pcivar.h>
#include <sys/dnv.h>
#include <sys/nv.h>
#ifdef PCI_IOV
#include <sys/iov_schema.h>
#include <dev/pci/pci_iov.h>
#endif
#include "thunder_bgx.h"
#include "nic_reg.h"
#include "nic.h"
#include "q_struct.h"
#define VNIC_PF_DEVSTR "Cavium Thunder NIC Physical Function Driver"
#define VNIC_PF_REG_RID PCIR_BAR(PCI_CFG_REG_BAR_NUM)
#define NIC_SET_VF_LMAC_MAP(bgx, lmac) ((((bgx) & 0xF) << 4) | ((lmac) & 0xF))
#define NIC_GET_BGX_FROM_VF_LMAC_MAP(map) (((map) >> 4) & 0xF)
#define NIC_GET_LMAC_FROM_VF_LMAC_MAP(map) ((map) & 0xF)
/* Structure to be used by the SR-IOV for VF configuration schemas */
struct nicvf_info {
boolean_t vf_enabled;
int vf_flags;
};
struct nicpf {
device_t dev;
uint8_t node;
u_int flags;
uint8_t num_vf_en; /* No of VF enabled */
struct nicvf_info vf_info[MAX_NUM_VFS_SUPPORTED];
struct resource * reg_base; /* Register start address */
struct pkind_cfg pkind;
uint8_t vf_lmac_map[MAX_LMAC];
boolean_t mbx_lock[MAX_NUM_VFS_SUPPORTED];
struct callout check_link;
struct mtx check_link_mtx;
uint8_t link[MAX_LMAC];
uint8_t duplex[MAX_LMAC];
uint32_t speed[MAX_LMAC];
uint16_t cpi_base[MAX_NUM_VFS_SUPPORTED];
uint16_t rssi_base[MAX_NUM_VFS_SUPPORTED];
uint16_t rss_ind_tbl_size;
/* MSI-X */
boolean_t msix_enabled;
uint8_t num_vec;
struct msix_entry msix_entries[NIC_PF_MSIX_VECTORS];
struct resource * msix_table_res;
};
static int nicpf_probe(device_t);
static int nicpf_attach(device_t);
static int nicpf_detach(device_t);
#ifdef PCI_IOV
static int nicpf_iov_init(device_t, uint16_t, const nvlist_t *);
static void nicpf_iov_uninit(device_t);
static int nicpf_iov_add_vf(device_t, uint16_t, const nvlist_t *);
#endif
static device_method_t nicpf_methods[] = {
/* Device interface */
DEVMETHOD(device_probe, nicpf_probe),
DEVMETHOD(device_attach, nicpf_attach),
DEVMETHOD(device_detach, nicpf_detach),
/* PCI SR-IOV interface */
#ifdef PCI_IOV
DEVMETHOD(pci_iov_init, nicpf_iov_init),
DEVMETHOD(pci_iov_uninit, nicpf_iov_uninit),
DEVMETHOD(pci_iov_add_vf, nicpf_iov_add_vf),
#endif
DEVMETHOD_END,
};
static driver_t vnicpf_driver = {
"vnicpf",
nicpf_methods,
sizeof(struct nicpf),
};
static devclass_t vnicpf_devclass;
DRIVER_MODULE(vnicpf, pci, vnicpf_driver, vnicpf_devclass, 0, 0);
MODULE_VERSION(vnicpf, 1);
MODULE_DEPEND(vnicpf, pci, 1, 1, 1);
MODULE_DEPEND(vnicpf, ether, 1, 1, 1);
MODULE_DEPEND(vnicpf, thunder_bgx, 1, 1, 1);
static int nicpf_alloc_res(struct nicpf *);
static void nicpf_free_res(struct nicpf *);
static void nic_set_lmac_vf_mapping(struct nicpf *);
static void nic_init_hw(struct nicpf *);
static int nic_sriov_init(device_t, struct nicpf *);
static void nic_poll_for_link(void *);
static int nic_register_interrupts(struct nicpf *);
static void nic_unregister_interrupts(struct nicpf *);
/*
* Device interface
*/
static int
nicpf_probe(device_t dev)
{
uint16_t vendor_id;
uint16_t device_id;
vendor_id = pci_get_vendor(dev);
device_id = pci_get_device(dev);
if (vendor_id == PCI_VENDOR_ID_CAVIUM &&
device_id == PCI_DEVICE_ID_THUNDER_NIC_PF) {
device_set_desc(dev, VNIC_PF_DEVSTR);
return (BUS_PROBE_DEFAULT);
}
return (ENXIO);
}
static int
nicpf_attach(device_t dev)
{
struct nicpf *nic;
int err;
nic = device_get_softc(dev);
nic->dev = dev;
/* Enable bus mastering */
pci_enable_busmaster(dev);
/* Allocate PCI resources */
err = nicpf_alloc_res(nic);
if (err != 0) {
device_printf(dev, "Could not allocate PCI resources\n");
return (err);
}
nic->node = nic_get_node_id(nic->reg_base);
/* Enable Traffic Network Switch (TNS) bypass mode by default */
nic->flags &= ~NIC_TNS_ENABLED;
nic_set_lmac_vf_mapping(nic);
/* Initialize hardware */
nic_init_hw(nic);
/* Set RSS TBL size for each VF */
nic->rss_ind_tbl_size = NIC_MAX_RSS_IDR_TBL_SIZE;
/* Setup interrupts */
err = nic_register_interrupts(nic);
if (err != 0)
goto err_free_res;
/* Configure SRIOV */
err = nic_sriov_init(dev, nic);
if (err != 0)
goto err_free_intr;
if (nic->flags & NIC_TNS_ENABLED)
return (0);
mtx_init(&nic->check_link_mtx, "VNIC PF link poll", NULL, MTX_DEF);
/* Register physical link status poll callout */
callout_init_mtx(&nic->check_link, &nic->check_link_mtx, 0);
mtx_lock(&nic->check_link_mtx);
nic_poll_for_link(nic);
mtx_unlock(&nic->check_link_mtx);
return (0);
err_free_intr:
nic_unregister_interrupts(nic);
err_free_res:
nicpf_free_res(nic);
pci_disable_busmaster(dev);
return (err);
}
static int
nicpf_detach(device_t dev)
{
struct nicpf *nic;
int err;
err = 0;
nic = device_get_softc(dev);
callout_drain(&nic->check_link);
mtx_destroy(&nic->check_link_mtx);
nic_unregister_interrupts(nic);
nicpf_free_res(nic);
pci_disable_busmaster(dev);
#ifdef PCI_IOV
err = pci_iov_detach(dev);
if (err != 0)
device_printf(dev, "SR-IOV in use. Detach first.\n");
#endif
return (err);
}
/*
* SR-IOV interface
*/
#ifdef PCI_IOV
static int
nicpf_iov_init(device_t dev, uint16_t num_vfs, const nvlist_t *params)
{
struct nicpf *nic;
nic = device_get_softc(dev);
if (num_vfs == 0)
return (ENXIO);
nic->flags |= NIC_SRIOV_ENABLED;
return (0);
}
static void
nicpf_iov_uninit(device_t dev)
{
/* ARM64TODO: Implement this function */
}
static int
nicpf_iov_add_vf(device_t dev, uint16_t vfnum, const nvlist_t *params)
{
const void *mac;
struct nicpf *nic;
size_t size;
int bgx, lmac;
nic = device_get_softc(dev);
if ((nic->flags & NIC_SRIOV_ENABLED) == 0)
return (ENXIO);
if (vfnum > (nic->num_vf_en - 1))
return (EINVAL);
if (nvlist_exists_binary(params, "mac-addr") != 0) {
mac = nvlist_get_binary(params, "mac-addr", &size);
bgx = NIC_GET_BGX_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vfnum]);
lmac = NIC_GET_LMAC_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vfnum]);
bgx_set_lmac_mac(nic->node, bgx, lmac, mac);
}
return (0);
}
#endif
/*
* Helper routines
*/
static int
nicpf_alloc_res(struct nicpf *nic)
{
device_t dev;
int rid;
dev = nic->dev;
rid = VNIC_PF_REG_RID;
nic->reg_base = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
RF_ACTIVE);
if (nic->reg_base == NULL) {
/* For verbose output print some more details */
if (bootverbose) {
device_printf(dev,
"Could not allocate registers memory\n");
}
return (ENXIO);
}
return (0);
}
static void
nicpf_free_res(struct nicpf *nic)
{
device_t dev;
dev = nic->dev;
if (nic->reg_base != NULL) {
bus_release_resource(dev, SYS_RES_MEMORY,
rman_get_rid(nic->reg_base), nic->reg_base);
}
}
/* Register read/write APIs */
static __inline void
nic_reg_write(struct nicpf *nic, bus_space_handle_t offset,
uint64_t val)
{
bus_write_8(nic->reg_base, offset, val);
}
static __inline uint64_t
nic_reg_read(struct nicpf *nic, uint64_t offset)
{
uint64_t val;
val = bus_read_8(nic->reg_base, offset);
return (val);
}
/* PF -> VF mailbox communication APIs */
static void
nic_enable_mbx_intr(struct nicpf *nic)
{
/* Enable mailbox interrupt for all 128 VFs */
nic_reg_write(nic, NIC_PF_MAILBOX_ENA_W1S, ~0UL);
nic_reg_write(nic, NIC_PF_MAILBOX_ENA_W1S + sizeof(uint64_t), ~0UL);
}
static void
nic_clear_mbx_intr(struct nicpf *nic, int vf, int mbx_reg)
{
nic_reg_write(nic, NIC_PF_MAILBOX_INT + (mbx_reg << 3), (1UL << vf));
}
static uint64_t
nic_get_mbx_addr(int vf)
{
return (NIC_PF_VF_0_127_MAILBOX_0_1 + (vf << NIC_VF_NUM_SHIFT));
}
/*
* Send a mailbox message to VF
* @vf: vf to which this message to be sent
* @mbx: Message to be sent
*/
static void
nic_send_msg_to_vf(struct nicpf *nic, int vf, union nic_mbx *mbx)
{
bus_space_handle_t mbx_addr = nic_get_mbx_addr(vf);
uint64_t *msg = (uint64_t *)mbx;
/*
* In first revision HW, mbox interrupt is triggerred
* when PF writes to MBOX(1), in next revisions when
* PF writes to MBOX(0)
*/
if (pass1_silicon(nic->dev)) {
nic_reg_write(nic, mbx_addr + 0, msg[0]);
nic_reg_write(nic, mbx_addr + 8, msg[1]);
} else {
nic_reg_write(nic, mbx_addr + 8, msg[1]);
nic_reg_write(nic, mbx_addr + 0, msg[0]);
}
}
/*
* Responds to VF's READY message with VF's
* ID, node, MAC address e.t.c
* @vf: VF which sent READY message
*/
static void
nic_mbx_send_ready(struct nicpf *nic, int vf)
{
union nic_mbx mbx = {};
int bgx_idx, lmac;
const char *mac;
mbx.nic_cfg.msg = NIC_MBOX_MSG_READY;
mbx.nic_cfg.vf_id = vf;
if (nic->flags & NIC_TNS_ENABLED)
mbx.nic_cfg.tns_mode = NIC_TNS_MODE;
else
mbx.nic_cfg.tns_mode = NIC_TNS_BYPASS_MODE;
if (vf < MAX_LMAC) {
bgx_idx = NIC_GET_BGX_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vf]);
lmac = NIC_GET_LMAC_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vf]);
mac = bgx_get_lmac_mac(nic->node, bgx_idx, lmac);
if (mac) {
memcpy((uint8_t *)&mbx.nic_cfg.mac_addr, mac,
ETHER_ADDR_LEN);
}
}
mbx.nic_cfg.node_id = nic->node;
mbx.nic_cfg.loopback_supported = vf < MAX_LMAC;
nic_send_msg_to_vf(nic, vf, &mbx);
}
/*
* ACKs VF's mailbox message
* @vf: VF to which ACK to be sent
*/
static void
nic_mbx_send_ack(struct nicpf *nic, int vf)
{
union nic_mbx mbx = {};
mbx.msg.msg = NIC_MBOX_MSG_ACK;
nic_send_msg_to_vf(nic, vf, &mbx);
}
/*
* NACKs VF's mailbox message that PF is not able to
* complete the action
* @vf: VF to which ACK to be sent
*/
static void
nic_mbx_send_nack(struct nicpf *nic, int vf)
{
union nic_mbx mbx = {};
mbx.msg.msg = NIC_MBOX_MSG_NACK;
nic_send_msg_to_vf(nic, vf, &mbx);
}
/*
* Flush all in flight receive packets to memory and
* bring down an active RQ
*/
static int
nic_rcv_queue_sw_sync(struct nicpf *nic)
{
uint16_t timeout = ~0x00;
nic_reg_write(nic, NIC_PF_SW_SYNC_RX, 0x01);
/* Wait till sync cycle is finished */
while (timeout) {
if (nic_reg_read(nic, NIC_PF_SW_SYNC_RX_DONE) & 0x1)
break;
timeout--;
}
nic_reg_write(nic, NIC_PF_SW_SYNC_RX, 0x00);
if (!timeout) {
device_printf(nic->dev, "Receive queue software sync failed\n");
return (ETIMEDOUT);
}
return (0);
}
/* Get BGX Rx/Tx stats and respond to VF's request */
static void
nic_get_bgx_stats(struct nicpf *nic, struct bgx_stats_msg *bgx)
{
int bgx_idx, lmac;
union nic_mbx mbx = {};
bgx_idx = NIC_GET_BGX_FROM_VF_LMAC_MAP(nic->vf_lmac_map[bgx->vf_id]);
lmac = NIC_GET_LMAC_FROM_VF_LMAC_MAP(nic->vf_lmac_map[bgx->vf_id]);
mbx.bgx_stats.msg = NIC_MBOX_MSG_BGX_STATS;
mbx.bgx_stats.vf_id = bgx->vf_id;
mbx.bgx_stats.rx = bgx->rx;
mbx.bgx_stats.idx = bgx->idx;
if (bgx->rx != 0) {
mbx.bgx_stats.stats =
bgx_get_rx_stats(nic->node, bgx_idx, lmac, bgx->idx);
} else {
mbx.bgx_stats.stats =
bgx_get_tx_stats(nic->node, bgx_idx, lmac, bgx->idx);
}
nic_send_msg_to_vf(nic, bgx->vf_id, &mbx);
}
/* Update hardware min/max frame size */
static int
nic_update_hw_frs(struct nicpf *nic, int new_frs, int vf)
{
if ((new_frs > NIC_HW_MAX_FRS) || (new_frs < NIC_HW_MIN_FRS)) {
device_printf(nic->dev,
"Invalid MTU setting from VF%d rejected, "
"should be between %d and %d\n",
vf, NIC_HW_MIN_FRS, NIC_HW_MAX_FRS);
return (EINVAL);
}
new_frs += ETHER_HDR_LEN;
if (new_frs <= nic->pkind.maxlen)
return (0);
nic->pkind.maxlen = new_frs;
nic_reg_write(nic, NIC_PF_PKIND_0_15_CFG, *(uint64_t *)&nic->pkind);
return (0);
}
/* Set minimum transmit packet size */
static void
nic_set_tx_pkt_pad(struct nicpf *nic, int size)
{
int lmac;
uint64_t lmac_cfg;
/* Max value that can be set is 60 */
if (size > 60)
size = 60;
for (lmac = 0; lmac < (MAX_BGX_PER_CN88XX * MAX_LMAC_PER_BGX); lmac++) {
lmac_cfg = nic_reg_read(nic, NIC_PF_LMAC_0_7_CFG | (lmac << 3));
lmac_cfg &= ~(0xF << 2);
lmac_cfg |= ((size / 4) << 2);
nic_reg_write(nic, NIC_PF_LMAC_0_7_CFG | (lmac << 3), lmac_cfg);
}
}
/*
* Function to check number of LMACs present and set VF::LMAC mapping.
* Mapping will be used while initializing channels.
*/
static void
nic_set_lmac_vf_mapping(struct nicpf *nic)
{
unsigned bgx_map = bgx_get_map(nic->node);
int bgx, next_bgx_lmac = 0;
int lmac, lmac_cnt = 0;
uint64_t lmac_credit;
nic->num_vf_en = 0;
if (nic->flags & NIC_TNS_ENABLED) {
nic->num_vf_en = DEFAULT_NUM_VF_ENABLED;
return;
}
for (bgx = 0; bgx < NIC_MAX_BGX; bgx++) {
if ((bgx_map & (1 << bgx)) == 0)
continue;
lmac_cnt = bgx_get_lmac_count(nic->node, bgx);
for (lmac = 0; lmac < lmac_cnt; lmac++)
nic->vf_lmac_map[next_bgx_lmac++] =
NIC_SET_VF_LMAC_MAP(bgx, lmac);
nic->num_vf_en += lmac_cnt;
/* Program LMAC credits */
lmac_credit = (1UL << 1); /* channel credit enable */
lmac_credit |= (0x1ff << 2); /* Max outstanding pkt count */
/* 48KB BGX Tx buffer size, each unit is of size 16bytes */
lmac_credit |= (((((48 * 1024) / lmac_cnt) -
NIC_HW_MAX_FRS) / 16) << 12);
lmac = bgx * MAX_LMAC_PER_BGX;
for (; lmac < lmac_cnt + (bgx * MAX_LMAC_PER_BGX); lmac++) {
nic_reg_write(nic, NIC_PF_LMAC_0_7_CREDIT + (lmac * 8),
lmac_credit);
}
}
}
#define TNS_PORT0_BLOCK 6
#define TNS_PORT1_BLOCK 7
#define BGX0_BLOCK 8
#define BGX1_BLOCK 9
static void
nic_init_hw(struct nicpf *nic)
{
int i;
/* Enable NIC HW block */
nic_reg_write(nic, NIC_PF_CFG, 0x3);
/* Enable backpressure */
nic_reg_write(nic, NIC_PF_BP_CFG, (1UL << 6) | 0x03);
if (nic->flags & NIC_TNS_ENABLED) {
nic_reg_write(nic, NIC_PF_INTF_0_1_SEND_CFG,
(NIC_TNS_MODE << 7) | TNS_PORT0_BLOCK);
nic_reg_write(nic, NIC_PF_INTF_0_1_SEND_CFG | (1 << 8),
(NIC_TNS_MODE << 7) | TNS_PORT1_BLOCK);
nic_reg_write(nic, NIC_PF_INTF_0_1_BP_CFG,
(1UL << 63) | TNS_PORT0_BLOCK);
nic_reg_write(nic, NIC_PF_INTF_0_1_BP_CFG + (1 << 8),
(1UL << 63) | TNS_PORT1_BLOCK);
} else {
/* Disable TNS mode on both interfaces */
nic_reg_write(nic, NIC_PF_INTF_0_1_SEND_CFG,
(NIC_TNS_BYPASS_MODE << 7) | BGX0_BLOCK);
nic_reg_write(nic, NIC_PF_INTF_0_1_SEND_CFG | (1 << 8),
(NIC_TNS_BYPASS_MODE << 7) | BGX1_BLOCK);
nic_reg_write(nic, NIC_PF_INTF_0_1_BP_CFG,
(1UL << 63) | BGX0_BLOCK);
nic_reg_write(nic, NIC_PF_INTF_0_1_BP_CFG + (1 << 8),
(1UL << 63) | BGX1_BLOCK);
}
/* PKIND configuration */
nic->pkind.minlen = 0;
nic->pkind.maxlen = NIC_HW_MAX_FRS + ETHER_HDR_LEN;
nic->pkind.lenerr_en = 1;
nic->pkind.rx_hdr = 0;
nic->pkind.hdr_sl = 0;
for (i = 0; i < NIC_MAX_PKIND; i++) {
nic_reg_write(nic, NIC_PF_PKIND_0_15_CFG | (i << 3),
*(uint64_t *)&nic->pkind);
}
nic_set_tx_pkt_pad(nic, NIC_HW_MIN_FRS);
/* Timer config */
nic_reg_write(nic, NIC_PF_INTR_TIMER_CFG, NICPF_CLK_PER_INT_TICK);
/* Enable VLAN ethertype matching and stripping */
nic_reg_write(nic, NIC_PF_RX_ETYPE_0_7,
(2 << 19) | (ETYPE_ALG_VLAN_STRIP << 16) | ETHERTYPE_VLAN);
}
/* Channel parse index configuration */
static void
nic_config_cpi(struct nicpf *nic, struct cpi_cfg_msg *cfg)
{
uint32_t vnic, bgx, lmac, chan;
uint32_t padd, cpi_count = 0;
uint64_t cpi_base, cpi, rssi_base, rssi;
uint8_t qset, rq_idx = 0;
vnic = cfg->vf_id;
bgx = NIC_GET_BGX_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vnic]);
lmac = NIC_GET_LMAC_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vnic]);
chan = (lmac * MAX_BGX_CHANS_PER_LMAC) + (bgx * NIC_CHANS_PER_INF);
cpi_base = (lmac * NIC_MAX_CPI_PER_LMAC) + (bgx * NIC_CPI_PER_BGX);
rssi_base = (lmac * nic->rss_ind_tbl_size) + (bgx * NIC_RSSI_PER_BGX);
/* Rx channel configuration */
nic_reg_write(nic, NIC_PF_CHAN_0_255_RX_BP_CFG | (chan << 3),
(1UL << 63) | (vnic << 0));
nic_reg_write(nic, NIC_PF_CHAN_0_255_RX_CFG | (chan << 3),
((uint64_t)cfg->cpi_alg << 62) | (cpi_base << 48));
if (cfg->cpi_alg == CPI_ALG_NONE)
cpi_count = 1;
else if (cfg->cpi_alg == CPI_ALG_VLAN) /* 3 bits of PCP */
cpi_count = 8;
else if (cfg->cpi_alg == CPI_ALG_VLAN16) /* 3 bits PCP + DEI */
cpi_count = 16;
else if (cfg->cpi_alg == CPI_ALG_DIFF) /* 6bits DSCP */
cpi_count = NIC_MAX_CPI_PER_LMAC;
/* RSS Qset, Qidx mapping */
qset = cfg->vf_id;
rssi = rssi_base;
for (; rssi < (rssi_base + cfg->rq_cnt); rssi++) {
nic_reg_write(nic, NIC_PF_RSSI_0_4097_RQ | (rssi << 3),
(qset << 3) | rq_idx);
rq_idx++;
}
rssi = 0;
cpi = cpi_base;
for (; cpi < (cpi_base + cpi_count); cpi++) {
/* Determine port to channel adder */
if (cfg->cpi_alg != CPI_ALG_DIFF)
padd = cpi % cpi_count;
else
padd = cpi % 8; /* 3 bits CS out of 6bits DSCP */
/* Leave RSS_SIZE as '0' to disable RSS */
if (pass1_silicon(nic->dev)) {
nic_reg_write(nic, NIC_PF_CPI_0_2047_CFG | (cpi << 3),
(vnic << 24) | (padd << 16) | (rssi_base + rssi));
} else {
/* Set MPI_ALG to '0' to disable MCAM parsing */
nic_reg_write(nic, NIC_PF_CPI_0_2047_CFG | (cpi << 3),
(padd << 16));
/* MPI index is same as CPI if MPI_ALG is not enabled */
nic_reg_write(nic, NIC_PF_MPI_0_2047_CFG | (cpi << 3),
(vnic << 24) | (rssi_base + rssi));
}
if ((rssi + 1) >= cfg->rq_cnt)
continue;
if (cfg->cpi_alg == CPI_ALG_VLAN)
rssi++;
else if (cfg->cpi_alg == CPI_ALG_VLAN16)
rssi = ((cpi - cpi_base) & 0xe) >> 1;
else if (cfg->cpi_alg == CPI_ALG_DIFF)
rssi = ((cpi - cpi_base) & 0x38) >> 3;
}
nic->cpi_base[cfg->vf_id] = cpi_base;
nic->rssi_base[cfg->vf_id] = rssi_base;
}
/* Responsds to VF with its RSS indirection table size */
static void
nic_send_rss_size(struct nicpf *nic, int vf)
{
union nic_mbx mbx = {};
- uint64_t *msg;
-
- msg = (uint64_t *)&mbx;
mbx.rss_size.msg = NIC_MBOX_MSG_RSS_SIZE;
mbx.rss_size.ind_tbl_size = nic->rss_ind_tbl_size;
nic_send_msg_to_vf(nic, vf, &mbx);
}
/*
* Receive side scaling configuration
* configure:
* - RSS index
* - indir table i.e hash::RQ mapping
* - no of hash bits to consider
*/
static void
nic_config_rss(struct nicpf *nic, struct rss_cfg_msg *cfg)
{
uint8_t qset, idx;
uint64_t cpi_cfg, cpi_base, rssi_base, rssi;
uint64_t idx_addr;
idx = 0;
rssi_base = nic->rssi_base[cfg->vf_id] + cfg->tbl_offset;
rssi = rssi_base;
qset = cfg->vf_id;
for (; rssi < (rssi_base + cfg->tbl_len); rssi++) {
nic_reg_write(nic, NIC_PF_RSSI_0_4097_RQ | (rssi << 3),
(qset << 3) | (cfg->ind_tbl[idx] & 0x7));
idx++;
}
cpi_base = nic->cpi_base[cfg->vf_id];
if (pass1_silicon(nic->dev))
idx_addr = NIC_PF_CPI_0_2047_CFG;
else
idx_addr = NIC_PF_MPI_0_2047_CFG;
cpi_cfg = nic_reg_read(nic, idx_addr | (cpi_base << 3));
cpi_cfg &= ~(0xFUL << 20);
cpi_cfg |= (cfg->hash_bits << 20);
nic_reg_write(nic, idx_addr | (cpi_base << 3), cpi_cfg);
}
/*
* 4 level transmit side scheduler configutation
* for TNS bypass mode
*
* Sample configuration for SQ0
* VNIC0-SQ0 -> TL4(0) -> TL3[0] -> TL2[0] -> TL1[0] -> BGX0
* VNIC1-SQ0 -> TL4(8) -> TL3[2] -> TL2[0] -> TL1[0] -> BGX0
* VNIC2-SQ0 -> TL4(16) -> TL3[4] -> TL2[1] -> TL1[0] -> BGX0
* VNIC3-SQ0 -> TL4(24) -> TL3[6] -> TL2[1] -> TL1[0] -> BGX0
* VNIC4-SQ0 -> TL4(512) -> TL3[128] -> TL2[32] -> TL1[1] -> BGX1
* VNIC5-SQ0 -> TL4(520) -> TL3[130] -> TL2[32] -> TL1[1] -> BGX1
* VNIC6-SQ0 -> TL4(528) -> TL3[132] -> TL2[33] -> TL1[1] -> BGX1
* VNIC7-SQ0 -> TL4(536) -> TL3[134] -> TL2[33] -> TL1[1] -> BGX1
*/
static void
nic_tx_channel_cfg(struct nicpf *nic, uint8_t vnic, struct sq_cfg_msg *sq)
{
uint32_t bgx, lmac, chan;
uint32_t tl2, tl3, tl4;
uint32_t rr_quantum;
uint8_t sq_idx = sq->sq_num;
uint8_t pqs_vnic;
pqs_vnic = vnic;
bgx = NIC_GET_BGX_FROM_VF_LMAC_MAP(nic->vf_lmac_map[pqs_vnic]);
lmac = NIC_GET_LMAC_FROM_VF_LMAC_MAP(nic->vf_lmac_map[pqs_vnic]);
/* 24 bytes for FCS, IPG and preamble */
rr_quantum = ((NIC_HW_MAX_FRS + 24) / 4);
tl4 = (lmac * NIC_TL4_PER_LMAC) + (bgx * NIC_TL4_PER_BGX);
tl4 += sq_idx;
tl3 = tl4 / (NIC_MAX_TL4 / NIC_MAX_TL3);
nic_reg_write(nic, NIC_PF_QSET_0_127_SQ_0_7_CFG2 |
((uint64_t)vnic << NIC_QS_ID_SHIFT) |
((uint32_t)sq_idx << NIC_Q_NUM_SHIFT), tl4);
nic_reg_write(nic, NIC_PF_TL4_0_1023_CFG | (tl4 << 3),
((uint64_t)vnic << 27) | ((uint32_t)sq_idx << 24) | rr_quantum);
nic_reg_write(nic, NIC_PF_TL3_0_255_CFG | (tl3 << 3), rr_quantum);
chan = (lmac * MAX_BGX_CHANS_PER_LMAC) + (bgx * NIC_CHANS_PER_INF);
nic_reg_write(nic, NIC_PF_TL3_0_255_CHAN | (tl3 << 3), chan);
/* Enable backpressure on the channel */
nic_reg_write(nic, NIC_PF_CHAN_0_255_TX_CFG | (chan << 3), 1);
tl2 = tl3 >> 2;
nic_reg_write(nic, NIC_PF_TL3A_0_63_CFG | (tl2 << 3), tl2);
nic_reg_write(nic, NIC_PF_TL2_0_63_CFG | (tl2 << 3), rr_quantum);
/* No priorities as of now */
nic_reg_write(nic, NIC_PF_TL2_0_63_PRI | (tl2 << 3), 0x00);
}
static int
nic_config_loopback(struct nicpf *nic, struct set_loopback *lbk)
{
int bgx_idx, lmac_idx;
if (lbk->vf_id > MAX_LMAC)
return (ENXIO);
bgx_idx = NIC_GET_BGX_FROM_VF_LMAC_MAP(nic->vf_lmac_map[lbk->vf_id]);
lmac_idx = NIC_GET_LMAC_FROM_VF_LMAC_MAP(nic->vf_lmac_map[lbk->vf_id]);
bgx_lmac_internal_loopback(nic->node, bgx_idx, lmac_idx, lbk->enable);
return (0);
}
/* Interrupt handler to handle mailbox messages from VFs */
static void
nic_handle_mbx_intr(struct nicpf *nic, int vf)
{
union nic_mbx mbx = {};
uint64_t *mbx_data;
uint64_t mbx_addr;
uint64_t reg_addr;
uint64_t cfg;
int bgx, lmac;
int i;
int ret = 0;
nic->mbx_lock[vf] = TRUE;
mbx_addr = nic_get_mbx_addr(vf);
mbx_data = (uint64_t *)&mbx;
for (i = 0; i < NIC_PF_VF_MAILBOX_SIZE; i++) {
*mbx_data = nic_reg_read(nic, mbx_addr);
mbx_data++;
mbx_addr += sizeof(uint64_t);
}
switch (mbx.msg.msg) {
case NIC_MBOX_MSG_READY:
nic_mbx_send_ready(nic, vf);
if (vf < MAX_LMAC) {
nic->link[vf] = 0;
nic->duplex[vf] = 0;
nic->speed[vf] = 0;
}
ret = 1;
break;
case NIC_MBOX_MSG_QS_CFG:
reg_addr = NIC_PF_QSET_0_127_CFG |
(mbx.qs.num << NIC_QS_ID_SHIFT);
cfg = mbx.qs.cfg;
nic_reg_write(nic, reg_addr, cfg);
break;
case NIC_MBOX_MSG_RQ_CFG:
reg_addr = NIC_PF_QSET_0_127_RQ_0_7_CFG |
(mbx.rq.qs_num << NIC_QS_ID_SHIFT) |
(mbx.rq.rq_num << NIC_Q_NUM_SHIFT);
nic_reg_write(nic, reg_addr, mbx.rq.cfg);
break;
case NIC_MBOX_MSG_RQ_BP_CFG:
reg_addr = NIC_PF_QSET_0_127_RQ_0_7_BP_CFG |
(mbx.rq.qs_num << NIC_QS_ID_SHIFT) |
(mbx.rq.rq_num << NIC_Q_NUM_SHIFT);
nic_reg_write(nic, reg_addr, mbx.rq.cfg);
break;
case NIC_MBOX_MSG_RQ_SW_SYNC:
ret = nic_rcv_queue_sw_sync(nic);
break;
case NIC_MBOX_MSG_RQ_DROP_CFG:
reg_addr = NIC_PF_QSET_0_127_RQ_0_7_DROP_CFG |
(mbx.rq.qs_num << NIC_QS_ID_SHIFT) |
(mbx.rq.rq_num << NIC_Q_NUM_SHIFT);
nic_reg_write(nic, reg_addr, mbx.rq.cfg);
break;
case NIC_MBOX_MSG_SQ_CFG:
reg_addr = NIC_PF_QSET_0_127_SQ_0_7_CFG |
(mbx.sq.qs_num << NIC_QS_ID_SHIFT) |
(mbx.sq.sq_num << NIC_Q_NUM_SHIFT);
nic_reg_write(nic, reg_addr, mbx.sq.cfg);
nic_tx_channel_cfg(nic, mbx.qs.num, &mbx.sq);
break;
case NIC_MBOX_MSG_SET_MAC:
lmac = mbx.mac.vf_id;
bgx = NIC_GET_BGX_FROM_VF_LMAC_MAP(nic->vf_lmac_map[lmac]);
lmac = NIC_GET_LMAC_FROM_VF_LMAC_MAP(nic->vf_lmac_map[lmac]);
bgx_set_lmac_mac(nic->node, bgx, lmac, mbx.mac.mac_addr);
break;
case NIC_MBOX_MSG_SET_MAX_FRS:
ret = nic_update_hw_frs(nic, mbx.frs.max_frs, mbx.frs.vf_id);
break;
case NIC_MBOX_MSG_CPI_CFG:
nic_config_cpi(nic, &mbx.cpi_cfg);
break;
case NIC_MBOX_MSG_RSS_SIZE:
nic_send_rss_size(nic, vf);
goto unlock;
case NIC_MBOX_MSG_RSS_CFG:
case NIC_MBOX_MSG_RSS_CFG_CONT: /* fall through */
nic_config_rss(nic, &mbx.rss_cfg);
break;
case NIC_MBOX_MSG_CFG_DONE:
/* Last message of VF config msg sequence */
nic->vf_info[vf].vf_enabled = TRUE;
goto unlock;
case NIC_MBOX_MSG_SHUTDOWN:
/* First msg in VF teardown sequence */
nic->vf_info[vf].vf_enabled = FALSE;
break;
case NIC_MBOX_MSG_BGX_STATS:
nic_get_bgx_stats(nic, &mbx.bgx_stats);
goto unlock;
case NIC_MBOX_MSG_LOOPBACK:
ret = nic_config_loopback(nic, &mbx.lbk);
break;
default:
device_printf(nic->dev,
"Invalid msg from VF%d, msg 0x%x\n", vf, mbx.msg.msg);
break;
}
if (ret == 0)
nic_mbx_send_ack(nic, vf);
else if (mbx.msg.msg != NIC_MBOX_MSG_READY)
nic_mbx_send_nack(nic, vf);
unlock:
nic->mbx_lock[vf] = FALSE;
}
static void
nic_mbx_intr_handler(struct nicpf *nic, int mbx)
{
uint64_t intr;
uint8_t vf, vf_per_mbx_reg = 64;
intr = nic_reg_read(nic, NIC_PF_MAILBOX_INT + (mbx << 3));
for (vf = 0; vf < vf_per_mbx_reg; vf++) {
if (intr & (1UL << vf)) {
nic_handle_mbx_intr(nic, vf + (mbx * vf_per_mbx_reg));
nic_clear_mbx_intr(nic, vf, mbx);
}
}
}
static void
nic_mbx0_intr_handler (void *arg)
{
struct nicpf *nic = (struct nicpf *)arg;
nic_mbx_intr_handler(nic, 0);
}
static void
nic_mbx1_intr_handler (void *arg)
{
struct nicpf *nic = (struct nicpf *)arg;
nic_mbx_intr_handler(nic, 1);
}
static int
nic_enable_msix(struct nicpf *nic)
{
struct pci_devinfo *dinfo;
int rid, count;
int ret;
dinfo = device_get_ivars(nic->dev);
rid = dinfo->cfg.msix.msix_table_bar;
nic->msix_table_res =
bus_alloc_resource_any(nic->dev, SYS_RES_MEMORY, &rid, RF_ACTIVE);
if (nic->msix_table_res == NULL) {
device_printf(nic->dev,
"Could not allocate memory for MSI-X table\n");
return (ENXIO);
}
count = nic->num_vec = NIC_PF_MSIX_VECTORS;
ret = pci_alloc_msix(nic->dev, &count);
if ((ret != 0) || (count != nic->num_vec)) {
device_printf(nic->dev,
"Request for #%d msix vectors failed, error: %d\n",
nic->num_vec, ret);
return (ret);
}
nic->msix_enabled = 1;
return (0);
}
static void
nic_disable_msix(struct nicpf *nic)
{
if (nic->msix_enabled) {
pci_release_msi(nic->dev);
nic->msix_enabled = 0;
nic->num_vec = 0;
}
bus_release_resource(nic->dev, SYS_RES_MEMORY,
rman_get_rid(nic->msix_table_res), nic->msix_table_res);
}
static void
nic_free_all_interrupts(struct nicpf *nic)
{
int irq;
for (irq = 0; irq < nic->num_vec; irq++) {
if (nic->msix_entries[irq].irq_res == NULL)
continue;
if (nic->msix_entries[irq].handle != NULL) {
bus_teardown_intr(nic->dev,
nic->msix_entries[irq].irq_res,
nic->msix_entries[irq].handle);
}
bus_release_resource(nic->dev, SYS_RES_IRQ, irq + 1,
nic->msix_entries[irq].irq_res);
}
}
static int
nic_register_interrupts(struct nicpf *nic)
{
int irq, rid;
int ret;
/* Enable MSI-X */
ret = nic_enable_msix(nic);
if (ret != 0)
return (ret);
/* Register mailbox interrupt handlers */
irq = NIC_PF_INTR_ID_MBOX0;
rid = irq + 1;
nic->msix_entries[irq].irq_res = bus_alloc_resource_any(nic->dev,
SYS_RES_IRQ, &rid, (RF_SHAREABLE | RF_ACTIVE));
if (nic->msix_entries[irq].irq_res == NULL) {
ret = ENXIO;
goto fail;
}
ret = bus_setup_intr(nic->dev, nic->msix_entries[irq].irq_res,
(INTR_MPSAFE | INTR_TYPE_MISC), NULL, nic_mbx0_intr_handler, nic,
&nic->msix_entries[irq].handle);
if (ret != 0)
goto fail;
irq = NIC_PF_INTR_ID_MBOX1;
rid = irq + 1;
nic->msix_entries[irq].irq_res = bus_alloc_resource_any(nic->dev,
SYS_RES_IRQ, &rid, (RF_SHAREABLE | RF_ACTIVE));
if (nic->msix_entries[irq].irq_res == NULL) {
ret = ENXIO;
goto fail;
}
ret = bus_setup_intr(nic->dev, nic->msix_entries[irq].irq_res,
(INTR_MPSAFE | INTR_TYPE_MISC), NULL, nic_mbx1_intr_handler, nic,
&nic->msix_entries[irq].handle);
if (ret != 0)
goto fail;
/* Enable mailbox interrupt */
nic_enable_mbx_intr(nic);
return (0);
fail:
nic_free_all_interrupts(nic);
return (ret);
}
static void
nic_unregister_interrupts(struct nicpf *nic)
{
nic_free_all_interrupts(nic);
nic_disable_msix(nic);
}
static int nic_sriov_init(device_t dev, struct nicpf *nic)
{
#ifdef PCI_IOV
nvlist_t *pf_schema, *vf_schema;
int iov_pos;
int err;
uint16_t total_vf_cnt;
err = pci_find_extcap(dev, PCIZ_SRIOV, &iov_pos);
if (err != 0) {
device_printf(dev,
"SR-IOV capability is not found in PCIe config space\n");
return (err);
}
/* Fix-up the number of enabled VFs */
total_vf_cnt = pci_read_config(dev, iov_pos + PCIR_SRIOV_TOTAL_VFS, 2);
if (total_vf_cnt == 0)
return (ENXIO);
/* Attach SR-IOV */
pf_schema = pci_iov_schema_alloc_node();
vf_schema = pci_iov_schema_alloc_node();
pci_iov_schema_add_unicast_mac(vf_schema, "mac-addr", 0, NULL);
/*
* All VFs can change their MACs.
* This flag will be ignored but we set it just for the record.
*/
pci_iov_schema_add_bool(vf_schema, "allow-set-mac",
IOV_SCHEMA_HASDEFAULT, TRUE);
err = pci_iov_attach(dev, pf_schema, vf_schema);
if (err != 0) {
device_printf(dev,
"Failed to initialize SR-IOV (error=%d)\n",
err);
return (err);
}
#endif
return (0);
}
/*
* Poll for BGX LMAC link status and update corresponding VF
* if there is a change, valid only if internal L2 switch
* is not present otherwise VF link is always treated as up
*/
static void
nic_poll_for_link(void *arg)
{
union nic_mbx mbx = {};
struct nicpf *nic;
struct bgx_link_status link;
uint8_t vf, bgx, lmac;
nic = (struct nicpf *)arg;
mbx.link_status.msg = NIC_MBOX_MSG_BGX_LINK_CHANGE;
for (vf = 0; vf < nic->num_vf_en; vf++) {
/* Poll only if VF is UP */
if (!nic->vf_info[vf].vf_enabled)
continue;
/* Get BGX, LMAC indices for the VF */
bgx = NIC_GET_BGX_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vf]);
lmac = NIC_GET_LMAC_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vf]);
/* Get interface link status */
bgx_get_lmac_link_state(nic->node, bgx, lmac, &link);
/* Inform VF only if link status changed */
if (nic->link[vf] == link.link_up)
continue;
if (!nic->mbx_lock[vf]) {
nic->link[vf] = link.link_up;
nic->duplex[vf] = link.duplex;
nic->speed[vf] = link.speed;
/* Send a mbox message to VF with current link status */
mbx.link_status.link_up = link.link_up;
mbx.link_status.duplex = link.duplex;
mbx.link_status.speed = link.speed;
nic_send_msg_to_vf(nic, vf, &mbx);
}
}
callout_reset(&nic->check_link, hz * 2, nic_poll_for_link, nic);
}
Index: head/sys/dev/vnic/nicvf_main.c
===================================================================
--- head/sys/dev/vnic/nicvf_main.c (revision 327172)
+++ head/sys/dev/vnic/nicvf_main.c (revision 327173)
@@ -1,1627 +1,1625 @@
/*
* Copyright (C) 2015 Cavium Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_inet.h"
#include "opt_inet6.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/bitset.h>
#include <sys/bitstring.h>
#include <sys/bus.h>
#include <sys/endian.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/module.h>
#include <sys/rman.h>
#include <sys/pciio.h>
#include <sys/pcpu.h>
#include <sys/proc.h>
#include <sys/socket.h>
#include <sys/sockio.h>
#include <sys/stdatomic.h>
#include <sys/cpuset.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/smp.h>
#include <sys/taskqueue.h>
#include <net/bpf.h>
#include <net/ethernet.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/if_arp.h>
#include <net/if_dl.h>
#include <net/if_media.h>
#include <net/if_types.h>
#include <net/if_vlan_var.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/if_ether.h>
#include <netinet/tcp_lro.h>
#include <dev/pci/pcireg.h>
#include <dev/pci/pcivar.h>
#include <sys/dnv.h>
#include <sys/nv.h>
#include <sys/iov_schema.h>
#include <machine/bus.h>
#include "thunder_bgx.h"
#include "nic_reg.h"
#include "nic.h"
#include "nicvf_queues.h"
#define VNIC_VF_DEVSTR "Cavium Thunder NIC Virtual Function Driver"
#define VNIC_VF_REG_RID PCIR_BAR(PCI_CFG_REG_BAR_NUM)
/* Lock for core interface settings */
#define NICVF_CORE_LOCK_INIT(nic) \
sx_init(&(nic)->core_sx, device_get_nameunit((nic)->dev))
#define NICVF_CORE_LOCK_DESTROY(nic) \
sx_destroy(&(nic)->core_sx)
#define NICVF_CORE_LOCK(nic) sx_xlock(&(nic)->core_sx)
#define NICVF_CORE_UNLOCK(nic) sx_xunlock(&(nic)->core_sx)
#define NICVF_CORE_LOCK_ASSERT(nic) sx_assert(&(nic)->core_sx, SA_XLOCKED)
#define SPEED_10 10
#define SPEED_100 100
#define SPEED_1000 1000
#define SPEED_10000 10000
#define SPEED_40000 40000
MALLOC_DEFINE(M_NICVF, "nicvf", "ThunderX VNIC VF dynamic memory");
static int nicvf_probe(device_t);
static int nicvf_attach(device_t);
static int nicvf_detach(device_t);
static device_method_t nicvf_methods[] = {
/* Device interface */
DEVMETHOD(device_probe, nicvf_probe),
DEVMETHOD(device_attach, nicvf_attach),
DEVMETHOD(device_detach, nicvf_detach),
DEVMETHOD_END,
};
static driver_t nicvf_driver = {
"vnic",
nicvf_methods,
sizeof(struct nicvf),
};
static devclass_t nicvf_devclass;
DRIVER_MODULE(vnicvf, pci, nicvf_driver, nicvf_devclass, 0, 0);
MODULE_VERSION(vnicvf, 1);
MODULE_DEPEND(vnicvf, pci, 1, 1, 1);
MODULE_DEPEND(vnicvf, ether, 1, 1, 1);
MODULE_DEPEND(vnicvf, vnicpf, 1, 1, 1);
static int nicvf_allocate_misc_interrupt(struct nicvf *);
static int nicvf_enable_misc_interrupt(struct nicvf *);
static int nicvf_allocate_net_interrupts(struct nicvf *);
static void nicvf_release_all_interrupts(struct nicvf *);
static int nicvf_update_hw_max_frs(struct nicvf *, int);
static int nicvf_hw_set_mac_addr(struct nicvf *, uint8_t *);
static void nicvf_config_cpi(struct nicvf *);
static int nicvf_rss_init(struct nicvf *);
static int nicvf_init_resources(struct nicvf *);
static int nicvf_setup_ifnet(struct nicvf *);
static int nicvf_setup_ifmedia(struct nicvf *);
static void nicvf_hw_addr_random(uint8_t *);
static int nicvf_if_ioctl(struct ifnet *, u_long, caddr_t);
static void nicvf_if_init(void *);
static void nicvf_if_init_locked(struct nicvf *);
static int nicvf_if_transmit(struct ifnet *, struct mbuf *);
static void nicvf_if_qflush(struct ifnet *);
static uint64_t nicvf_if_getcounter(struct ifnet *, ift_counter);
static int nicvf_stop_locked(struct nicvf *);
static void nicvf_media_status(struct ifnet *, struct ifmediareq *);
static int nicvf_media_change(struct ifnet *);
static void nicvf_tick_stats(void *);
static int
nicvf_probe(device_t dev)
{
uint16_t vendor_id;
uint16_t device_id;
vendor_id = pci_get_vendor(dev);
device_id = pci_get_device(dev);
if (vendor_id != PCI_VENDOR_ID_CAVIUM)
return (ENXIO);
if (device_id == PCI_DEVICE_ID_THUNDER_NIC_VF ||
device_id == PCI_DEVICE_ID_THUNDER_PASS1_NIC_VF) {
device_set_desc(dev, VNIC_VF_DEVSTR);
return (BUS_PROBE_DEFAULT);
}
return (ENXIO);
}
static int
nicvf_attach(device_t dev)
{
struct nicvf *nic;
int rid, qcount;
int err = 0;
uint8_t hwaddr[ETHER_ADDR_LEN];
uint8_t zeromac[] = {[0 ... (ETHER_ADDR_LEN - 1)] = 0};
nic = device_get_softc(dev);
nic->dev = dev;
nic->pnicvf = nic;
NICVF_CORE_LOCK_INIT(nic);
/* Enable HW TSO on Pass2 */
if (!pass1_silicon(dev))
nic->hw_tso = TRUE;
rid = VNIC_VF_REG_RID;
nic->reg_base = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
RF_ACTIVE);
if (nic->reg_base == NULL) {
device_printf(dev, "Could not allocate registers memory\n");
return (ENXIO);
}
qcount = MAX_CMP_QUEUES_PER_QS;
nic->max_queues = qcount;
err = nicvf_set_qset_resources(nic);
if (err != 0)
goto err_free_res;
/* Check if PF is alive and get MAC address for this VF */
err = nicvf_allocate_misc_interrupt(nic);
if (err != 0)
goto err_free_res;
NICVF_CORE_LOCK(nic);
err = nicvf_enable_misc_interrupt(nic);
NICVF_CORE_UNLOCK(nic);
if (err != 0)
goto err_release_intr;
err = nicvf_allocate_net_interrupts(nic);
if (err != 0) {
device_printf(dev,
"Could not allocate network interface interrupts\n");
goto err_free_ifnet;
}
/* If no MAC address was obtained we generate random one */
if (memcmp(nic->hwaddr, zeromac, ETHER_ADDR_LEN) == 0) {
nicvf_hw_addr_random(hwaddr);
memcpy(nic->hwaddr, hwaddr, ETHER_ADDR_LEN);
NICVF_CORE_LOCK(nic);
nicvf_hw_set_mac_addr(nic, hwaddr);
NICVF_CORE_UNLOCK(nic);
}
/* Configure CPI alorithm */
nic->cpi_alg = CPI_ALG_NONE;
NICVF_CORE_LOCK(nic);
nicvf_config_cpi(nic);
/* Configure receive side scaling */
if (nic->qs->rq_cnt > 1)
nicvf_rss_init(nic);
NICVF_CORE_UNLOCK(nic);
err = nicvf_setup_ifnet(nic);
if (err != 0) {
device_printf(dev, "Could not set-up ifnet\n");
goto err_release_intr;
}
err = nicvf_setup_ifmedia(nic);
if (err != 0) {
device_printf(dev, "Could not set-up ifmedia\n");
goto err_free_ifnet;
}
mtx_init(&nic->stats_mtx, "VNIC stats", NULL, MTX_DEF);
callout_init_mtx(&nic->stats_callout, &nic->stats_mtx, 0);
ether_ifattach(nic->ifp, nic->hwaddr);
return (0);
err_free_ifnet:
if_free(nic->ifp);
err_release_intr:
nicvf_release_all_interrupts(nic);
err_free_res:
bus_release_resource(dev, SYS_RES_MEMORY, rman_get_rid(nic->reg_base),
nic->reg_base);
return (err);
}
static int
nicvf_detach(device_t dev)
{
struct nicvf *nic;
nic = device_get_softc(dev);
NICVF_CORE_LOCK(nic);
/* Shut down the port and release ring resources */
nicvf_stop_locked(nic);
/* Release stats lock */
mtx_destroy(&nic->stats_mtx);
/* Release interrupts */
nicvf_release_all_interrupts(nic);
/* Release memory resource */
if (nic->reg_base != NULL) {
bus_release_resource(dev, SYS_RES_MEMORY,
rman_get_rid(nic->reg_base), nic->reg_base);
}
/* Remove all ifmedia configurations */
ifmedia_removeall(&nic->if_media);
/* Free this ifnet */
if_free(nic->ifp);
NICVF_CORE_UNLOCK(nic);
/* Finally destroy the lock */
NICVF_CORE_LOCK_DESTROY(nic);
return (0);
}
static void
nicvf_hw_addr_random(uint8_t *hwaddr)
{
uint32_t rnd;
uint8_t addr[ETHER_ADDR_LEN];
/*
* Create randomized MAC address.
* Set 'bsd' + random 24 low-order bits.
*/
rnd = arc4random() & 0x00ffffff;
addr[0] = 'b';
addr[1] = 's';
addr[2] = 'd';
addr[3] = rnd >> 16;
addr[4] = rnd >> 8;
addr[5] = rnd >> 0;
memcpy(hwaddr, addr, ETHER_ADDR_LEN);
}
static int
nicvf_setup_ifnet(struct nicvf *nic)
{
struct ifnet *ifp;
ifp = if_alloc(IFT_ETHER);
if (ifp == NULL) {
device_printf(nic->dev, "Could not allocate ifnet structure\n");
return (ENOMEM);
}
nic->ifp = ifp;
if_setsoftc(ifp, nic);
if_initname(ifp, device_get_name(nic->dev), device_get_unit(nic->dev));
if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
if_settransmitfn(ifp, nicvf_if_transmit);
if_setqflushfn(ifp, nicvf_if_qflush);
if_setioctlfn(ifp, nicvf_if_ioctl);
if_setinitfn(ifp, nicvf_if_init);
if_setgetcounterfn(ifp, nicvf_if_getcounter);
if_setmtu(ifp, ETHERMTU);
/* Reset caps */
if_setcapabilities(ifp, 0);
/* Set the default values */
if_setcapabilitiesbit(ifp, IFCAP_VLAN_MTU | IFCAP_JUMBO_MTU, 0);
if_setcapabilitiesbit(ifp, IFCAP_LRO, 0);
if (nic->hw_tso) {
/* TSO */
if_setcapabilitiesbit(ifp, IFCAP_TSO4, 0);
/* TSO parameters */
if_sethwtsomax(ifp, NICVF_TSO_MAXSIZE);
if_sethwtsomaxsegcount(ifp, NICVF_TSO_NSEGS);
if_sethwtsomaxsegsize(ifp, MCLBYTES);
}
/* IP/TCP/UDP HW checksums */
if_setcapabilitiesbit(ifp, IFCAP_HWCSUM, 0);
if_setcapabilitiesbit(ifp, IFCAP_HWSTATS, 0);
/*
* HW offload enable
*/
if_clearhwassist(ifp);
if_sethwassistbits(ifp, (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP), 0);
if (nic->hw_tso)
if_sethwassistbits(ifp, (CSUM_TSO), 0);
if_setcapenable(ifp, if_getcapabilities(ifp));
return (0);
}
static int
nicvf_setup_ifmedia(struct nicvf *nic)
{
ifmedia_init(&nic->if_media, IFM_IMASK, nicvf_media_change,
nicvf_media_status);
/*
* Advertise availability of all possible connection types,
* even though not all are possible at the same time.
*/
ifmedia_add(&nic->if_media, (IFM_ETHER | IFM_10_T | IFM_FDX),
0, NULL);
ifmedia_add(&nic->if_media, (IFM_ETHER | IFM_100_TX | IFM_FDX),
0, NULL);
ifmedia_add(&nic->if_media, (IFM_ETHER | IFM_1000_T | IFM_FDX),
0, NULL);
ifmedia_add(&nic->if_media, (IFM_ETHER | IFM_10G_SR | IFM_FDX),
0, NULL);
ifmedia_add(&nic->if_media, (IFM_ETHER | IFM_40G_CR4 | IFM_FDX),
0, NULL);
ifmedia_add(&nic->if_media, (IFM_ETHER | IFM_AUTO | IFM_FDX),
0, NULL);
ifmedia_set(&nic->if_media, (IFM_ETHER | IFM_AUTO | IFM_FDX));
return (0);
}
static int
nicvf_if_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
{
struct nicvf *nic;
struct rcv_queue *rq;
struct ifreq *ifr;
- uint32_t flags;
int mask, err;
int rq_idx;
#if defined(INET) || defined(INET6)
struct ifaddr *ifa;
boolean_t avoid_reset = FALSE;
#endif
nic = if_getsoftc(ifp);
ifr = (struct ifreq *)data;
#if defined(INET) || defined(INET6)
ifa = (struct ifaddr *)data;
#endif
err = 0;
switch (cmd) {
case SIOCSIFADDR:
#ifdef INET
if (ifa->ifa_addr->sa_family == AF_INET)
avoid_reset = TRUE;
#endif
#ifdef INET6
if (ifa->ifa_addr->sa_family == AF_INET6)
avoid_reset = TRUE;
#endif
#if defined(INET) || defined(INET6)
/* Avoid reinitialization unless it's necessary */
if (avoid_reset) {
if_setflagbits(ifp, IFF_UP, 0);
if (!(if_getdrvflags(ifp) & IFF_DRV_RUNNING))
nicvf_if_init(nic);
#ifdef INET
if (!(if_getflags(ifp) & IFF_NOARP))
arp_ifinit(ifp, ifa);
#endif
return (0);
}
#endif
err = ether_ioctl(ifp, cmd, data);
break;
case SIOCSIFMTU:
if (ifr->ifr_mtu < NIC_HW_MIN_FRS ||
ifr->ifr_mtu > NIC_HW_MAX_FRS) {
err = EINVAL;
} else {
NICVF_CORE_LOCK(nic);
err = nicvf_update_hw_max_frs(nic, ifr->ifr_mtu);
if (err == 0)
if_setmtu(ifp, ifr->ifr_mtu);
NICVF_CORE_UNLOCK(nic);
}
break;
case SIOCSIFFLAGS:
NICVF_CORE_LOCK(nic);
if (if_getflags(ifp) & IFF_UP) {
if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
- flags = if_getflags(ifp) ^ nic->if_flags;
if ((nic->if_flags & if_getflags(ifp)) &
IFF_PROMISC) {
/* Change promiscous mode */
#if 0
/* ARM64TODO */
nicvf_set_promiscous(nic);
#endif
}
if ((nic->if_flags ^ if_getflags(ifp)) &
IFF_ALLMULTI) {
/* Change multicasting settings */
#if 0
/* ARM64TODO */
nicvf_set_multicast(nic);
#endif
}
} else {
nicvf_if_init_locked(nic);
}
} else if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)
nicvf_stop_locked(nic);
nic->if_flags = if_getflags(ifp);
NICVF_CORE_UNLOCK(nic);
break;
case SIOCADDMULTI:
case SIOCDELMULTI:
if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
#if 0
NICVF_CORE_LOCK(nic);
/* ARM64TODO */
nicvf_set_multicast(nic);
NICVF_CORE_UNLOCK(nic);
#endif
}
break;
case SIOCSIFMEDIA:
case SIOCGIFMEDIA:
err = ifmedia_ioctl(ifp, ifr, &nic->if_media, cmd);
break;
case SIOCSIFCAP:
mask = if_getcapenable(ifp) ^ ifr->ifr_reqcap;
if (mask & IFCAP_VLAN_MTU) {
/* No work to do except acknowledge the change took. */
if_togglecapenable(ifp, IFCAP_VLAN_MTU);
}
if (mask & IFCAP_TXCSUM)
if_togglecapenable(ifp, IFCAP_TXCSUM);
if (mask & IFCAP_RXCSUM)
if_togglecapenable(ifp, IFCAP_RXCSUM);
if ((mask & IFCAP_TSO4) && nic->hw_tso)
if_togglecapenable(ifp, IFCAP_TSO4);
if (mask & IFCAP_LRO) {
/*
* Lock the driver for a moment to avoid
* mismatch in per-queue settings.
*/
NICVF_CORE_LOCK(nic);
if_togglecapenable(ifp, IFCAP_LRO);
if ((if_getdrvflags(nic->ifp) & IFF_DRV_RUNNING) != 0) {
/*
* Now disable LRO for subsequent packets.
* Atomicity of this change is not necessary
* as we don't need precise toggle of this
* feature for all threads processing the
* completion queue.
*/
for (rq_idx = 0;
rq_idx < nic->qs->rq_cnt; rq_idx++) {
rq = &nic->qs->rq[rq_idx];
rq->lro_enabled = !rq->lro_enabled;
}
}
NICVF_CORE_UNLOCK(nic);
}
break;
default:
err = ether_ioctl(ifp, cmd, data);
break;
}
return (err);
}
static void
nicvf_if_init_locked(struct nicvf *nic)
{
struct queue_set *qs = nic->qs;
struct ifnet *ifp;
int qidx;
int err;
caddr_t if_addr;
NICVF_CORE_LOCK_ASSERT(nic);
ifp = nic->ifp;
if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0)
nicvf_stop_locked(nic);
err = nicvf_enable_misc_interrupt(nic);
if (err != 0) {
if_printf(ifp, "Could not reenable Mbox interrupt\n");
return;
}
/* Get the latest MAC address */
if_addr = if_getlladdr(ifp);
/* Update MAC address if changed */
if (memcmp(nic->hwaddr, if_addr, ETHER_ADDR_LEN) != 0) {
memcpy(nic->hwaddr, if_addr, ETHER_ADDR_LEN);
nicvf_hw_set_mac_addr(nic, if_addr);
}
/* Initialize the queues */
err = nicvf_init_resources(nic);
if (err != 0)
goto error;
/* Make sure queue initialization is written */
wmb();
nicvf_reg_write(nic, NIC_VF_INT, ~0UL);
/* Enable Qset err interrupt */
nicvf_enable_intr(nic, NICVF_INTR_QS_ERR, 0);
/* Enable completion queue interrupt */
for (qidx = 0; qidx < qs->cq_cnt; qidx++)
nicvf_enable_intr(nic, NICVF_INTR_CQ, qidx);
/* Enable RBDR threshold interrupt */
for (qidx = 0; qidx < qs->rbdr_cnt; qidx++)
nicvf_enable_intr(nic, NICVF_INTR_RBDR, qidx);
nic->drv_stats.txq_stop = 0;
nic->drv_stats.txq_wake = 0;
/* Activate network interface */
if_setdrvflagbits(ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE);
/* Schedule callout to update stats */
callout_reset(&nic->stats_callout, hz, nicvf_tick_stats, nic);
return;
error:
/* Something went very wrong. Disable this ifnet for good */
if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
}
static void
nicvf_if_init(void *if_softc)
{
struct nicvf *nic = if_softc;
NICVF_CORE_LOCK(nic);
nicvf_if_init_locked(nic);
NICVF_CORE_UNLOCK(nic);
}
static int
nicvf_if_transmit(struct ifnet *ifp, struct mbuf *mbuf)
{
struct nicvf *nic = if_getsoftc(ifp);
struct queue_set *qs = nic->qs;
struct snd_queue *sq;
struct mbuf *mtmp;
int qidx;
int err = 0;
if (__predict_false(qs == NULL)) {
panic("%s: missing queue set for %s", __func__,
device_get_nameunit(nic->dev));
}
/* Select queue */
if (M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE)
qidx = mbuf->m_pkthdr.flowid % qs->sq_cnt;
else
qidx = curcpu % qs->sq_cnt;
sq = &qs->sq[qidx];
if (mbuf->m_next != NULL &&
(mbuf->m_pkthdr.csum_flags &
(CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP)) != 0) {
if (M_WRITABLE(mbuf) == 0) {
mtmp = m_dup(mbuf, M_NOWAIT);
m_freem(mbuf);
if (mtmp == NULL)
return (ENOBUFS);
mbuf = mtmp;
}
}
err = drbr_enqueue(ifp, sq->br, mbuf);
if (((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
IFF_DRV_RUNNING) || !nic->link_up || (err != 0)) {
/*
* Try to enqueue packet to the ring buffer.
* If the driver is not active, link down or enqueue operation
* failed, return with the appropriate error code.
*/
return (err);
}
if (NICVF_TX_TRYLOCK(sq) != 0) {
err = nicvf_xmit_locked(sq);
NICVF_TX_UNLOCK(sq);
return (err);
} else
taskqueue_enqueue(sq->snd_taskq, &sq->snd_task);
return (0);
}
static void
nicvf_if_qflush(struct ifnet *ifp)
{
struct nicvf *nic;
struct queue_set *qs;
struct snd_queue *sq;
struct mbuf *mbuf;
size_t idx;
nic = if_getsoftc(ifp);
qs = nic->qs;
for (idx = 0; idx < qs->sq_cnt; idx++) {
sq = &qs->sq[idx];
NICVF_TX_LOCK(sq);
while ((mbuf = buf_ring_dequeue_sc(sq->br)) != NULL)
m_freem(mbuf);
NICVF_TX_UNLOCK(sq);
}
if_qflush(ifp);
}
static uint64_t
nicvf_if_getcounter(struct ifnet *ifp, ift_counter cnt)
{
struct nicvf *nic;
struct nicvf_hw_stats *hw_stats;
struct nicvf_drv_stats *drv_stats;
nic = if_getsoftc(ifp);
hw_stats = &nic->hw_stats;
drv_stats = &nic->drv_stats;
switch (cnt) {
case IFCOUNTER_IPACKETS:
return (drv_stats->rx_frames_ok);
case IFCOUNTER_OPACKETS:
return (drv_stats->tx_frames_ok);
case IFCOUNTER_IBYTES:
return (hw_stats->rx_bytes);
case IFCOUNTER_OBYTES:
return (hw_stats->tx_bytes_ok);
case IFCOUNTER_IMCASTS:
return (hw_stats->rx_mcast_frames);
case IFCOUNTER_COLLISIONS:
return (0);
case IFCOUNTER_IQDROPS:
return (drv_stats->rx_drops);
case IFCOUNTER_OQDROPS:
return (drv_stats->tx_drops);
default:
return (if_get_counter_default(ifp, cnt));
}
}
static void
nicvf_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
{
struct nicvf *nic = if_getsoftc(ifp);
NICVF_CORE_LOCK(nic);
ifmr->ifm_status = IFM_AVALID;
ifmr->ifm_active = IFM_ETHER;
if (nic->link_up) {
/* Device attached to working network */
ifmr->ifm_status |= IFM_ACTIVE;
}
switch (nic->speed) {
case SPEED_10:
ifmr->ifm_active |= IFM_10_T;
break;
case SPEED_100:
ifmr->ifm_active |= IFM_100_TX;
break;
case SPEED_1000:
ifmr->ifm_active |= IFM_1000_T;
break;
case SPEED_10000:
ifmr->ifm_active |= IFM_10G_SR;
break;
case SPEED_40000:
ifmr->ifm_active |= IFM_40G_CR4;
break;
default:
ifmr->ifm_active |= IFM_AUTO;
break;
}
if (nic->duplex)
ifmr->ifm_active |= IFM_FDX;
else
ifmr->ifm_active |= IFM_HDX;
NICVF_CORE_UNLOCK(nic);
}
static int
nicvf_media_change(struct ifnet *ifp __unused)
{
return (0);
}
/* Register read/write APIs */
void
nicvf_reg_write(struct nicvf *nic, bus_space_handle_t offset, uint64_t val)
{
bus_write_8(nic->reg_base, offset, val);
}
uint64_t
nicvf_reg_read(struct nicvf *nic, uint64_t offset)
{
return (bus_read_8(nic->reg_base, offset));
}
void
nicvf_queue_reg_write(struct nicvf *nic, bus_space_handle_t offset,
uint64_t qidx, uint64_t val)
{
bus_write_8(nic->reg_base, offset + (qidx << NIC_Q_NUM_SHIFT), val);
}
uint64_t
nicvf_queue_reg_read(struct nicvf *nic, bus_space_handle_t offset,
uint64_t qidx)
{
return (bus_read_8(nic->reg_base, offset + (qidx << NIC_Q_NUM_SHIFT)));
}
/* VF -> PF mailbox communication */
static void
nicvf_write_to_mbx(struct nicvf *nic, union nic_mbx *mbx)
{
uint64_t *msg = (uint64_t *)mbx;
nicvf_reg_write(nic, NIC_VF_PF_MAILBOX_0_1 + 0, msg[0]);
nicvf_reg_write(nic, NIC_VF_PF_MAILBOX_0_1 + 8, msg[1]);
}
int
nicvf_send_msg_to_pf(struct nicvf *nic, union nic_mbx *mbx)
{
int timeout = NIC_MBOX_MSG_TIMEOUT * 10;
int sleep = 2;
NICVF_CORE_LOCK_ASSERT(nic);
nic->pf_acked = FALSE;
nic->pf_nacked = FALSE;
nicvf_write_to_mbx(nic, mbx);
/* Wait for previous message to be acked, timeout 2sec */
while (!nic->pf_acked) {
if (nic->pf_nacked)
return (EINVAL);
DELAY(sleep * 1000);
if (nic->pf_acked)
break;
timeout -= sleep;
if (!timeout) {
device_printf(nic->dev,
"PF didn't ack to mbox msg %d from VF%d\n",
(mbx->msg.msg & 0xFF), nic->vf_id);
return (EBUSY);
}
}
return (0);
}
/*
* Checks if VF is able to comminicate with PF
* and also gets the VNIC number this VF is associated to.
*/
static int
nicvf_check_pf_ready(struct nicvf *nic)
{
union nic_mbx mbx = {};
mbx.msg.msg = NIC_MBOX_MSG_READY;
if (nicvf_send_msg_to_pf(nic, &mbx)) {
device_printf(nic->dev,
"PF didn't respond to READY msg\n");
return 0;
}
return 1;
}
static void
nicvf_read_bgx_stats(struct nicvf *nic, struct bgx_stats_msg *bgx)
{
if (bgx->rx)
nic->bgx_stats.rx_stats[bgx->idx] = bgx->stats;
else
nic->bgx_stats.tx_stats[bgx->idx] = bgx->stats;
}
static void
nicvf_handle_mbx_intr(struct nicvf *nic)
{
union nic_mbx mbx = {};
uint64_t *mbx_data;
uint64_t mbx_addr;
int i;
mbx_addr = NIC_VF_PF_MAILBOX_0_1;
mbx_data = (uint64_t *)&mbx;
for (i = 0; i < NIC_PF_VF_MAILBOX_SIZE; i++) {
*mbx_data = nicvf_reg_read(nic, mbx_addr);
mbx_data++;
mbx_addr += sizeof(uint64_t);
}
switch (mbx.msg.msg) {
case NIC_MBOX_MSG_READY:
nic->pf_acked = TRUE;
nic->vf_id = mbx.nic_cfg.vf_id & 0x7F;
nic->tns_mode = mbx.nic_cfg.tns_mode & 0x7F;
nic->node = mbx.nic_cfg.node_id;
memcpy(nic->hwaddr, mbx.nic_cfg.mac_addr, ETHER_ADDR_LEN);
nic->loopback_supported = mbx.nic_cfg.loopback_supported;
nic->link_up = FALSE;
nic->duplex = 0;
nic->speed = 0;
break;
case NIC_MBOX_MSG_ACK:
nic->pf_acked = TRUE;
break;
case NIC_MBOX_MSG_NACK:
nic->pf_nacked = TRUE;
break;
case NIC_MBOX_MSG_RSS_SIZE:
nic->rss_info.rss_size = mbx.rss_size.ind_tbl_size;
nic->pf_acked = TRUE;
break;
case NIC_MBOX_MSG_BGX_STATS:
nicvf_read_bgx_stats(nic, &mbx.bgx_stats);
nic->pf_acked = TRUE;
break;
case NIC_MBOX_MSG_BGX_LINK_CHANGE:
nic->pf_acked = TRUE;
nic->link_up = mbx.link_status.link_up;
nic->duplex = mbx.link_status.duplex;
nic->speed = mbx.link_status.speed;
if (nic->link_up) {
if_setbaudrate(nic->ifp, nic->speed * 1000000);
if_link_state_change(nic->ifp, LINK_STATE_UP);
} else {
if_setbaudrate(nic->ifp, 0);
if_link_state_change(nic->ifp, LINK_STATE_DOWN);
}
break;
default:
device_printf(nic->dev,
"Invalid message from PF, msg 0x%x\n", mbx.msg.msg);
break;
}
nicvf_clear_intr(nic, NICVF_INTR_MBOX, 0);
}
static int
nicvf_update_hw_max_frs(struct nicvf *nic, int mtu)
{
union nic_mbx mbx = {};
mbx.frs.msg = NIC_MBOX_MSG_SET_MAX_FRS;
mbx.frs.max_frs = mtu;
mbx.frs.vf_id = nic->vf_id;
return nicvf_send_msg_to_pf(nic, &mbx);
}
static int
nicvf_hw_set_mac_addr(struct nicvf *nic, uint8_t *hwaddr)
{
union nic_mbx mbx = {};
mbx.mac.msg = NIC_MBOX_MSG_SET_MAC;
mbx.mac.vf_id = nic->vf_id;
memcpy(mbx.mac.mac_addr, hwaddr, ETHER_ADDR_LEN);
return (nicvf_send_msg_to_pf(nic, &mbx));
}
static void
nicvf_config_cpi(struct nicvf *nic)
{
union nic_mbx mbx = {};
mbx.cpi_cfg.msg = NIC_MBOX_MSG_CPI_CFG;
mbx.cpi_cfg.vf_id = nic->vf_id;
mbx.cpi_cfg.cpi_alg = nic->cpi_alg;
mbx.cpi_cfg.rq_cnt = nic->qs->rq_cnt;
nicvf_send_msg_to_pf(nic, &mbx);
}
static void
nicvf_get_rss_size(struct nicvf *nic)
{
union nic_mbx mbx = {};
mbx.rss_size.msg = NIC_MBOX_MSG_RSS_SIZE;
mbx.rss_size.vf_id = nic->vf_id;
nicvf_send_msg_to_pf(nic, &mbx);
}
static void
nicvf_config_rss(struct nicvf *nic)
{
union nic_mbx mbx = {};
struct nicvf_rss_info *rss;
int ind_tbl_len;
int i, nextq;
rss = &nic->rss_info;
ind_tbl_len = rss->rss_size;
nextq = 0;
mbx.rss_cfg.vf_id = nic->vf_id;
mbx.rss_cfg.hash_bits = rss->hash_bits;
while (ind_tbl_len != 0) {
mbx.rss_cfg.tbl_offset = nextq;
mbx.rss_cfg.tbl_len = MIN(ind_tbl_len,
RSS_IND_TBL_LEN_PER_MBX_MSG);
mbx.rss_cfg.msg = mbx.rss_cfg.tbl_offset ?
NIC_MBOX_MSG_RSS_CFG_CONT : NIC_MBOX_MSG_RSS_CFG;
for (i = 0; i < mbx.rss_cfg.tbl_len; i++)
mbx.rss_cfg.ind_tbl[i] = rss->ind_tbl[nextq++];
nicvf_send_msg_to_pf(nic, &mbx);
ind_tbl_len -= mbx.rss_cfg.tbl_len;
}
}
static void
nicvf_set_rss_key(struct nicvf *nic)
{
struct nicvf_rss_info *rss;
uint64_t key_addr;
int idx;
rss = &nic->rss_info;
key_addr = NIC_VNIC_RSS_KEY_0_4;
for (idx = 0; idx < RSS_HASH_KEY_SIZE; idx++) {
nicvf_reg_write(nic, key_addr, rss->key[idx]);
key_addr += sizeof(uint64_t);
}
}
static int
nicvf_rss_init(struct nicvf *nic)
{
struct nicvf_rss_info *rss;
int idx;
nicvf_get_rss_size(nic);
rss = &nic->rss_info;
if (nic->cpi_alg != CPI_ALG_NONE) {
rss->enable = FALSE;
rss->hash_bits = 0;
return (ENXIO);
}
rss->enable = TRUE;
/* Using the HW reset value for now */
rss->key[0] = 0xFEED0BADFEED0BADUL;
rss->key[1] = 0xFEED0BADFEED0BADUL;
rss->key[2] = 0xFEED0BADFEED0BADUL;
rss->key[3] = 0xFEED0BADFEED0BADUL;
rss->key[4] = 0xFEED0BADFEED0BADUL;
nicvf_set_rss_key(nic);
rss->cfg = RSS_IP_HASH_ENA | RSS_TCP_HASH_ENA | RSS_UDP_HASH_ENA;
nicvf_reg_write(nic, NIC_VNIC_RSS_CFG, rss->cfg);
rss->hash_bits = fls(rss->rss_size) - 1;
for (idx = 0; idx < rss->rss_size; idx++)
rss->ind_tbl[idx] = idx % nic->rx_queues;
nicvf_config_rss(nic);
return (0);
}
static int
nicvf_init_resources(struct nicvf *nic)
{
int err;
union nic_mbx mbx = {};
mbx.msg.msg = NIC_MBOX_MSG_CFG_DONE;
/* Enable Qset */
nicvf_qset_config(nic, TRUE);
/* Initialize queues and HW for data transfer */
err = nicvf_config_data_transfer(nic, TRUE);
if (err) {
device_printf(nic->dev,
"Failed to alloc/config VF's QSet resources\n");
return (err);
}
/* Send VF config done msg to PF */
nicvf_write_to_mbx(nic, &mbx);
return (0);
}
static void
nicvf_misc_intr_handler(void *arg)
{
struct nicvf *nic = (struct nicvf *)arg;
uint64_t intr;
intr = nicvf_reg_read(nic, NIC_VF_INT);
/* Check for spurious interrupt */
if (!(intr & NICVF_INTR_MBOX_MASK))
return;
nicvf_handle_mbx_intr(nic);
}
static int
nicvf_intr_handler(void *arg)
{
struct nicvf *nic;
struct cmp_queue *cq;
int qidx;
cq = (struct cmp_queue *)arg;
nic = cq->nic;
qidx = cq->idx;
/* Disable interrupts */
nicvf_disable_intr(nic, NICVF_INTR_CQ, qidx);
taskqueue_enqueue(cq->cmp_taskq, &cq->cmp_task);
/* Clear interrupt */
nicvf_clear_intr(nic, NICVF_INTR_CQ, qidx);
return (FILTER_HANDLED);
}
static void
nicvf_rbdr_intr_handler(void *arg)
{
struct nicvf *nic;
struct queue_set *qs;
struct rbdr *rbdr;
int qidx;
nic = (struct nicvf *)arg;
/* Disable RBDR interrupt and schedule softirq */
for (qidx = 0; qidx < nic->qs->rbdr_cnt; qidx++) {
if (!nicvf_is_intr_enabled(nic, NICVF_INTR_RBDR, qidx))
continue;
nicvf_disable_intr(nic, NICVF_INTR_RBDR, qidx);
qs = nic->qs;
rbdr = &qs->rbdr[qidx];
taskqueue_enqueue(rbdr->rbdr_taskq, &rbdr->rbdr_task_nowait);
/* Clear interrupt */
nicvf_clear_intr(nic, NICVF_INTR_RBDR, qidx);
}
}
static void
nicvf_qs_err_intr_handler(void *arg)
{
struct nicvf *nic = (struct nicvf *)arg;
struct queue_set *qs = nic->qs;
/* Disable Qset err interrupt and schedule softirq */
nicvf_disable_intr(nic, NICVF_INTR_QS_ERR, 0);
taskqueue_enqueue(qs->qs_err_taskq, &qs->qs_err_task);
nicvf_clear_intr(nic, NICVF_INTR_QS_ERR, 0);
}
static int
nicvf_enable_msix(struct nicvf *nic)
{
struct pci_devinfo *dinfo;
int rid, count;
int ret;
dinfo = device_get_ivars(nic->dev);
rid = dinfo->cfg.msix.msix_table_bar;
nic->msix_table_res =
bus_alloc_resource_any(nic->dev, SYS_RES_MEMORY, &rid, RF_ACTIVE);
if (nic->msix_table_res == NULL) {
device_printf(nic->dev,
"Could not allocate memory for MSI-X table\n");
return (ENXIO);
}
count = nic->num_vec = NIC_VF_MSIX_VECTORS;
ret = pci_alloc_msix(nic->dev, &count);
if ((ret != 0) || (count != nic->num_vec)) {
device_printf(nic->dev,
"Request for #%d msix vectors failed, error: %d\n",
nic->num_vec, ret);
return (ret);
}
nic->msix_enabled = 1;
return (0);
}
static void
nicvf_disable_msix(struct nicvf *nic)
{
if (nic->msix_enabled) {
pci_release_msi(nic->dev);
nic->msix_enabled = 0;
nic->num_vec = 0;
}
}
static void
nicvf_release_all_interrupts(struct nicvf *nic)
{
struct resource *res;
int irq;
int err;
/* Free registered interrupts */
for (irq = 0; irq < nic->num_vec; irq++) {
res = nic->msix_entries[irq].irq_res;
if (res == NULL)
continue;
/* Teardown interrupt first */
if (nic->msix_entries[irq].handle != NULL) {
err = bus_teardown_intr(nic->dev,
nic->msix_entries[irq].irq_res,
nic->msix_entries[irq].handle);
KASSERT(err == 0,
("ERROR: Unable to teardown interrupt %d", irq));
nic->msix_entries[irq].handle = NULL;
}
bus_release_resource(nic->dev, SYS_RES_IRQ,
rman_get_rid(res), nic->msix_entries[irq].irq_res);
nic->msix_entries[irq].irq_res = NULL;
}
/* Disable MSI-X */
nicvf_disable_msix(nic);
}
/*
* Initialize MSIX vectors and register MISC interrupt.
* Send READY message to PF to check if its alive
*/
static int
nicvf_allocate_misc_interrupt(struct nicvf *nic)
{
struct resource *res;
int irq, rid;
int ret = 0;
/* Return if mailbox interrupt is already registered */
if (nic->msix_enabled)
return (0);
/* Enable MSI-X */
if (nicvf_enable_msix(nic) != 0)
return (ENXIO);
irq = NICVF_INTR_ID_MISC;
rid = irq + 1;
nic->msix_entries[irq].irq_res = bus_alloc_resource_any(nic->dev,
SYS_RES_IRQ, &rid, (RF_SHAREABLE | RF_ACTIVE));
if (nic->msix_entries[irq].irq_res == NULL) {
device_printf(nic->dev,
"Could not allocate Mbox interrupt for VF%d\n",
device_get_unit(nic->dev));
return (ENXIO);
}
ret = bus_setup_intr(nic->dev, nic->msix_entries[irq].irq_res,
(INTR_MPSAFE | INTR_TYPE_MISC), NULL, nicvf_misc_intr_handler, nic,
&nic->msix_entries[irq].handle);
if (ret != 0) {
res = nic->msix_entries[irq].irq_res;
bus_release_resource(nic->dev, SYS_RES_IRQ,
rman_get_rid(res), res);
nic->msix_entries[irq].irq_res = NULL;
return (ret);
}
return (0);
}
static int
nicvf_enable_misc_interrupt(struct nicvf *nic)
{
/* Enable mailbox interrupt */
nicvf_enable_intr(nic, NICVF_INTR_MBOX, 0);
/* Check if VF is able to communicate with PF */
if (!nicvf_check_pf_ready(nic)) {
nicvf_disable_intr(nic, NICVF_INTR_MBOX, 0);
return (ENXIO);
}
return (0);
}
static void
nicvf_release_net_interrupts(struct nicvf *nic)
{
struct resource *res;
int irq;
int err;
for_each_cq_irq(irq) {
res = nic->msix_entries[irq].irq_res;
if (res == NULL)
continue;
/* Teardown active interrupts first */
if (nic->msix_entries[irq].handle != NULL) {
err = bus_teardown_intr(nic->dev,
nic->msix_entries[irq].irq_res,
nic->msix_entries[irq].handle);
KASSERT(err == 0,
("ERROR: Unable to teardown CQ interrupt %d",
(irq - NICVF_INTR_ID_CQ)));
if (err != 0)
continue;
}
/* Release resource */
bus_release_resource(nic->dev, SYS_RES_IRQ, rman_get_rid(res),
res);
nic->msix_entries[irq].irq_res = NULL;
}
for_each_rbdr_irq(irq) {
res = nic->msix_entries[irq].irq_res;
if (res == NULL)
continue;
/* Teardown active interrupts first */
if (nic->msix_entries[irq].handle != NULL) {
err = bus_teardown_intr(nic->dev,
nic->msix_entries[irq].irq_res,
nic->msix_entries[irq].handle);
KASSERT(err == 0,
("ERROR: Unable to teardown RDBR interrupt %d",
(irq - NICVF_INTR_ID_RBDR)));
if (err != 0)
continue;
}
/* Release resource */
bus_release_resource(nic->dev, SYS_RES_IRQ, rman_get_rid(res),
res);
nic->msix_entries[irq].irq_res = NULL;
}
irq = NICVF_INTR_ID_QS_ERR;
res = nic->msix_entries[irq].irq_res;
if (res != NULL) {
/* Teardown active interrupts first */
if (nic->msix_entries[irq].handle != NULL) {
err = bus_teardown_intr(nic->dev,
nic->msix_entries[irq].irq_res,
nic->msix_entries[irq].handle);
KASSERT(err == 0,
("ERROR: Unable to teardown QS Error interrupt %d",
irq));
if (err != 0)
return;
}
/* Release resource */
bus_release_resource(nic->dev, SYS_RES_IRQ, rman_get_rid(res),
res);
nic->msix_entries[irq].irq_res = NULL;
}
}
static int
nicvf_allocate_net_interrupts(struct nicvf *nic)
{
u_int cpuid;
int irq, rid;
int qidx;
int ret = 0;
/* MSI-X must be configured by now */
if (!nic->msix_enabled) {
device_printf(nic->dev, "Cannot alloacte queue interrups. "
"MSI-X interrupts disabled.\n");
return (ENXIO);
}
/* Register CQ interrupts */
for_each_cq_irq(irq) {
if (irq >= (NICVF_INTR_ID_CQ + nic->qs->cq_cnt))
break;
qidx = irq - NICVF_INTR_ID_CQ;
rid = irq + 1;
nic->msix_entries[irq].irq_res = bus_alloc_resource_any(nic->dev,
SYS_RES_IRQ, &rid, (RF_SHAREABLE | RF_ACTIVE));
if (nic->msix_entries[irq].irq_res == NULL) {
device_printf(nic->dev,
"Could not allocate CQ interrupt %d for VF%d\n",
(irq - NICVF_INTR_ID_CQ), device_get_unit(nic->dev));
ret = ENXIO;
goto error;
}
ret = bus_setup_intr(nic->dev, nic->msix_entries[irq].irq_res,
(INTR_MPSAFE | INTR_TYPE_NET), nicvf_intr_handler,
NULL, &nic->qs->cq[qidx], &nic->msix_entries[irq].handle);
if (ret != 0) {
device_printf(nic->dev,
"Could not setup CQ interrupt %d for VF%d\n",
(irq - NICVF_INTR_ID_CQ), device_get_unit(nic->dev));
goto error;
}
cpuid = (device_get_unit(nic->dev) * CMP_QUEUE_CNT) + qidx;
cpuid %= mp_ncpus;
/*
* Save CPU ID for later use when system-wide RSS is enabled.
* It will be used to pit the CQ task to the same CPU that got
* interrupted.
*/
nic->qs->cq[qidx].cmp_cpuid = cpuid;
if (bootverbose) {
device_printf(nic->dev, "bind CQ%d IRQ to CPU%d\n",
qidx, cpuid);
}
/* Bind interrupts to the given CPU */
bus_bind_intr(nic->dev, nic->msix_entries[irq].irq_res, cpuid);
}
/* Register RBDR interrupt */
for_each_rbdr_irq(irq) {
if (irq >= (NICVF_INTR_ID_RBDR + nic->qs->rbdr_cnt))
break;
rid = irq + 1;
nic->msix_entries[irq].irq_res = bus_alloc_resource_any(nic->dev,
SYS_RES_IRQ, &rid, (RF_SHAREABLE | RF_ACTIVE));
if (nic->msix_entries[irq].irq_res == NULL) {
device_printf(nic->dev,
"Could not allocate RBDR interrupt %d for VF%d\n",
(irq - NICVF_INTR_ID_RBDR),
device_get_unit(nic->dev));
ret = ENXIO;
goto error;
}
ret = bus_setup_intr(nic->dev, nic->msix_entries[irq].irq_res,
(INTR_MPSAFE | INTR_TYPE_NET), NULL,
nicvf_rbdr_intr_handler, nic,
&nic->msix_entries[irq].handle);
if (ret != 0) {
device_printf(nic->dev,
"Could not setup RBDR interrupt %d for VF%d\n",
(irq - NICVF_INTR_ID_RBDR),
device_get_unit(nic->dev));
goto error;
}
}
/* Register QS error interrupt */
irq = NICVF_INTR_ID_QS_ERR;
rid = irq + 1;
nic->msix_entries[irq].irq_res = bus_alloc_resource_any(nic->dev,
SYS_RES_IRQ, &rid, (RF_SHAREABLE | RF_ACTIVE));
if (nic->msix_entries[irq].irq_res == NULL) {
device_printf(nic->dev,
"Could not allocate QS Error interrupt for VF%d\n",
device_get_unit(nic->dev));
ret = ENXIO;
goto error;
}
ret = bus_setup_intr(nic->dev, nic->msix_entries[irq].irq_res,
(INTR_MPSAFE | INTR_TYPE_NET), NULL, nicvf_qs_err_intr_handler,
nic, &nic->msix_entries[irq].handle);
if (ret != 0) {
device_printf(nic->dev,
"Could not setup QS Error interrupt for VF%d\n",
device_get_unit(nic->dev));
goto error;
}
return (0);
error:
nicvf_release_net_interrupts(nic);
return (ret);
}
static int
nicvf_stop_locked(struct nicvf *nic)
{
struct ifnet *ifp;
int qidx;
struct queue_set *qs = nic->qs;
union nic_mbx mbx = {};
NICVF_CORE_LOCK_ASSERT(nic);
/* Stop callout. Can block here since holding SX lock */
callout_drain(&nic->stats_callout);
ifp = nic->ifp;
mbx.msg.msg = NIC_MBOX_MSG_SHUTDOWN;
nicvf_send_msg_to_pf(nic, &mbx);
/* Disable RBDR & QS error interrupts */
for (qidx = 0; qidx < qs->rbdr_cnt; qidx++) {
nicvf_disable_intr(nic, NICVF_INTR_RBDR, qidx);
nicvf_clear_intr(nic, NICVF_INTR_RBDR, qidx);
}
nicvf_disable_intr(nic, NICVF_INTR_QS_ERR, 0);
nicvf_clear_intr(nic, NICVF_INTR_QS_ERR, 0);
/* Deactivate network interface */
if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
/* Free resources */
nicvf_config_data_transfer(nic, FALSE);
/* Disable HW Qset */
nicvf_qset_config(nic, FALSE);
/* disable mailbox interrupt */
nicvf_disable_intr(nic, NICVF_INTR_MBOX, 0);
return (0);
}
static void
nicvf_update_stats(struct nicvf *nic)
{
int qidx;
struct nicvf_hw_stats *stats = &nic->hw_stats;
struct nicvf_drv_stats *drv_stats = &nic->drv_stats;
struct queue_set *qs = nic->qs;
#define GET_RX_STATS(reg) \
nicvf_reg_read(nic, NIC_VNIC_RX_STAT_0_13 | ((reg) << 3))
#define GET_TX_STATS(reg) \
nicvf_reg_read(nic, NIC_VNIC_TX_STAT_0_4 | ((reg) << 3))
stats->rx_bytes = GET_RX_STATS(RX_OCTS);
stats->rx_ucast_frames = GET_RX_STATS(RX_UCAST);
stats->rx_bcast_frames = GET_RX_STATS(RX_BCAST);
stats->rx_mcast_frames = GET_RX_STATS(RX_MCAST);
stats->rx_fcs_errors = GET_RX_STATS(RX_FCS);
stats->rx_l2_errors = GET_RX_STATS(RX_L2ERR);
stats->rx_drop_red = GET_RX_STATS(RX_RED);
stats->rx_drop_red_bytes = GET_RX_STATS(RX_RED_OCTS);
stats->rx_drop_overrun = GET_RX_STATS(RX_ORUN);
stats->rx_drop_overrun_bytes = GET_RX_STATS(RX_ORUN_OCTS);
stats->rx_drop_bcast = GET_RX_STATS(RX_DRP_BCAST);
stats->rx_drop_mcast = GET_RX_STATS(RX_DRP_MCAST);
stats->rx_drop_l3_bcast = GET_RX_STATS(RX_DRP_L3BCAST);
stats->rx_drop_l3_mcast = GET_RX_STATS(RX_DRP_L3MCAST);
stats->tx_bytes_ok = GET_TX_STATS(TX_OCTS);
stats->tx_ucast_frames_ok = GET_TX_STATS(TX_UCAST);
stats->tx_bcast_frames_ok = GET_TX_STATS(TX_BCAST);
stats->tx_mcast_frames_ok = GET_TX_STATS(TX_MCAST);
stats->tx_drops = GET_TX_STATS(TX_DROP);
drv_stats->tx_frames_ok = stats->tx_ucast_frames_ok +
stats->tx_bcast_frames_ok + stats->tx_mcast_frames_ok;
drv_stats->rx_drops = stats->rx_drop_red + stats->rx_drop_overrun;
drv_stats->tx_drops = stats->tx_drops;
/* Update RQ and SQ stats */
for (qidx = 0; qidx < qs->rq_cnt; qidx++)
nicvf_update_rq_stats(nic, qidx);
for (qidx = 0; qidx < qs->sq_cnt; qidx++)
nicvf_update_sq_stats(nic, qidx);
}
static void
nicvf_tick_stats(void *arg)
{
struct nicvf *nic;
nic = (struct nicvf *)arg;
/* Read the statistics */
nicvf_update_stats(nic);
callout_reset(&nic->stats_callout, hz, nicvf_tick_stats, nic);
}
Index: head/sys/dev/vnic/nicvf_queues.c
===================================================================
--- head/sys/dev/vnic/nicvf_queues.c (revision 327172)
+++ head/sys/dev/vnic/nicvf_queues.c (revision 327173)
@@ -1,2367 +1,2366 @@
/*
* Copyright (C) 2015 Cavium Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_inet.h"
#include "opt_inet6.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/bitset.h>
#include <sys/bitstring.h>
#include <sys/buf_ring.h>
#include <sys/bus.h>
#include <sys/endian.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/module.h>
#include <sys/rman.h>
#include <sys/pciio.h>
#include <sys/pcpu.h>
#include <sys/proc.h>
#include <sys/sockio.h>
#include <sys/socket.h>
#include <sys/stdatomic.h>
#include <sys/cpuset.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/smp.h>
#include <sys/taskqueue.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <machine/bus.h>
#include <machine/vmparam.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/if_media.h>
#include <net/ifq.h>
#include <net/bpf.h>
#include <net/ethernet.h>
#include <netinet/in_systm.h>
#include <netinet/in.h>
#include <netinet/if_ether.h>
#include <netinet/ip.h>
#include <netinet/ip6.h>
#include <netinet/sctp.h>
#include <netinet/tcp.h>
#include <netinet/tcp_lro.h>
#include <netinet/udp.h>
#include <netinet6/ip6_var.h>
#include <dev/pci/pcireg.h>
#include <dev/pci/pcivar.h>
#include "thunder_bgx.h"
#include "nic_reg.h"
#include "nic.h"
#include "q_struct.h"
#include "nicvf_queues.h"
#define DEBUG
#undef DEBUG
#ifdef DEBUG
#define dprintf(dev, fmt, ...) device_printf(dev, fmt, ##__VA_ARGS__)
#else
#define dprintf(dev, fmt, ...)
#endif
MALLOC_DECLARE(M_NICVF);
static void nicvf_free_snd_queue(struct nicvf *, struct snd_queue *);
static struct mbuf * nicvf_get_rcv_mbuf(struct nicvf *, struct cqe_rx_t *);
static void nicvf_sq_disable(struct nicvf *, int);
static void nicvf_sq_enable(struct nicvf *, struct snd_queue *, int);
static void nicvf_put_sq_desc(struct snd_queue *, int);
static void nicvf_cmp_queue_config(struct nicvf *, struct queue_set *, int,
boolean_t);
static void nicvf_sq_free_used_descs(struct nicvf *, struct snd_queue *, int);
static int nicvf_tx_mbuf_locked(struct snd_queue *, struct mbuf **);
static void nicvf_rbdr_task(void *, int);
static void nicvf_rbdr_task_nowait(void *, int);
struct rbuf_info {
bus_dma_tag_t dmat;
bus_dmamap_t dmap;
struct mbuf * mbuf;
};
#define GET_RBUF_INFO(x) ((struct rbuf_info *)((x) - NICVF_RCV_BUF_ALIGN_BYTES))
/* Poll a register for a specific value */
static int nicvf_poll_reg(struct nicvf *nic, int qidx,
uint64_t reg, int bit_pos, int bits, int val)
{
uint64_t bit_mask;
uint64_t reg_val;
int timeout = 10;
bit_mask = (1UL << bits) - 1;
bit_mask = (bit_mask << bit_pos);
while (timeout) {
reg_val = nicvf_queue_reg_read(nic, reg, qidx);
if (((reg_val & bit_mask) >> bit_pos) == val)
return (0);
DELAY(1000);
timeout--;
}
device_printf(nic->dev, "Poll on reg 0x%lx failed\n", reg);
return (ETIMEDOUT);
}
/* Callback for bus_dmamap_load() */
static void
nicvf_dmamap_q_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
{
bus_addr_t *paddr;
KASSERT(nseg == 1, ("wrong number of segments, should be 1"));
paddr = arg;
*paddr = segs->ds_addr;
}
/* Allocate memory for a queue's descriptors */
static int
nicvf_alloc_q_desc_mem(struct nicvf *nic, struct q_desc_mem *dmem,
int q_len, int desc_size, int align_bytes)
{
int err, err_dmat;
/* Create DMA tag first */
err = bus_dma_tag_create(
bus_get_dma_tag(nic->dev), /* parent tag */
align_bytes, /* alignment */
0, /* boundary */
BUS_SPACE_MAXADDR, /* lowaddr */
BUS_SPACE_MAXADDR, /* highaddr */
NULL, NULL, /* filtfunc, filtfuncarg */
(q_len * desc_size), /* maxsize */
1, /* nsegments */
(q_len * desc_size), /* maxsegsize */
0, /* flags */
NULL, NULL, /* lockfunc, lockfuncarg */
&dmem->dmat); /* dmat */
if (err != 0) {
device_printf(nic->dev,
"Failed to create busdma tag for descriptors ring\n");
return (err);
}
/* Allocate segment of continuous DMA safe memory */
err = bus_dmamem_alloc(
dmem->dmat, /* DMA tag */
&dmem->base, /* virtual address */
(BUS_DMA_NOWAIT | BUS_DMA_ZERO), /* flags */
&dmem->dmap); /* DMA map */
if (err != 0) {
device_printf(nic->dev, "Failed to allocate DMA safe memory for"
"descriptors ring\n");
goto dmamem_fail;
}
err = bus_dmamap_load(
dmem->dmat,
dmem->dmap,
dmem->base,
(q_len * desc_size), /* allocation size */
nicvf_dmamap_q_cb, /* map to DMA address cb. */
&dmem->phys_base, /* physical address */
BUS_DMA_NOWAIT);
if (err != 0) {
device_printf(nic->dev,
"Cannot load DMA map of descriptors ring\n");
goto dmamap_fail;
}
dmem->q_len = q_len;
dmem->size = (desc_size * q_len);
return (0);
dmamap_fail:
bus_dmamem_free(dmem->dmat, dmem->base, dmem->dmap);
dmem->phys_base = 0;
dmamem_fail:
err_dmat = bus_dma_tag_destroy(dmem->dmat);
dmem->base = NULL;
KASSERT(err_dmat == 0,
("%s: Trying to destroy BUSY DMA tag", __func__));
return (err);
}
/* Free queue's descriptor memory */
static void
nicvf_free_q_desc_mem(struct nicvf *nic, struct q_desc_mem *dmem)
{
int err;
if ((dmem == NULL) || (dmem->base == NULL))
return;
/* Unload a map */
bus_dmamap_sync(dmem->dmat, dmem->dmap, BUS_DMASYNC_POSTREAD);
bus_dmamap_unload(dmem->dmat, dmem->dmap);
/* Free DMA memory */
bus_dmamem_free(dmem->dmat, dmem->base, dmem->dmap);
/* Destroy DMA tag */
err = bus_dma_tag_destroy(dmem->dmat);
KASSERT(err == 0,
("%s: Trying to destroy BUSY DMA tag", __func__));
dmem->phys_base = 0;
dmem->base = NULL;
}
/*
* Allocate buffer for packet reception
* HW returns memory address where packet is DMA'ed but not a pointer
* into RBDR ring, so save buffer address at the start of fragment and
* align the start address to a cache aligned address
*/
static __inline int
nicvf_alloc_rcv_buffer(struct nicvf *nic, struct rbdr *rbdr,
bus_dmamap_t dmap, int mflags, uint32_t buf_len, bus_addr_t *rbuf)
{
struct mbuf *mbuf;
struct rbuf_info *rinfo;
bus_dma_segment_t segs[1];
int nsegs;
int err;
mbuf = m_getjcl(mflags, MT_DATA, M_PKTHDR, MCLBYTES);
if (mbuf == NULL)
return (ENOMEM);
/*
* The length is equal to the actual length + one 128b line
* used as a room for rbuf_info structure.
*/
mbuf->m_len = mbuf->m_pkthdr.len = buf_len;
err = bus_dmamap_load_mbuf_sg(rbdr->rbdr_buff_dmat, dmap, mbuf, segs,
&nsegs, BUS_DMA_NOWAIT);
if (err != 0) {
device_printf(nic->dev,
"Failed to map mbuf into DMA visible memory, err: %d\n",
err);
m_freem(mbuf);
bus_dmamap_destroy(rbdr->rbdr_buff_dmat, dmap);
return (err);
}
if (nsegs != 1)
panic("Unexpected number of DMA segments for RB: %d", nsegs);
/*
* Now use the room for rbuf_info structure
* and adjust mbuf data and length.
*/
rinfo = (struct rbuf_info *)mbuf->m_data;
m_adj(mbuf, NICVF_RCV_BUF_ALIGN_BYTES);
rinfo->dmat = rbdr->rbdr_buff_dmat;
rinfo->dmap = dmap;
rinfo->mbuf = mbuf;
*rbuf = segs[0].ds_addr + NICVF_RCV_BUF_ALIGN_BYTES;
return (0);
}
/* Retrieve mbuf for received packet */
static struct mbuf *
nicvf_rb_ptr_to_mbuf(struct nicvf *nic, bus_addr_t rb_ptr)
{
struct mbuf *mbuf;
struct rbuf_info *rinfo;
/* Get buffer start address and alignment offset */
rinfo = GET_RBUF_INFO(PHYS_TO_DMAP(rb_ptr));
/* Now retrieve mbuf to give to stack */
mbuf = rinfo->mbuf;
if (__predict_false(mbuf == NULL)) {
panic("%s: Received packet fragment with NULL mbuf",
device_get_nameunit(nic->dev));
}
/*
* Clear the mbuf in the descriptor to indicate
* that this slot is processed and free to use.
*/
rinfo->mbuf = NULL;
bus_dmamap_sync(rinfo->dmat, rinfo->dmap, BUS_DMASYNC_POSTREAD);
bus_dmamap_unload(rinfo->dmat, rinfo->dmap);
return (mbuf);
}
/* Allocate RBDR ring and populate receive buffers */
static int
nicvf_init_rbdr(struct nicvf *nic, struct rbdr *rbdr, int ring_len,
int buf_size, int qidx)
{
bus_dmamap_t dmap;
bus_addr_t rbuf;
struct rbdr_entry_t *desc;
int idx;
int err;
/* Allocate rbdr descriptors ring */
err = nicvf_alloc_q_desc_mem(nic, &rbdr->dmem, ring_len,
sizeof(struct rbdr_entry_t), NICVF_RCV_BUF_ALIGN_BYTES);
if (err != 0) {
device_printf(nic->dev,
"Failed to create RBDR descriptors ring\n");
return (err);
}
rbdr->desc = rbdr->dmem.base;
/*
* Buffer size has to be in multiples of 128 bytes.
* Make room for metadata of size of one line (128 bytes).
*/
rbdr->dma_size = buf_size - NICVF_RCV_BUF_ALIGN_BYTES;
rbdr->enable = TRUE;
rbdr->thresh = RBDR_THRESH;
rbdr->nic = nic;
rbdr->idx = qidx;
/*
* Create DMA tag for Rx buffers.
* Each map created using this tag is intended to store Rx payload for
* one fragment and one header structure containing rbuf_info (thus
* additional 128 byte line since RB must be a multiple of 128 byte
* cache line).
*/
if (buf_size > MCLBYTES) {
device_printf(nic->dev,
"Buffer size to large for mbuf cluster\n");
return (EINVAL);
}
err = bus_dma_tag_create(
bus_get_dma_tag(nic->dev), /* parent tag */
NICVF_RCV_BUF_ALIGN_BYTES, /* alignment */
0, /* boundary */
DMAP_MAX_PHYSADDR, /* lowaddr */
DMAP_MIN_PHYSADDR, /* highaddr */
NULL, NULL, /* filtfunc, filtfuncarg */
roundup2(buf_size, MCLBYTES), /* maxsize */
1, /* nsegments */
roundup2(buf_size, MCLBYTES), /* maxsegsize */
0, /* flags */
NULL, NULL, /* lockfunc, lockfuncarg */
&rbdr->rbdr_buff_dmat); /* dmat */
if (err != 0) {
device_printf(nic->dev,
"Failed to create busdma tag for RBDR buffers\n");
return (err);
}
rbdr->rbdr_buff_dmaps = malloc(sizeof(*rbdr->rbdr_buff_dmaps) *
ring_len, M_NICVF, (M_WAITOK | M_ZERO));
for (idx = 0; idx < ring_len; idx++) {
err = bus_dmamap_create(rbdr->rbdr_buff_dmat, 0, &dmap);
if (err != 0) {
device_printf(nic->dev,
"Failed to create DMA map for RB\n");
return (err);
}
rbdr->rbdr_buff_dmaps[idx] = dmap;
err = nicvf_alloc_rcv_buffer(nic, rbdr, dmap, M_WAITOK,
DMA_BUFFER_LEN, &rbuf);
if (err != 0)
return (err);
desc = GET_RBDR_DESC(rbdr, idx);
desc->buf_addr = (rbuf >> NICVF_RCV_BUF_ALIGN);
}
/* Allocate taskqueue */
TASK_INIT(&rbdr->rbdr_task, 0, nicvf_rbdr_task, rbdr);
TASK_INIT(&rbdr->rbdr_task_nowait, 0, nicvf_rbdr_task_nowait, rbdr);
rbdr->rbdr_taskq = taskqueue_create_fast("nicvf_rbdr_taskq", M_WAITOK,
taskqueue_thread_enqueue, &rbdr->rbdr_taskq);
taskqueue_start_threads(&rbdr->rbdr_taskq, 1, PI_NET, "%s: rbdr_taskq",
device_get_nameunit(nic->dev));
return (0);
}
/* Free RBDR ring and its receive buffers */
static void
nicvf_free_rbdr(struct nicvf *nic, struct rbdr *rbdr)
{
struct mbuf *mbuf;
struct queue_set *qs;
struct rbdr_entry_t *desc;
struct rbuf_info *rinfo;
bus_addr_t buf_addr;
int head, tail, idx;
int err;
qs = nic->qs;
if ((qs == NULL) || (rbdr == NULL))
return;
rbdr->enable = FALSE;
if (rbdr->rbdr_taskq != NULL) {
/* Remove tasks */
while (taskqueue_cancel(rbdr->rbdr_taskq,
&rbdr->rbdr_task_nowait, NULL) != 0) {
/* Finish the nowait task first */
taskqueue_drain(rbdr->rbdr_taskq,
&rbdr->rbdr_task_nowait);
}
taskqueue_free(rbdr->rbdr_taskq);
rbdr->rbdr_taskq = NULL;
while (taskqueue_cancel(taskqueue_thread,
&rbdr->rbdr_task, NULL) != 0) {
/* Now finish the sleepable task */
taskqueue_drain(taskqueue_thread, &rbdr->rbdr_task);
}
}
/*
* Free all of the memory under the RB descriptors.
* There are assumptions here:
* 1. Corresponding RBDR is disabled
* - it is safe to operate using head and tail indexes
* 2. All bffers that were received are properly freed by
* the receive handler
* - there is no need to unload DMA map and free MBUF for other
* descriptors than unused ones
*/
if (rbdr->rbdr_buff_dmat != NULL) {
head = rbdr->head;
tail = rbdr->tail;
while (head != tail) {
desc = GET_RBDR_DESC(rbdr, head);
buf_addr = desc->buf_addr << NICVF_RCV_BUF_ALIGN;
rinfo = GET_RBUF_INFO(PHYS_TO_DMAP(buf_addr));
bus_dmamap_unload(rbdr->rbdr_buff_dmat, rinfo->dmap);
mbuf = rinfo->mbuf;
/* This will destroy everything including rinfo! */
m_freem(mbuf);
head++;
head &= (rbdr->dmem.q_len - 1);
}
/* Free tail descriptor */
desc = GET_RBDR_DESC(rbdr, tail);
buf_addr = desc->buf_addr << NICVF_RCV_BUF_ALIGN;
rinfo = GET_RBUF_INFO(PHYS_TO_DMAP(buf_addr));
bus_dmamap_unload(rbdr->rbdr_buff_dmat, rinfo->dmap);
mbuf = rinfo->mbuf;
/* This will destroy everything including rinfo! */
m_freem(mbuf);
/* Destroy DMA maps */
for (idx = 0; idx < qs->rbdr_len; idx++) {
if (rbdr->rbdr_buff_dmaps[idx] == NULL)
continue;
err = bus_dmamap_destroy(rbdr->rbdr_buff_dmat,
rbdr->rbdr_buff_dmaps[idx]);
KASSERT(err == 0,
("%s: Could not destroy DMA map for RB, desc: %d",
__func__, idx));
rbdr->rbdr_buff_dmaps[idx] = NULL;
}
/* Now destroy the tag */
err = bus_dma_tag_destroy(rbdr->rbdr_buff_dmat);
KASSERT(err == 0,
("%s: Trying to destroy BUSY DMA tag", __func__));
rbdr->head = 0;
rbdr->tail = 0;
}
/* Free RBDR ring */
nicvf_free_q_desc_mem(nic, &rbdr->dmem);
}
/*
* Refill receive buffer descriptors with new buffers.
*/
static int
nicvf_refill_rbdr(struct rbdr *rbdr, int mflags)
{
struct nicvf *nic;
struct queue_set *qs;
int rbdr_idx;
int tail, qcount;
int refill_rb_cnt;
struct rbdr_entry_t *desc;
bus_dmamap_t dmap;
bus_addr_t rbuf;
boolean_t rb_alloc_fail;
int new_rb;
rb_alloc_fail = TRUE;
new_rb = 0;
nic = rbdr->nic;
qs = nic->qs;
rbdr_idx = rbdr->idx;
/* Check if it's enabled */
if (!rbdr->enable)
return (0);
/* Get no of desc's to be refilled */
qcount = nicvf_queue_reg_read(nic, NIC_QSET_RBDR_0_1_STATUS0, rbdr_idx);
qcount &= 0x7FFFF;
/* Doorbell can be ringed with a max of ring size minus 1 */
if (qcount >= (qs->rbdr_len - 1)) {
rb_alloc_fail = FALSE;
goto out;
} else
refill_rb_cnt = qs->rbdr_len - qcount - 1;
/* Start filling descs from tail */
tail = nicvf_queue_reg_read(nic, NIC_QSET_RBDR_0_1_TAIL, rbdr_idx) >> 3;
while (refill_rb_cnt) {
tail++;
tail &= (rbdr->dmem.q_len - 1);
dmap = rbdr->rbdr_buff_dmaps[tail];
if (nicvf_alloc_rcv_buffer(nic, rbdr, dmap, mflags,
DMA_BUFFER_LEN, &rbuf)) {
/* Something went wrong. Resign */
break;
}
desc = GET_RBDR_DESC(rbdr, tail);
desc->buf_addr = (rbuf >> NICVF_RCV_BUF_ALIGN);
refill_rb_cnt--;
new_rb++;
}
/* make sure all memory stores are done before ringing doorbell */
wmb();
/* Check if buffer allocation failed */
if (refill_rb_cnt == 0)
rb_alloc_fail = FALSE;
/* Notify HW */
nicvf_queue_reg_write(nic, NIC_QSET_RBDR_0_1_DOOR,
rbdr_idx, new_rb);
out:
if (!rb_alloc_fail) {
/*
* Re-enable RBDR interrupts only
* if buffer allocation is success.
*/
nicvf_enable_intr(nic, NICVF_INTR_RBDR, rbdr_idx);
return (0);
}
return (ENOMEM);
}
/* Refill RBs even if sleep is needed to reclaim memory */
static void
nicvf_rbdr_task(void *arg, int pending)
{
struct rbdr *rbdr;
int err;
rbdr = (struct rbdr *)arg;
err = nicvf_refill_rbdr(rbdr, M_WAITOK);
if (__predict_false(err != 0)) {
panic("%s: Failed to refill RBs even when sleep enabled",
__func__);
}
}
/* Refill RBs as soon as possible without waiting */
static void
nicvf_rbdr_task_nowait(void *arg, int pending)
{
struct rbdr *rbdr;
int err;
rbdr = (struct rbdr *)arg;
err = nicvf_refill_rbdr(rbdr, M_NOWAIT);
if (err != 0) {
/*
* Schedule another, sleepable kernel thread
* that will for sure refill the buffers.
*/
taskqueue_enqueue(taskqueue_thread, &rbdr->rbdr_task);
}
}
static int
nicvf_rcv_pkt_handler(struct nicvf *nic, struct cmp_queue *cq,
struct cqe_rx_t *cqe_rx, int cqe_type)
{
struct mbuf *mbuf;
struct rcv_queue *rq;
int rq_idx;
int err = 0;
rq_idx = cqe_rx->rq_idx;
rq = &nic->qs->rq[rq_idx];
/* Check for errors */
err = nicvf_check_cqe_rx_errs(nic, cq, cqe_rx);
if (err && !cqe_rx->rb_cnt)
return (0);
mbuf = nicvf_get_rcv_mbuf(nic, cqe_rx);
if (mbuf == NULL) {
dprintf(nic->dev, "Packet not received\n");
return (0);
}
/* If error packet */
if (err != 0) {
m_freem(mbuf);
return (0);
}
if (rq->lro_enabled &&
((cqe_rx->l3_type == L3TYPE_IPV4) && (cqe_rx->l4_type == L4TYPE_TCP)) &&
(mbuf->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) ==
(CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) {
/*
* At this point it is known that there are no errors in the
* packet. Attempt to LRO enqueue. Send to stack if no resources
* or enqueue error.
*/
if ((rq->lro.lro_cnt != 0) &&
(tcp_lro_rx(&rq->lro, mbuf, 0) == 0))
return (0);
}
/*
* Push this packet to the stack later to avoid
* unlocking completion task in the middle of work.
*/
err = buf_ring_enqueue(cq->rx_br, mbuf);
if (err != 0) {
/*
* Failed to enqueue this mbuf.
* We don't drop it, just schedule another task.
*/
return (err);
}
return (0);
}
static void
nicvf_snd_pkt_handler(struct nicvf *nic, struct cmp_queue *cq,
struct cqe_send_t *cqe_tx, int cqe_type)
{
bus_dmamap_t dmap;
struct mbuf *mbuf;
struct snd_queue *sq;
struct sq_hdr_subdesc *hdr;
mbuf = NULL;
sq = &nic->qs->sq[cqe_tx->sq_idx];
hdr = (struct sq_hdr_subdesc *)GET_SQ_DESC(sq, cqe_tx->sqe_ptr);
if (hdr->subdesc_type != SQ_DESC_TYPE_HEADER)
return;
dprintf(nic->dev,
"%s Qset #%d SQ #%d SQ ptr #%d subdesc count %d\n",
__func__, cqe_tx->sq_qs, cqe_tx->sq_idx,
cqe_tx->sqe_ptr, hdr->subdesc_cnt);
dmap = (bus_dmamap_t)sq->snd_buff[cqe_tx->sqe_ptr].dmap;
bus_dmamap_unload(sq->snd_buff_dmat, dmap);
mbuf = (struct mbuf *)sq->snd_buff[cqe_tx->sqe_ptr].mbuf;
if (mbuf != NULL) {
m_freem(mbuf);
sq->snd_buff[cqe_tx->sqe_ptr].mbuf = NULL;
nicvf_put_sq_desc(sq, hdr->subdesc_cnt + 1);
}
nicvf_check_cqe_tx_errs(nic, cq, cqe_tx);
}
static int
nicvf_cq_intr_handler(struct nicvf *nic, uint8_t cq_idx)
{
struct mbuf *mbuf;
struct ifnet *ifp;
int processed_cqe, work_done = 0, tx_done = 0;
int cqe_count, cqe_head;
struct queue_set *qs = nic->qs;
struct cmp_queue *cq = &qs->cq[cq_idx];
struct snd_queue *sq = &qs->sq[cq_idx];
struct rcv_queue *rq;
struct cqe_rx_t *cq_desc;
struct lro_ctrl *lro;
int rq_idx;
int cmp_err;
NICVF_CMP_LOCK(cq);
cmp_err = 0;
processed_cqe = 0;
/* Get no of valid CQ entries to process */
cqe_count = nicvf_queue_reg_read(nic, NIC_QSET_CQ_0_7_STATUS, cq_idx);
cqe_count &= CQ_CQE_COUNT;
if (cqe_count == 0)
goto out;
/* Get head of the valid CQ entries */
cqe_head = nicvf_queue_reg_read(nic, NIC_QSET_CQ_0_7_HEAD, cq_idx) >> 9;
cqe_head &= 0xFFFF;
dprintf(nic->dev, "%s CQ%d cqe_count %d cqe_head %d\n",
__func__, cq_idx, cqe_count, cqe_head);
while (processed_cqe < cqe_count) {
/* Get the CQ descriptor */
cq_desc = (struct cqe_rx_t *)GET_CQ_DESC(cq, cqe_head);
cqe_head++;
cqe_head &= (cq->dmem.q_len - 1);
/* Prefetch next CQ descriptor */
__builtin_prefetch((struct cqe_rx_t *)GET_CQ_DESC(cq, cqe_head));
dprintf(nic->dev, "CQ%d cq_desc->cqe_type %d\n", cq_idx,
cq_desc->cqe_type);
switch (cq_desc->cqe_type) {
case CQE_TYPE_RX:
cmp_err = nicvf_rcv_pkt_handler(nic, cq, cq_desc,
CQE_TYPE_RX);
if (__predict_false(cmp_err != 0)) {
/*
* Ups. Cannot finish now.
* Let's try again later.
*/
goto done;
}
work_done++;
break;
case CQE_TYPE_SEND:
nicvf_snd_pkt_handler(nic, cq, (void *)cq_desc,
CQE_TYPE_SEND);
tx_done++;
break;
case CQE_TYPE_INVALID:
case CQE_TYPE_RX_SPLIT:
case CQE_TYPE_RX_TCP:
case CQE_TYPE_SEND_PTP:
/* Ignore for now */
break;
}
processed_cqe++;
}
done:
dprintf(nic->dev,
"%s CQ%d processed_cqe %d work_done %d\n",
__func__, cq_idx, processed_cqe, work_done);
/* Ring doorbell to inform H/W to reuse processed CQEs */
nicvf_queue_reg_write(nic, NIC_QSET_CQ_0_7_DOOR, cq_idx, processed_cqe);
if ((tx_done > 0) &&
((if_getdrvflags(nic->ifp) & IFF_DRV_RUNNING) != 0)) {
/* Reenable TXQ if its stopped earlier due to SQ full */
if_setdrvflagbits(nic->ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE);
taskqueue_enqueue(sq->snd_taskq, &sq->snd_task);
}
out:
/*
* Flush any outstanding LRO work
*/
rq_idx = cq_idx;
rq = &nic->qs->rq[rq_idx];
lro = &rq->lro;
tcp_lro_flush_all(lro);
NICVF_CMP_UNLOCK(cq);
ifp = nic->ifp;
/* Push received MBUFs to the stack */
while (!buf_ring_empty(cq->rx_br)) {
mbuf = buf_ring_dequeue_mc(cq->rx_br);
if (__predict_true(mbuf != NULL))
(*ifp->if_input)(ifp, mbuf);
}
return (cmp_err);
}
/*
* Qset error interrupt handler
*
* As of now only CQ errors are handled
*/
static void
nicvf_qs_err_task(void *arg, int pending)
{
struct nicvf *nic;
struct queue_set *qs;
int qidx;
uint64_t status;
boolean_t enable = TRUE;
nic = (struct nicvf *)arg;
qs = nic->qs;
/* Deactivate network interface */
if_setdrvflagbits(nic->ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
/* Check if it is CQ err */
for (qidx = 0; qidx < qs->cq_cnt; qidx++) {
status = nicvf_queue_reg_read(nic, NIC_QSET_CQ_0_7_STATUS,
qidx);
if ((status & CQ_ERR_MASK) == 0)
continue;
/* Process already queued CQEs and reconfig CQ */
nicvf_disable_intr(nic, NICVF_INTR_CQ, qidx);
nicvf_sq_disable(nic, qidx);
(void)nicvf_cq_intr_handler(nic, qidx);
nicvf_cmp_queue_config(nic, qs, qidx, enable);
nicvf_sq_free_used_descs(nic, &qs->sq[qidx], qidx);
nicvf_sq_enable(nic, &qs->sq[qidx], qidx);
nicvf_enable_intr(nic, NICVF_INTR_CQ, qidx);
}
if_setdrvflagbits(nic->ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE);
/* Re-enable Qset error interrupt */
nicvf_enable_intr(nic, NICVF_INTR_QS_ERR, 0);
}
static void
nicvf_cmp_task(void *arg, int pending)
{
struct cmp_queue *cq;
struct nicvf *nic;
int cmp_err;
cq = (struct cmp_queue *)arg;
nic = cq->nic;
/* Handle CQ descriptors */
cmp_err = nicvf_cq_intr_handler(nic, cq->idx);
if (__predict_false(cmp_err != 0)) {
/*
* Schedule another thread here since we did not
* process the entire CQ due to Tx or Rx CQ parse error.
*/
taskqueue_enqueue(cq->cmp_taskq, &cq->cmp_task);
}
nicvf_clear_intr(nic, NICVF_INTR_CQ, cq->idx);
/* Reenable interrupt (previously disabled in nicvf_intr_handler() */
nicvf_enable_intr(nic, NICVF_INTR_CQ, cq->idx);
}
/* Initialize completion queue */
static int
nicvf_init_cmp_queue(struct nicvf *nic, struct cmp_queue *cq, int q_len,
int qidx)
{
int err;
/* Initizalize lock */
snprintf(cq->mtx_name, sizeof(cq->mtx_name), "%s: CQ(%d) lock",
device_get_nameunit(nic->dev), qidx);
mtx_init(&cq->mtx, cq->mtx_name, NULL, MTX_DEF);
err = nicvf_alloc_q_desc_mem(nic, &cq->dmem, q_len, CMP_QUEUE_DESC_SIZE,
NICVF_CQ_BASE_ALIGN_BYTES);
if (err != 0) {
device_printf(nic->dev,
"Could not allocate DMA memory for CQ\n");
return (err);
}
cq->desc = cq->dmem.base;
cq->thresh = pass1_silicon(nic->dev) ? 0 : CMP_QUEUE_CQE_THRESH;
cq->nic = nic;
cq->idx = qidx;
nic->cq_coalesce_usecs = (CMP_QUEUE_TIMER_THRESH * 0.05) - 1;
cq->rx_br = buf_ring_alloc(CMP_QUEUE_LEN * 8, M_DEVBUF, M_WAITOK,
&cq->mtx);
/* Allocate taskqueue */
TASK_INIT(&cq->cmp_task, 0, nicvf_cmp_task, cq);
cq->cmp_taskq = taskqueue_create_fast("nicvf_cmp_taskq", M_WAITOK,
taskqueue_thread_enqueue, &cq->cmp_taskq);
taskqueue_start_threads(&cq->cmp_taskq, 1, PI_NET, "%s: cmp_taskq(%d)",
device_get_nameunit(nic->dev), qidx);
return (0);
}
static void
nicvf_free_cmp_queue(struct nicvf *nic, struct cmp_queue *cq)
{
if (cq == NULL)
return;
/*
* The completion queue itself should be disabled by now
* (ref. nicvf_snd_queue_config()).
* Ensure that it is safe to disable it or panic.
*/
if (cq->enable)
panic("%s: Trying to free working CQ(%d)", __func__, cq->idx);
if (cq->cmp_taskq != NULL) {
/* Remove task */
while (taskqueue_cancel(cq->cmp_taskq, &cq->cmp_task, NULL) != 0)
taskqueue_drain(cq->cmp_taskq, &cq->cmp_task);
taskqueue_free(cq->cmp_taskq);
cq->cmp_taskq = NULL;
}
/*
* Completion interrupt will possibly enable interrupts again
* so disable interrupting now after we finished processing
* completion task. It is safe to do so since the corresponding CQ
* was already disabled.
*/
nicvf_disable_intr(nic, NICVF_INTR_CQ, cq->idx);
nicvf_clear_intr(nic, NICVF_INTR_CQ, cq->idx);
NICVF_CMP_LOCK(cq);
nicvf_free_q_desc_mem(nic, &cq->dmem);
drbr_free(cq->rx_br, M_DEVBUF);
NICVF_CMP_UNLOCK(cq);
mtx_destroy(&cq->mtx);
memset(cq->mtx_name, 0, sizeof(cq->mtx_name));
}
int
nicvf_xmit_locked(struct snd_queue *sq)
{
struct nicvf *nic;
struct ifnet *ifp;
struct mbuf *next;
int err;
NICVF_TX_LOCK_ASSERT(sq);
nic = sq->nic;
ifp = nic->ifp;
err = 0;
while ((next = drbr_peek(ifp, sq->br)) != NULL) {
/* Send a copy of the frame to the BPF listener */
ETHER_BPF_MTAP(ifp, next);
err = nicvf_tx_mbuf_locked(sq, &next);
if (err != 0) {
if (next == NULL)
drbr_advance(ifp, sq->br);
else
drbr_putback(ifp, sq->br, next);
break;
}
drbr_advance(ifp, sq->br);
}
return (err);
}
static void
nicvf_snd_task(void *arg, int pending)
{
struct snd_queue *sq = (struct snd_queue *)arg;
struct nicvf *nic;
struct ifnet *ifp;
int err;
nic = sq->nic;
ifp = nic->ifp;
/*
* Skip sending anything if the driver is not running,
* SQ full or link is down.
*/
if (((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
IFF_DRV_RUNNING) || !nic->link_up)
return;
NICVF_TX_LOCK(sq);
err = nicvf_xmit_locked(sq);
NICVF_TX_UNLOCK(sq);
/* Try again */
if (err != 0)
taskqueue_enqueue(sq->snd_taskq, &sq->snd_task);
}
/* Initialize transmit queue */
static int
nicvf_init_snd_queue(struct nicvf *nic, struct snd_queue *sq, int q_len,
int qidx)
{
size_t i;
int err;
/* Initizalize TX lock for this queue */
snprintf(sq->mtx_name, sizeof(sq->mtx_name), "%s: SQ(%d) lock",
device_get_nameunit(nic->dev), qidx);
mtx_init(&sq->mtx, sq->mtx_name, NULL, MTX_DEF);
NICVF_TX_LOCK(sq);
/* Allocate buffer ring */
sq->br = buf_ring_alloc(q_len / MIN_SQ_DESC_PER_PKT_XMIT, M_DEVBUF,
M_NOWAIT, &sq->mtx);
if (sq->br == NULL) {
device_printf(nic->dev,
"ERROR: Could not set up buf ring for SQ(%d)\n", qidx);
err = ENOMEM;
goto error;
}
/* Allocate DMA memory for Tx descriptors */
err = nicvf_alloc_q_desc_mem(nic, &sq->dmem, q_len, SND_QUEUE_DESC_SIZE,
NICVF_SQ_BASE_ALIGN_BYTES);
if (err != 0) {
device_printf(nic->dev,
"Could not allocate DMA memory for SQ\n");
goto error;
}
sq->desc = sq->dmem.base;
sq->head = sq->tail = 0;
atomic_store_rel_int(&sq->free_cnt, q_len - 1);
sq->thresh = SND_QUEUE_THRESH;
sq->idx = qidx;
sq->nic = nic;
/*
* Allocate DMA maps for Tx buffers
*/
/* Create DMA tag first */
err = bus_dma_tag_create(
bus_get_dma_tag(nic->dev), /* parent tag */
1, /* alignment */
0, /* boundary */
BUS_SPACE_MAXADDR, /* lowaddr */
BUS_SPACE_MAXADDR, /* highaddr */
NULL, NULL, /* filtfunc, filtfuncarg */
NICVF_TSO_MAXSIZE, /* maxsize */
NICVF_TSO_NSEGS, /* nsegments */
MCLBYTES, /* maxsegsize */
0, /* flags */
NULL, NULL, /* lockfunc, lockfuncarg */
&sq->snd_buff_dmat); /* dmat */
if (err != 0) {
device_printf(nic->dev,
"Failed to create busdma tag for Tx buffers\n");
goto error;
}
/* Allocate send buffers array */
sq->snd_buff = malloc(sizeof(*sq->snd_buff) * q_len, M_NICVF,
(M_NOWAIT | M_ZERO));
if (sq->snd_buff == NULL) {
device_printf(nic->dev,
"Could not allocate memory for Tx buffers array\n");
err = ENOMEM;
goto error;
}
/* Now populate maps */
for (i = 0; i < q_len; i++) {
err = bus_dmamap_create(sq->snd_buff_dmat, 0,
&sq->snd_buff[i].dmap);
if (err != 0) {
device_printf(nic->dev,
"Failed to create DMA maps for Tx buffers\n");
goto error;
}
}
NICVF_TX_UNLOCK(sq);
/* Allocate taskqueue */
TASK_INIT(&sq->snd_task, 0, nicvf_snd_task, sq);
sq->snd_taskq = taskqueue_create_fast("nicvf_snd_taskq", M_WAITOK,
taskqueue_thread_enqueue, &sq->snd_taskq);
taskqueue_start_threads(&sq->snd_taskq, 1, PI_NET, "%s: snd_taskq(%d)",
device_get_nameunit(nic->dev), qidx);
return (0);
error:
NICVF_TX_UNLOCK(sq);
return (err);
}
static void
nicvf_free_snd_queue(struct nicvf *nic, struct snd_queue *sq)
{
struct queue_set *qs = nic->qs;
size_t i;
int err;
if (sq == NULL)
return;
if (sq->snd_taskq != NULL) {
/* Remove task */
while (taskqueue_cancel(sq->snd_taskq, &sq->snd_task, NULL) != 0)
taskqueue_drain(sq->snd_taskq, &sq->snd_task);
taskqueue_free(sq->snd_taskq);
sq->snd_taskq = NULL;
}
NICVF_TX_LOCK(sq);
if (sq->snd_buff_dmat != NULL) {
if (sq->snd_buff != NULL) {
for (i = 0; i < qs->sq_len; i++) {
m_freem(sq->snd_buff[i].mbuf);
sq->snd_buff[i].mbuf = NULL;
bus_dmamap_unload(sq->snd_buff_dmat,
sq->snd_buff[i].dmap);
err = bus_dmamap_destroy(sq->snd_buff_dmat,
sq->snd_buff[i].dmap);
/*
* If bus_dmamap_destroy fails it can cause
* random panic later if the tag is also
* destroyed in the process.
*/
KASSERT(err == 0,
("%s: Could not destroy DMA map for SQ",
__func__));
}
}
free(sq->snd_buff, M_NICVF);
err = bus_dma_tag_destroy(sq->snd_buff_dmat);
KASSERT(err == 0,
("%s: Trying to destroy BUSY DMA tag", __func__));
}
/* Free private driver ring for this send queue */
if (sq->br != NULL)
drbr_free(sq->br, M_DEVBUF);
if (sq->dmem.base != NULL)
nicvf_free_q_desc_mem(nic, &sq->dmem);
NICVF_TX_UNLOCK(sq);
/* Destroy Tx lock */
mtx_destroy(&sq->mtx);
memset(sq->mtx_name, 0, sizeof(sq->mtx_name));
}
static void
nicvf_reclaim_snd_queue(struct nicvf *nic, struct queue_set *qs, int qidx)
{
/* Disable send queue */
nicvf_queue_reg_write(nic, NIC_QSET_SQ_0_7_CFG, qidx, 0);
/* Check if SQ is stopped */
if (nicvf_poll_reg(nic, qidx, NIC_QSET_SQ_0_7_STATUS, 21, 1, 0x01))
return;
/* Reset send queue */
nicvf_queue_reg_write(nic, NIC_QSET_SQ_0_7_CFG, qidx, NICVF_SQ_RESET);
}
static void
nicvf_reclaim_rcv_queue(struct nicvf *nic, struct queue_set *qs, int qidx)
{
union nic_mbx mbx = {};
/* Make sure all packets in the pipeline are written back into mem */
mbx.msg.msg = NIC_MBOX_MSG_RQ_SW_SYNC;
nicvf_send_msg_to_pf(nic, &mbx);
}
static void
nicvf_reclaim_cmp_queue(struct nicvf *nic, struct queue_set *qs, int qidx)
{
/* Disable timer threshold (doesn't get reset upon CQ reset */
nicvf_queue_reg_write(nic, NIC_QSET_CQ_0_7_CFG2, qidx, 0);
/* Disable completion queue */
nicvf_queue_reg_write(nic, NIC_QSET_CQ_0_7_CFG, qidx, 0);
/* Reset completion queue */
nicvf_queue_reg_write(nic, NIC_QSET_CQ_0_7_CFG, qidx, NICVF_CQ_RESET);
}
static void
nicvf_reclaim_rbdr(struct nicvf *nic, struct rbdr *rbdr, int qidx)
{
uint64_t tmp, fifo_state;
int timeout = 10;
/* Save head and tail pointers for feeing up buffers */
rbdr->head =
nicvf_queue_reg_read(nic, NIC_QSET_RBDR_0_1_HEAD, qidx) >> 3;
rbdr->tail =
nicvf_queue_reg_read(nic, NIC_QSET_RBDR_0_1_TAIL, qidx) >> 3;
/*
* If RBDR FIFO is in 'FAIL' state then do a reset first
* before relaiming.
*/
fifo_state = nicvf_queue_reg_read(nic, NIC_QSET_RBDR_0_1_STATUS0, qidx);
if (((fifo_state >> 62) & 0x03) == 0x3) {
nicvf_queue_reg_write(nic, NIC_QSET_RBDR_0_1_CFG,
qidx, NICVF_RBDR_RESET);
}
/* Disable RBDR */
nicvf_queue_reg_write(nic, NIC_QSET_RBDR_0_1_CFG, qidx, 0);
if (nicvf_poll_reg(nic, qidx, NIC_QSET_RBDR_0_1_STATUS0, 62, 2, 0x00))
return;
while (1) {
tmp = nicvf_queue_reg_read(nic,
NIC_QSET_RBDR_0_1_PREFETCH_STATUS, qidx);
if ((tmp & 0xFFFFFFFF) == ((tmp >> 32) & 0xFFFFFFFF))
break;
DELAY(1000);
timeout--;
if (!timeout) {
device_printf(nic->dev,
"Failed polling on prefetch status\n");
return;
}
}
nicvf_queue_reg_write(nic, NIC_QSET_RBDR_0_1_CFG, qidx,
NICVF_RBDR_RESET);
if (nicvf_poll_reg(nic, qidx, NIC_QSET_RBDR_0_1_STATUS0, 62, 2, 0x02))
return;
nicvf_queue_reg_write(nic, NIC_QSET_RBDR_0_1_CFG, qidx, 0x00);
if (nicvf_poll_reg(nic, qidx, NIC_QSET_RBDR_0_1_STATUS0, 62, 2, 0x00))
return;
}
/* Configures receive queue */
static void
nicvf_rcv_queue_config(struct nicvf *nic, struct queue_set *qs,
int qidx, bool enable)
{
union nic_mbx mbx = {};
struct rcv_queue *rq;
struct rq_cfg rq_cfg;
struct ifnet *ifp;
struct lro_ctrl *lro;
ifp = nic->ifp;
rq = &qs->rq[qidx];
rq->enable = enable;
lro = &rq->lro;
/* Disable receive queue */
nicvf_queue_reg_write(nic, NIC_QSET_RQ_0_7_CFG, qidx, 0);
if (!rq->enable) {
nicvf_reclaim_rcv_queue(nic, qs, qidx);
/* Free LRO memory */
tcp_lro_free(lro);
rq->lro_enabled = FALSE;
return;
}
/* Configure LRO if enabled */
rq->lro_enabled = FALSE;
if ((if_getcapenable(ifp) & IFCAP_LRO) != 0) {
if (tcp_lro_init(lro) != 0) {
device_printf(nic->dev,
"Failed to initialize LRO for RXQ%d\n", qidx);
} else {
rq->lro_enabled = TRUE;
lro->ifp = nic->ifp;
}
}
rq->cq_qs = qs->vnic_id;
rq->cq_idx = qidx;
rq->start_rbdr_qs = qs->vnic_id;
rq->start_qs_rbdr_idx = qs->rbdr_cnt - 1;
rq->cont_rbdr_qs = qs->vnic_id;
rq->cont_qs_rbdr_idx = qs->rbdr_cnt - 1;
/* all writes of RBDR data to be loaded into L2 Cache as well*/
rq->caching = 1;
/* Send a mailbox msg to PF to config RQ */
mbx.rq.msg = NIC_MBOX_MSG_RQ_CFG;
mbx.rq.qs_num = qs->vnic_id;
mbx.rq.rq_num = qidx;
mbx.rq.cfg = (rq->caching << 26) | (rq->cq_qs << 19) |
(rq->cq_idx << 16) | (rq->cont_rbdr_qs << 9) |
(rq->cont_qs_rbdr_idx << 8) | (rq->start_rbdr_qs << 1) |
(rq->start_qs_rbdr_idx);
nicvf_send_msg_to_pf(nic, &mbx);
mbx.rq.msg = NIC_MBOX_MSG_RQ_BP_CFG;
mbx.rq.cfg = (1UL << 63) | (1UL << 62) | (qs->vnic_id << 0);
nicvf_send_msg_to_pf(nic, &mbx);
/*
* RQ drop config
* Enable CQ drop to reserve sufficient CQEs for all tx packets
*/
mbx.rq.msg = NIC_MBOX_MSG_RQ_DROP_CFG;
mbx.rq.cfg = (1UL << 62) | (RQ_CQ_DROP << 8);
nicvf_send_msg_to_pf(nic, &mbx);
nicvf_queue_reg_write(nic, NIC_QSET_RQ_GEN_CFG, 0, 0x00);
/* Enable Receive queue */
rq_cfg.ena = 1;
rq_cfg.tcp_ena = 0;
nicvf_queue_reg_write(nic, NIC_QSET_RQ_0_7_CFG, qidx,
*(uint64_t *)&rq_cfg);
}
/* Configures completion queue */
static void
nicvf_cmp_queue_config(struct nicvf *nic, struct queue_set *qs,
int qidx, boolean_t enable)
{
struct cmp_queue *cq;
struct cq_cfg cq_cfg;
cq = &qs->cq[qidx];
cq->enable = enable;
if (!cq->enable) {
nicvf_reclaim_cmp_queue(nic, qs, qidx);
return;
}
/* Reset completion queue */
nicvf_queue_reg_write(nic, NIC_QSET_CQ_0_7_CFG, qidx, NICVF_CQ_RESET);
/* Set completion queue base address */
nicvf_queue_reg_write(nic, NIC_QSET_CQ_0_7_BASE, qidx,
(uint64_t)(cq->dmem.phys_base));
/* Enable Completion queue */
cq_cfg.ena = 1;
cq_cfg.reset = 0;
cq_cfg.caching = 0;
cq_cfg.qsize = CMP_QSIZE;
cq_cfg.avg_con = 0;
nicvf_queue_reg_write(nic, NIC_QSET_CQ_0_7_CFG, qidx, *(uint64_t *)&cq_cfg);
/* Set threshold value for interrupt generation */
nicvf_queue_reg_write(nic, NIC_QSET_CQ_0_7_THRESH, qidx, cq->thresh);
nicvf_queue_reg_write(nic, NIC_QSET_CQ_0_7_CFG2, qidx,
nic->cq_coalesce_usecs);
}
/* Configures transmit queue */
static void
nicvf_snd_queue_config(struct nicvf *nic, struct queue_set *qs, int qidx,
boolean_t enable)
{
union nic_mbx mbx = {};
struct snd_queue *sq;
struct sq_cfg sq_cfg;
sq = &qs->sq[qidx];
sq->enable = enable;
if (!sq->enable) {
nicvf_reclaim_snd_queue(nic, qs, qidx);
return;
}
/* Reset send queue */
nicvf_queue_reg_write(nic, NIC_QSET_SQ_0_7_CFG, qidx, NICVF_SQ_RESET);
sq->cq_qs = qs->vnic_id;
sq->cq_idx = qidx;
/* Send a mailbox msg to PF to config SQ */
mbx.sq.msg = NIC_MBOX_MSG_SQ_CFG;
mbx.sq.qs_num = qs->vnic_id;
mbx.sq.sq_num = qidx;
mbx.sq.sqs_mode = nic->sqs_mode;
mbx.sq.cfg = (sq->cq_qs << 3) | sq->cq_idx;
nicvf_send_msg_to_pf(nic, &mbx);
/* Set queue base address */
nicvf_queue_reg_write(nic, NIC_QSET_SQ_0_7_BASE, qidx,
(uint64_t)(sq->dmem.phys_base));
/* Enable send queue & set queue size */
sq_cfg.ena = 1;
sq_cfg.reset = 0;
sq_cfg.ldwb = 0;
sq_cfg.qsize = SND_QSIZE;
sq_cfg.tstmp_bgx_intf = 0;
nicvf_queue_reg_write(nic, NIC_QSET_SQ_0_7_CFG, qidx, *(uint64_t *)&sq_cfg);
/* Set threshold value for interrupt generation */
nicvf_queue_reg_write(nic, NIC_QSET_SQ_0_7_THRESH, qidx, sq->thresh);
}
/* Configures receive buffer descriptor ring */
static void
nicvf_rbdr_config(struct nicvf *nic, struct queue_set *qs, int qidx,
boolean_t enable)
{
struct rbdr *rbdr;
struct rbdr_cfg rbdr_cfg;
rbdr = &qs->rbdr[qidx];
nicvf_reclaim_rbdr(nic, rbdr, qidx);
if (!enable)
return;
/* Set descriptor base address */
nicvf_queue_reg_write(nic, NIC_QSET_RBDR_0_1_BASE, qidx,
(uint64_t)(rbdr->dmem.phys_base));
/* Enable RBDR & set queue size */
/* Buffer size should be in multiples of 128 bytes */
rbdr_cfg.ena = 1;
rbdr_cfg.reset = 0;
rbdr_cfg.ldwb = 0;
rbdr_cfg.qsize = RBDR_SIZE;
rbdr_cfg.avg_con = 0;
rbdr_cfg.lines = rbdr->dma_size / 128;
nicvf_queue_reg_write(nic, NIC_QSET_RBDR_0_1_CFG, qidx,
*(uint64_t *)&rbdr_cfg);
/* Notify HW */
nicvf_queue_reg_write(nic, NIC_QSET_RBDR_0_1_DOOR, qidx,
qs->rbdr_len - 1);
/* Set threshold value for interrupt generation */
nicvf_queue_reg_write(nic, NIC_QSET_RBDR_0_1_THRESH, qidx,
rbdr->thresh - 1);
}
/* Requests PF to assign and enable Qset */
void
nicvf_qset_config(struct nicvf *nic, boolean_t enable)
{
union nic_mbx mbx = {};
struct queue_set *qs;
struct qs_cfg *qs_cfg;
qs = nic->qs;
if (qs == NULL) {
device_printf(nic->dev,
"Qset is still not allocated, don't init queues\n");
return;
}
qs->enable = enable;
qs->vnic_id = nic->vf_id;
/* Send a mailbox msg to PF to config Qset */
mbx.qs.msg = NIC_MBOX_MSG_QS_CFG;
mbx.qs.num = qs->vnic_id;
mbx.qs.cfg = 0;
qs_cfg = (struct qs_cfg *)&mbx.qs.cfg;
if (qs->enable) {
qs_cfg->ena = 1;
qs_cfg->vnic = qs->vnic_id;
}
nicvf_send_msg_to_pf(nic, &mbx);
}
static void
nicvf_free_resources(struct nicvf *nic)
{
int qidx;
struct queue_set *qs;
qs = nic->qs;
/*
* Remove QS error task first since it has to be dead
* to safely free completion queue tasks.
*/
if (qs->qs_err_taskq != NULL) {
/* Shut down QS error tasks */
while (taskqueue_cancel(qs->qs_err_taskq,
&qs->qs_err_task, NULL) != 0) {
taskqueue_drain(qs->qs_err_taskq, &qs->qs_err_task);
}
taskqueue_free(qs->qs_err_taskq);
qs->qs_err_taskq = NULL;
}
/* Free receive buffer descriptor ring */
for (qidx = 0; qidx < qs->rbdr_cnt; qidx++)
nicvf_free_rbdr(nic, &qs->rbdr[qidx]);
/* Free completion queue */
for (qidx = 0; qidx < qs->cq_cnt; qidx++)
nicvf_free_cmp_queue(nic, &qs->cq[qidx]);
/* Free send queue */
for (qidx = 0; qidx < qs->sq_cnt; qidx++)
nicvf_free_snd_queue(nic, &qs->sq[qidx]);
}
static int
nicvf_alloc_resources(struct nicvf *nic)
{
struct queue_set *qs = nic->qs;
int qidx;
/* Alloc receive buffer descriptor ring */
for (qidx = 0; qidx < qs->rbdr_cnt; qidx++) {
if (nicvf_init_rbdr(nic, &qs->rbdr[qidx], qs->rbdr_len,
DMA_BUFFER_LEN, qidx))
goto alloc_fail;
}
/* Alloc send queue */
for (qidx = 0; qidx < qs->sq_cnt; qidx++) {
if (nicvf_init_snd_queue(nic, &qs->sq[qidx], qs->sq_len, qidx))
goto alloc_fail;
}
/* Alloc completion queue */
for (qidx = 0; qidx < qs->cq_cnt; qidx++) {
if (nicvf_init_cmp_queue(nic, &qs->cq[qidx], qs->cq_len, qidx))
goto alloc_fail;
}
/* Allocate QS error taskqueue */
TASK_INIT(&qs->qs_err_task, 0, nicvf_qs_err_task, nic);
qs->qs_err_taskq = taskqueue_create_fast("nicvf_qs_err_taskq", M_WAITOK,
taskqueue_thread_enqueue, &qs->qs_err_taskq);
taskqueue_start_threads(&qs->qs_err_taskq, 1, PI_NET, "%s: qs_taskq",
device_get_nameunit(nic->dev));
return (0);
alloc_fail:
nicvf_free_resources(nic);
return (ENOMEM);
}
int
nicvf_set_qset_resources(struct nicvf *nic)
{
struct queue_set *qs;
qs = malloc(sizeof(*qs), M_NICVF, (M_ZERO | M_WAITOK));
nic->qs = qs;
/* Set count of each queue */
qs->rbdr_cnt = RBDR_CNT;
qs->rq_cnt = RCV_QUEUE_CNT;
qs->sq_cnt = SND_QUEUE_CNT;
qs->cq_cnt = CMP_QUEUE_CNT;
/* Set queue lengths */
qs->rbdr_len = RCV_BUF_COUNT;
qs->sq_len = SND_QUEUE_LEN;
qs->cq_len = CMP_QUEUE_LEN;
nic->rx_queues = qs->rq_cnt;
nic->tx_queues = qs->sq_cnt;
return (0);
}
int
nicvf_config_data_transfer(struct nicvf *nic, boolean_t enable)
{
boolean_t disable = FALSE;
struct queue_set *qs;
int qidx;
qs = nic->qs;
if (qs == NULL)
return (0);
if (enable) {
if (nicvf_alloc_resources(nic) != 0)
return (ENOMEM);
for (qidx = 0; qidx < qs->sq_cnt; qidx++)
nicvf_snd_queue_config(nic, qs, qidx, enable);
for (qidx = 0; qidx < qs->cq_cnt; qidx++)
nicvf_cmp_queue_config(nic, qs, qidx, enable);
for (qidx = 0; qidx < qs->rbdr_cnt; qidx++)
nicvf_rbdr_config(nic, qs, qidx, enable);
for (qidx = 0; qidx < qs->rq_cnt; qidx++)
nicvf_rcv_queue_config(nic, qs, qidx, enable);
} else {
for (qidx = 0; qidx < qs->rq_cnt; qidx++)
nicvf_rcv_queue_config(nic, qs, qidx, disable);
for (qidx = 0; qidx < qs->rbdr_cnt; qidx++)
nicvf_rbdr_config(nic, qs, qidx, disable);
for (qidx = 0; qidx < qs->sq_cnt; qidx++)
nicvf_snd_queue_config(nic, qs, qidx, disable);
for (qidx = 0; qidx < qs->cq_cnt; qidx++)
nicvf_cmp_queue_config(nic, qs, qidx, disable);
nicvf_free_resources(nic);
}
return (0);
}
/*
* Get a free desc from SQ
* returns descriptor ponter & descriptor number
*/
static __inline int
nicvf_get_sq_desc(struct snd_queue *sq, int desc_cnt)
{
int qentry;
qentry = sq->tail;
atomic_subtract_int(&sq->free_cnt, desc_cnt);
sq->tail += desc_cnt;
sq->tail &= (sq->dmem.q_len - 1);
return (qentry);
}
/* Free descriptor back to SQ for future use */
static void
nicvf_put_sq_desc(struct snd_queue *sq, int desc_cnt)
{
atomic_add_int(&sq->free_cnt, desc_cnt);
sq->head += desc_cnt;
sq->head &= (sq->dmem.q_len - 1);
}
static __inline int
nicvf_get_nxt_sqentry(struct snd_queue *sq, int qentry)
{
qentry++;
qentry &= (sq->dmem.q_len - 1);
return (qentry);
}
static void
nicvf_sq_enable(struct nicvf *nic, struct snd_queue *sq, int qidx)
{
uint64_t sq_cfg;
sq_cfg = nicvf_queue_reg_read(nic, NIC_QSET_SQ_0_7_CFG, qidx);
sq_cfg |= NICVF_SQ_EN;
nicvf_queue_reg_write(nic, NIC_QSET_SQ_0_7_CFG, qidx, sq_cfg);
/* Ring doorbell so that H/W restarts processing SQEs */
nicvf_queue_reg_write(nic, NIC_QSET_SQ_0_7_DOOR, qidx, 0);
}
static void
nicvf_sq_disable(struct nicvf *nic, int qidx)
{
uint64_t sq_cfg;
sq_cfg = nicvf_queue_reg_read(nic, NIC_QSET_SQ_0_7_CFG, qidx);
sq_cfg &= ~NICVF_SQ_EN;
nicvf_queue_reg_write(nic, NIC_QSET_SQ_0_7_CFG, qidx, sq_cfg);
}
static void
nicvf_sq_free_used_descs(struct nicvf *nic, struct snd_queue *sq, int qidx)
{
- uint64_t head, tail;
+ uint64_t head;
struct snd_buff *snd_buff;
struct sq_hdr_subdesc *hdr;
NICVF_TX_LOCK(sq);
head = nicvf_queue_reg_read(nic, NIC_QSET_SQ_0_7_HEAD, qidx) >> 4;
- tail = nicvf_queue_reg_read(nic, NIC_QSET_SQ_0_7_TAIL, qidx) >> 4;
while (sq->head != head) {
hdr = (struct sq_hdr_subdesc *)GET_SQ_DESC(sq, sq->head);
if (hdr->subdesc_type != SQ_DESC_TYPE_HEADER) {
nicvf_put_sq_desc(sq, 1);
continue;
}
snd_buff = &sq->snd_buff[sq->head];
if (snd_buff->mbuf != NULL) {
bus_dmamap_unload(sq->snd_buff_dmat, snd_buff->dmap);
m_freem(snd_buff->mbuf);
sq->snd_buff[sq->head].mbuf = NULL;
}
nicvf_put_sq_desc(sq, hdr->subdesc_cnt + 1);
}
NICVF_TX_UNLOCK(sq);
}
/*
* Add SQ HEADER subdescriptor.
* First subdescriptor for every send descriptor.
*/
static __inline int
nicvf_sq_add_hdr_subdesc(struct snd_queue *sq, int qentry,
int subdesc_cnt, struct mbuf *mbuf, int len)
{
struct nicvf *nic;
struct sq_hdr_subdesc *hdr;
struct ether_vlan_header *eh;
#ifdef INET
struct ip *ip;
struct tcphdr *th;
#endif
uint16_t etype;
int ehdrlen, iphlen, poff, proto;
nic = sq->nic;
hdr = (struct sq_hdr_subdesc *)GET_SQ_DESC(sq, qentry);
sq->snd_buff[qentry].mbuf = mbuf;
memset(hdr, 0, SND_QUEUE_DESC_SIZE);
hdr->subdesc_type = SQ_DESC_TYPE_HEADER;
/* Enable notification via CQE after processing SQE */
hdr->post_cqe = 1;
/* No of subdescriptors following this */
hdr->subdesc_cnt = subdesc_cnt;
hdr->tot_len = len;
eh = mtod(mbuf, struct ether_vlan_header *);
if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
etype = ntohs(eh->evl_proto);
} else {
ehdrlen = ETHER_HDR_LEN;
etype = ntohs(eh->evl_encap_proto);
}
poff = proto = -1;
switch (etype) {
#ifdef INET6
case ETHERTYPE_IPV6:
if (mbuf->m_len < ehdrlen + sizeof(struct ip6_hdr)) {
mbuf = m_pullup(mbuf, ehdrlen +sizeof(struct ip6_hdr));
sq->snd_buff[qentry].mbuf = NULL;
if (mbuf == NULL)
return (ENOBUFS);
}
poff = ip6_lasthdr(mbuf, ehdrlen, IPPROTO_IPV6, &proto);
if (poff < 0)
return (ENOBUFS);
poff += ehdrlen;
break;
#endif
#ifdef INET
case ETHERTYPE_IP:
if (mbuf->m_len < ehdrlen + sizeof(struct ip)) {
mbuf = m_pullup(mbuf, ehdrlen + sizeof(struct ip));
sq->snd_buff[qentry].mbuf = mbuf;
if (mbuf == NULL)
return (ENOBUFS);
}
if (mbuf->m_pkthdr.csum_flags & CSUM_IP)
hdr->csum_l3 = 1; /* Enable IP csum calculation */
ip = (struct ip *)(mbuf->m_data + ehdrlen);
iphlen = ip->ip_hl << 2;
poff = ehdrlen + iphlen;
proto = ip->ip_p;
break;
#endif
}
#if defined(INET6) || defined(INET)
if (poff > 0 && mbuf->m_pkthdr.csum_flags != 0) {
switch (proto) {
case IPPROTO_TCP:
if ((mbuf->m_pkthdr.csum_flags & CSUM_TCP) == 0)
break;
if (mbuf->m_len < (poff + sizeof(struct tcphdr))) {
mbuf = m_pullup(mbuf, poff + sizeof(struct tcphdr));
sq->snd_buff[qentry].mbuf = mbuf;
if (mbuf == NULL)
return (ENOBUFS);
}
hdr->csum_l4 = SEND_L4_CSUM_TCP;
break;
case IPPROTO_UDP:
if ((mbuf->m_pkthdr.csum_flags & CSUM_UDP) == 0)
break;
if (mbuf->m_len < (poff + sizeof(struct udphdr))) {
mbuf = m_pullup(mbuf, poff + sizeof(struct udphdr));
sq->snd_buff[qentry].mbuf = mbuf;
if (mbuf == NULL)
return (ENOBUFS);
}
hdr->csum_l4 = SEND_L4_CSUM_UDP;
break;
case IPPROTO_SCTP:
if ((mbuf->m_pkthdr.csum_flags & CSUM_SCTP) == 0)
break;
if (mbuf->m_len < (poff + sizeof(struct sctphdr))) {
mbuf = m_pullup(mbuf, poff + sizeof(struct sctphdr));
sq->snd_buff[qentry].mbuf = mbuf;
if (mbuf == NULL)
return (ENOBUFS);
}
hdr->csum_l4 = SEND_L4_CSUM_SCTP;
break;
default:
break;
}
hdr->l3_offset = ehdrlen;
hdr->l4_offset = poff;
}
if ((mbuf->m_pkthdr.tso_segsz != 0) && nic->hw_tso) {
th = (struct tcphdr *)((caddr_t)(mbuf->m_data + poff));
hdr->tso = 1;
hdr->tso_start = poff + (th->th_off * 4);
hdr->tso_max_paysize = mbuf->m_pkthdr.tso_segsz;
hdr->inner_l3_offset = ehdrlen - 2;
nic->drv_stats.tx_tso++;
}
#endif
return (0);
}
/*
* SQ GATHER subdescriptor
* Must follow HDR descriptor
*/
static inline void nicvf_sq_add_gather_subdesc(struct snd_queue *sq, int qentry,
int size, uint64_t data)
{
struct sq_gather_subdesc *gather;
qentry &= (sq->dmem.q_len - 1);
gather = (struct sq_gather_subdesc *)GET_SQ_DESC(sq, qentry);
memset(gather, 0, SND_QUEUE_DESC_SIZE);
gather->subdesc_type = SQ_DESC_TYPE_GATHER;
gather->ld_type = NIC_SEND_LD_TYPE_E_LDD;
gather->size = size;
gather->addr = data;
}
/* Put an mbuf to a SQ for packet transfer. */
static int
nicvf_tx_mbuf_locked(struct snd_queue *sq, struct mbuf **mbufp)
{
bus_dma_segment_t segs[256];
struct snd_buff *snd_buff;
size_t seg;
int nsegs, qentry;
int subdesc_cnt;
int err;
NICVF_TX_LOCK_ASSERT(sq);
if (sq->free_cnt == 0)
return (ENOBUFS);
snd_buff = &sq->snd_buff[sq->tail];
err = bus_dmamap_load_mbuf_sg(sq->snd_buff_dmat, snd_buff->dmap,
*mbufp, segs, &nsegs, BUS_DMA_NOWAIT);
if (__predict_false(err != 0)) {
/* ARM64TODO: Add mbuf defragmenting if we lack maps */
m_freem(*mbufp);
*mbufp = NULL;
return (err);
}
/* Set how many subdescriptors is required */
subdesc_cnt = MIN_SQ_DESC_PER_PKT_XMIT + nsegs - 1;
if (subdesc_cnt > sq->free_cnt) {
/* ARM64TODO: Add mbuf defragmentation if we lack descriptors */
bus_dmamap_unload(sq->snd_buff_dmat, snd_buff->dmap);
return (ENOBUFS);
}
qentry = nicvf_get_sq_desc(sq, subdesc_cnt);
/* Add SQ header subdesc */
err = nicvf_sq_add_hdr_subdesc(sq, qentry, subdesc_cnt - 1, *mbufp,
(*mbufp)->m_pkthdr.len);
if (err != 0) {
nicvf_put_sq_desc(sq, subdesc_cnt);
bus_dmamap_unload(sq->snd_buff_dmat, snd_buff->dmap);
if (err == ENOBUFS) {
m_freem(*mbufp);
*mbufp = NULL;
}
return (err);
}
/* Add SQ gather subdescs */
for (seg = 0; seg < nsegs; seg++) {
qentry = nicvf_get_nxt_sqentry(sq, qentry);
nicvf_sq_add_gather_subdesc(sq, qentry, segs[seg].ds_len,
segs[seg].ds_addr);
}
/* make sure all memory stores are done before ringing doorbell */
bus_dmamap_sync(sq->dmem.dmat, sq->dmem.dmap, BUS_DMASYNC_PREWRITE);
dprintf(sq->nic->dev, "%s: sq->idx: %d, subdesc_cnt: %d\n",
__func__, sq->idx, subdesc_cnt);
/* Inform HW to xmit new packet */
nicvf_queue_reg_write(sq->nic, NIC_QSET_SQ_0_7_DOOR,
sq->idx, subdesc_cnt);
return (0);
}
static __inline u_int
frag_num(u_int i)
{
#if BYTE_ORDER == BIG_ENDIAN
return ((i & ~3) + 3 - (i & 3));
#else
return (i);
#endif
}
/* Returns MBUF for a received packet */
struct mbuf *
nicvf_get_rcv_mbuf(struct nicvf *nic, struct cqe_rx_t *cqe_rx)
{
int frag;
int payload_len = 0;
struct mbuf *mbuf;
struct mbuf *mbuf_frag;
uint16_t *rb_lens = NULL;
uint64_t *rb_ptrs = NULL;
mbuf = NULL;
rb_lens = (uint16_t *)((uint8_t *)cqe_rx + (3 * sizeof(uint64_t)));
rb_ptrs = (uint64_t *)((uint8_t *)cqe_rx + (6 * sizeof(uint64_t)));
dprintf(nic->dev, "%s rb_cnt %d rb0_ptr %lx rb0_sz %d\n",
__func__, cqe_rx->rb_cnt, cqe_rx->rb0_ptr, cqe_rx->rb0_sz);
for (frag = 0; frag < cqe_rx->rb_cnt; frag++) {
payload_len = rb_lens[frag_num(frag)];
if (frag == 0) {
/* First fragment */
mbuf = nicvf_rb_ptr_to_mbuf(nic,
(*rb_ptrs - cqe_rx->align_pad));
mbuf->m_len = payload_len;
mbuf->m_data += cqe_rx->align_pad;
if_setrcvif(mbuf, nic->ifp);
} else {
/* Add fragments */
mbuf_frag = nicvf_rb_ptr_to_mbuf(nic, *rb_ptrs);
m_append(mbuf, payload_len, mbuf_frag->m_data);
m_freem(mbuf_frag);
}
/* Next buffer pointer */
rb_ptrs++;
}
if (__predict_true(mbuf != NULL)) {
m_fixhdr(mbuf);
mbuf->m_pkthdr.flowid = cqe_rx->rq_idx;
M_HASHTYPE_SET(mbuf, M_HASHTYPE_OPAQUE);
if (__predict_true((if_getcapenable(nic->ifp) & IFCAP_RXCSUM) != 0)) {
/*
* HW by default verifies IP & TCP/UDP/SCTP checksums
*/
if (__predict_true(cqe_rx->l3_type == L3TYPE_IPV4)) {
mbuf->m_pkthdr.csum_flags =
(CSUM_IP_CHECKED | CSUM_IP_VALID);
}
switch (cqe_rx->l4_type) {
case L4TYPE_UDP:
case L4TYPE_TCP: /* fall through */
mbuf->m_pkthdr.csum_flags |=
(CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
mbuf->m_pkthdr.csum_data = 0xffff;
break;
case L4TYPE_SCTP:
mbuf->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
break;
default:
break;
}
}
}
return (mbuf);
}
/* Enable interrupt */
void
nicvf_enable_intr(struct nicvf *nic, int int_type, int q_idx)
{
uint64_t reg_val;
reg_val = nicvf_reg_read(nic, NIC_VF_ENA_W1S);
switch (int_type) {
case NICVF_INTR_CQ:
reg_val |= ((1UL << q_idx) << NICVF_INTR_CQ_SHIFT);
break;
case NICVF_INTR_SQ:
reg_val |= ((1UL << q_idx) << NICVF_INTR_SQ_SHIFT);
break;
case NICVF_INTR_RBDR:
reg_val |= ((1UL << q_idx) << NICVF_INTR_RBDR_SHIFT);
break;
case NICVF_INTR_PKT_DROP:
reg_val |= (1UL << NICVF_INTR_PKT_DROP_SHIFT);
break;
case NICVF_INTR_TCP_TIMER:
reg_val |= (1UL << NICVF_INTR_TCP_TIMER_SHIFT);
break;
case NICVF_INTR_MBOX:
reg_val |= (1UL << NICVF_INTR_MBOX_SHIFT);
break;
case NICVF_INTR_QS_ERR:
reg_val |= (1UL << NICVF_INTR_QS_ERR_SHIFT);
break;
default:
device_printf(nic->dev,
"Failed to enable interrupt: unknown type\n");
break;
}
nicvf_reg_write(nic, NIC_VF_ENA_W1S, reg_val);
}
/* Disable interrupt */
void
nicvf_disable_intr(struct nicvf *nic, int int_type, int q_idx)
{
uint64_t reg_val = 0;
switch (int_type) {
case NICVF_INTR_CQ:
reg_val |= ((1UL << q_idx) << NICVF_INTR_CQ_SHIFT);
break;
case NICVF_INTR_SQ:
reg_val |= ((1UL << q_idx) << NICVF_INTR_SQ_SHIFT);
break;
case NICVF_INTR_RBDR:
reg_val |= ((1UL << q_idx) << NICVF_INTR_RBDR_SHIFT);
break;
case NICVF_INTR_PKT_DROP:
reg_val |= (1UL << NICVF_INTR_PKT_DROP_SHIFT);
break;
case NICVF_INTR_TCP_TIMER:
reg_val |= (1UL << NICVF_INTR_TCP_TIMER_SHIFT);
break;
case NICVF_INTR_MBOX:
reg_val |= (1UL << NICVF_INTR_MBOX_SHIFT);
break;
case NICVF_INTR_QS_ERR:
reg_val |= (1UL << NICVF_INTR_QS_ERR_SHIFT);
break;
default:
device_printf(nic->dev,
"Failed to disable interrupt: unknown type\n");
break;
}
nicvf_reg_write(nic, NIC_VF_ENA_W1C, reg_val);
}
/* Clear interrupt */
void
nicvf_clear_intr(struct nicvf *nic, int int_type, int q_idx)
{
uint64_t reg_val = 0;
switch (int_type) {
case NICVF_INTR_CQ:
reg_val = ((1UL << q_idx) << NICVF_INTR_CQ_SHIFT);
break;
case NICVF_INTR_SQ:
reg_val = ((1UL << q_idx) << NICVF_INTR_SQ_SHIFT);
break;
case NICVF_INTR_RBDR:
reg_val = ((1UL << q_idx) << NICVF_INTR_RBDR_SHIFT);
break;
case NICVF_INTR_PKT_DROP:
reg_val = (1UL << NICVF_INTR_PKT_DROP_SHIFT);
break;
case NICVF_INTR_TCP_TIMER:
reg_val = (1UL << NICVF_INTR_TCP_TIMER_SHIFT);
break;
case NICVF_INTR_MBOX:
reg_val = (1UL << NICVF_INTR_MBOX_SHIFT);
break;
case NICVF_INTR_QS_ERR:
reg_val |= (1UL << NICVF_INTR_QS_ERR_SHIFT);
break;
default:
device_printf(nic->dev,
"Failed to clear interrupt: unknown type\n");
break;
}
nicvf_reg_write(nic, NIC_VF_INT, reg_val);
}
/* Check if interrupt is enabled */
int
nicvf_is_intr_enabled(struct nicvf *nic, int int_type, int q_idx)
{
uint64_t reg_val;
uint64_t mask = 0xff;
reg_val = nicvf_reg_read(nic, NIC_VF_ENA_W1S);
switch (int_type) {
case NICVF_INTR_CQ:
mask = ((1UL << q_idx) << NICVF_INTR_CQ_SHIFT);
break;
case NICVF_INTR_SQ:
mask = ((1UL << q_idx) << NICVF_INTR_SQ_SHIFT);
break;
case NICVF_INTR_RBDR:
mask = ((1UL << q_idx) << NICVF_INTR_RBDR_SHIFT);
break;
case NICVF_INTR_PKT_DROP:
mask = NICVF_INTR_PKT_DROP_MASK;
break;
case NICVF_INTR_TCP_TIMER:
mask = NICVF_INTR_TCP_TIMER_MASK;
break;
case NICVF_INTR_MBOX:
mask = NICVF_INTR_MBOX_MASK;
break;
case NICVF_INTR_QS_ERR:
mask = NICVF_INTR_QS_ERR_MASK;
break;
default:
device_printf(nic->dev,
"Failed to check interrupt enable: unknown type\n");
break;
}
return (reg_val & mask);
}
void
nicvf_update_rq_stats(struct nicvf *nic, int rq_idx)
{
struct rcv_queue *rq;
#define GET_RQ_STATS(reg) \
nicvf_reg_read(nic, NIC_QSET_RQ_0_7_STAT_0_1 |\
(rq_idx << NIC_Q_NUM_SHIFT) | (reg << 3))
rq = &nic->qs->rq[rq_idx];
rq->stats.bytes = GET_RQ_STATS(RQ_SQ_STATS_OCTS);
rq->stats.pkts = GET_RQ_STATS(RQ_SQ_STATS_PKTS);
}
void
nicvf_update_sq_stats(struct nicvf *nic, int sq_idx)
{
struct snd_queue *sq;
#define GET_SQ_STATS(reg) \
nicvf_reg_read(nic, NIC_QSET_SQ_0_7_STAT_0_1 |\
(sq_idx << NIC_Q_NUM_SHIFT) | (reg << 3))
sq = &nic->qs->sq[sq_idx];
sq->stats.bytes = GET_SQ_STATS(RQ_SQ_STATS_OCTS);
sq->stats.pkts = GET_SQ_STATS(RQ_SQ_STATS_PKTS);
}
/* Check for errors in the receive cmp.queue entry */
int
nicvf_check_cqe_rx_errs(struct nicvf *nic, struct cmp_queue *cq,
struct cqe_rx_t *cqe_rx)
{
struct nicvf_hw_stats *stats = &nic->hw_stats;
struct nicvf_drv_stats *drv_stats = &nic->drv_stats;
if (!cqe_rx->err_level && !cqe_rx->err_opcode) {
drv_stats->rx_frames_ok++;
return (0);
}
switch (cqe_rx->err_opcode) {
case CQ_RX_ERROP_RE_PARTIAL:
stats->rx_bgx_truncated_pkts++;
break;
case CQ_RX_ERROP_RE_JABBER:
stats->rx_jabber_errs++;
break;
case CQ_RX_ERROP_RE_FCS:
stats->rx_fcs_errs++;
break;
case CQ_RX_ERROP_RE_RX_CTL:
stats->rx_bgx_errs++;
break;
case CQ_RX_ERROP_PREL2_ERR:
stats->rx_prel2_errs++;
break;
case CQ_RX_ERROP_L2_MAL:
stats->rx_l2_hdr_malformed++;
break;
case CQ_RX_ERROP_L2_OVERSIZE:
stats->rx_oversize++;
break;
case CQ_RX_ERROP_L2_UNDERSIZE:
stats->rx_undersize++;
break;
case CQ_RX_ERROP_L2_LENMISM:
stats->rx_l2_len_mismatch++;
break;
case CQ_RX_ERROP_L2_PCLP:
stats->rx_l2_pclp++;
break;
case CQ_RX_ERROP_IP_NOT:
stats->rx_ip_ver_errs++;
break;
case CQ_RX_ERROP_IP_CSUM_ERR:
stats->rx_ip_csum_errs++;
break;
case CQ_RX_ERROP_IP_MAL:
stats->rx_ip_hdr_malformed++;
break;
case CQ_RX_ERROP_IP_MALD:
stats->rx_ip_payload_malformed++;
break;
case CQ_RX_ERROP_IP_HOP:
stats->rx_ip_ttl_errs++;
break;
case CQ_RX_ERROP_L3_PCLP:
stats->rx_l3_pclp++;
break;
case CQ_RX_ERROP_L4_MAL:
stats->rx_l4_malformed++;
break;
case CQ_RX_ERROP_L4_CHK:
stats->rx_l4_csum_errs++;
break;
case CQ_RX_ERROP_UDP_LEN:
stats->rx_udp_len_errs++;
break;
case CQ_RX_ERROP_L4_PORT:
stats->rx_l4_port_errs++;
break;
case CQ_RX_ERROP_TCP_FLAG:
stats->rx_tcp_flag_errs++;
break;
case CQ_RX_ERROP_TCP_OFFSET:
stats->rx_tcp_offset_errs++;
break;
case CQ_RX_ERROP_L4_PCLP:
stats->rx_l4_pclp++;
break;
case CQ_RX_ERROP_RBDR_TRUNC:
stats->rx_truncated_pkts++;
break;
}
return (1);
}
/* Check for errors in the send cmp.queue entry */
int
nicvf_check_cqe_tx_errs(struct nicvf *nic, struct cmp_queue *cq,
struct cqe_send_t *cqe_tx)
{
struct cmp_queue_stats *stats = &cq->stats;
switch (cqe_tx->send_status) {
case CQ_TX_ERROP_GOOD:
stats->tx.good++;
return (0);
case CQ_TX_ERROP_DESC_FAULT:
stats->tx.desc_fault++;
break;
case CQ_TX_ERROP_HDR_CONS_ERR:
stats->tx.hdr_cons_err++;
break;
case CQ_TX_ERROP_SUBDC_ERR:
stats->tx.subdesc_err++;
break;
case CQ_TX_ERROP_IMM_SIZE_OFLOW:
stats->tx.imm_size_oflow++;
break;
case CQ_TX_ERROP_DATA_SEQUENCE_ERR:
stats->tx.data_seq_err++;
break;
case CQ_TX_ERROP_MEM_SEQUENCE_ERR:
stats->tx.mem_seq_err++;
break;
case CQ_TX_ERROP_LOCK_VIOL:
stats->tx.lock_viol++;
break;
case CQ_TX_ERROP_DATA_FAULT:
stats->tx.data_fault++;
break;
case CQ_TX_ERROP_TSTMP_CONFLICT:
stats->tx.tstmp_conflict++;
break;
case CQ_TX_ERROP_TSTMP_TIMEOUT:
stats->tx.tstmp_timeout++;
break;
case CQ_TX_ERROP_MEM_FAULT:
stats->tx.mem_fault++;
break;
case CQ_TX_ERROP_CK_OVERLAP:
stats->tx.csum_overlap++;
break;
case CQ_TX_ERROP_CK_OFLOW:
stats->tx.csum_overflow++;
break;
}
return (1);
}
Index: head/sys/fs/cd9660/cd9660_vfsops.c
===================================================================
--- head/sys/fs/cd9660/cd9660_vfsops.c (revision 327172)
+++ head/sys/fs/cd9660/cd9660_vfsops.c (revision 327173)
@@ -1,857 +1,855 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (c) 1994
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley
* by Pace Willisson (pace@blitz.com). The Rock Ridge Extension
* Support code is derived from software contributed to Berkeley
* by Atsushi Murai (amurai@spec.co.jp).
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)cd9660_vfsops.c 8.18 (Berkeley) 5/22/95
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/namei.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/cdio.h>
#include <sys/conf.h>
#include <sys/fcntl.h>
#include <sys/malloc.h>
#include <sys/stat.h>
#include <sys/syslog.h>
#include <sys/iconv.h>
#include <fs/cd9660/iso.h>
#include <fs/cd9660/iso_rrip.h>
#include <fs/cd9660/cd9660_node.h>
#include <fs/cd9660/cd9660_mount.h>
#include <geom/geom.h>
#include <geom/geom_vfs.h>
MALLOC_DEFINE(M_ISOFSMNT, "isofs_mount", "ISOFS mount structure");
MALLOC_DEFINE(M_ISOFSNODE, "isofs_node", "ISOFS vnode private part");
struct iconv_functions *cd9660_iconv = NULL;
static vfs_mount_t cd9660_mount;
static vfs_cmount_t cd9660_cmount;
static vfs_unmount_t cd9660_unmount;
static vfs_root_t cd9660_root;
static vfs_statfs_t cd9660_statfs;
static vfs_vget_t cd9660_vget;
static vfs_fhtovp_t cd9660_fhtovp;
static struct vfsops cd9660_vfsops = {
.vfs_fhtovp = cd9660_fhtovp,
.vfs_mount = cd9660_mount,
.vfs_cmount = cd9660_cmount,
.vfs_root = cd9660_root,
.vfs_statfs = cd9660_statfs,
.vfs_unmount = cd9660_unmount,
.vfs_vget = cd9660_vget,
};
VFS_SET(cd9660_vfsops, cd9660, VFCF_READONLY);
MODULE_VERSION(cd9660, 1);
static int cd9660_vfs_hash_cmp(struct vnode *vp, void *pino);
static int iso_mountfs(struct vnode *devvp, struct mount *mp);
/*
* VFS Operations.
*/
static int
cd9660_cmount(struct mntarg *ma, void *data, uint64_t flags)
{
struct iso_args args;
struct export_args exp;
int error;
error = copyin(data, &args, sizeof args);
if (error)
return (error);
vfs_oexport_conv(&args.export, &exp);
ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN);
ma = mount_arg(ma, "export", &exp, sizeof(exp));
ma = mount_argsu(ma, "cs_disk", args.cs_disk, 64);
ma = mount_argsu(ma, "cs_local", args.cs_local, 64);
ma = mount_argf(ma, "ssector", "%u", args.ssector);
ma = mount_argb(ma, !(args.flags & ISOFSMNT_NORRIP), "norrip");
ma = mount_argb(ma, args.flags & ISOFSMNT_GENS, "nogens");
ma = mount_argb(ma, args.flags & ISOFSMNT_EXTATT, "noextatt");
ma = mount_argb(ma, !(args.flags & ISOFSMNT_NOJOLIET), "nojoliet");
ma = mount_argb(ma,
args.flags & ISOFSMNT_BROKENJOLIET, "nobrokenjoliet");
ma = mount_argb(ma, args.flags & ISOFSMNT_KICONV, "nokiconv");
error = kernel_mount(ma, flags);
return (error);
}
static int
cd9660_mount(struct mount *mp)
{
struct vnode *devvp;
struct thread *td;
char *fspec;
int error;
accmode_t accmode;
struct nameidata ndp;
struct iso_mnt *imp = NULL;
td = curthread;
/*
* Unconditionally mount as read-only.
*/
MNT_ILOCK(mp);
mp->mnt_flag |= MNT_RDONLY;
MNT_IUNLOCK(mp);
fspec = vfs_getopts(mp->mnt_optnew, "from", &error);
if (error)
return (error);
imp = VFSTOISOFS(mp);
if (mp->mnt_flag & MNT_UPDATE) {
if (vfs_flagopt(mp->mnt_optnew, "export", NULL, 0))
return (0);
}
/*
* Not an update, or updating the name: look up the name
* and verify that it refers to a sensible block device.
*/
NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td);
if ((error = namei(&ndp)))
return (error);
NDFREE(&ndp, NDF_ONLY_PNBUF);
devvp = ndp.ni_vp;
if (!vn_isdisk(devvp, &error)) {
vput(devvp);
return (error);
}
/*
* Verify that user has necessary permissions on the device,
* or has superuser abilities
*/
accmode = VREAD;
error = VOP_ACCESS(devvp, accmode, td->td_ucred, td);
if (error)
error = priv_check(td, PRIV_VFS_MOUNT_PERM);
if (error) {
vput(devvp);
return (error);
}
if ((mp->mnt_flag & MNT_UPDATE) == 0) {
error = iso_mountfs(devvp, mp);
if (error)
vrele(devvp);
} else {
if (devvp != imp->im_devvp)
error = EINVAL; /* needs translation */
vput(devvp);
}
if (error)
return (error);
vfs_mountedfrom(mp, fspec);
return (0);
}
/*
* Common code for mount and mountroot
*/
static int
iso_mountfs(devvp, mp)
struct vnode *devvp;
struct mount *mp;
{
struct iso_mnt *isomp = NULL;
struct buf *bp = NULL;
struct buf *pribp = NULL, *supbp = NULL;
struct cdev *dev;
int error = EINVAL;
int high_sierra = 0;
int iso_bsize;
int iso_blknum;
int joliet_level;
int isverified = 0;
struct iso_volume_descriptor *vdp = NULL;
struct iso_primary_descriptor *pri = NULL;
struct iso_sierra_primary_descriptor *pri_sierra = NULL;
struct iso_supplementary_descriptor *sup = NULL;
struct iso_directory_record *rootp;
int logical_block_size, ssector;
struct g_consumer *cp;
struct bufobj *bo;
char *cs_local, *cs_disk;
dev = devvp->v_rdev;
dev_ref(dev);
g_topology_lock();
error = g_vfs_open(devvp, &cp, "cd9660", 0);
if (error == 0)
g_getattr("MNT::verified", cp, &isverified);
g_topology_unlock();
VOP_UNLOCK(devvp, 0);
if (error)
goto out;
if (devvp->v_rdev->si_iosize_max != 0)
mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max;
if (mp->mnt_iosize_max > MAXPHYS)
mp->mnt_iosize_max = MAXPHYS;
bo = &devvp->v_bufobj;
/* This is the "logical sector size". The standard says this
* should be 2048 or the physical sector size on the device,
* whichever is greater.
*/
if ((ISO_DEFAULT_BLOCK_SIZE % cp->provider->sectorsize) != 0) {
error = EINVAL;
goto out;
}
iso_bsize = cp->provider->sectorsize;
joliet_level = 0;
if (1 != vfs_scanopt(mp->mnt_optnew, "ssector", "%d", &ssector))
ssector = 0;
for (iso_blknum = 16 + ssector;
iso_blknum < 100 + ssector;
iso_blknum++) {
if ((error = bread(devvp, iso_blknum * btodb(ISO_DEFAULT_BLOCK_SIZE),
iso_bsize, NOCRED, &bp)) != 0)
goto out;
vdp = (struct iso_volume_descriptor *)bp->b_data;
if (bcmp (vdp->id, ISO_STANDARD_ID, sizeof vdp->id) != 0) {
if (bcmp (vdp->id_sierra, ISO_SIERRA_ID,
sizeof vdp->id_sierra) != 0) {
error = EINVAL;
goto out;
} else
high_sierra = 1;
}
switch (isonum_711 (high_sierra? vdp->type_sierra: vdp->type)){
case ISO_VD_PRIMARY:
if (pribp == NULL) {
pribp = bp;
bp = NULL;
pri = (struct iso_primary_descriptor *)vdp;
pri_sierra =
(struct iso_sierra_primary_descriptor *)vdp;
}
break;
case ISO_VD_SUPPLEMENTARY:
if (supbp == NULL) {
supbp = bp;
bp = NULL;
sup = (struct iso_supplementary_descriptor *)vdp;
if (!vfs_flagopt(mp->mnt_optnew, "nojoliet", NULL, 0)) {
if (bcmp(sup->escape, "%/@", 3) == 0)
joliet_level = 1;
if (bcmp(sup->escape, "%/C", 3) == 0)
joliet_level = 2;
if (bcmp(sup->escape, "%/E", 3) == 0)
joliet_level = 3;
if ((isonum_711 (sup->flags) & 1) &&
!vfs_flagopt(mp->mnt_optnew, "brokenjoliet", NULL, 0))
joliet_level = 0;
}
}
break;
case ISO_VD_END:
goto vd_end;
default:
break;
}
if (bp != NULL) {
brelse(bp);
bp = NULL;
}
}
vd_end:
if (bp != NULL) {
brelse(bp);
bp = NULL;
}
if (pri == NULL) {
error = EINVAL;
goto out;
}
logical_block_size =
isonum_723 (high_sierra?
pri_sierra->logical_block_size:
pri->logical_block_size);
if (logical_block_size < DEV_BSIZE || logical_block_size > MAXBSIZE
|| (logical_block_size & (logical_block_size - 1)) != 0) {
error = EINVAL;
goto out;
}
rootp = (struct iso_directory_record *)
(high_sierra?
pri_sierra->root_directory_record:
pri->root_directory_record);
isomp = malloc(sizeof *isomp, M_ISOFSMNT, M_WAITOK | M_ZERO);
isomp->im_cp = cp;
isomp->im_bo = bo;
isomp->logical_block_size = logical_block_size;
isomp->volume_space_size =
isonum_733 (high_sierra?
pri_sierra->volume_space_size:
pri->volume_space_size);
isomp->joliet_level = 0;
/*
* Since an ISO9660 multi-session CD can also access previous
* sessions, we have to include them into the space consider-
* ations. This doesn't yield a very accurate number since
* parts of the old sessions might be inaccessible now, but we
* can't do much better. This is also important for the NFS
* filehandle validation.
*/
isomp->volume_space_size += ssector;
bcopy (rootp, isomp->root, sizeof isomp->root);
isomp->root_extent = isonum_733 (rootp->extent);
isomp->root_size = isonum_733 (rootp->size);
isomp->im_bmask = logical_block_size - 1;
isomp->im_bshift = ffs(logical_block_size) - 1;
pribp->b_flags |= B_AGE;
brelse(pribp);
pribp = NULL;
rootp = NULL;
pri = NULL;
pri_sierra = NULL;
mp->mnt_data = isomp;
mp->mnt_stat.f_fsid.val[0] = dev2udev(dev);
mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum;
mp->mnt_maxsymlinklen = 0;
MNT_ILOCK(mp);
if (isverified)
mp->mnt_flag |= MNT_VERIFIED;
mp->mnt_flag |= MNT_LOCAL;
mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_EXTENDED_SHARED;
MNT_IUNLOCK(mp);
isomp->im_mountp = mp;
isomp->im_dev = dev;
isomp->im_devvp = devvp;
vfs_flagopt(mp->mnt_optnew, "norrip", &isomp->im_flags, ISOFSMNT_NORRIP);
vfs_flagopt(mp->mnt_optnew, "gens", &isomp->im_flags, ISOFSMNT_GENS);
vfs_flagopt(mp->mnt_optnew, "extatt", &isomp->im_flags, ISOFSMNT_EXTATT);
vfs_flagopt(mp->mnt_optnew, "nojoliet", &isomp->im_flags, ISOFSMNT_NOJOLIET);
vfs_flagopt(mp->mnt_optnew, "kiconv", &isomp->im_flags, ISOFSMNT_KICONV);
/* Check the Rock Ridge Extension support */
if (!(isomp->im_flags & ISOFSMNT_NORRIP)) {
if ((error = bread(isomp->im_devvp, (isomp->root_extent +
isonum_711(((struct iso_directory_record *)isomp->root)->
ext_attr_length)) << (isomp->im_bshift - DEV_BSHIFT),
isomp->logical_block_size, NOCRED, &bp)) != 0)
goto out;
rootp = (struct iso_directory_record *)bp->b_data;
if ((isomp->rr_skip = cd9660_rrip_offset(rootp,isomp)) < 0) {
isomp->im_flags |= ISOFSMNT_NORRIP;
} else {
isomp->im_flags &= ~ISOFSMNT_GENS;
}
/*
* The contents are valid,
* but they will get reread as part of another vnode, so...
*/
bp->b_flags |= B_AGE;
brelse(bp);
bp = NULL;
rootp = NULL;
}
if (isomp->im_flags & ISOFSMNT_KICONV && cd9660_iconv) {
cs_local = vfs_getopts(mp->mnt_optnew, "cs_local", &error);
if (error)
goto out;
cs_disk = vfs_getopts(mp->mnt_optnew, "cs_disk", &error);
if (error)
goto out;
cd9660_iconv->open(cs_local, cs_disk, &isomp->im_d2l);
cd9660_iconv->open(cs_disk, cs_local, &isomp->im_l2d);
} else {
isomp->im_d2l = NULL;
isomp->im_l2d = NULL;
}
if (high_sierra) {
/* this effectively ignores all the mount flags */
if (bootverbose)
log(LOG_INFO, "cd9660: High Sierra Format\n");
isomp->iso_ftype = ISO_FTYPE_HIGH_SIERRA;
} else
switch (isomp->im_flags&(ISOFSMNT_NORRIP|ISOFSMNT_GENS)) {
default:
isomp->iso_ftype = ISO_FTYPE_DEFAULT;
break;
case ISOFSMNT_GENS|ISOFSMNT_NORRIP:
isomp->iso_ftype = ISO_FTYPE_9660;
break;
case 0:
if (bootverbose)
log(LOG_INFO, "cd9660: RockRidge Extension\n");
isomp->iso_ftype = ISO_FTYPE_RRIP;
break;
}
/* Decide whether to use the Joliet descriptor */
if (isomp->iso_ftype != ISO_FTYPE_RRIP && joliet_level) {
if (bootverbose)
log(LOG_INFO, "cd9660: Joliet Extension (Level %d)\n",
joliet_level);
rootp = (struct iso_directory_record *)
sup->root_directory_record;
bcopy (rootp, isomp->root, sizeof isomp->root);
isomp->root_extent = isonum_733 (rootp->extent);
isomp->root_size = isonum_733 (rootp->size);
isomp->joliet_level = joliet_level;
supbp->b_flags |= B_AGE;
}
if (supbp) {
brelse(supbp);
supbp = NULL;
sup = NULL;
}
return 0;
out:
if (bp != NULL)
brelse(bp);
if (pribp != NULL)
brelse(pribp);
if (supbp != NULL)
brelse(supbp);
if (cp != NULL) {
g_topology_lock();
g_vfs_close(cp);
g_topology_unlock();
}
if (isomp) {
free(isomp, M_ISOFSMNT);
mp->mnt_data = NULL;
}
dev_rel(dev);
return error;
}
/*
* unmount system call
*/
static int
cd9660_unmount(mp, mntflags)
struct mount *mp;
int mntflags;
{
struct iso_mnt *isomp;
int error, flags = 0;
if (mntflags & MNT_FORCE)
flags |= FORCECLOSE;
if ((error = vflush(mp, 0, flags, curthread)))
return (error);
isomp = VFSTOISOFS(mp);
if (isomp->im_flags & ISOFSMNT_KICONV && cd9660_iconv) {
if (isomp->im_d2l)
cd9660_iconv->close(isomp->im_d2l);
if (isomp->im_l2d)
cd9660_iconv->close(isomp->im_l2d);
}
g_topology_lock();
g_vfs_close(isomp->im_cp);
g_topology_unlock();
vrele(isomp->im_devvp);
dev_rel(isomp->im_dev);
free(isomp, M_ISOFSMNT);
mp->mnt_data = NULL;
MNT_ILOCK(mp);
mp->mnt_flag &= ~MNT_LOCAL;
MNT_IUNLOCK(mp);
return (error);
}
/*
* Return root of a filesystem
*/
static int
cd9660_root(mp, flags, vpp)
struct mount *mp;
int flags;
struct vnode **vpp;
{
struct iso_mnt *imp = VFSTOISOFS(mp);
struct iso_directory_record *dp =
(struct iso_directory_record *)imp->root;
cd_ino_t ino = isodirino(dp, imp);
/*
* With RRIP we must use the `.' entry of the root directory.
* Simply tell vget, that it's a relocated directory.
*/
return (cd9660_vget_internal(mp, ino, flags, vpp,
imp->iso_ftype == ISO_FTYPE_RRIP, dp));
}
/*
* Get filesystem statistics.
*/
static int
cd9660_statfs(mp, sbp)
struct mount *mp;
struct statfs *sbp;
{
struct iso_mnt *isomp;
isomp = VFSTOISOFS(mp);
sbp->f_bsize = isomp->logical_block_size;
sbp->f_iosize = sbp->f_bsize; /* XXX */
sbp->f_blocks = isomp->volume_space_size;
sbp->f_bfree = 0; /* total free blocks */
sbp->f_bavail = 0; /* blocks free for non superuser */
sbp->f_files = 0; /* total files */
sbp->f_ffree = 0; /* free file nodes */
return 0;
}
/*
* File handle to vnode
*
* Have to be really careful about stale file handles:
* - check that the inode number is in range
* - call iget() to get the locked inode
* - check for an unallocated inode (i_mode == 0)
* - check that the generation number matches
*/
/* ARGSUSED */
static int
cd9660_fhtovp(mp, fhp, flags, vpp)
struct mount *mp;
struct fid *fhp;
int flags;
struct vnode **vpp;
{
struct ifid ifh;
struct iso_node *ip;
struct vnode *nvp;
int error;
memcpy(&ifh, fhp, sizeof(ifh));
#ifdef ISOFS_DBG
printf("fhtovp: ino %d, start %ld\n",
ifh.ifid_ino, ifh.ifid_start);
#endif
if ((error = VFS_VGET(mp, ifh.ifid_ino, LK_EXCLUSIVE, &nvp)) != 0) {
*vpp = NULLVP;
return (error);
}
ip = VTOI(nvp);
if (ip->inode.iso_mode == 0) {
vput(nvp);
*vpp = NULLVP;
return (ESTALE);
}
*vpp = nvp;
vnode_create_vobject(*vpp, ip->i_size, curthread);
return (0);
}
/*
* Conform to standard VFS interface; can't vget arbitrary inodes beyond 4GB
* into media with current inode scheme and 32-bit ino_t. This shouldn't be
* needed for anything other than nfsd, and who exports a mounted DVD over NFS?
*/
static int
cd9660_vget(mp, ino, flags, vpp)
struct mount *mp;
ino_t ino;
int flags;
struct vnode **vpp;
{
/*
* XXXX
* It would be nice if we didn't always set the `relocated' flag
* and force the extra read, but I don't want to think about fixing
* that right now.
*/
return (cd9660_vget_internal(mp, ino, flags, vpp,
#if 0
VFSTOISOFS(mp)->iso_ftype == ISO_FTYPE_RRIP,
#else
0,
#endif
(struct iso_directory_record *)0));
}
/* Use special comparator for full 64-bit ino comparison. */
static int
cd9660_vfs_hash_cmp(vp, pino)
struct vnode *vp;
void *pino;
{
struct iso_node *ip;
cd_ino_t ino;
ip = VTOI(vp);
ino = *(cd_ino_t *)pino;
return (ip->i_number != ino);
}
int
cd9660_vget_internal(mp, ino, flags, vpp, relocated, isodir)
struct mount *mp;
cd_ino_t ino;
int flags;
struct vnode **vpp;
int relocated;
struct iso_directory_record *isodir;
{
struct iso_mnt *imp;
struct iso_node *ip;
struct buf *bp;
struct vnode *vp;
- struct cdev *dev;
int error;
struct thread *td;
td = curthread;
error = vfs_hash_get(mp, ino, flags, td, vpp, cd9660_vfs_hash_cmp,
&ino);
if (error || *vpp != NULL)
return (error);
/*
* We must promote to an exclusive lock for vnode creation. This
* can happen if lookup is passed LOCKSHARED.
*/
if ((flags & LK_TYPE_MASK) == LK_SHARED) {
flags &= ~LK_TYPE_MASK;
flags |= LK_EXCLUSIVE;
}
/*
* We do not lock vnode creation as it is believed to be too
* expensive for such rare case as simultaneous creation of vnode
* for same ino by different processes. We just allow them to race
* and check later to decide who wins. Let the race begin!
*/
imp = VFSTOISOFS(mp);
- dev = imp->im_dev;
/* Allocate a new vnode/iso_node. */
if ((error = getnewvnode("isofs", mp, &cd9660_vnodeops, &vp)) != 0) {
*vpp = NULLVP;
return (error);
}
ip = malloc(sizeof(struct iso_node), M_ISOFSNODE,
M_WAITOK | M_ZERO);
vp->v_data = ip;
ip->i_vnode = vp;
ip->i_number = ino;
lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL);
error = insmntque(vp, mp);
if (error != 0) {
free(ip, M_ISOFSNODE);
*vpp = NULLVP;
return (error);
}
error = vfs_hash_insert(vp, ino, flags, td, vpp, cd9660_vfs_hash_cmp,
&ino);
if (error || *vpp != NULL)
return (error);
if (isodir == NULL) {
int lbn, off;
lbn = lblkno(imp, ino);
if (lbn >= imp->volume_space_size) {
vput(vp);
printf("fhtovp: lbn exceed volume space %d\n", lbn);
return (ESTALE);
}
off = blkoff(imp, ino);
if (off + ISO_DIRECTORY_RECORD_SIZE > imp->logical_block_size) {
vput(vp);
printf("fhtovp: crosses block boundary %d\n",
off + ISO_DIRECTORY_RECORD_SIZE);
return (ESTALE);
}
error = bread(imp->im_devvp,
lbn << (imp->im_bshift - DEV_BSHIFT),
imp->logical_block_size, NOCRED, &bp);
if (error) {
vput(vp);
brelse(bp);
printf("fhtovp: bread error %d\n",error);
return (error);
}
isodir = (struct iso_directory_record *)(bp->b_data + off);
if (off + isonum_711(isodir->length) >
imp->logical_block_size) {
vput(vp);
brelse(bp);
printf("fhtovp: directory crosses block boundary %d[off=%d/len=%d]\n",
off +isonum_711(isodir->length), off,
isonum_711(isodir->length));
return (ESTALE);
}
#if 0
if (isonum_733(isodir->extent) +
isonum_711(isodir->ext_attr_length) != ifhp->ifid_start) {
brelse(bp);
printf("fhtovp: file start miss %d vs %d\n",
isonum_733(isodir->extent) + isonum_711(isodir->ext_attr_length),
ifhp->ifid_start);
return (ESTALE);
}
#endif
} else
bp = NULL;
ip->i_mnt = imp;
if (relocated) {
/*
* On relocated directories we must
* read the `.' entry out of a dir.
*/
ip->iso_start = ino >> imp->im_bshift;
if (bp != NULL)
brelse(bp);
if ((error = cd9660_blkatoff(vp, (off_t)0, NULL, &bp)) != 0) {
vput(vp);
return (error);
}
isodir = (struct iso_directory_record *)bp->b_data;
}
ip->iso_extent = isonum_733(isodir->extent);
ip->i_size = isonum_733(isodir->size);
ip->iso_start = isonum_711(isodir->ext_attr_length) + ip->iso_extent;
/*
* Setup time stamp, attribute
*/
vp->v_type = VNON;
switch (imp->iso_ftype) {
default: /* ISO_FTYPE_9660 */
{
struct buf *bp2;
int off;
if ((imp->im_flags & ISOFSMNT_EXTATT)
&& (off = isonum_711(isodir->ext_attr_length)))
cd9660_blkatoff(vp, (off_t)-(off << imp->im_bshift), NULL,
&bp2);
else
bp2 = NULL;
cd9660_defattr(isodir, ip, bp2, ISO_FTYPE_9660);
cd9660_deftstamp(isodir, ip, bp2, ISO_FTYPE_9660);
if (bp2)
brelse(bp2);
break;
}
case ISO_FTYPE_RRIP:
cd9660_rrip_analyze(isodir, ip, imp);
break;
}
brelse(bp);
/*
* Initialize the associated vnode
*/
switch (vp->v_type = IFTOVT(ip->inode.iso_mode)) {
case VFIFO:
vp->v_op = &cd9660_fifoops;
break;
default:
VN_LOCK_ASHARE(vp);
break;
}
if (ip->iso_extent == imp->root_extent)
vp->v_vflag |= VV_ROOT;
/*
* XXX need generation number?
*/
*vpp = vp;
return (0);
}
Index: head/sys/fs/nfs/nfs_commonkrpc.c
===================================================================
--- head/sys/fs/nfs/nfs_commonkrpc.c (revision 327172)
+++ head/sys/fs/nfs/nfs_commonkrpc.c (revision 327173)
@@ -1,1347 +1,1344 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (c) 1989, 1991, 1993, 1995
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Rick Macklem at The University of Guelph.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
/*
* Socket operations for use by nfs
*/
#include "opt_kgssapi.h"
#include "opt_nfs.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/mount.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/signalvar.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/vnode.h>
#include <rpc/rpc.h>
#include <rpc/krpc.h>
#include <kgssapi/krb5/kcrypto.h>
#include <fs/nfs/nfsport.h>
#ifdef KDTRACE_HOOKS
#include <sys/dtrace_bsd.h>
dtrace_nfsclient_nfs23_start_probe_func_t
dtrace_nfscl_nfs234_start_probe;
dtrace_nfsclient_nfs23_done_probe_func_t
dtrace_nfscl_nfs234_done_probe;
/*
* Registered probes by RPC type.
*/
uint32_t nfscl_nfs2_start_probes[NFSV41_NPROCS + 1];
uint32_t nfscl_nfs2_done_probes[NFSV41_NPROCS + 1];
uint32_t nfscl_nfs3_start_probes[NFSV41_NPROCS + 1];
uint32_t nfscl_nfs3_done_probes[NFSV41_NPROCS + 1];
uint32_t nfscl_nfs4_start_probes[NFSV41_NPROCS + 1];
uint32_t nfscl_nfs4_done_probes[NFSV41_NPROCS + 1];
#endif
NFSSTATESPINLOCK;
NFSREQSPINLOCK;
NFSDLOCKMUTEX;
NFSCLSTATEMUTEX;
extern struct nfsstatsv1 nfsstatsv1;
extern struct nfsreqhead nfsd_reqq;
extern int nfscl_ticks;
extern void (*ncl_call_invalcaches)(struct vnode *);
extern int nfs_numnfscbd;
extern int nfscl_debuglevel;
SVCPOOL *nfscbd_pool;
static int nfsrv_gsscallbackson = 0;
static int nfs_bufpackets = 4;
static int nfs_reconnects;
static int nfs3_jukebox_delay = 10;
static int nfs_skip_wcc_data_onerr = 1;
SYSCTL_DECL(_vfs_nfs);
SYSCTL_INT(_vfs_nfs, OID_AUTO, bufpackets, CTLFLAG_RW, &nfs_bufpackets, 0,
"Buffer reservation size 2 < x < 64");
SYSCTL_INT(_vfs_nfs, OID_AUTO, reconnects, CTLFLAG_RD, &nfs_reconnects, 0,
"Number of times the nfs client has had to reconnect");
SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs3_jukebox_delay, CTLFLAG_RW, &nfs3_jukebox_delay, 0,
"Number of seconds to delay a retry after receiving EJUKEBOX");
SYSCTL_INT(_vfs_nfs, OID_AUTO, skip_wcc_data_onerr, CTLFLAG_RW, &nfs_skip_wcc_data_onerr, 0,
"Disable weak cache consistency checking when server returns an error");
static void nfs_down(struct nfsmount *, struct thread *, const char *,
int, int);
static void nfs_up(struct nfsmount *, struct thread *, const char *,
int, int);
static int nfs_msg(struct thread *, const char *, const char *, int);
struct nfs_cached_auth {
int ca_refs; /* refcount, including 1 from the cache */
uid_t ca_uid; /* uid that corresponds to this auth */
AUTH *ca_auth; /* RPC auth handle */
};
static int nfsv2_procid[NFS_V3NPROCS] = {
NFSV2PROC_NULL,
NFSV2PROC_GETATTR,
NFSV2PROC_SETATTR,
NFSV2PROC_LOOKUP,
NFSV2PROC_NOOP,
NFSV2PROC_READLINK,
NFSV2PROC_READ,
NFSV2PROC_WRITE,
NFSV2PROC_CREATE,
NFSV2PROC_MKDIR,
NFSV2PROC_SYMLINK,
NFSV2PROC_CREATE,
NFSV2PROC_REMOVE,
NFSV2PROC_RMDIR,
NFSV2PROC_RENAME,
NFSV2PROC_LINK,
NFSV2PROC_READDIR,
NFSV2PROC_NOOP,
NFSV2PROC_STATFS,
NFSV2PROC_NOOP,
NFSV2PROC_NOOP,
NFSV2PROC_NOOP,
};
/*
* Initialize sockets and congestion for a new NFS connection.
* We do not free the sockaddr if error.
*/
int
newnfs_connect(struct nfsmount *nmp, struct nfssockreq *nrp,
struct ucred *cred, NFSPROC_T *p, int callback_retry_mult)
{
int rcvreserve, sndreserve;
int pktscale, pktscalesav;
struct sockaddr *saddr;
struct ucred *origcred;
CLIENT *client;
struct netconfig *nconf;
struct socket *so;
int one = 1, retries, error = 0;
struct thread *td = curthread;
SVCXPRT *xprt;
struct timeval timo;
/*
* We need to establish the socket using the credentials of
* the mountpoint. Some parts of this process (such as
* sobind() and soconnect()) will use the curent thread's
* credential instead of the socket credential. To work
* around this, temporarily change the current thread's
* credential to that of the mountpoint.
*
* XXX: It would be better to explicitly pass the correct
* credential to sobind() and soconnect().
*/
origcred = td->td_ucred;
/*
* Use the credential in nr_cred, if not NULL.
*/
if (nrp->nr_cred != NULL)
td->td_ucred = nrp->nr_cred;
else
td->td_ucred = cred;
saddr = nrp->nr_nam;
if (saddr->sa_family == AF_INET)
if (nrp->nr_sotype == SOCK_DGRAM)
nconf = getnetconfigent("udp");
else
nconf = getnetconfigent("tcp");
else if (saddr->sa_family == AF_LOCAL)
nconf = getnetconfigent("local");
else
if (nrp->nr_sotype == SOCK_DGRAM)
nconf = getnetconfigent("udp6");
else
nconf = getnetconfigent("tcp6");
pktscale = nfs_bufpackets;
if (pktscale < 2)
pktscale = 2;
if (pktscale > 64)
pktscale = 64;
pktscalesav = pktscale;
/*
* soreserve() can fail if sb_max is too small, so shrink pktscale
* and try again if there is an error.
* Print a log message suggesting increasing sb_max.
* Creating a socket and doing this is necessary since, if the
* reservation sizes are too large and will make soreserve() fail,
* the connection will work until a large send is attempted and
* then it will loop in the krpc code.
*/
so = NULL;
saddr = NFSSOCKADDR(nrp->nr_nam, struct sockaddr *);
error = socreate(saddr->sa_family, &so, nrp->nr_sotype,
nrp->nr_soproto, td->td_ucred, td);
if (error) {
td->td_ucred = origcred;
goto out;
}
do {
if (error != 0 && pktscale > 2) {
if (nmp != NULL && nrp->nr_sotype == SOCK_STREAM &&
pktscale == pktscalesav)
printf("Consider increasing kern.ipc.maxsockbuf\n");
pktscale--;
}
if (nrp->nr_sotype == SOCK_DGRAM) {
if (nmp != NULL) {
sndreserve = (NFS_MAXDGRAMDATA + NFS_MAXPKTHDR) *
pktscale;
rcvreserve = (NFS_MAXDGRAMDATA + NFS_MAXPKTHDR) *
pktscale;
} else {
sndreserve = rcvreserve = 1024 * pktscale;
}
} else {
if (nrp->nr_sotype != SOCK_STREAM)
panic("nfscon sotype");
if (nmp != NULL) {
sndreserve = (NFS_MAXBSIZE + NFS_MAXXDR +
sizeof (u_int32_t)) * pktscale;
rcvreserve = (NFS_MAXBSIZE + NFS_MAXXDR +
sizeof (u_int32_t)) * pktscale;
} else {
sndreserve = rcvreserve = 1024 * pktscale;
}
}
error = soreserve(so, sndreserve, rcvreserve);
if (error != 0 && nmp != NULL && nrp->nr_sotype == SOCK_STREAM &&
pktscale <= 2)
printf("Must increase kern.ipc.maxsockbuf or reduce"
" rsize, wsize\n");
} while (error != 0 && pktscale > 2);
soclose(so);
if (error) {
td->td_ucred = origcred;
goto out;
}
client = clnt_reconnect_create(nconf, saddr, nrp->nr_prog,
nrp->nr_vers, sndreserve, rcvreserve);
CLNT_CONTROL(client, CLSET_WAITCHAN, "nfsreq");
if (nmp != NULL) {
if ((nmp->nm_flag & NFSMNT_INT))
CLNT_CONTROL(client, CLSET_INTERRUPTIBLE, &one);
if ((nmp->nm_flag & NFSMNT_RESVPORT))
CLNT_CONTROL(client, CLSET_PRIVPORT, &one);
if (NFSHASSOFT(nmp)) {
if (nmp->nm_sotype == SOCK_DGRAM)
/*
* For UDP, the large timeout for a reconnect
* will be set to "nm_retry * nm_timeo / 2", so
* we only want to do 2 reconnect timeout
* retries.
*/
retries = 2;
else
retries = nmp->nm_retry;
} else
retries = INT_MAX;
/* cred == NULL for DS connects. */
if (NFSHASNFSV4N(nmp) && cred != NULL) {
/*
* Make sure the nfscbd_pool doesn't get destroyed
* while doing this.
*/
NFSD_LOCK();
if (nfs_numnfscbd > 0) {
nfs_numnfscbd++;
NFSD_UNLOCK();
xprt = svc_vc_create_backchannel(nfscbd_pool);
CLNT_CONTROL(client, CLSET_BACKCHANNEL, xprt);
NFSD_LOCK();
nfs_numnfscbd--;
if (nfs_numnfscbd == 0)
wakeup(&nfs_numnfscbd);
}
NFSD_UNLOCK();
}
} else {
/*
* Three cases:
* - Null RPC callback to client
* - Non-Null RPC callback to client, wait a little longer
* - upcalls to nfsuserd and gssd (clp == NULL)
*/
if (callback_retry_mult == 0) {
retries = NFSV4_UPCALLRETRY;
CLNT_CONTROL(client, CLSET_PRIVPORT, &one);
} else {
retries = NFSV4_CALLBACKRETRY * callback_retry_mult;
}
}
CLNT_CONTROL(client, CLSET_RETRIES, &retries);
if (nmp != NULL) {
/*
* For UDP, there are 2 timeouts:
* - CLSET_RETRY_TIMEOUT sets the initial timeout for the timer
* that does a retransmit of an RPC request using the same
* socket and xid. This is what you normally want to do,
* since NFS servers depend on "same xid" for their
* Duplicate Request Cache.
* - timeout specified in CLNT_CALL_MBUF(), which specifies when
* retransmits on the same socket should fail and a fresh
* socket created. Each of these timeouts counts as one
* CLSET_RETRIES as set above.
* Set the initial retransmit timeout for UDP. This timeout
* doesn't exist for TCP and the following call just fails,
* which is ok.
*/
timo.tv_sec = nmp->nm_timeo / NFS_HZ;
timo.tv_usec = (nmp->nm_timeo % NFS_HZ) * 1000000 / NFS_HZ;
CLNT_CONTROL(client, CLSET_RETRY_TIMEOUT, &timo);
}
mtx_lock(&nrp->nr_mtx);
if (nrp->nr_client != NULL) {
mtx_unlock(&nrp->nr_mtx);
/*
* Someone else already connected.
*/
CLNT_RELEASE(client);
} else {
nrp->nr_client = client;
/*
* Protocols that do not require connections may be optionally
* left unconnected for servers that reply from a port other
* than NFS_PORT.
*/
if (nmp == NULL || (nmp->nm_flag & NFSMNT_NOCONN) == 0) {
mtx_unlock(&nrp->nr_mtx);
CLNT_CONTROL(client, CLSET_CONNECT, &one);
} else
mtx_unlock(&nrp->nr_mtx);
}
/* Restore current thread's credentials. */
td->td_ucred = origcred;
out:
NFSEXITCODE(error);
return (error);
}
/*
* NFS disconnect. Clean up and unlink.
*/
void
newnfs_disconnect(struct nfssockreq *nrp)
{
CLIENT *client;
mtx_lock(&nrp->nr_mtx);
if (nrp->nr_client != NULL) {
client = nrp->nr_client;
nrp->nr_client = NULL;
mtx_unlock(&nrp->nr_mtx);
rpc_gss_secpurge_call(client);
CLNT_CLOSE(client);
CLNT_RELEASE(client);
} else {
mtx_unlock(&nrp->nr_mtx);
}
}
static AUTH *
nfs_getauth(struct nfssockreq *nrp, int secflavour, char *clnt_principal,
char *srv_principal, gss_OID mech_oid, struct ucred *cred)
{
rpc_gss_service_t svc;
AUTH *auth;
switch (secflavour) {
case RPCSEC_GSS_KRB5:
case RPCSEC_GSS_KRB5I:
case RPCSEC_GSS_KRB5P:
if (!mech_oid) {
if (!rpc_gss_mech_to_oid_call("kerberosv5", &mech_oid))
return (NULL);
}
if (secflavour == RPCSEC_GSS_KRB5)
svc = rpc_gss_svc_none;
else if (secflavour == RPCSEC_GSS_KRB5I)
svc = rpc_gss_svc_integrity;
else
svc = rpc_gss_svc_privacy;
if (clnt_principal == NULL)
auth = rpc_gss_secfind_call(nrp->nr_client, cred,
srv_principal, mech_oid, svc);
else {
auth = rpc_gss_seccreate_call(nrp->nr_client, cred,
clnt_principal, srv_principal, "kerberosv5",
svc, NULL, NULL, NULL);
return (auth);
}
if (auth != NULL)
return (auth);
/* fallthrough */
case AUTH_SYS:
default:
return (authunix_create(cred));
}
}
/*
* Callback from the RPC code to generate up/down notifications.
*/
struct nfs_feedback_arg {
struct nfsmount *nf_mount;
int nf_lastmsg; /* last tprintf */
int nf_tprintfmsg;
struct thread *nf_td;
};
static void
nfs_feedback(int type, int proc, void *arg)
{
struct nfs_feedback_arg *nf = (struct nfs_feedback_arg *) arg;
struct nfsmount *nmp = nf->nf_mount;
time_t now;
switch (type) {
case FEEDBACK_REXMIT2:
case FEEDBACK_RECONNECT:
now = NFSD_MONOSEC;
if (nf->nf_lastmsg + nmp->nm_tprintf_delay < now) {
nfs_down(nmp, nf->nf_td,
"not responding", 0, NFSSTA_TIMEO);
nf->nf_tprintfmsg = TRUE;
nf->nf_lastmsg = now;
}
break;
case FEEDBACK_OK:
nfs_up(nf->nf_mount, nf->nf_td,
"is alive again", NFSSTA_TIMEO, nf->nf_tprintfmsg);
break;
}
}
/*
* newnfs_request - goes something like this
* - does the rpc by calling the krpc layer
* - break down rpc header and return with nfs reply
* nb: always frees up nd_mreq mbuf list
*/
int
newnfs_request(struct nfsrv_descript *nd, struct nfsmount *nmp,
struct nfsclient *clp, struct nfssockreq *nrp, vnode_t vp,
struct thread *td, struct ucred *cred, u_int32_t prog, u_int32_t vers,
u_char *retsum, int toplevel, u_int64_t *xidp, struct nfsclsession *dssep)
{
uint32_t retseq, retval, slotseq, *tl;
time_t waituntil;
int i = 0, j = 0, opcnt, set_sigset = 0, slot;
- int trycnt, error = 0, usegssname = 0, secflavour = AUTH_SYS;
+ int error = 0, usegssname = 0, secflavour = AUTH_SYS;
int freeslot, maxslot, reterr, slotpos, timeo;
u_int16_t procnum;
u_int trylater_delay = 1;
struct nfs_feedback_arg nf;
struct timeval timo;
AUTH *auth;
struct rpc_callextra ext;
enum clnt_stat stat;
struct nfsreq *rep = NULL;
char *srv_principal = NULL, *clnt_principal = NULL;
sigset_t oldset;
struct ucred *authcred;
struct nfsclsession *sep;
uint8_t sessionid[NFSX_V4SESSIONID];
sep = dssep;
if (xidp != NULL)
*xidp = 0;
/* Reject requests while attempting a forced unmount. */
if (nmp != NULL && NFSCL_FORCEDISM(nmp->nm_mountp)) {
m_freem(nd->nd_mreq);
return (ESTALE);
}
/*
* Set authcred, which is used to acquire RPC credentials to
* the cred argument, by default. The crhold() should not be
* necessary, but will ensure that some future code change
* doesn't result in the credential being free'd prematurely.
*/
authcred = crhold(cred);
/* For client side interruptible mounts, mask off the signals. */
if (nmp != NULL && td != NULL && NFSHASINT(nmp)) {
newnfs_set_sigmask(td, &oldset);
set_sigset = 1;
}
/*
* XXX if not already connected call nfs_connect now. Longer
* term, change nfs_mount to call nfs_connect unconditionally
* and let clnt_reconnect_create handle reconnects.
*/
if (nrp->nr_client == NULL)
newnfs_connect(nmp, nrp, cred, td, 0);
/*
* For a client side mount, nmp is != NULL and clp == NULL. For
* server calls (callbacks or upcalls), nmp == NULL.
*/
if (clp != NULL) {
NFSLOCKSTATE();
if ((clp->lc_flags & LCL_GSS) && nfsrv_gsscallbackson) {
secflavour = RPCSEC_GSS_KRB5;
if (nd->nd_procnum != NFSPROC_NULL) {
if (clp->lc_flags & LCL_GSSINTEGRITY)
secflavour = RPCSEC_GSS_KRB5I;
else if (clp->lc_flags & LCL_GSSPRIVACY)
secflavour = RPCSEC_GSS_KRB5P;
}
}
NFSUNLOCKSTATE();
} else if (nmp != NULL && NFSHASKERB(nmp) &&
nd->nd_procnum != NFSPROC_NULL) {
if (NFSHASALLGSSNAME(nmp) && nmp->nm_krbnamelen > 0)
nd->nd_flag |= ND_USEGSSNAME;
if ((nd->nd_flag & ND_USEGSSNAME) != 0) {
/*
* If there is a client side host based credential,
* use that, otherwise use the system uid, if set.
* The system uid is in the nmp->nm_sockreq.nr_cred
* credentials.
*/
if (nmp->nm_krbnamelen > 0) {
usegssname = 1;
clnt_principal = nmp->nm_krbname;
} else if (nmp->nm_uid != (uid_t)-1) {
KASSERT(nmp->nm_sockreq.nr_cred != NULL,
("newnfs_request: NULL nr_cred"));
crfree(authcred);
authcred = crhold(nmp->nm_sockreq.nr_cred);
}
} else if (nmp->nm_krbnamelen == 0 &&
nmp->nm_uid != (uid_t)-1 && cred->cr_uid == (uid_t)0) {
/*
* If there is no host based principal name and
* the system uid is set and this is root, use the
* system uid, since root won't have user
* credentials in a credentials cache file.
* The system uid is in the nmp->nm_sockreq.nr_cred
* credentials.
*/
KASSERT(nmp->nm_sockreq.nr_cred != NULL,
("newnfs_request: NULL nr_cred"));
crfree(authcred);
authcred = crhold(nmp->nm_sockreq.nr_cred);
}
if (NFSHASINTEGRITY(nmp))
secflavour = RPCSEC_GSS_KRB5I;
else if (NFSHASPRIVACY(nmp))
secflavour = RPCSEC_GSS_KRB5P;
else
secflavour = RPCSEC_GSS_KRB5;
srv_principal = NFSMNT_SRVKRBNAME(nmp);
} else if (nmp != NULL && !NFSHASKERB(nmp) &&
nd->nd_procnum != NFSPROC_NULL &&
(nd->nd_flag & ND_USEGSSNAME) != 0) {
/*
* Use the uid that did the mount when the RPC is doing
* NFSv4 system operations, as indicated by the
* ND_USEGSSNAME flag, for the AUTH_SYS case.
* The credentials in nm_sockreq.nr_cred were used for the
* mount.
*/
KASSERT(nmp->nm_sockreq.nr_cred != NULL,
("newnfs_request: NULL nr_cred"));
crfree(authcred);
authcred = crhold(nmp->nm_sockreq.nr_cred);
}
if (nmp != NULL) {
bzero(&nf, sizeof(struct nfs_feedback_arg));
nf.nf_mount = nmp;
nf.nf_td = td;
nf.nf_lastmsg = NFSD_MONOSEC -
((nmp->nm_tprintf_delay)-(nmp->nm_tprintf_initial_delay));
}
if (nd->nd_procnum == NFSPROC_NULL)
auth = authnone_create();
else if (usegssname) {
/*
* For this case, the authenticator is held in the
* nfssockreq structure, so don't release the reference count
* held on it. --> Don't AUTH_DESTROY() it in this function.
*/
if (nrp->nr_auth == NULL)
nrp->nr_auth = nfs_getauth(nrp, secflavour,
clnt_principal, srv_principal, NULL, authcred);
else
rpc_gss_refresh_auth_call(nrp->nr_auth);
auth = nrp->nr_auth;
} else
auth = nfs_getauth(nrp, secflavour, NULL,
srv_principal, NULL, authcred);
crfree(authcred);
if (auth == NULL) {
m_freem(nd->nd_mreq);
if (set_sigset)
newnfs_restore_sigmask(td, &oldset);
return (EACCES);
}
bzero(&ext, sizeof(ext));
ext.rc_auth = auth;
if (nmp != NULL) {
ext.rc_feedback = nfs_feedback;
ext.rc_feedback_arg = &nf;
}
procnum = nd->nd_procnum;
if ((nd->nd_flag & ND_NFSV4) &&
nd->nd_procnum != NFSPROC_NULL &&
nd->nd_procnum != NFSV4PROC_CBCOMPOUND)
procnum = NFSV4PROC_COMPOUND;
if (nmp != NULL) {
NFSINCRGLOBAL(nfsstatsv1.rpcrequests);
/* Map the procnum to the old NFSv2 one, as required. */
if ((nd->nd_flag & ND_NFSV2) != 0) {
if (nd->nd_procnum < NFS_V3NPROCS)
procnum = nfsv2_procid[nd->nd_procnum];
else
procnum = NFSV2PROC_NOOP;
}
/*
* Now only used for the R_DONTRECOVER case, but until that is
* supported within the krpc code, I need to keep a queue of
* outstanding RPCs for nfsv4 client requests.
*/
if ((nd->nd_flag & ND_NFSV4) && procnum == NFSV4PROC_COMPOUND)
MALLOC(rep, struct nfsreq *, sizeof(struct nfsreq),
M_NFSDREQ, M_WAITOK);
#ifdef KDTRACE_HOOKS
if (dtrace_nfscl_nfs234_start_probe != NULL) {
uint32_t probe_id;
int probe_procnum;
if (nd->nd_flag & ND_NFSV4) {
probe_id =
nfscl_nfs4_start_probes[nd->nd_procnum];
probe_procnum = nd->nd_procnum;
} else if (nd->nd_flag & ND_NFSV3) {
probe_id = nfscl_nfs3_start_probes[procnum];
probe_procnum = procnum;
} else {
probe_id =
nfscl_nfs2_start_probes[nd->nd_procnum];
probe_procnum = procnum;
}
if (probe_id != 0)
(dtrace_nfscl_nfs234_start_probe)
(probe_id, vp, nd->nd_mreq, cred,
probe_procnum);
}
#endif
}
- trycnt = 0;
freeslot = -1; /* Set to slot that needs to be free'd */
tryagain:
slot = -1; /* Slot that needs a sequence# increment. */
/*
* This timeout specifies when a new socket should be created,
* along with new xid values. For UDP, this should be done
* infrequently, since retransmits of RPC requests should normally
* use the same xid.
*/
if (nmp == NULL) {
timo.tv_usec = 0;
if (clp == NULL)
timo.tv_sec = NFSV4_UPCALLTIMEO;
else
timo.tv_sec = NFSV4_CALLBACKTIMEO;
} else {
if (nrp->nr_sotype != SOCK_DGRAM) {
timo.tv_usec = 0;
if ((nmp->nm_flag & NFSMNT_NFSV4))
timo.tv_sec = INT_MAX;
else
timo.tv_sec = NFS_TCPTIMEO;
} else {
if (NFSHASSOFT(nmp)) {
/*
* CLSET_RETRIES is set to 2, so this should be
* half of the total timeout required.
*/
timeo = nmp->nm_retry * nmp->nm_timeo / 2;
if (timeo < 1)
timeo = 1;
timo.tv_sec = timeo / NFS_HZ;
timo.tv_usec = (timeo % NFS_HZ) * 1000000 /
NFS_HZ;
} else {
/* For UDP hard mounts, use a large value. */
timo.tv_sec = NFS_MAXTIMEO / NFS_HZ;
timo.tv_usec = 0;
}
}
if (rep != NULL) {
rep->r_flags = 0;
rep->r_nmp = nmp;
/*
* Chain request into list of outstanding requests.
*/
NFSLOCKREQ();
TAILQ_INSERT_TAIL(&nfsd_reqq, rep, r_chain);
NFSUNLOCKREQ();
}
}
nd->nd_mrep = NULL;
if (clp != NULL && sep != NULL)
stat = clnt_bck_call(nrp->nr_client, &ext, procnum,
nd->nd_mreq, &nd->nd_mrep, timo, sep->nfsess_xprt);
else
stat = CLNT_CALL_MBUF(nrp->nr_client, &ext, procnum,
nd->nd_mreq, &nd->nd_mrep, timo);
if (rep != NULL) {
/*
* RPC done, unlink the request.
*/
NFSLOCKREQ();
TAILQ_REMOVE(&nfsd_reqq, rep, r_chain);
NFSUNLOCKREQ();
}
/*
* If there was a successful reply and a tprintf msg.
* tprintf a response.
*/
if (stat == RPC_SUCCESS) {
error = 0;
} else if (stat == RPC_TIMEDOUT) {
NFSINCRGLOBAL(nfsstatsv1.rpctimeouts);
error = ETIMEDOUT;
} else if (stat == RPC_VERSMISMATCH) {
NFSINCRGLOBAL(nfsstatsv1.rpcinvalid);
error = EOPNOTSUPP;
} else if (stat == RPC_PROGVERSMISMATCH) {
NFSINCRGLOBAL(nfsstatsv1.rpcinvalid);
error = EPROTONOSUPPORT;
} else if (stat == RPC_INTR) {
error = EINTR;
} else {
NFSINCRGLOBAL(nfsstatsv1.rpcinvalid);
error = EACCES;
}
if (error) {
m_freem(nd->nd_mreq);
if (usegssname == 0)
AUTH_DESTROY(auth);
if (rep != NULL)
FREE((caddr_t)rep, M_NFSDREQ);
if (set_sigset)
newnfs_restore_sigmask(td, &oldset);
return (error);
}
KASSERT(nd->nd_mrep != NULL, ("mrep shouldn't be NULL if no error\n"));
/*
* Search for any mbufs that are not a multiple of 4 bytes long
* or with m_data not longword aligned.
* These could cause pointer alignment problems, so copy them to
* well aligned mbufs.
*/
newnfs_realign(&nd->nd_mrep, M_WAITOK);
nd->nd_md = nd->nd_mrep;
nd->nd_dpos = NFSMTOD(nd->nd_md, caddr_t);
nd->nd_repstat = 0;
if (nd->nd_procnum != NFSPROC_NULL &&
nd->nd_procnum != NFSV4PROC_CBNULL) {
/* If sep == NULL, set it to the default in nmp. */
if (sep == NULL && nmp != NULL)
sep = nfsmnt_mdssession(nmp);
/*
* and now the actual NFS xdr.
*/
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
nd->nd_repstat = fxdr_unsigned(u_int32_t, *tl);
if (nd->nd_repstat >= 10000)
NFSCL_DEBUG(1, "proc=%d reps=%d\n", (int)nd->nd_procnum,
(int)nd->nd_repstat);
/*
* Get rid of the tag, return count and SEQUENCE result for
* NFSv4.
*/
if ((nd->nd_flag & ND_NFSV4) != 0) {
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
i = fxdr_unsigned(int, *tl);
error = nfsm_advance(nd, NFSM_RNDUP(i), -1);
if (error)
goto nfsmout;
NFSM_DISSECT(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
opcnt = fxdr_unsigned(int, *tl++);
i = fxdr_unsigned(int, *tl++);
j = fxdr_unsigned(int, *tl);
if (j >= 10000)
NFSCL_DEBUG(1, "fop=%d fst=%d\n", i, j);
/*
* If the first op is Sequence, free up the slot.
*/
if ((nmp != NULL && i == NFSV4OP_SEQUENCE && j != 0) ||
(clp != NULL && i == NFSV4OP_CBSEQUENCE && j != 0))
NFSCL_DEBUG(1, "failed seq=%d\n", j);
if ((nmp != NULL && i == NFSV4OP_SEQUENCE && j == 0) ||
(clp != NULL && i == NFSV4OP_CBSEQUENCE && j == 0)
) {
if (i == NFSV4OP_SEQUENCE)
NFSM_DISSECT(tl, uint32_t *,
NFSX_V4SESSIONID +
5 * NFSX_UNSIGNED);
else
NFSM_DISSECT(tl, uint32_t *,
NFSX_V4SESSIONID +
4 * NFSX_UNSIGNED);
mtx_lock(&sep->nfsess_mtx);
if (bcmp(tl, sep->nfsess_sessionid,
NFSX_V4SESSIONID) == 0) {
tl += NFSX_V4SESSIONID / NFSX_UNSIGNED;
retseq = fxdr_unsigned(uint32_t, *tl++);
slot = fxdr_unsigned(int, *tl++);
freeslot = slot;
if (retseq != sep->nfsess_slotseq[slot])
printf("retseq diff 0x%x\n",
retseq);
retval = fxdr_unsigned(uint32_t, *++tl);
if ((retval + 1) < sep->nfsess_foreslots
)
sep->nfsess_foreslots = (retval
+ 1);
else if ((retval + 1) >
sep->nfsess_foreslots)
sep->nfsess_foreslots = (retval
< 64) ? (retval + 1) : 64;
}
mtx_unlock(&sep->nfsess_mtx);
/* Grab the op and status for the next one. */
if (opcnt > 1) {
NFSM_DISSECT(tl, uint32_t *,
2 * NFSX_UNSIGNED);
i = fxdr_unsigned(int, *tl++);
j = fxdr_unsigned(int, *tl);
}
}
}
if (nd->nd_repstat != 0) {
if (nd->nd_repstat == NFSERR_BADSESSION &&
nmp != NULL && dssep == NULL) {
/*
* If this is a client side MDS RPC, mark
* the MDS session defunct and initiate
* recovery, as required.
* The nfsess_defunct field is protected by
* the NFSLOCKMNT()/nm_mtx lock and not the
* nfsess_mtx lock to simplify its handling,
* for the MDS session. This lock is also
* sufficient for nfsess_sessionid, since it
* never changes in the structure.
*/
NFSCL_DEBUG(1, "Got badsession\n");
NFSLOCKCLSTATE();
NFSLOCKMNT(nmp);
sep = NFSMNT_MDSSESSION(nmp);
if (bcmp(sep->nfsess_sessionid, nd->nd_sequence,
NFSX_V4SESSIONID) == 0) {
/* Initiate recovery. */
sep->nfsess_defunct = 1;
NFSCL_DEBUG(1, "Marked defunct\n");
if (nmp->nm_clp != NULL) {
nmp->nm_clp->nfsc_flags |=
NFSCLFLAGS_RECOVER;
wakeup(nmp->nm_clp);
}
}
NFSUNLOCKCLSTATE();
/*
* Sleep for up to 1sec waiting for a new
* session.
*/
mtx_sleep(&nmp->nm_sess, &nmp->nm_mtx, PZERO,
"nfsbadsess", hz);
/*
* Get the session again, in case a new one
* has been created during the sleep.
*/
sep = NFSMNT_MDSSESSION(nmp);
NFSUNLOCKMNT(nmp);
if ((nd->nd_flag & ND_LOOPBADSESS) != 0) {
reterr = nfsv4_sequencelookup(nmp, sep,
&slotpos, &maxslot, &slotseq,
sessionid);
if (reterr == 0) {
/* Fill in new session info. */
NFSCL_DEBUG(1,
"Filling in new sequence\n");
tl = nd->nd_sequence;
bcopy(sessionid, tl,
NFSX_V4SESSIONID);
tl += NFSX_V4SESSIONID /
NFSX_UNSIGNED;
*tl++ = txdr_unsigned(slotseq);
*tl++ = txdr_unsigned(slotpos);
*tl = txdr_unsigned(maxslot);
}
if (reterr == NFSERR_BADSESSION ||
reterr == 0) {
NFSCL_DEBUG(1,
"Badsession looping\n");
m_freem(nd->nd_mrep);
nd->nd_mrep = NULL;
goto tryagain;
}
nd->nd_repstat = reterr;
NFSCL_DEBUG(1, "Got err=%d\n", reterr);
}
}
if (((nd->nd_repstat == NFSERR_DELAY ||
nd->nd_repstat == NFSERR_GRACE) &&
(nd->nd_flag & ND_NFSV4) &&
nd->nd_procnum != NFSPROC_DELEGRETURN &&
nd->nd_procnum != NFSPROC_SETATTR &&
nd->nd_procnum != NFSPROC_READ &&
nd->nd_procnum != NFSPROC_READDS &&
nd->nd_procnum != NFSPROC_WRITE &&
nd->nd_procnum != NFSPROC_WRITEDS &&
nd->nd_procnum != NFSPROC_OPEN &&
nd->nd_procnum != NFSPROC_CREATE &&
nd->nd_procnum != NFSPROC_OPENCONFIRM &&
nd->nd_procnum != NFSPROC_OPENDOWNGRADE &&
nd->nd_procnum != NFSPROC_CLOSE &&
nd->nd_procnum != NFSPROC_LOCK &&
nd->nd_procnum != NFSPROC_LOCKU) ||
(nd->nd_repstat == NFSERR_DELAY &&
(nd->nd_flag & ND_NFSV4) == 0) ||
nd->nd_repstat == NFSERR_RESOURCE) {
if (trylater_delay > NFS_TRYLATERDEL)
trylater_delay = NFS_TRYLATERDEL;
waituntil = NFSD_MONOSEC + trylater_delay;
while (NFSD_MONOSEC < waituntil)
(void) nfs_catnap(PZERO, 0, "nfstry");
trylater_delay *= 2;
if (slot != -1) {
mtx_lock(&sep->nfsess_mtx);
sep->nfsess_slotseq[slot]++;
*nd->nd_slotseq = txdr_unsigned(
sep->nfsess_slotseq[slot]);
mtx_unlock(&sep->nfsess_mtx);
}
m_freem(nd->nd_mrep);
nd->nd_mrep = NULL;
goto tryagain;
}
/*
* If the File Handle was stale, invalidate the
* lookup cache, just in case.
* (vp != NULL implies a client side call)
*/
if (nd->nd_repstat == ESTALE && vp != NULL) {
cache_purge(vp);
if (ncl_call_invalcaches != NULL)
(*ncl_call_invalcaches)(vp);
}
}
if ((nd->nd_flag & ND_NFSV4) != 0) {
/* Free the slot, as required. */
if (freeslot != -1)
nfsv4_freeslot(sep, freeslot);
/*
* If this op is Putfh, throw its results away.
*/
if (j >= 10000)
NFSCL_DEBUG(1, "nop=%d nst=%d\n", i, j);
if (nmp != NULL && i == NFSV4OP_PUTFH && j == 0) {
NFSM_DISSECT(tl,u_int32_t *,2 * NFSX_UNSIGNED);
i = fxdr_unsigned(int, *tl++);
j = fxdr_unsigned(int, *tl);
if (j >= 10000)
NFSCL_DEBUG(1, "n2op=%d n2st=%d\n", i,
j);
/*
* All Compounds that do an Op that must
* be in sequence consist of NFSV4OP_PUTFH
* followed by one of these. As such, we
* can determine if the seqid# should be
* incremented, here.
*/
if ((i == NFSV4OP_OPEN ||
i == NFSV4OP_OPENCONFIRM ||
i == NFSV4OP_OPENDOWNGRADE ||
i == NFSV4OP_CLOSE ||
i == NFSV4OP_LOCK ||
i == NFSV4OP_LOCKU) &&
(j == 0 ||
(j != NFSERR_STALECLIENTID &&
j != NFSERR_STALESTATEID &&
j != NFSERR_BADSTATEID &&
j != NFSERR_BADSEQID &&
j != NFSERR_BADXDR &&
j != NFSERR_RESOURCE &&
j != NFSERR_NOFILEHANDLE)))
nd->nd_flag |= ND_INCRSEQID;
}
/*
* If this op's status is non-zero, mark
* that there is no more data to process.
* The exception is Setattr, which always has xdr
* when it has failed.
*/
if (j != 0 && i != NFSV4OP_SETATTR)
nd->nd_flag |= ND_NOMOREDATA;
/*
* If R_DONTRECOVER is set, replace the stale error
* reply, so that recovery isn't initiated.
*/
if ((nd->nd_repstat == NFSERR_STALECLIENTID ||
nd->nd_repstat == NFSERR_BADSESSION ||
nd->nd_repstat == NFSERR_STALESTATEID) &&
rep != NULL && (rep->r_flags & R_DONTRECOVER))
nd->nd_repstat = NFSERR_STALEDONTRECOVER;
}
}
#ifdef KDTRACE_HOOKS
if (nmp != NULL && dtrace_nfscl_nfs234_done_probe != NULL) {
uint32_t probe_id;
int probe_procnum;
if (nd->nd_flag & ND_NFSV4) {
probe_id = nfscl_nfs4_done_probes[nd->nd_procnum];
probe_procnum = nd->nd_procnum;
} else if (nd->nd_flag & ND_NFSV3) {
probe_id = nfscl_nfs3_done_probes[procnum];
probe_procnum = procnum;
} else {
probe_id = nfscl_nfs2_done_probes[nd->nd_procnum];
probe_procnum = procnum;
}
if (probe_id != 0)
(dtrace_nfscl_nfs234_done_probe)(probe_id, vp,
nd->nd_mreq, cred, probe_procnum, 0);
}
#endif
m_freem(nd->nd_mreq);
if (usegssname == 0)
AUTH_DESTROY(auth);
if (rep != NULL)
FREE((caddr_t)rep, M_NFSDREQ);
if (set_sigset)
newnfs_restore_sigmask(td, &oldset);
return (0);
nfsmout:
mbuf_freem(nd->nd_mrep);
mbuf_freem(nd->nd_mreq);
if (usegssname == 0)
AUTH_DESTROY(auth);
if (rep != NULL)
FREE((caddr_t)rep, M_NFSDREQ);
if (set_sigset)
newnfs_restore_sigmask(td, &oldset);
return (error);
}
/*
* Mark all of an nfs mount's outstanding requests with R_SOFTTERM and
* wait for all requests to complete. This is used by forced unmounts
* to terminate any outstanding RPCs.
*/
int
newnfs_nmcancelreqs(struct nfsmount *nmp)
{
struct nfsclds *dsp;
struct __rpc_client *cl;
if (nmp->nm_sockreq.nr_client != NULL)
CLNT_CLOSE(nmp->nm_sockreq.nr_client);
lookformore:
NFSLOCKMNT(nmp);
TAILQ_FOREACH(dsp, &nmp->nm_sess, nfsclds_list) {
NFSLOCKDS(dsp);
if (dsp != TAILQ_FIRST(&nmp->nm_sess) &&
(dsp->nfsclds_flags & NFSCLDS_CLOSED) == 0 &&
dsp->nfsclds_sockp != NULL &&
dsp->nfsclds_sockp->nr_client != NULL) {
dsp->nfsclds_flags |= NFSCLDS_CLOSED;
cl = dsp->nfsclds_sockp->nr_client;
NFSUNLOCKDS(dsp);
NFSUNLOCKMNT(nmp);
CLNT_CLOSE(cl);
goto lookformore;
}
NFSUNLOCKDS(dsp);
}
NFSUNLOCKMNT(nmp);
return (0);
}
/*
* Any signal that can interrupt an NFS operation in an intr mount
* should be added to this set. SIGSTOP and SIGKILL cannot be masked.
*/
int newnfs_sig_set[] = {
SIGINT,
SIGTERM,
SIGHUP,
SIGKILL,
SIGQUIT
};
/*
* Check to see if one of the signals in our subset is pending on
* the process (in an intr mount).
*/
static int
nfs_sig_pending(sigset_t set)
{
int i;
for (i = 0 ; i < nitems(newnfs_sig_set); i++)
if (SIGISMEMBER(set, newnfs_sig_set[i]))
return (1);
return (0);
}
/*
* The set/restore sigmask functions are used to (temporarily) overwrite
* the thread td_sigmask during an RPC call (for example). These are also
* used in other places in the NFS client that might tsleep().
*/
void
newnfs_set_sigmask(struct thread *td, sigset_t *oldset)
{
sigset_t newset;
int i;
struct proc *p;
SIGFILLSET(newset);
if (td == NULL)
td = curthread; /* XXX */
p = td->td_proc;
/* Remove the NFS set of signals from newset */
PROC_LOCK(p);
mtx_lock(&p->p_sigacts->ps_mtx);
for (i = 0 ; i < nitems(newnfs_sig_set); i++) {
/*
* But make sure we leave the ones already masked
* by the process, ie. remove the signal from the
* temporary signalmask only if it wasn't already
* in p_sigmask.
*/
if (!SIGISMEMBER(td->td_sigmask, newnfs_sig_set[i]) &&
!SIGISMEMBER(p->p_sigacts->ps_sigignore, newnfs_sig_set[i]))
SIGDELSET(newset, newnfs_sig_set[i]);
}
mtx_unlock(&p->p_sigacts->ps_mtx);
kern_sigprocmask(td, SIG_SETMASK, &newset, oldset,
SIGPROCMASK_PROC_LOCKED);
PROC_UNLOCK(p);
}
void
newnfs_restore_sigmask(struct thread *td, sigset_t *set)
{
if (td == NULL)
td = curthread; /* XXX */
kern_sigprocmask(td, SIG_SETMASK, set, NULL, 0);
}
/*
* NFS wrapper to msleep(), that shoves a new p_sigmask and restores the
* old one after msleep() returns.
*/
int
newnfs_msleep(struct thread *td, void *ident, struct mtx *mtx, int priority, char *wmesg, int timo)
{
sigset_t oldset;
int error;
- struct proc *p;
-
+
if ((priority & PCATCH) == 0)
return msleep(ident, mtx, priority, wmesg, timo);
if (td == NULL)
td = curthread; /* XXX */
newnfs_set_sigmask(td, &oldset);
error = msleep(ident, mtx, priority, wmesg, timo);
newnfs_restore_sigmask(td, &oldset);
- p = td->td_proc;
return (error);
}
/*
* Test for a termination condition pending on the process.
* This is used for NFSMNT_INT mounts.
*/
int
newnfs_sigintr(struct nfsmount *nmp, struct thread *td)
{
struct proc *p;
sigset_t tmpset;
/* Terminate all requests while attempting a forced unmount. */
if (NFSCL_FORCEDISM(nmp->nm_mountp))
return (EIO);
if (!(nmp->nm_flag & NFSMNT_INT))
return (0);
if (td == NULL)
return (0);
p = td->td_proc;
PROC_LOCK(p);
tmpset = p->p_siglist;
SIGSETOR(tmpset, td->td_siglist);
SIGSETNAND(tmpset, td->td_sigmask);
mtx_lock(&p->p_sigacts->ps_mtx);
SIGSETNAND(tmpset, p->p_sigacts->ps_sigignore);
mtx_unlock(&p->p_sigacts->ps_mtx);
if ((SIGNOTEMPTY(p->p_siglist) || SIGNOTEMPTY(td->td_siglist))
&& nfs_sig_pending(tmpset)) {
PROC_UNLOCK(p);
return (EINTR);
}
PROC_UNLOCK(p);
return (0);
}
static int
nfs_msg(struct thread *td, const char *server, const char *msg, int error)
{
struct proc *p;
p = td ? td->td_proc : NULL;
if (error) {
tprintf(p, LOG_INFO, "nfs server %s: %s, error %d\n",
server, msg, error);
} else {
tprintf(p, LOG_INFO, "nfs server %s: %s\n", server, msg);
}
return (0);
}
static void
nfs_down(struct nfsmount *nmp, struct thread *td, const char *msg,
int error, int flags)
{
if (nmp == NULL)
return;
mtx_lock(&nmp->nm_mtx);
if ((flags & NFSSTA_TIMEO) && !(nmp->nm_state & NFSSTA_TIMEO)) {
nmp->nm_state |= NFSSTA_TIMEO;
mtx_unlock(&nmp->nm_mtx);
vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
VQ_NOTRESP, 0);
} else
mtx_unlock(&nmp->nm_mtx);
mtx_lock(&nmp->nm_mtx);
if ((flags & NFSSTA_LOCKTIMEO) && !(nmp->nm_state & NFSSTA_LOCKTIMEO)) {
nmp->nm_state |= NFSSTA_LOCKTIMEO;
mtx_unlock(&nmp->nm_mtx);
vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
VQ_NOTRESPLOCK, 0);
} else
mtx_unlock(&nmp->nm_mtx);
nfs_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, error);
}
static void
nfs_up(struct nfsmount *nmp, struct thread *td, const char *msg,
int flags, int tprintfmsg)
{
if (nmp == NULL)
return;
if (tprintfmsg) {
nfs_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, 0);
}
mtx_lock(&nmp->nm_mtx);
if ((flags & NFSSTA_TIMEO) && (nmp->nm_state & NFSSTA_TIMEO)) {
nmp->nm_state &= ~NFSSTA_TIMEO;
mtx_unlock(&nmp->nm_mtx);
vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
VQ_NOTRESP, 1);
} else
mtx_unlock(&nmp->nm_mtx);
mtx_lock(&nmp->nm_mtx);
if ((flags & NFSSTA_LOCKTIMEO) && (nmp->nm_state & NFSSTA_LOCKTIMEO)) {
nmp->nm_state &= ~NFSSTA_LOCKTIMEO;
mtx_unlock(&nmp->nm_mtx);
vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
VQ_NOTRESPLOCK, 1);
} else
mtx_unlock(&nmp->nm_mtx);
}
Index: head/sys/fs/nfs/nfs_commonsubs.c
===================================================================
--- head/sys/fs/nfs/nfs_commonsubs.c (revision 327172)
+++ head/sys/fs/nfs/nfs_commonsubs.c (revision 327173)
@@ -1,4242 +1,4241 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Rick Macklem at The University of Guelph.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
/*
* These functions support the macros and help fiddle mbuf chains for
* the nfs op functions. They do things like create the rpc header and
* copy data between mbuf chains and uio lists.
*/
#ifndef APPLEKEXT
#include "opt_inet6.h"
#include <fs/nfs/nfsport.h>
#include <security/mac/mac_framework.h>
/*
* Data items converted to xdr at startup, since they are constant
* This is kinda hokey, but may save a little time doing byte swaps
*/
u_int32_t newnfs_true, newnfs_false, newnfs_xdrneg1;
/* And other global data */
nfstype nfsv34_type[9] = { NFNON, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, NFSOCK,
NFFIFO, NFNON };
enum vtype newnv2tov_type[8] = { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VNON, VNON };
enum vtype nv34tov_type[8]={ VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO };
struct timeval nfsboottime; /* Copy boottime once, so it never changes */
int nfscl_ticks;
int nfsrv_useacl = 1;
struct nfssockreq nfsrv_nfsuserdsock;
int nfsrv_nfsuserd = 0;
struct nfsreqhead nfsd_reqq;
uid_t nfsrv_defaultuid = UID_NOBODY;
gid_t nfsrv_defaultgid = GID_NOGROUP;
int nfsrv_lease = NFSRV_LEASE;
int ncl_mbuf_mlen = MLEN;
int nfsd_enable_stringtouid = 0;
static int nfs_enable_uidtostring = 0;
NFSNAMEIDMUTEX;
NFSSOCKMUTEX;
extern int nfsrv_lughashsize;
SYSCTL_DECL(_vfs_nfs);
SYSCTL_INT(_vfs_nfs, OID_AUTO, enable_uidtostring, CTLFLAG_RW,
&nfs_enable_uidtostring, 0, "Make nfs always send numeric owner_names");
/*
* This array of structures indicates, for V4:
* retfh - which of 3 types of calling args are used
* 0 - doesn't change cfh or use a sfh
* 1 - replaces cfh with a new one (unless it returns an error status)
* 2 - uses cfh and sfh
* needscfh - if the op wants a cfh and premtime
* 0 - doesn't use a cfh
* 1 - uses a cfh, but doesn't want pre-op attributes
* 2 - uses a cfh and wants pre-op attributes
* savereply - indicates a non-idempotent Op
* 0 - not non-idempotent
* 1 - non-idempotent
* Ops that are ordered via seqid# are handled separately from these
* non-idempotent Ops.
* Define it here, since it is used by both the client and server.
*/
struct nfsv4_opflag nfsv4_opflag[NFSV41_NOPS] = {
{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* undef */
{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* undef */
{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* undef */
{ 0, 1, 0, 0, LK_SHARED, 1, 1 }, /* Access */
{ 0, 1, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* Close */
{ 0, 2, 0, 1, LK_EXCLUSIVE, 1, 1 }, /* Commit */
{ 1, 2, 1, 1, LK_EXCLUSIVE, 1, 1 }, /* Create */
{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* Delegpurge */
{ 0, 1, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* Delegreturn */
{ 0, 1, 0, 0, LK_SHARED, 1, 1 }, /* Getattr */
{ 0, 1, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* GetFH */
{ 2, 1, 1, 1, LK_EXCLUSIVE, 1, 1 }, /* Link */
{ 0, 1, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* Lock */
{ 0, 1, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* LockT */
{ 0, 1, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* LockU */
{ 1, 2, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Lookup */
{ 1, 2, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Lookupp */
{ 0, 1, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* NVerify */
{ 1, 1, 0, 1, LK_EXCLUSIVE, 1, 0 }, /* Open */
{ 1, 1, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* OpenAttr */
{ 0, 1, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* OpenConfirm */
{ 0, 1, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* OpenDowngrade */
{ 1, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* PutFH */
{ 1, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* PutPubFH */
{ 1, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* PutRootFH */
{ 0, 1, 0, 0, LK_SHARED, 1, 0 }, /* Read */
{ 0, 1, 0, 0, LK_SHARED, 1, 1 }, /* Readdir */
{ 0, 1, 0, 0, LK_SHARED, 1, 1 }, /* ReadLink */
{ 0, 2, 1, 1, LK_EXCLUSIVE, 1, 1 }, /* Remove */
{ 2, 1, 1, 1, LK_EXCLUSIVE, 1, 1 }, /* Rename */
{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* Renew */
{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* RestoreFH */
{ 0, 1, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* SaveFH */
{ 0, 1, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* SecInfo */
{ 0, 2, 1, 1, LK_EXCLUSIVE, 1, 0 }, /* Setattr */
{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* SetClientID */
{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* SetClientIDConfirm */
{ 0, 1, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Verify */
{ 0, 2, 1, 1, LK_EXCLUSIVE, 1, 0 }, /* Write */
{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* ReleaseLockOwner */
{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Backchannel Ctrl */
{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Bind Conn to Sess */
{ 0, 0, 0, 0, LK_EXCLUSIVE, 0, 0 }, /* Exchange ID */
{ 0, 0, 0, 0, LK_EXCLUSIVE, 0, 0 }, /* Create Session */
{ 0, 0, 0, 0, LK_EXCLUSIVE, 0, 0 }, /* Destroy Session */
{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* Free StateID */
{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Get Dir Deleg */
{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Get Device Info */
{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Get Device List */
{ 0, 1, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Layout Commit */
{ 0, 1, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Layout Get */
{ 0, 1, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* Layout Return */
{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Secinfo No name */
{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* Sequence */
{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Set SSV */
{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Test StateID */
{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Want Delegation */
{ 0, 0, 0, 0, LK_EXCLUSIVE, 0, 0 }, /* Destroy ClientID */
{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* Reclaim Complete */
};
#endif /* !APPLEKEXT */
static int ncl_mbuf_mhlen = MHLEN;
static int nfsrv_usercnt = 0;
static int nfsrv_dnsnamelen;
static u_char *nfsrv_dnsname = NULL;
static int nfsrv_usermax = 999999999;
struct nfsrv_lughash {
struct mtx mtx;
struct nfsuserhashhead lughead;
};
static struct nfsrv_lughash *nfsuserhash;
static struct nfsrv_lughash *nfsusernamehash;
static struct nfsrv_lughash *nfsgrouphash;
static struct nfsrv_lughash *nfsgroupnamehash;
/*
* This static array indicates whether or not the RPC generates a large
* reply. This is used by nfs_reply() to decide whether or not an mbuf
* cluster should be allocated. (If a cluster is required by an RPC
* marked 0 in this array, the code will still work, just not quite as
* efficiently.)
*/
int nfs_bigreply[NFSV41_NPROCS] = { 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0 };
/* local functions */
static int nfsrv_skipace(struct nfsrv_descript *nd, int *acesizep);
static void nfsv4_wanted(struct nfsv4lock *lp);
static int nfsrv_cmpmixedcase(u_char *cp, u_char *cp2, int len);
static int nfsrv_getuser(int procnum, uid_t uid, gid_t gid, char *name,
NFSPROC_T *p);
static void nfsrv_removeuser(struct nfsusrgrp *usrp, int isuser);
static int nfsrv_getrefstr(struct nfsrv_descript *, u_char **, u_char **,
int *, int *);
static void nfsrv_refstrbigenough(int, u_char **, u_char **, int *);
#ifndef APPLE
/*
* copies mbuf chain to the uio scatter/gather list
*/
int
nfsm_mbufuio(struct nfsrv_descript *nd, struct uio *uiop, int siz)
{
char *mbufcp, *uiocp;
int xfer, left, len;
mbuf_t mp;
long uiosiz, rem;
int error = 0;
mp = nd->nd_md;
mbufcp = nd->nd_dpos;
len = NFSMTOD(mp, caddr_t) + mbuf_len(mp) - mbufcp;
rem = NFSM_RNDUP(siz) - siz;
while (siz > 0) {
if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL) {
error = EBADRPC;
goto out;
}
left = uiop->uio_iov->iov_len;
uiocp = uiop->uio_iov->iov_base;
if (left > siz)
left = siz;
uiosiz = left;
while (left > 0) {
while (len == 0) {
mp = mbuf_next(mp);
if (mp == NULL) {
error = EBADRPC;
goto out;
}
mbufcp = NFSMTOD(mp, caddr_t);
len = mbuf_len(mp);
KASSERT(len >= 0,
("len %d, corrupted mbuf?", len));
}
xfer = (left > len) ? len : left;
#ifdef notdef
/* Not Yet.. */
if (uiop->uio_iov->iov_op != NULL)
(*(uiop->uio_iov->iov_op))
(mbufcp, uiocp, xfer);
else
#endif
if (uiop->uio_segflg == UIO_SYSSPACE)
NFSBCOPY(mbufcp, uiocp, xfer);
else
copyout(mbufcp, CAST_USER_ADDR_T(uiocp), xfer);
left -= xfer;
len -= xfer;
mbufcp += xfer;
uiocp += xfer;
uiop->uio_offset += xfer;
uiop->uio_resid -= xfer;
}
if (uiop->uio_iov->iov_len <= siz) {
uiop->uio_iovcnt--;
uiop->uio_iov++;
} else {
uiop->uio_iov->iov_base = (void *)
((char *)uiop->uio_iov->iov_base + uiosiz);
uiop->uio_iov->iov_len -= uiosiz;
}
siz -= uiosiz;
}
nd->nd_dpos = mbufcp;
nd->nd_md = mp;
if (rem > 0) {
if (len < rem)
error = nfsm_advance(nd, rem, len);
else
nd->nd_dpos += rem;
}
out:
NFSEXITCODE2(error, nd);
return (error);
}
#endif /* !APPLE */
/*
* Help break down an mbuf chain by setting the first siz bytes contiguous
* pointed to by returned val.
* This is used by the macro NFSM_DISSECT for tough
* cases.
*/
APPLESTATIC void *
nfsm_dissct(struct nfsrv_descript *nd, int siz, int how)
{
mbuf_t mp2;
int siz2, xfer;
caddr_t p;
int left;
caddr_t retp;
retp = NULL;
left = NFSMTOD(nd->nd_md, caddr_t) + mbuf_len(nd->nd_md) - nd->nd_dpos;
while (left == 0) {
nd->nd_md = mbuf_next(nd->nd_md);
if (nd->nd_md == NULL)
return (retp);
left = mbuf_len(nd->nd_md);
nd->nd_dpos = NFSMTOD(nd->nd_md, caddr_t);
}
if (left >= siz) {
retp = nd->nd_dpos;
nd->nd_dpos += siz;
} else if (mbuf_next(nd->nd_md) == NULL) {
return (retp);
} else if (siz > ncl_mbuf_mhlen) {
panic("nfs S too big");
} else {
MGET(mp2, MT_DATA, how);
if (mp2 == NULL)
return (NULL);
mbuf_setnext(mp2, mbuf_next(nd->nd_md));
mbuf_setnext(nd->nd_md, mp2);
mbuf_setlen(nd->nd_md, mbuf_len(nd->nd_md) - left);
nd->nd_md = mp2;
retp = p = NFSMTOD(mp2, caddr_t);
NFSBCOPY(nd->nd_dpos, p, left); /* Copy what was left */
siz2 = siz - left;
p += left;
mp2 = mbuf_next(mp2);
/* Loop around copying up the siz2 bytes */
while (siz2 > 0) {
if (mp2 == NULL)
return (NULL);
xfer = (siz2 > mbuf_len(mp2)) ? mbuf_len(mp2) : siz2;
if (xfer > 0) {
NFSBCOPY(NFSMTOD(mp2, caddr_t), p, xfer);
NFSM_DATAP(mp2, xfer);
mbuf_setlen(mp2, mbuf_len(mp2) - xfer);
p += xfer;
siz2 -= xfer;
}
if (siz2 > 0)
mp2 = mbuf_next(mp2);
}
mbuf_setlen(nd->nd_md, siz);
nd->nd_md = mp2;
nd->nd_dpos = NFSMTOD(mp2, caddr_t);
}
return (retp);
}
/*
* Advance the position in the mbuf chain.
* If offs == 0, this is a no-op, but it is simpler to just return from
* here than check for offs > 0 for all calls to nfsm_advance.
* If left == -1, it should be calculated here.
*/
APPLESTATIC int
nfsm_advance(struct nfsrv_descript *nd, int offs, int left)
{
int error = 0;
if (offs == 0)
goto out;
/*
* A negative offs should be considered a serious problem.
*/
if (offs < 0)
panic("nfsrv_advance");
/*
* If left == -1, calculate it here.
*/
if (left == -1)
left = NFSMTOD(nd->nd_md, caddr_t) + mbuf_len(nd->nd_md) -
nd->nd_dpos;
/*
* Loop around, advancing over the mbuf data.
*/
while (offs > left) {
offs -= left;
nd->nd_md = mbuf_next(nd->nd_md);
if (nd->nd_md == NULL) {
error = EBADRPC;
goto out;
}
left = mbuf_len(nd->nd_md);
nd->nd_dpos = NFSMTOD(nd->nd_md, caddr_t);
}
nd->nd_dpos += offs;
out:
NFSEXITCODE(error);
return (error);
}
/*
* Copy a string into mbuf(s).
* Return the number of bytes output, including XDR overheads.
*/
APPLESTATIC int
nfsm_strtom(struct nfsrv_descript *nd, const char *cp, int siz)
{
mbuf_t m2;
int xfer, left;
mbuf_t m1;
int rem, bytesize;
u_int32_t *tl;
char *cp2;
NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
*tl = txdr_unsigned(siz);
rem = NFSM_RNDUP(siz) - siz;
bytesize = NFSX_UNSIGNED + siz + rem;
m2 = nd->nd_mb;
cp2 = nd->nd_bpos;
left = M_TRAILINGSPACE(m2);
/*
* Loop around copying the string to mbuf(s).
*/
while (siz > 0) {
if (left == 0) {
if (siz > ncl_mbuf_mlen)
NFSMCLGET(m1, M_WAITOK);
else
NFSMGET(m1);
mbuf_setlen(m1, 0);
mbuf_setnext(m2, m1);
m2 = m1;
cp2 = NFSMTOD(m2, caddr_t);
left = M_TRAILINGSPACE(m2);
}
if (left >= siz)
xfer = siz;
else
xfer = left;
NFSBCOPY(cp, cp2, xfer);
cp += xfer;
mbuf_setlen(m2, mbuf_len(m2) + xfer);
siz -= xfer;
left -= xfer;
if (siz == 0 && rem) {
if (left < rem)
panic("nfsm_strtom");
NFSBZERO(cp2 + xfer, rem);
mbuf_setlen(m2, mbuf_len(m2) + rem);
}
}
nd->nd_mb = m2;
nd->nd_bpos = NFSMTOD(m2, caddr_t) + mbuf_len(m2);
return (bytesize);
}
/*
* Called once to initialize data structures...
*/
APPLESTATIC void
newnfs_init(void)
{
static int nfs_inited = 0;
if (nfs_inited)
return;
nfs_inited = 1;
newnfs_true = txdr_unsigned(TRUE);
newnfs_false = txdr_unsigned(FALSE);
newnfs_xdrneg1 = txdr_unsigned(-1);
nfscl_ticks = (hz * NFS_TICKINTVL + 500) / 1000;
if (nfscl_ticks < 1)
nfscl_ticks = 1;
NFSSETBOOTTIME(nfsboottime);
/*
* Initialize reply list and start timer
*/
TAILQ_INIT(&nfsd_reqq);
NFS_TIMERINIT;
}
/*
* Put a file handle in an mbuf list.
* If the size argument == 0, just use the default size.
* set_true == 1 if there should be an newnfs_true prepended on the file handle.
* Return the number of bytes output, including XDR overhead.
*/
APPLESTATIC int
nfsm_fhtom(struct nfsrv_descript *nd, u_int8_t *fhp, int size, int set_true)
{
u_int32_t *tl;
u_int8_t *cp;
- int fullsiz, rem, bytesize = 0;
+ int fullsiz, bytesize = 0;
if (size == 0)
size = NFSX_MYFH;
switch (nd->nd_flag & (ND_NFSV2 | ND_NFSV3 | ND_NFSV4)) {
case ND_NFSV2:
if (size > NFSX_V2FH)
panic("fh size > NFSX_V2FH for NFSv2");
NFSM_BUILD(cp, u_int8_t *, NFSX_V2FH);
NFSBCOPY(fhp, cp, size);
if (size < NFSX_V2FH)
NFSBZERO(cp + size, NFSX_V2FH - size);
bytesize = NFSX_V2FH;
break;
case ND_NFSV3:
case ND_NFSV4:
fullsiz = NFSM_RNDUP(size);
- rem = fullsiz - size;
if (set_true) {
bytesize = 2 * NFSX_UNSIGNED + fullsiz;
NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
*tl = newnfs_true;
} else {
bytesize = NFSX_UNSIGNED + fullsiz;
}
(void) nfsm_strtom(nd, fhp, size);
break;
}
return (bytesize);
}
/*
* This function compares two net addresses by family and returns TRUE
* if they are the same host.
* If there is any doubt, return FALSE.
* The AF_INET family is handled as a special case so that address mbufs
* don't need to be saved to store "struct in_addr", which is only 4 bytes.
*/
APPLESTATIC int
nfsaddr_match(int family, union nethostaddr *haddr, NFSSOCKADDR_T nam)
{
struct sockaddr_in *inetaddr;
switch (family) {
case AF_INET:
inetaddr = NFSSOCKADDR(nam, struct sockaddr_in *);
if (inetaddr->sin_family == AF_INET &&
inetaddr->sin_addr.s_addr == haddr->had_inet.s_addr)
return (1);
break;
#ifdef INET6
case AF_INET6:
{
struct sockaddr_in6 *inetaddr6;
inetaddr6 = NFSSOCKADDR(nam, struct sockaddr_in6 *);
/* XXX - should test sin6_scope_id ? */
if (inetaddr6->sin6_family == AF_INET6 &&
IN6_ARE_ADDR_EQUAL(&inetaddr6->sin6_addr,
&haddr->had_inet6))
return (1);
}
break;
#endif
}
return (0);
}
/*
* Similar to the above, but takes to NFSSOCKADDR_T args.
*/
APPLESTATIC int
nfsaddr2_match(NFSSOCKADDR_T nam1, NFSSOCKADDR_T nam2)
{
struct sockaddr_in *addr1, *addr2;
struct sockaddr *inaddr;
inaddr = NFSSOCKADDR(nam1, struct sockaddr *);
switch (inaddr->sa_family) {
case AF_INET:
addr1 = NFSSOCKADDR(nam1, struct sockaddr_in *);
addr2 = NFSSOCKADDR(nam2, struct sockaddr_in *);
if (addr2->sin_family == AF_INET &&
addr1->sin_addr.s_addr == addr2->sin_addr.s_addr)
return (1);
break;
#ifdef INET6
case AF_INET6:
{
struct sockaddr_in6 *inet6addr1, *inet6addr2;
inet6addr1 = NFSSOCKADDR(nam1, struct sockaddr_in6 *);
inet6addr2 = NFSSOCKADDR(nam2, struct sockaddr_in6 *);
/* XXX - should test sin6_scope_id ? */
if (inet6addr2->sin6_family == AF_INET6 &&
IN6_ARE_ADDR_EQUAL(&inet6addr1->sin6_addr,
&inet6addr2->sin6_addr))
return (1);
}
break;
#endif
}
return (0);
}
/*
* Trim the stuff already dissected off the mbuf list.
*/
APPLESTATIC void
newnfs_trimleading(nd)
struct nfsrv_descript *nd;
{
mbuf_t m, n;
int offs;
/*
* First, free up leading mbufs.
*/
if (nd->nd_mrep != nd->nd_md) {
m = nd->nd_mrep;
while (mbuf_next(m) != nd->nd_md) {
if (mbuf_next(m) == NULL)
panic("nfsm trim leading");
m = mbuf_next(m);
}
mbuf_setnext(m, NULL);
mbuf_freem(nd->nd_mrep);
}
m = nd->nd_md;
/*
* Now, adjust this mbuf, based on nd_dpos.
*/
offs = nd->nd_dpos - NFSMTOD(m, caddr_t);
if (offs == mbuf_len(m)) {
n = m;
m = mbuf_next(m);
if (m == NULL)
panic("nfsm trim leading2");
mbuf_setnext(n, NULL);
mbuf_freem(n);
} else if (offs > 0) {
mbuf_setlen(m, mbuf_len(m) - offs);
NFSM_DATAP(m, offs);
} else if (offs < 0)
panic("nfsm trimleading offs");
nd->nd_mrep = m;
nd->nd_md = m;
nd->nd_dpos = NFSMTOD(m, caddr_t);
}
/*
* Trim trailing data off the mbuf list being built.
*/
APPLESTATIC void
newnfs_trimtrailing(nd, mb, bpos)
struct nfsrv_descript *nd;
mbuf_t mb;
caddr_t bpos;
{
if (mbuf_next(mb)) {
mbuf_freem(mbuf_next(mb));
mbuf_setnext(mb, NULL);
}
mbuf_setlen(mb, bpos - NFSMTOD(mb, caddr_t));
nd->nd_mb = mb;
nd->nd_bpos = bpos;
}
/*
* Dissect a file handle on the client.
*/
APPLESTATIC int
nfsm_getfh(struct nfsrv_descript *nd, struct nfsfh **nfhpp)
{
u_int32_t *tl;
struct nfsfh *nfhp;
int error, len;
*nfhpp = NULL;
if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4)) {
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if ((len = fxdr_unsigned(int, *tl)) <= 0 ||
len > NFSX_FHMAX) {
error = EBADRPC;
goto nfsmout;
}
} else
len = NFSX_V2FH;
MALLOC(nfhp, struct nfsfh *, sizeof (struct nfsfh) + len,
M_NFSFH, M_WAITOK);
error = nfsrv_mtostr(nd, nfhp->nfh_fh, len);
if (error) {
FREE((caddr_t)nfhp, M_NFSFH);
goto nfsmout;
}
nfhp->nfh_len = len;
*nfhpp = nfhp;
nfsmout:
NFSEXITCODE2(error, nd);
return (error);
}
/*
* Break down the nfsv4 acl.
* If the aclp == NULL or won't fit in an acl, just discard the acl info.
*/
APPLESTATIC int
nfsrv_dissectacl(struct nfsrv_descript *nd, NFSACL_T *aclp, int *aclerrp,
int *aclsizep, __unused NFSPROC_T *p)
{
u_int32_t *tl;
int i, aclsize;
int acecnt, error = 0, aceerr = 0, acesize;
*aclerrp = 0;
if (aclp)
aclp->acl_cnt = 0;
/*
* Parse out the ace entries and expect them to conform to
* what can be supported by R/W/X bits.
*/
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
aclsize = NFSX_UNSIGNED;
acecnt = fxdr_unsigned(int, *tl);
if (acecnt > ACL_MAX_ENTRIES)
aceerr = NFSERR_ATTRNOTSUPP;
if (nfsrv_useacl == 0)
aceerr = NFSERR_ATTRNOTSUPP;
for (i = 0; i < acecnt; i++) {
if (aclp && !aceerr)
error = nfsrv_dissectace(nd, &aclp->acl_entry[i],
&aceerr, &acesize, p);
else
error = nfsrv_skipace(nd, &acesize);
if (error)
goto nfsmout;
aclsize += acesize;
}
if (aclp && !aceerr)
aclp->acl_cnt = acecnt;
if (aceerr)
*aclerrp = aceerr;
if (aclsizep)
*aclsizep = aclsize;
nfsmout:
NFSEXITCODE2(error, nd);
return (error);
}
/*
* Skip over an NFSv4 ace entry. Just dissect the xdr and discard it.
*/
static int
nfsrv_skipace(struct nfsrv_descript *nd, int *acesizep)
{
u_int32_t *tl;
int error, len = 0;
NFSM_DISSECT(tl, u_int32_t *, 4 * NFSX_UNSIGNED);
len = fxdr_unsigned(int, *(tl + 3));
error = nfsm_advance(nd, NFSM_RNDUP(len), -1);
nfsmout:
*acesizep = NFSM_RNDUP(len) + (4 * NFSX_UNSIGNED);
NFSEXITCODE2(error, nd);
return (error);
}
/*
* Get attribute bits from an mbuf list.
* Returns EBADRPC for a parsing error, 0 otherwise.
* If the clearinvalid flag is set, clear the bits not supported.
*/
APPLESTATIC int
nfsrv_getattrbits(struct nfsrv_descript *nd, nfsattrbit_t *attrbitp, int *cntp,
int *retnotsupp)
{
u_int32_t *tl;
int cnt, i, outcnt;
int error = 0;
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
cnt = fxdr_unsigned(int, *tl);
if (cnt < 0) {
error = NFSERR_BADXDR;
goto nfsmout;
}
if (cnt > NFSATTRBIT_MAXWORDS)
outcnt = NFSATTRBIT_MAXWORDS;
else
outcnt = cnt;
NFSZERO_ATTRBIT(attrbitp);
if (outcnt > 0) {
NFSM_DISSECT(tl, u_int32_t *, outcnt * NFSX_UNSIGNED);
for (i = 0; i < outcnt; i++)
attrbitp->bits[i] = fxdr_unsigned(u_int32_t, *tl++);
}
for (i = 0; i < (cnt - outcnt); i++) {
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (retnotsupp != NULL && *tl != 0)
*retnotsupp = NFSERR_ATTRNOTSUPP;
}
if (cntp)
*cntp = NFSX_UNSIGNED + (cnt * NFSX_UNSIGNED);
nfsmout:
NFSEXITCODE2(error, nd);
return (error);
}
/*
* Get the attributes for V4.
* If the compare flag is true, test for any attribute changes,
* otherwise return the attribute values.
* These attributes cover fields in "struct vattr", "struct statfs",
* "struct nfsfsinfo", the file handle and the lease duration.
* The value of retcmpp is set to 1 if all attributes are the same,
* and 0 otherwise.
* Returns EBADRPC if it can't be parsed, 0 otherwise.
*/
APPLESTATIC int
nfsv4_loadattr(struct nfsrv_descript *nd, vnode_t vp,
struct nfsvattr *nap, struct nfsfh **nfhpp, fhandle_t *fhp, int fhsize,
struct nfsv3_pathconf *pc, struct statfs *sbp, struct nfsstatfs *sfp,
struct nfsfsinfo *fsp, NFSACL_T *aclp, int compare, int *retcmpp,
u_int32_t *leasep, u_int32_t *rderrp, NFSPROC_T *p, struct ucred *cred)
{
u_int32_t *tl;
int i = 0, j, k, l = 0, m, bitpos, attrsum = 0;
int error, tfhsize, aceerr, attrsize, cnt, retnotsup;
u_char *cp, *cp2, namestr[NFSV4_SMALLSTR + 1];
nfsattrbit_t attrbits, retattrbits, checkattrbits;
struct nfsfh *tnfhp;
struct nfsreferral *refp;
u_quad_t tquad;
nfsquad_t tnfsquad;
struct timespec temptime;
uid_t uid;
gid_t gid;
u_int32_t freenum = 0, tuint;
u_int64_t uquad = 0, thyp, thyp2;
#ifdef QUOTA
struct dqblk dqb;
uid_t savuid;
#endif
CTASSERT(sizeof(ino_t) == sizeof(uint64_t));
if (compare) {
retnotsup = 0;
error = nfsrv_getattrbits(nd, &attrbits, NULL, &retnotsup);
} else {
error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL);
}
if (error)
goto nfsmout;
if (compare) {
*retcmpp = retnotsup;
} else {
/*
* Just set default values to some of the important ones.
*/
if (nap != NULL) {
nap->na_type = VREG;
nap->na_mode = 0;
nap->na_rdev = (NFSDEV_T)0;
nap->na_mtime.tv_sec = 0;
nap->na_mtime.tv_nsec = 0;
nap->na_gen = 0;
nap->na_flags = 0;
nap->na_blocksize = NFS_FABLKSIZE;
}
if (sbp != NULL) {
sbp->f_bsize = NFS_FABLKSIZE;
sbp->f_blocks = 0;
sbp->f_bfree = 0;
sbp->f_bavail = 0;
sbp->f_files = 0;
sbp->f_ffree = 0;
}
if (fsp != NULL) {
fsp->fs_rtmax = 8192;
fsp->fs_rtpref = 8192;
fsp->fs_maxname = NFS_MAXNAMLEN;
fsp->fs_wtmax = 8192;
fsp->fs_wtpref = 8192;
fsp->fs_wtmult = NFS_FABLKSIZE;
fsp->fs_dtpref = 8192;
fsp->fs_maxfilesize = 0xffffffffffffffffull;
fsp->fs_timedelta.tv_sec = 0;
fsp->fs_timedelta.tv_nsec = 1;
fsp->fs_properties = (NFSV3_FSFLINK | NFSV3_FSFSYMLINK |
NFSV3_FSFHOMOGENEOUS | NFSV3_FSFCANSETTIME);
}
if (pc != NULL) {
pc->pc_linkmax = NFS_LINK_MAX;
pc->pc_namemax = NAME_MAX;
pc->pc_notrunc = 0;
pc->pc_chownrestricted = 0;
pc->pc_caseinsensitive = 0;
pc->pc_casepreserving = 1;
}
if (sfp != NULL) {
sfp->sf_ffiles = UINT64_MAX;
sfp->sf_tfiles = UINT64_MAX;
sfp->sf_afiles = UINT64_MAX;
sfp->sf_fbytes = UINT64_MAX;
sfp->sf_tbytes = UINT64_MAX;
sfp->sf_abytes = UINT64_MAX;
}
}
/*
* Loop around getting the attributes.
*/
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
attrsize = fxdr_unsigned(int, *tl);
for (bitpos = 0; bitpos < NFSATTRBIT_MAX; bitpos++) {
if (attrsum > attrsize) {
error = NFSERR_BADXDR;
goto nfsmout;
}
if (NFSISSET_ATTRBIT(&attrbits, bitpos))
switch (bitpos) {
case NFSATTRBIT_SUPPORTEDATTRS:
retnotsup = 0;
if (compare || nap == NULL)
error = nfsrv_getattrbits(nd, &retattrbits,
&cnt, &retnotsup);
else
error = nfsrv_getattrbits(nd, &nap->na_suppattr,
&cnt, &retnotsup);
if (error)
goto nfsmout;
if (compare && !(*retcmpp)) {
NFSSETSUPP_ATTRBIT(&checkattrbits);
/* Some filesystem do not support NFSv4ACL */
if (nfsrv_useacl == 0 || nfs_supportsnfsv4acls(vp) == 0) {
NFSCLRBIT_ATTRBIT(&checkattrbits, NFSATTRBIT_ACL);
NFSCLRBIT_ATTRBIT(&checkattrbits, NFSATTRBIT_ACLSUPPORT);
}
if (!NFSEQUAL_ATTRBIT(&retattrbits, &checkattrbits)
|| retnotsup)
*retcmpp = NFSERR_NOTSAME;
}
attrsum += cnt;
break;
case NFSATTRBIT_TYPE:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (compare) {
if (!(*retcmpp)) {
if (nap->na_type != nfsv34tov_type(*tl))
*retcmpp = NFSERR_NOTSAME;
}
} else if (nap != NULL) {
nap->na_type = nfsv34tov_type(*tl);
}
attrsum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_FHEXPIRETYPE:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (compare && !(*retcmpp)) {
if (fxdr_unsigned(int, *tl) !=
NFSV4FHTYPE_PERSISTENT)
*retcmpp = NFSERR_NOTSAME;
}
attrsum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_CHANGE:
NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
if (compare) {
if (!(*retcmpp)) {
if (nap->na_filerev != fxdr_hyper(tl))
*retcmpp = NFSERR_NOTSAME;
}
} else if (nap != NULL) {
nap->na_filerev = fxdr_hyper(tl);
}
attrsum += NFSX_HYPER;
break;
case NFSATTRBIT_SIZE:
NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
if (compare) {
if (!(*retcmpp)) {
if (nap->na_size != fxdr_hyper(tl))
*retcmpp = NFSERR_NOTSAME;
}
} else if (nap != NULL) {
nap->na_size = fxdr_hyper(tl);
}
attrsum += NFSX_HYPER;
break;
case NFSATTRBIT_LINKSUPPORT:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (compare) {
if (!(*retcmpp)) {
if (fsp->fs_properties & NFSV3_FSFLINK) {
if (*tl == newnfs_false)
*retcmpp = NFSERR_NOTSAME;
} else {
if (*tl == newnfs_true)
*retcmpp = NFSERR_NOTSAME;
}
}
} else if (fsp != NULL) {
if (*tl == newnfs_true)
fsp->fs_properties |= NFSV3_FSFLINK;
else
fsp->fs_properties &= ~NFSV3_FSFLINK;
}
attrsum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_SYMLINKSUPPORT:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (compare) {
if (!(*retcmpp)) {
if (fsp->fs_properties & NFSV3_FSFSYMLINK) {
if (*tl == newnfs_false)
*retcmpp = NFSERR_NOTSAME;
} else {
if (*tl == newnfs_true)
*retcmpp = NFSERR_NOTSAME;
}
}
} else if (fsp != NULL) {
if (*tl == newnfs_true)
fsp->fs_properties |= NFSV3_FSFSYMLINK;
else
fsp->fs_properties &= ~NFSV3_FSFSYMLINK;
}
attrsum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_NAMEDATTR:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (compare && !(*retcmpp)) {
if (*tl != newnfs_false)
*retcmpp = NFSERR_NOTSAME;
}
attrsum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_FSID:
NFSM_DISSECT(tl, u_int32_t *, 4 * NFSX_UNSIGNED);
thyp = fxdr_hyper(tl);
tl += 2;
thyp2 = fxdr_hyper(tl);
if (compare) {
if (*retcmpp == 0) {
if (thyp != (u_int64_t)
vfs_statfs(vnode_mount(vp))->f_fsid.val[0] ||
thyp2 != (u_int64_t)
vfs_statfs(vnode_mount(vp))->f_fsid.val[1])
*retcmpp = NFSERR_NOTSAME;
}
} else if (nap != NULL) {
nap->na_filesid[0] = thyp;
nap->na_filesid[1] = thyp2;
}
attrsum += (4 * NFSX_UNSIGNED);
break;
case NFSATTRBIT_UNIQUEHANDLES:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (compare && !(*retcmpp)) {
if (*tl != newnfs_true)
*retcmpp = NFSERR_NOTSAME;
}
attrsum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_LEASETIME:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (compare) {
if (fxdr_unsigned(int, *tl) != nfsrv_lease &&
!(*retcmpp))
*retcmpp = NFSERR_NOTSAME;
} else if (leasep != NULL) {
*leasep = fxdr_unsigned(u_int32_t, *tl);
}
attrsum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_RDATTRERROR:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (compare) {
if (!(*retcmpp))
*retcmpp = NFSERR_INVAL;
} else if (rderrp != NULL) {
*rderrp = fxdr_unsigned(u_int32_t, *tl);
}
attrsum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_ACL:
if (compare) {
if (!(*retcmpp)) {
if (nfsrv_useacl && nfs_supportsnfsv4acls(vp)) {
NFSACL_T *naclp;
naclp = acl_alloc(M_WAITOK);
error = nfsrv_dissectacl(nd, naclp, &aceerr,
&cnt, p);
if (error) {
acl_free(naclp);
goto nfsmout;
}
if (aceerr || aclp == NULL ||
nfsrv_compareacl(aclp, naclp))
*retcmpp = NFSERR_NOTSAME;
acl_free(naclp);
} else {
error = nfsrv_dissectacl(nd, NULL, &aceerr,
&cnt, p);
*retcmpp = NFSERR_ATTRNOTSUPP;
}
}
} else {
if (vp != NULL && aclp != NULL)
error = nfsrv_dissectacl(nd, aclp, &aceerr,
&cnt, p);
else
error = nfsrv_dissectacl(nd, NULL, &aceerr,
&cnt, p);
if (error)
goto nfsmout;
}
attrsum += cnt;
break;
case NFSATTRBIT_ACLSUPPORT:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (compare && !(*retcmpp)) {
if (nfsrv_useacl && nfs_supportsnfsv4acls(vp)) {
if (fxdr_unsigned(u_int32_t, *tl) !=
NFSV4ACE_SUPTYPES)
*retcmpp = NFSERR_NOTSAME;
} else {
*retcmpp = NFSERR_ATTRNOTSUPP;
}
}
attrsum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_ARCHIVE:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (compare && !(*retcmpp))
*retcmpp = NFSERR_ATTRNOTSUPP;
attrsum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_CANSETTIME:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (compare) {
if (!(*retcmpp)) {
if (fsp->fs_properties & NFSV3_FSFCANSETTIME) {
if (*tl == newnfs_false)
*retcmpp = NFSERR_NOTSAME;
} else {
if (*tl == newnfs_true)
*retcmpp = NFSERR_NOTSAME;
}
}
} else if (fsp != NULL) {
if (*tl == newnfs_true)
fsp->fs_properties |= NFSV3_FSFCANSETTIME;
else
fsp->fs_properties &= ~NFSV3_FSFCANSETTIME;
}
attrsum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_CASEINSENSITIVE:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (compare) {
if (!(*retcmpp)) {
if (*tl != newnfs_false)
*retcmpp = NFSERR_NOTSAME;
}
} else if (pc != NULL) {
pc->pc_caseinsensitive =
fxdr_unsigned(u_int32_t, *tl);
}
attrsum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_CASEPRESERVING:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (compare) {
if (!(*retcmpp)) {
if (*tl != newnfs_true)
*retcmpp = NFSERR_NOTSAME;
}
} else if (pc != NULL) {
pc->pc_casepreserving =
fxdr_unsigned(u_int32_t, *tl);
}
attrsum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_CHOWNRESTRICTED:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (compare) {
if (!(*retcmpp)) {
if (*tl != newnfs_true)
*retcmpp = NFSERR_NOTSAME;
}
} else if (pc != NULL) {
pc->pc_chownrestricted =
fxdr_unsigned(u_int32_t, *tl);
}
attrsum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_FILEHANDLE:
error = nfsm_getfh(nd, &tnfhp);
if (error)
goto nfsmout;
tfhsize = tnfhp->nfh_len;
if (compare) {
if (!(*retcmpp) &&
!NFSRV_CMPFH(tnfhp->nfh_fh, tfhsize,
fhp, fhsize))
*retcmpp = NFSERR_NOTSAME;
FREE((caddr_t)tnfhp, M_NFSFH);
} else if (nfhpp != NULL) {
*nfhpp = tnfhp;
} else {
FREE((caddr_t)tnfhp, M_NFSFH);
}
attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(tfhsize));
break;
case NFSATTRBIT_FILEID:
NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
thyp = fxdr_hyper(tl);
if (compare) {
if (!(*retcmpp)) {
if (nap->na_fileid != thyp)
*retcmpp = NFSERR_NOTSAME;
}
} else if (nap != NULL)
nap->na_fileid = thyp;
attrsum += NFSX_HYPER;
break;
case NFSATTRBIT_FILESAVAIL:
NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
if (compare) {
if (!(*retcmpp) &&
sfp->sf_afiles != fxdr_hyper(tl))
*retcmpp = NFSERR_NOTSAME;
} else if (sfp != NULL) {
sfp->sf_afiles = fxdr_hyper(tl);
}
attrsum += NFSX_HYPER;
break;
case NFSATTRBIT_FILESFREE:
NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
if (compare) {
if (!(*retcmpp) &&
sfp->sf_ffiles != fxdr_hyper(tl))
*retcmpp = NFSERR_NOTSAME;
} else if (sfp != NULL) {
sfp->sf_ffiles = fxdr_hyper(tl);
}
attrsum += NFSX_HYPER;
break;
case NFSATTRBIT_FILESTOTAL:
NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
if (compare) {
if (!(*retcmpp) &&
sfp->sf_tfiles != fxdr_hyper(tl))
*retcmpp = NFSERR_NOTSAME;
} else if (sfp != NULL) {
sfp->sf_tfiles = fxdr_hyper(tl);
}
attrsum += NFSX_HYPER;
break;
case NFSATTRBIT_FSLOCATIONS:
error = nfsrv_getrefstr(nd, &cp, &cp2, &l, &m);
if (error)
goto nfsmout;
attrsum += l;
if (compare && !(*retcmpp)) {
refp = nfsv4root_getreferral(vp, NULL, 0);
if (refp != NULL) {
if (cp == NULL || cp2 == NULL ||
strcmp(cp, "/") ||
strcmp(cp2, refp->nfr_srvlist))
*retcmpp = NFSERR_NOTSAME;
} else if (m == 0) {
*retcmpp = NFSERR_NOTSAME;
}
}
if (cp != NULL)
free(cp, M_NFSSTRING);
if (cp2 != NULL)
free(cp2, M_NFSSTRING);
break;
case NFSATTRBIT_HIDDEN:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (compare && !(*retcmpp))
*retcmpp = NFSERR_ATTRNOTSUPP;
attrsum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_HOMOGENEOUS:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (compare) {
if (!(*retcmpp)) {
if (fsp->fs_properties &
NFSV3_FSFHOMOGENEOUS) {
if (*tl == newnfs_false)
*retcmpp = NFSERR_NOTSAME;
} else {
if (*tl == newnfs_true)
*retcmpp = NFSERR_NOTSAME;
}
}
} else if (fsp != NULL) {
if (*tl == newnfs_true)
fsp->fs_properties |= NFSV3_FSFHOMOGENEOUS;
else
fsp->fs_properties &= ~NFSV3_FSFHOMOGENEOUS;
}
attrsum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_MAXFILESIZE:
NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
tnfsquad.qval = fxdr_hyper(tl);
if (compare) {
if (!(*retcmpp)) {
tquad = NFSRV_MAXFILESIZE;
if (tquad != tnfsquad.qval)
*retcmpp = NFSERR_NOTSAME;
}
} else if (fsp != NULL) {
fsp->fs_maxfilesize = tnfsquad.qval;
}
attrsum += NFSX_HYPER;
break;
case NFSATTRBIT_MAXLINK:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (compare) {
if (!(*retcmpp)) {
if (fxdr_unsigned(int, *tl) != NFS_LINK_MAX)
*retcmpp = NFSERR_NOTSAME;
}
} else if (pc != NULL) {
pc->pc_linkmax = fxdr_unsigned(u_int32_t, *tl);
}
attrsum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_MAXNAME:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (compare) {
if (!(*retcmpp)) {
if (fsp->fs_maxname !=
fxdr_unsigned(u_int32_t, *tl))
*retcmpp = NFSERR_NOTSAME;
}
} else {
tuint = fxdr_unsigned(u_int32_t, *tl);
/*
* Some Linux NFSv4 servers report this
* as 0 or 4billion, so I'll set it to
* NFS_MAXNAMLEN. If a server actually creates
* a name longer than NFS_MAXNAMLEN, it will
* get an error back.
*/
if (tuint == 0 || tuint > NFS_MAXNAMLEN)
tuint = NFS_MAXNAMLEN;
if (fsp != NULL)
fsp->fs_maxname = tuint;
if (pc != NULL)
pc->pc_namemax = tuint;
}
attrsum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_MAXREAD:
NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
if (compare) {
if (!(*retcmpp)) {
if (fsp->fs_rtmax != fxdr_unsigned(u_int32_t,
*(tl + 1)) || *tl != 0)
*retcmpp = NFSERR_NOTSAME;
}
} else if (fsp != NULL) {
fsp->fs_rtmax = fxdr_unsigned(u_int32_t, *++tl);
fsp->fs_rtpref = fsp->fs_rtmax;
fsp->fs_dtpref = fsp->fs_rtpref;
}
attrsum += NFSX_HYPER;
break;
case NFSATTRBIT_MAXWRITE:
NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
if (compare) {
if (!(*retcmpp)) {
if (fsp->fs_wtmax != fxdr_unsigned(u_int32_t,
*(tl + 1)) || *tl != 0)
*retcmpp = NFSERR_NOTSAME;
}
} else if (fsp != NULL) {
fsp->fs_wtmax = fxdr_unsigned(int, *++tl);
fsp->fs_wtpref = fsp->fs_wtmax;
}
attrsum += NFSX_HYPER;
break;
case NFSATTRBIT_MIMETYPE:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
i = fxdr_unsigned(int, *tl);
attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(i));
error = nfsm_advance(nd, NFSM_RNDUP(i), -1);
if (error)
goto nfsmout;
if (compare && !(*retcmpp))
*retcmpp = NFSERR_ATTRNOTSUPP;
break;
case NFSATTRBIT_MODE:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (compare) {
if (!(*retcmpp)) {
if (nap->na_mode != nfstov_mode(*tl))
*retcmpp = NFSERR_NOTSAME;
}
} else if (nap != NULL) {
nap->na_mode = nfstov_mode(*tl);
}
attrsum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_NOTRUNC:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (compare) {
if (!(*retcmpp)) {
if (*tl != newnfs_true)
*retcmpp = NFSERR_NOTSAME;
}
} else if (pc != NULL) {
pc->pc_notrunc = fxdr_unsigned(u_int32_t, *tl);
}
attrsum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_NUMLINKS:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
tuint = fxdr_unsigned(u_int32_t, *tl);
if (compare) {
if (!(*retcmpp)) {
if ((u_int32_t)nap->na_nlink != tuint)
*retcmpp = NFSERR_NOTSAME;
}
} else if (nap != NULL) {
nap->na_nlink = tuint;
}
attrsum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_OWNER:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
j = fxdr_unsigned(int, *tl);
if (j < 0) {
error = NFSERR_BADXDR;
goto nfsmout;
}
attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(j));
if (j > NFSV4_SMALLSTR)
cp = malloc(j + 1, M_NFSSTRING, M_WAITOK);
else
cp = namestr;
error = nfsrv_mtostr(nd, cp, j);
if (error) {
if (j > NFSV4_SMALLSTR)
free(cp, M_NFSSTRING);
goto nfsmout;
}
if (compare) {
if (!(*retcmpp)) {
if (nfsv4_strtouid(nd, cp, j, &uid, p) ||
nap->na_uid != uid)
*retcmpp = NFSERR_NOTSAME;
}
} else if (nap != NULL) {
if (nfsv4_strtouid(nd, cp, j, &uid, p))
nap->na_uid = nfsrv_defaultuid;
else
nap->na_uid = uid;
}
if (j > NFSV4_SMALLSTR)
free(cp, M_NFSSTRING);
break;
case NFSATTRBIT_OWNERGROUP:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
j = fxdr_unsigned(int, *tl);
if (j < 0) {
error = NFSERR_BADXDR;
goto nfsmout;
}
attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(j));
if (j > NFSV4_SMALLSTR)
cp = malloc(j + 1, M_NFSSTRING, M_WAITOK);
else
cp = namestr;
error = nfsrv_mtostr(nd, cp, j);
if (error) {
if (j > NFSV4_SMALLSTR)
free(cp, M_NFSSTRING);
goto nfsmout;
}
if (compare) {
if (!(*retcmpp)) {
if (nfsv4_strtogid(nd, cp, j, &gid, p) ||
nap->na_gid != gid)
*retcmpp = NFSERR_NOTSAME;
}
} else if (nap != NULL) {
if (nfsv4_strtogid(nd, cp, j, &gid, p))
nap->na_gid = nfsrv_defaultgid;
else
nap->na_gid = gid;
}
if (j > NFSV4_SMALLSTR)
free(cp, M_NFSSTRING);
break;
case NFSATTRBIT_QUOTAHARD:
NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
if (sbp != NULL) {
if (priv_check_cred(cred, PRIV_VFS_EXCEEDQUOTA, 0))
freenum = sbp->f_bfree;
else
freenum = sbp->f_bavail;
#ifdef QUOTA
/*
* ufs_quotactl() insists that the uid argument
* equal p_ruid for non-root quota access, so
* we'll just make sure that's the case.
*/
savuid = p->p_cred->p_ruid;
p->p_cred->p_ruid = cred->cr_uid;
if (!VFS_QUOTACTL(vnode_mount(vp),QCMD(Q_GETQUOTA,
USRQUOTA), cred->cr_uid, (caddr_t)&dqb))
freenum = min(dqb.dqb_bhardlimit, freenum);
p->p_cred->p_ruid = savuid;
#endif /* QUOTA */
uquad = (u_int64_t)freenum;
NFSQUOTABLKTOBYTE(uquad, sbp->f_bsize);
}
if (compare && !(*retcmpp)) {
if (uquad != fxdr_hyper(tl))
*retcmpp = NFSERR_NOTSAME;
}
attrsum += NFSX_HYPER;
break;
case NFSATTRBIT_QUOTASOFT:
NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
if (sbp != NULL) {
if (priv_check_cred(cred, PRIV_VFS_EXCEEDQUOTA, 0))
freenum = sbp->f_bfree;
else
freenum = sbp->f_bavail;
#ifdef QUOTA
/*
* ufs_quotactl() insists that the uid argument
* equal p_ruid for non-root quota access, so
* we'll just make sure that's the case.
*/
savuid = p->p_cred->p_ruid;
p->p_cred->p_ruid = cred->cr_uid;
if (!VFS_QUOTACTL(vnode_mount(vp),QCMD(Q_GETQUOTA,
USRQUOTA), cred->cr_uid, (caddr_t)&dqb))
freenum = min(dqb.dqb_bsoftlimit, freenum);
p->p_cred->p_ruid = savuid;
#endif /* QUOTA */
uquad = (u_int64_t)freenum;
NFSQUOTABLKTOBYTE(uquad, sbp->f_bsize);
}
if (compare && !(*retcmpp)) {
if (uquad != fxdr_hyper(tl))
*retcmpp = NFSERR_NOTSAME;
}
attrsum += NFSX_HYPER;
break;
case NFSATTRBIT_QUOTAUSED:
NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
if (sbp != NULL) {
freenum = 0;
#ifdef QUOTA
/*
* ufs_quotactl() insists that the uid argument
* equal p_ruid for non-root quota access, so
* we'll just make sure that's the case.
*/
savuid = p->p_cred->p_ruid;
p->p_cred->p_ruid = cred->cr_uid;
if (!VFS_QUOTACTL(vnode_mount(vp),QCMD(Q_GETQUOTA,
USRQUOTA), cred->cr_uid, (caddr_t)&dqb))
freenum = dqb.dqb_curblocks;
p->p_cred->p_ruid = savuid;
#endif /* QUOTA */
uquad = (u_int64_t)freenum;
NFSQUOTABLKTOBYTE(uquad, sbp->f_bsize);
}
if (compare && !(*retcmpp)) {
if (uquad != fxdr_hyper(tl))
*retcmpp = NFSERR_NOTSAME;
}
attrsum += NFSX_HYPER;
break;
case NFSATTRBIT_RAWDEV:
NFSM_DISSECT(tl, u_int32_t *, NFSX_V4SPECDATA);
j = fxdr_unsigned(int, *tl++);
k = fxdr_unsigned(int, *tl);
if (compare) {
if (!(*retcmpp)) {
if (nap->na_rdev != NFSMAKEDEV(j, k))
*retcmpp = NFSERR_NOTSAME;
}
} else if (nap != NULL) {
nap->na_rdev = NFSMAKEDEV(j, k);
}
attrsum += NFSX_V4SPECDATA;
break;
case NFSATTRBIT_SPACEAVAIL:
NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
if (compare) {
if (!(*retcmpp) &&
sfp->sf_abytes != fxdr_hyper(tl))
*retcmpp = NFSERR_NOTSAME;
} else if (sfp != NULL) {
sfp->sf_abytes = fxdr_hyper(tl);
}
attrsum += NFSX_HYPER;
break;
case NFSATTRBIT_SPACEFREE:
NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
if (compare) {
if (!(*retcmpp) &&
sfp->sf_fbytes != fxdr_hyper(tl))
*retcmpp = NFSERR_NOTSAME;
} else if (sfp != NULL) {
sfp->sf_fbytes = fxdr_hyper(tl);
}
attrsum += NFSX_HYPER;
break;
case NFSATTRBIT_SPACETOTAL:
NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
if (compare) {
if (!(*retcmpp) &&
sfp->sf_tbytes != fxdr_hyper(tl))
*retcmpp = NFSERR_NOTSAME;
} else if (sfp != NULL) {
sfp->sf_tbytes = fxdr_hyper(tl);
}
attrsum += NFSX_HYPER;
break;
case NFSATTRBIT_SPACEUSED:
NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
thyp = fxdr_hyper(tl);
if (compare) {
if (!(*retcmpp)) {
if ((u_int64_t)nap->na_bytes != thyp)
*retcmpp = NFSERR_NOTSAME;
}
} else if (nap != NULL) {
nap->na_bytes = thyp;
}
attrsum += NFSX_HYPER;
break;
case NFSATTRBIT_SYSTEM:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (compare && !(*retcmpp))
*retcmpp = NFSERR_ATTRNOTSUPP;
attrsum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_TIMEACCESS:
NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
fxdr_nfsv4time(tl, &temptime);
if (compare) {
if (!(*retcmpp)) {
if (!NFS_CMPTIME(temptime, nap->na_atime))
*retcmpp = NFSERR_NOTSAME;
}
} else if (nap != NULL) {
nap->na_atime = temptime;
}
attrsum += NFSX_V4TIME;
break;
case NFSATTRBIT_TIMEACCESSSET:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
attrsum += NFSX_UNSIGNED;
i = fxdr_unsigned(int, *tl);
if (i == NFSV4SATTRTIME_TOCLIENT) {
NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
attrsum += NFSX_V4TIME;
}
if (compare && !(*retcmpp))
*retcmpp = NFSERR_INVAL;
break;
case NFSATTRBIT_TIMEBACKUP:
NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
if (compare && !(*retcmpp))
*retcmpp = NFSERR_ATTRNOTSUPP;
attrsum += NFSX_V4TIME;
break;
case NFSATTRBIT_TIMECREATE:
NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
if (compare && !(*retcmpp))
*retcmpp = NFSERR_ATTRNOTSUPP;
attrsum += NFSX_V4TIME;
break;
case NFSATTRBIT_TIMEDELTA:
NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
if (fsp != NULL) {
if (compare) {
if (!(*retcmpp)) {
if ((u_int32_t)fsp->fs_timedelta.tv_sec !=
fxdr_unsigned(u_int32_t, *(tl + 1)) ||
(u_int32_t)fsp->fs_timedelta.tv_nsec !=
(fxdr_unsigned(u_int32_t, *(tl + 2)) %
1000000000) ||
*tl != 0)
*retcmpp = NFSERR_NOTSAME;
}
} else {
fxdr_nfsv4time(tl, &fsp->fs_timedelta);
}
}
attrsum += NFSX_V4TIME;
break;
case NFSATTRBIT_TIMEMETADATA:
NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
fxdr_nfsv4time(tl, &temptime);
if (compare) {
if (!(*retcmpp)) {
if (!NFS_CMPTIME(temptime, nap->na_ctime))
*retcmpp = NFSERR_NOTSAME;
}
} else if (nap != NULL) {
nap->na_ctime = temptime;
}
attrsum += NFSX_V4TIME;
break;
case NFSATTRBIT_TIMEMODIFY:
NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
fxdr_nfsv4time(tl, &temptime);
if (compare) {
if (!(*retcmpp)) {
if (!NFS_CMPTIME(temptime, nap->na_mtime))
*retcmpp = NFSERR_NOTSAME;
}
} else if (nap != NULL) {
nap->na_mtime = temptime;
}
attrsum += NFSX_V4TIME;
break;
case NFSATTRBIT_TIMEMODIFYSET:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
attrsum += NFSX_UNSIGNED;
i = fxdr_unsigned(int, *tl);
if (i == NFSV4SATTRTIME_TOCLIENT) {
NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
attrsum += NFSX_V4TIME;
}
if (compare && !(*retcmpp))
*retcmpp = NFSERR_INVAL;
break;
case NFSATTRBIT_MOUNTEDONFILEID:
NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
thyp = fxdr_hyper(tl);
if (compare) {
if (!(*retcmpp)) {
if (!vp || !nfsrv_atroot(vp, &thyp2))
thyp2 = nap->na_fileid;
if (thyp2 != thyp)
*retcmpp = NFSERR_NOTSAME;
}
} else if (nap != NULL)
nap->na_mntonfileno = thyp;
attrsum += NFSX_HYPER;
break;
case NFSATTRBIT_SUPPATTREXCLCREAT:
retnotsup = 0;
error = nfsrv_getattrbits(nd, &retattrbits,
&cnt, &retnotsup);
if (error)
goto nfsmout;
if (compare && !(*retcmpp)) {
NFSSETSUPP_ATTRBIT(&checkattrbits);
NFSCLRNOTSETABLE_ATTRBIT(&checkattrbits);
NFSCLRBIT_ATTRBIT(&checkattrbits,
NFSATTRBIT_TIMEACCESSSET);
if (!NFSEQUAL_ATTRBIT(&retattrbits, &checkattrbits)
|| retnotsup)
*retcmpp = NFSERR_NOTSAME;
}
attrsum += cnt;
break;
default:
printf("EEK! nfsv4_loadattr unknown attr=%d\n",
bitpos);
if (compare && !(*retcmpp))
*retcmpp = NFSERR_ATTRNOTSUPP;
/*
* and get out of the loop, since we can't parse
* the unknown attrbute data.
*/
bitpos = NFSATTRBIT_MAX;
break;
}
}
/*
* some clients pad the attrlist, so we need to skip over the
* padding.
*/
if (attrsum > attrsize) {
error = NFSERR_BADXDR;
} else {
attrsize = NFSM_RNDUP(attrsize);
if (attrsum < attrsize)
error = nfsm_advance(nd, attrsize - attrsum, -1);
}
nfsmout:
NFSEXITCODE2(error, nd);
return (error);
}
/*
* Implement sleep locks for newnfs. The nfslock_usecnt allows for a
* shared lock and the NFSXXX_LOCK flag permits an exclusive lock.
* The first argument is a pointer to an nfsv4lock structure.
* The second argument is 1 iff a blocking lock is wanted.
* If this argument is 0, the call waits until no thread either wants nor
* holds an exclusive lock.
* It returns 1 if the lock was acquired, 0 otherwise.
* If several processes call this function concurrently wanting the exclusive
* lock, one will get the lock and the rest will return without getting the
* lock. (If the caller must have the lock, it simply calls this function in a
* loop until the function returns 1 to indicate the lock was acquired.)
* Any usecnt must be decremented by calling nfsv4_relref() before
* calling nfsv4_lock(). It was done this way, so nfsv4_lock() could
* be called in a loop.
* The isleptp argument is set to indicate if the call slept, iff not NULL
* and the mp argument indicates to check for a forced dismount, iff not
* NULL.
*/
APPLESTATIC int
nfsv4_lock(struct nfsv4lock *lp, int iwantlock, int *isleptp,
void *mutex, struct mount *mp)
{
if (isleptp)
*isleptp = 0;
/*
* If a lock is wanted, loop around until the lock is acquired by
* someone and then released. If I want the lock, try to acquire it.
* For a lock to be issued, no lock must be in force and the usecnt
* must be zero.
*/
if (iwantlock) {
if (!(lp->nfslock_lock & NFSV4LOCK_LOCK) &&
lp->nfslock_usecnt == 0) {
lp->nfslock_lock &= ~NFSV4LOCK_LOCKWANTED;
lp->nfslock_lock |= NFSV4LOCK_LOCK;
return (1);
}
lp->nfslock_lock |= NFSV4LOCK_LOCKWANTED;
}
while (lp->nfslock_lock & (NFSV4LOCK_LOCK | NFSV4LOCK_LOCKWANTED)) {
if (mp != NULL && NFSCL_FORCEDISM(mp)) {
lp->nfslock_lock &= ~NFSV4LOCK_LOCKWANTED;
return (0);
}
lp->nfslock_lock |= NFSV4LOCK_WANTED;
if (isleptp)
*isleptp = 1;
(void) nfsmsleep(&lp->nfslock_lock, mutex,
PZERO - 1, "nfsv4lck", NULL);
if (iwantlock && !(lp->nfslock_lock & NFSV4LOCK_LOCK) &&
lp->nfslock_usecnt == 0) {
lp->nfslock_lock &= ~NFSV4LOCK_LOCKWANTED;
lp->nfslock_lock |= NFSV4LOCK_LOCK;
return (1);
}
}
return (0);
}
/*
* Release the lock acquired by nfsv4_lock().
* The second argument is set to 1 to indicate the nfslock_usecnt should be
* incremented, as well.
*/
APPLESTATIC void
nfsv4_unlock(struct nfsv4lock *lp, int incref)
{
lp->nfslock_lock &= ~NFSV4LOCK_LOCK;
if (incref)
lp->nfslock_usecnt++;
nfsv4_wanted(lp);
}
/*
* Release a reference cnt.
*/
APPLESTATIC void
nfsv4_relref(struct nfsv4lock *lp)
{
if (lp->nfslock_usecnt <= 0)
panic("nfsv4root ref cnt");
lp->nfslock_usecnt--;
if (lp->nfslock_usecnt == 0)
nfsv4_wanted(lp);
}
/*
* Get a reference cnt.
* This function will wait for any exclusive lock to be released, but will
* not wait for threads that want the exclusive lock. If priority needs
* to be given to threads that need the exclusive lock, a call to nfsv4_lock()
* with the 2nd argument == 0 should be done before calling nfsv4_getref().
* If the mp argument is not NULL, check for NFSCL_FORCEDISM() being set and
* return without getting a refcnt for that case.
*/
APPLESTATIC void
nfsv4_getref(struct nfsv4lock *lp, int *isleptp, void *mutex,
struct mount *mp)
{
if (isleptp)
*isleptp = 0;
/*
* Wait for a lock held.
*/
while (lp->nfslock_lock & NFSV4LOCK_LOCK) {
if (mp != NULL && NFSCL_FORCEDISM(mp))
return;
lp->nfslock_lock |= NFSV4LOCK_WANTED;
if (isleptp)
*isleptp = 1;
(void) nfsmsleep(&lp->nfslock_lock, mutex,
PZERO - 1, "nfsv4gr", NULL);
}
if (mp != NULL && NFSCL_FORCEDISM(mp))
return;
lp->nfslock_usecnt++;
}
/*
* Get a reference as above, but return failure instead of sleeping if
* an exclusive lock is held.
*/
APPLESTATIC int
nfsv4_getref_nonblock(struct nfsv4lock *lp)
{
if ((lp->nfslock_lock & NFSV4LOCK_LOCK) != 0)
return (0);
lp->nfslock_usecnt++;
return (1);
}
/*
* Test for a lock. Return 1 if locked, 0 otherwise.
*/
APPLESTATIC int
nfsv4_testlock(struct nfsv4lock *lp)
{
if ((lp->nfslock_lock & NFSV4LOCK_LOCK) == 0 &&
lp->nfslock_usecnt == 0)
return (0);
return (1);
}
/*
* Wake up anyone sleeping, waiting for this lock.
*/
static void
nfsv4_wanted(struct nfsv4lock *lp)
{
if (lp->nfslock_lock & NFSV4LOCK_WANTED) {
lp->nfslock_lock &= ~NFSV4LOCK_WANTED;
wakeup((caddr_t)&lp->nfslock_lock);
}
}
/*
* Copy a string from an mbuf list into a character array.
* Return EBADRPC if there is an mbuf error,
* 0 otherwise.
*/
APPLESTATIC int
nfsrv_mtostr(struct nfsrv_descript *nd, char *str, int siz)
{
char *cp;
int xfer, len;
mbuf_t mp;
int rem, error = 0;
mp = nd->nd_md;
cp = nd->nd_dpos;
len = NFSMTOD(mp, caddr_t) + mbuf_len(mp) - cp;
rem = NFSM_RNDUP(siz) - siz;
while (siz > 0) {
if (len > siz)
xfer = siz;
else
xfer = len;
NFSBCOPY(cp, str, xfer);
str += xfer;
siz -= xfer;
if (siz > 0) {
mp = mbuf_next(mp);
if (mp == NULL) {
error = EBADRPC;
goto out;
}
cp = NFSMTOD(mp, caddr_t);
len = mbuf_len(mp);
} else {
cp += xfer;
len -= xfer;
}
}
*str = '\0';
nd->nd_dpos = cp;
nd->nd_md = mp;
if (rem > 0) {
if (len < rem)
error = nfsm_advance(nd, rem, len);
else
nd->nd_dpos += rem;
}
out:
NFSEXITCODE2(error, nd);
return (error);
}
/*
* Fill in the attributes as marked by the bitmap (V4).
*/
APPLESTATIC int
nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp,
NFSACL_T *saclp, struct vattr *vap, fhandle_t *fhp, int rderror,
nfsattrbit_t *attrbitp, struct ucred *cred, NFSPROC_T *p, int isdgram,
int reterr, int supports_nfsv4acls, int at_root, uint64_t mounted_on_fileno)
{
int bitpos, retnum = 0;
u_int32_t *tl;
int siz, prefixnum, error;
u_char *cp, namestr[NFSV4_SMALLSTR];
nfsattrbit_t attrbits, retbits;
nfsattrbit_t *retbitp = &retbits;
u_int32_t freenum, *retnump;
u_int64_t uquad;
struct statfs *fs;
struct nfsfsinfo fsinf;
struct timespec temptime;
NFSACL_T *aclp, *naclp = NULL;
#ifdef QUOTA
struct dqblk dqb;
uid_t savuid;
#endif
/*
* First, set the bits that can be filled and get fsinfo.
*/
NFSSET_ATTRBIT(retbitp, attrbitp);
/*
* If both p and cred are NULL, it is a client side setattr call.
* If both p and cred are not NULL, it is a server side reply call.
* If p is not NULL and cred is NULL, it is a client side callback
* reply call.
*/
if (p == NULL && cred == NULL) {
NFSCLRNOTSETABLE_ATTRBIT(retbitp);
aclp = saclp;
} else {
NFSCLRNOTFILLABLE_ATTRBIT(retbitp);
naclp = acl_alloc(M_WAITOK);
aclp = naclp;
}
nfsvno_getfs(&fsinf, isdgram);
#ifndef APPLE
/*
* Get the VFS_STATFS(), since some attributes need them.
*/
fs = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
if (NFSISSETSTATFS_ATTRBIT(retbitp)) {
error = VFS_STATFS(mp, fs);
if (error != 0) {
if (reterr) {
nd->nd_repstat = NFSERR_ACCES;
free(fs, M_STATFS);
return (0);
}
NFSCLRSTATFS_ATTRBIT(retbitp);
}
}
#endif
/*
* And the NFSv4 ACL...
*/
if (NFSISSET_ATTRBIT(retbitp, NFSATTRBIT_ACLSUPPORT) &&
(nfsrv_useacl == 0 || ((cred != NULL || p != NULL) &&
supports_nfsv4acls == 0))) {
NFSCLRBIT_ATTRBIT(retbitp, NFSATTRBIT_ACLSUPPORT);
}
if (NFSISSET_ATTRBIT(retbitp, NFSATTRBIT_ACL)) {
if (nfsrv_useacl == 0 || ((cred != NULL || p != NULL) &&
supports_nfsv4acls == 0)) {
NFSCLRBIT_ATTRBIT(retbitp, NFSATTRBIT_ACL);
} else if (naclp != NULL) {
if (NFSVOPLOCK(vp, LK_SHARED) == 0) {
error = VOP_ACCESSX(vp, VREAD_ACL, cred, p);
if (error == 0)
error = VOP_GETACL(vp, ACL_TYPE_NFS4,
naclp, cred, p);
NFSVOPUNLOCK(vp, 0);
} else
error = NFSERR_PERM;
if (error != 0) {
if (reterr) {
nd->nd_repstat = NFSERR_ACCES;
free(fs, M_STATFS);
return (0);
}
NFSCLRBIT_ATTRBIT(retbitp, NFSATTRBIT_ACL);
}
}
}
/*
* Put out the attribute bitmap for the ones being filled in
* and get the field for the number of attributes returned.
*/
prefixnum = nfsrv_putattrbit(nd, retbitp);
NFSM_BUILD(retnump, u_int32_t *, NFSX_UNSIGNED);
prefixnum += NFSX_UNSIGNED;
/*
* Now, loop around filling in the attributes for each bit set.
*/
for (bitpos = 0; bitpos < NFSATTRBIT_MAX; bitpos++) {
if (NFSISSET_ATTRBIT(retbitp, bitpos)) {
switch (bitpos) {
case NFSATTRBIT_SUPPORTEDATTRS:
NFSSETSUPP_ATTRBIT(&attrbits);
if (nfsrv_useacl == 0 || ((cred != NULL || p != NULL)
&& supports_nfsv4acls == 0)) {
NFSCLRBIT_ATTRBIT(&attrbits,NFSATTRBIT_ACLSUPPORT);
NFSCLRBIT_ATTRBIT(&attrbits,NFSATTRBIT_ACL);
}
retnum += nfsrv_putattrbit(nd, &attrbits);
break;
case NFSATTRBIT_TYPE:
NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
*tl = vtonfsv34_type(vap->va_type);
retnum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_FHEXPIRETYPE:
NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
*tl = txdr_unsigned(NFSV4FHTYPE_PERSISTENT);
retnum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_CHANGE:
NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER);
txdr_hyper(vap->va_filerev, tl);
retnum += NFSX_HYPER;
break;
case NFSATTRBIT_SIZE:
NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER);
txdr_hyper(vap->va_size, tl);
retnum += NFSX_HYPER;
break;
case NFSATTRBIT_LINKSUPPORT:
NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
if (fsinf.fs_properties & NFSV3FSINFO_LINK)
*tl = newnfs_true;
else
*tl = newnfs_false;
retnum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_SYMLINKSUPPORT:
NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
if (fsinf.fs_properties & NFSV3FSINFO_SYMLINK)
*tl = newnfs_true;
else
*tl = newnfs_false;
retnum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_NAMEDATTR:
NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
*tl = newnfs_false;
retnum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_FSID:
NFSM_BUILD(tl, u_int32_t *, NFSX_V4FSID);
*tl++ = 0;
*tl++ = txdr_unsigned(mp->mnt_stat.f_fsid.val[0]);
*tl++ = 0;
*tl = txdr_unsigned(mp->mnt_stat.f_fsid.val[1]);
retnum += NFSX_V4FSID;
break;
case NFSATTRBIT_UNIQUEHANDLES:
NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
*tl = newnfs_true;
retnum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_LEASETIME:
NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
*tl = txdr_unsigned(nfsrv_lease);
retnum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_RDATTRERROR:
NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
*tl = txdr_unsigned(rderror);
retnum += NFSX_UNSIGNED;
break;
/*
* Recommended Attributes. (Only the supported ones.)
*/
case NFSATTRBIT_ACL:
retnum += nfsrv_buildacl(nd, aclp, vnode_vtype(vp), p);
break;
case NFSATTRBIT_ACLSUPPORT:
NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
*tl = txdr_unsigned(NFSV4ACE_SUPTYPES);
retnum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_CANSETTIME:
NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
if (fsinf.fs_properties & NFSV3FSINFO_CANSETTIME)
*tl = newnfs_true;
else
*tl = newnfs_false;
retnum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_CASEINSENSITIVE:
NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
*tl = newnfs_false;
retnum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_CASEPRESERVING:
NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
*tl = newnfs_true;
retnum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_CHOWNRESTRICTED:
NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
*tl = newnfs_true;
retnum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_FILEHANDLE:
retnum += nfsm_fhtom(nd, (u_int8_t *)fhp, 0, 0);
break;
case NFSATTRBIT_FILEID:
NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER);
uquad = vap->va_fileid;
txdr_hyper(uquad, tl);
retnum += NFSX_HYPER;
break;
case NFSATTRBIT_FILESAVAIL:
/*
* Check quota and use min(quota, f_ffree).
*/
freenum = fs->f_ffree;
#ifdef QUOTA
/*
* ufs_quotactl() insists that the uid argument
* equal p_ruid for non-root quota access, so
* we'll just make sure that's the case.
*/
savuid = p->p_cred->p_ruid;
p->p_cred->p_ruid = cred->cr_uid;
if (!VFS_QUOTACTL(mp, QCMD(Q_GETQUOTA,USRQUOTA),
cred->cr_uid, (caddr_t)&dqb))
freenum = min(dqb.dqb_isoftlimit-dqb.dqb_curinodes,
freenum);
p->p_cred->p_ruid = savuid;
#endif /* QUOTA */
NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER);
*tl++ = 0;
*tl = txdr_unsigned(freenum);
retnum += NFSX_HYPER;
break;
case NFSATTRBIT_FILESFREE:
NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER);
*tl++ = 0;
*tl = txdr_unsigned(fs->f_ffree);
retnum += NFSX_HYPER;
break;
case NFSATTRBIT_FILESTOTAL:
NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER);
*tl++ = 0;
*tl = txdr_unsigned(fs->f_files);
retnum += NFSX_HYPER;
break;
case NFSATTRBIT_FSLOCATIONS:
NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
*tl++ = 0;
*tl = 0;
retnum += 2 * NFSX_UNSIGNED;
break;
case NFSATTRBIT_HOMOGENEOUS:
NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
if (fsinf.fs_properties & NFSV3FSINFO_HOMOGENEOUS)
*tl = newnfs_true;
else
*tl = newnfs_false;
retnum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_MAXFILESIZE:
NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER);
uquad = NFSRV_MAXFILESIZE;
txdr_hyper(uquad, tl);
retnum += NFSX_HYPER;
break;
case NFSATTRBIT_MAXLINK:
NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
*tl = txdr_unsigned(NFS_LINK_MAX);
retnum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_MAXNAME:
NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
*tl = txdr_unsigned(NFS_MAXNAMLEN);
retnum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_MAXREAD:
NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER);
*tl++ = 0;
*tl = txdr_unsigned(fsinf.fs_rtmax);
retnum += NFSX_HYPER;
break;
case NFSATTRBIT_MAXWRITE:
NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER);
*tl++ = 0;
*tl = txdr_unsigned(fsinf.fs_wtmax);
retnum += NFSX_HYPER;
break;
case NFSATTRBIT_MODE:
NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
*tl = vtonfsv34_mode(vap->va_mode);
retnum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_NOTRUNC:
NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
*tl = newnfs_true;
retnum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_NUMLINKS:
NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
*tl = txdr_unsigned(vap->va_nlink);
retnum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_OWNER:
cp = namestr;
nfsv4_uidtostr(vap->va_uid, &cp, &siz, p);
retnum += nfsm_strtom(nd, cp, siz);
if (cp != namestr)
free(cp, M_NFSSTRING);
break;
case NFSATTRBIT_OWNERGROUP:
cp = namestr;
nfsv4_gidtostr(vap->va_gid, &cp, &siz, p);
retnum += nfsm_strtom(nd, cp, siz);
if (cp != namestr)
free(cp, M_NFSSTRING);
break;
case NFSATTRBIT_QUOTAHARD:
if (priv_check_cred(cred, PRIV_VFS_EXCEEDQUOTA, 0))
freenum = fs->f_bfree;
else
freenum = fs->f_bavail;
#ifdef QUOTA
/*
* ufs_quotactl() insists that the uid argument
* equal p_ruid for non-root quota access, so
* we'll just make sure that's the case.
*/
savuid = p->p_cred->p_ruid;
p->p_cred->p_ruid = cred->cr_uid;
if (!VFS_QUOTACTL(mp, QCMD(Q_GETQUOTA,USRQUOTA),
cred->cr_uid, (caddr_t)&dqb))
freenum = min(dqb.dqb_bhardlimit, freenum);
p->p_cred->p_ruid = savuid;
#endif /* QUOTA */
NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER);
uquad = (u_int64_t)freenum;
NFSQUOTABLKTOBYTE(uquad, fs->f_bsize);
txdr_hyper(uquad, tl);
retnum += NFSX_HYPER;
break;
case NFSATTRBIT_QUOTASOFT:
if (priv_check_cred(cred, PRIV_VFS_EXCEEDQUOTA, 0))
freenum = fs->f_bfree;
else
freenum = fs->f_bavail;
#ifdef QUOTA
/*
* ufs_quotactl() insists that the uid argument
* equal p_ruid for non-root quota access, so
* we'll just make sure that's the case.
*/
savuid = p->p_cred->p_ruid;
p->p_cred->p_ruid = cred->cr_uid;
if (!VFS_QUOTACTL(mp, QCMD(Q_GETQUOTA,USRQUOTA),
cred->cr_uid, (caddr_t)&dqb))
freenum = min(dqb.dqb_bsoftlimit, freenum);
p->p_cred->p_ruid = savuid;
#endif /* QUOTA */
NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER);
uquad = (u_int64_t)freenum;
NFSQUOTABLKTOBYTE(uquad, fs->f_bsize);
txdr_hyper(uquad, tl);
retnum += NFSX_HYPER;
break;
case NFSATTRBIT_QUOTAUSED:
freenum = 0;
#ifdef QUOTA
/*
* ufs_quotactl() insists that the uid argument
* equal p_ruid for non-root quota access, so
* we'll just make sure that's the case.
*/
savuid = p->p_cred->p_ruid;
p->p_cred->p_ruid = cred->cr_uid;
if (!VFS_QUOTACTL(mp, QCMD(Q_GETQUOTA,USRQUOTA),
cred->cr_uid, (caddr_t)&dqb))
freenum = dqb.dqb_curblocks;
p->p_cred->p_ruid = savuid;
#endif /* QUOTA */
NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER);
uquad = (u_int64_t)freenum;
NFSQUOTABLKTOBYTE(uquad, fs->f_bsize);
txdr_hyper(uquad, tl);
retnum += NFSX_HYPER;
break;
case NFSATTRBIT_RAWDEV:
NFSM_BUILD(tl, u_int32_t *, NFSX_V4SPECDATA);
*tl++ = txdr_unsigned(NFSMAJOR(vap->va_rdev));
*tl = txdr_unsigned(NFSMINOR(vap->va_rdev));
retnum += NFSX_V4SPECDATA;
break;
case NFSATTRBIT_SPACEAVAIL:
NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER);
if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0))
uquad = (u_int64_t)fs->f_bfree;
else
uquad = (u_int64_t)fs->f_bavail;
uquad *= fs->f_bsize;
txdr_hyper(uquad, tl);
retnum += NFSX_HYPER;
break;
case NFSATTRBIT_SPACEFREE:
NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER);
uquad = (u_int64_t)fs->f_bfree;
uquad *= fs->f_bsize;
txdr_hyper(uquad, tl);
retnum += NFSX_HYPER;
break;
case NFSATTRBIT_SPACETOTAL:
NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER);
uquad = (u_int64_t)fs->f_blocks;
uquad *= fs->f_bsize;
txdr_hyper(uquad, tl);
retnum += NFSX_HYPER;
break;
case NFSATTRBIT_SPACEUSED:
NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER);
txdr_hyper(vap->va_bytes, tl);
retnum += NFSX_HYPER;
break;
case NFSATTRBIT_TIMEACCESS:
NFSM_BUILD(tl, u_int32_t *, NFSX_V4TIME);
txdr_nfsv4time(&vap->va_atime, tl);
retnum += NFSX_V4TIME;
break;
case NFSATTRBIT_TIMEACCESSSET:
if ((vap->va_vaflags & VA_UTIMES_NULL) == 0) {
NFSM_BUILD(tl, u_int32_t *, NFSX_V4SETTIME);
*tl++ = txdr_unsigned(NFSV4SATTRTIME_TOCLIENT);
txdr_nfsv4time(&vap->va_atime, tl);
retnum += NFSX_V4SETTIME;
} else {
NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
*tl = txdr_unsigned(NFSV4SATTRTIME_TOSERVER);
retnum += NFSX_UNSIGNED;
}
break;
case NFSATTRBIT_TIMEDELTA:
NFSM_BUILD(tl, u_int32_t *, NFSX_V4TIME);
temptime.tv_sec = 0;
temptime.tv_nsec = 1000000000 / hz;
txdr_nfsv4time(&temptime, tl);
retnum += NFSX_V4TIME;
break;
case NFSATTRBIT_TIMEMETADATA:
NFSM_BUILD(tl, u_int32_t *, NFSX_V4TIME);
txdr_nfsv4time(&vap->va_ctime, tl);
retnum += NFSX_V4TIME;
break;
case NFSATTRBIT_TIMEMODIFY:
NFSM_BUILD(tl, u_int32_t *, NFSX_V4TIME);
txdr_nfsv4time(&vap->va_mtime, tl);
retnum += NFSX_V4TIME;
break;
case NFSATTRBIT_TIMEMODIFYSET:
if ((vap->va_vaflags & VA_UTIMES_NULL) == 0) {
NFSM_BUILD(tl, u_int32_t *, NFSX_V4SETTIME);
*tl++ = txdr_unsigned(NFSV4SATTRTIME_TOCLIENT);
txdr_nfsv4time(&vap->va_mtime, tl);
retnum += NFSX_V4SETTIME;
} else {
NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
*tl = txdr_unsigned(NFSV4SATTRTIME_TOSERVER);
retnum += NFSX_UNSIGNED;
}
break;
case NFSATTRBIT_MOUNTEDONFILEID:
NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER);
if (at_root != 0)
uquad = mounted_on_fileno;
else
uquad = vap->va_fileid;
txdr_hyper(uquad, tl);
retnum += NFSX_HYPER;
break;
case NFSATTRBIT_SUPPATTREXCLCREAT:
NFSSETSUPP_ATTRBIT(&attrbits);
NFSCLRNOTSETABLE_ATTRBIT(&attrbits);
NFSCLRBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEACCESSSET);
retnum += nfsrv_putattrbit(nd, &attrbits);
break;
default:
printf("EEK! Bad V4 attribute bitpos=%d\n", bitpos);
}
}
}
if (naclp != NULL)
acl_free(naclp);
free(fs, M_STATFS);
*retnump = txdr_unsigned(retnum);
return (retnum + prefixnum);
}
/*
* Put the attribute bits onto an mbuf list.
* Return the number of bytes of output generated.
*/
APPLESTATIC int
nfsrv_putattrbit(struct nfsrv_descript *nd, nfsattrbit_t *attrbitp)
{
u_int32_t *tl;
int cnt, i, bytesize;
for (cnt = NFSATTRBIT_MAXWORDS; cnt > 0; cnt--)
if (attrbitp->bits[cnt - 1])
break;
bytesize = (cnt + 1) * NFSX_UNSIGNED;
NFSM_BUILD(tl, u_int32_t *, bytesize);
*tl++ = txdr_unsigned(cnt);
for (i = 0; i < cnt; i++)
*tl++ = txdr_unsigned(attrbitp->bits[i]);
return (bytesize);
}
/*
* Convert a uid to a string.
* If the lookup fails, just output the digits.
* uid - the user id
* cpp - points to a buffer of size NFSV4_SMALLSTR
* (malloc a larger one, as required)
* retlenp - pointer to length to be returned
*/
APPLESTATIC void
nfsv4_uidtostr(uid_t uid, u_char **cpp, int *retlenp, NFSPROC_T *p)
{
int i;
struct nfsusrgrp *usrp;
u_char *cp = *cpp;
uid_t tmp;
int cnt, hasampersand, len = NFSV4_SMALLSTR, ret;
struct nfsrv_lughash *hp;
cnt = 0;
tryagain:
if (nfsrv_dnsnamelen > 0 && !nfs_enable_uidtostring) {
/*
* Always map nfsrv_defaultuid to "nobody".
*/
if (uid == nfsrv_defaultuid) {
i = nfsrv_dnsnamelen + 7;
if (i > len) {
if (len > NFSV4_SMALLSTR)
free(cp, M_NFSSTRING);
cp = malloc(i, M_NFSSTRING, M_WAITOK);
*cpp = cp;
len = i;
goto tryagain;
}
*retlenp = i;
NFSBCOPY("nobody@", cp, 7);
cp += 7;
NFSBCOPY(nfsrv_dnsname, cp, nfsrv_dnsnamelen);
return;
}
hasampersand = 0;
hp = NFSUSERHASH(uid);
mtx_lock(&hp->mtx);
TAILQ_FOREACH(usrp, &hp->lughead, lug_numhash) {
if (usrp->lug_uid == uid) {
if (usrp->lug_expiry < NFSD_MONOSEC)
break;
/*
* If the name doesn't already have an '@'
* in it, append @domainname to it.
*/
for (i = 0; i < usrp->lug_namelen; i++) {
if (usrp->lug_name[i] == '@') {
hasampersand = 1;
break;
}
}
if (hasampersand)
i = usrp->lug_namelen;
else
i = usrp->lug_namelen +
nfsrv_dnsnamelen + 1;
if (i > len) {
mtx_unlock(&hp->mtx);
if (len > NFSV4_SMALLSTR)
free(cp, M_NFSSTRING);
cp = malloc(i, M_NFSSTRING, M_WAITOK);
*cpp = cp;
len = i;
goto tryagain;
}
*retlenp = i;
NFSBCOPY(usrp->lug_name, cp, usrp->lug_namelen);
if (!hasampersand) {
cp += usrp->lug_namelen;
*cp++ = '@';
NFSBCOPY(nfsrv_dnsname, cp, nfsrv_dnsnamelen);
}
TAILQ_REMOVE(&hp->lughead, usrp, lug_numhash);
TAILQ_INSERT_TAIL(&hp->lughead, usrp,
lug_numhash);
mtx_unlock(&hp->mtx);
return;
}
}
mtx_unlock(&hp->mtx);
cnt++;
ret = nfsrv_getuser(RPCNFSUSERD_GETUID, uid, (gid_t)0,
NULL, p);
if (ret == 0 && cnt < 2)
goto tryagain;
}
/*
* No match, just return a string of digits.
*/
tmp = uid;
i = 0;
while (tmp || i == 0) {
tmp /= 10;
i++;
}
len = (i > len) ? len : i;
*retlenp = len;
cp += (len - 1);
tmp = uid;
for (i = 0; i < len; i++) {
*cp-- = '0' + (tmp % 10);
tmp /= 10;
}
return;
}
/*
* Get a credential for the uid with the server's group list.
* If none is found, just return the credential passed in after
* logging a warning message.
*/
struct ucred *
nfsrv_getgrpscred(struct ucred *oldcred)
{
struct nfsusrgrp *usrp;
struct ucred *newcred;
int cnt, ret;
uid_t uid;
struct nfsrv_lughash *hp;
cnt = 0;
uid = oldcred->cr_uid;
tryagain:
if (nfsrv_dnsnamelen > 0) {
hp = NFSUSERHASH(uid);
mtx_lock(&hp->mtx);
TAILQ_FOREACH(usrp, &hp->lughead, lug_numhash) {
if (usrp->lug_uid == uid) {
if (usrp->lug_expiry < NFSD_MONOSEC)
break;
if (usrp->lug_cred != NULL) {
newcred = crhold(usrp->lug_cred);
crfree(oldcred);
} else
newcred = oldcred;
TAILQ_REMOVE(&hp->lughead, usrp, lug_numhash);
TAILQ_INSERT_TAIL(&hp->lughead, usrp,
lug_numhash);
mtx_unlock(&hp->mtx);
return (newcred);
}
}
mtx_unlock(&hp->mtx);
cnt++;
ret = nfsrv_getuser(RPCNFSUSERD_GETUID, uid, (gid_t)0,
NULL, curthread);
if (ret == 0 && cnt < 2)
goto tryagain;
}
return (oldcred);
}
/*
* Convert a string to a uid.
* If no conversion is possible return NFSERR_BADOWNER, otherwise
* return 0.
* If this is called from a client side mount using AUTH_SYS and the
* string is made up entirely of digits, just convert the string to
* a number.
*/
APPLESTATIC int
nfsv4_strtouid(struct nfsrv_descript *nd, u_char *str, int len, uid_t *uidp,
NFSPROC_T *p)
{
int i;
char *cp, *endstr, *str0;
struct nfsusrgrp *usrp;
int cnt, ret;
int error = 0;
uid_t tuid;
struct nfsrv_lughash *hp, *hp2;
if (len == 0) {
error = NFSERR_BADOWNER;
goto out;
}
/* If a string of digits and an AUTH_SYS mount, just convert it. */
str0 = str;
tuid = (uid_t)strtoul(str0, &endstr, 10);
if ((endstr - str0) == len) {
/* A numeric string. */
if ((nd->nd_flag & ND_KERBV) == 0 &&
((nd->nd_flag & ND_NFSCL) != 0 ||
nfsd_enable_stringtouid != 0))
*uidp = tuid;
else
error = NFSERR_BADOWNER;
goto out;
}
/*
* Look for an '@'.
*/
cp = strchr(str0, '@');
if (cp != NULL)
i = (int)(cp++ - str0);
else
i = len;
cnt = 0;
tryagain:
if (nfsrv_dnsnamelen > 0) {
/*
* If an '@' is found and the domain name matches, search for
* the name with dns stripped off.
* Mixed case alpahbetics will match for the domain name, but
* all upper case will not.
*/
if (cnt == 0 && i < len && i > 0 &&
(len - 1 - i) == nfsrv_dnsnamelen &&
!nfsrv_cmpmixedcase(cp, nfsrv_dnsname, nfsrv_dnsnamelen)) {
len -= (nfsrv_dnsnamelen + 1);
*(cp - 1) = '\0';
}
/*
* Check for the special case of "nobody".
*/
if (len == 6 && !NFSBCMP(str, "nobody", 6)) {
*uidp = nfsrv_defaultuid;
error = 0;
goto out;
}
hp = NFSUSERNAMEHASH(str, len);
mtx_lock(&hp->mtx);
TAILQ_FOREACH(usrp, &hp->lughead, lug_namehash) {
if (usrp->lug_namelen == len &&
!NFSBCMP(usrp->lug_name, str, len)) {
if (usrp->lug_expiry < NFSD_MONOSEC)
break;
hp2 = NFSUSERHASH(usrp->lug_uid);
mtx_lock(&hp2->mtx);
TAILQ_REMOVE(&hp2->lughead, usrp, lug_numhash);
TAILQ_INSERT_TAIL(&hp2->lughead, usrp,
lug_numhash);
*uidp = usrp->lug_uid;
mtx_unlock(&hp2->mtx);
mtx_unlock(&hp->mtx);
error = 0;
goto out;
}
}
mtx_unlock(&hp->mtx);
cnt++;
ret = nfsrv_getuser(RPCNFSUSERD_GETUSER, (uid_t)0, (gid_t)0,
str, p);
if (ret == 0 && cnt < 2)
goto tryagain;
}
error = NFSERR_BADOWNER;
out:
NFSEXITCODE(error);
return (error);
}
/*
* Convert a gid to a string.
* gid - the group id
* cpp - points to a buffer of size NFSV4_SMALLSTR
* (malloc a larger one, as required)
* retlenp - pointer to length to be returned
*/
APPLESTATIC void
nfsv4_gidtostr(gid_t gid, u_char **cpp, int *retlenp, NFSPROC_T *p)
{
int i;
struct nfsusrgrp *usrp;
u_char *cp = *cpp;
gid_t tmp;
int cnt, hasampersand, len = NFSV4_SMALLSTR, ret;
struct nfsrv_lughash *hp;
cnt = 0;
tryagain:
if (nfsrv_dnsnamelen > 0 && !nfs_enable_uidtostring) {
/*
* Always map nfsrv_defaultgid to "nogroup".
*/
if (gid == nfsrv_defaultgid) {
i = nfsrv_dnsnamelen + 8;
if (i > len) {
if (len > NFSV4_SMALLSTR)
free(cp, M_NFSSTRING);
cp = malloc(i, M_NFSSTRING, M_WAITOK);
*cpp = cp;
len = i;
goto tryagain;
}
*retlenp = i;
NFSBCOPY("nogroup@", cp, 8);
cp += 8;
NFSBCOPY(nfsrv_dnsname, cp, nfsrv_dnsnamelen);
return;
}
hasampersand = 0;
hp = NFSGROUPHASH(gid);
mtx_lock(&hp->mtx);
TAILQ_FOREACH(usrp, &hp->lughead, lug_numhash) {
if (usrp->lug_gid == gid) {
if (usrp->lug_expiry < NFSD_MONOSEC)
break;
/*
* If the name doesn't already have an '@'
* in it, append @domainname to it.
*/
for (i = 0; i < usrp->lug_namelen; i++) {
if (usrp->lug_name[i] == '@') {
hasampersand = 1;
break;
}
}
if (hasampersand)
i = usrp->lug_namelen;
else
i = usrp->lug_namelen +
nfsrv_dnsnamelen + 1;
if (i > len) {
mtx_unlock(&hp->mtx);
if (len > NFSV4_SMALLSTR)
free(cp, M_NFSSTRING);
cp = malloc(i, M_NFSSTRING, M_WAITOK);
*cpp = cp;
len = i;
goto tryagain;
}
*retlenp = i;
NFSBCOPY(usrp->lug_name, cp, usrp->lug_namelen);
if (!hasampersand) {
cp += usrp->lug_namelen;
*cp++ = '@';
NFSBCOPY(nfsrv_dnsname, cp, nfsrv_dnsnamelen);
}
TAILQ_REMOVE(&hp->lughead, usrp, lug_numhash);
TAILQ_INSERT_TAIL(&hp->lughead, usrp,
lug_numhash);
mtx_unlock(&hp->mtx);
return;
}
}
mtx_unlock(&hp->mtx);
cnt++;
ret = nfsrv_getuser(RPCNFSUSERD_GETGID, (uid_t)0, gid,
NULL, p);
if (ret == 0 && cnt < 2)
goto tryagain;
}
/*
* No match, just return a string of digits.
*/
tmp = gid;
i = 0;
while (tmp || i == 0) {
tmp /= 10;
i++;
}
len = (i > len) ? len : i;
*retlenp = len;
cp += (len - 1);
tmp = gid;
for (i = 0; i < len; i++) {
*cp-- = '0' + (tmp % 10);
tmp /= 10;
}
return;
}
/*
* Convert a string to a gid.
* If no conversion is possible return NFSERR_BADOWNER, otherwise
* return 0.
* If this is called from a client side mount using AUTH_SYS and the
* string is made up entirely of digits, just convert the string to
* a number.
*/
APPLESTATIC int
nfsv4_strtogid(struct nfsrv_descript *nd, u_char *str, int len, gid_t *gidp,
NFSPROC_T *p)
{
int i;
char *cp, *endstr, *str0;
struct nfsusrgrp *usrp;
int cnt, ret;
int error = 0;
gid_t tgid;
struct nfsrv_lughash *hp, *hp2;
if (len == 0) {
error = NFSERR_BADOWNER;
goto out;
}
/* If a string of digits and an AUTH_SYS mount, just convert it. */
str0 = str;
tgid = (gid_t)strtoul(str0, &endstr, 10);
if ((endstr - str0) == len) {
/* A numeric string. */
if ((nd->nd_flag & ND_KERBV) == 0 &&
((nd->nd_flag & ND_NFSCL) != 0 ||
nfsd_enable_stringtouid != 0))
*gidp = tgid;
else
error = NFSERR_BADOWNER;
goto out;
}
/*
* Look for an '@'.
*/
cp = strchr(str0, '@');
if (cp != NULL)
i = (int)(cp++ - str0);
else
i = len;
cnt = 0;
tryagain:
if (nfsrv_dnsnamelen > 0) {
/*
* If an '@' is found and the dns name matches, search for the
* name with the dns stripped off.
*/
if (cnt == 0 && i < len && i > 0 &&
(len - 1 - i) == nfsrv_dnsnamelen &&
!nfsrv_cmpmixedcase(cp, nfsrv_dnsname, nfsrv_dnsnamelen)) {
len -= (nfsrv_dnsnamelen + 1);
*(cp - 1) = '\0';
}
/*
* Check for the special case of "nogroup".
*/
if (len == 7 && !NFSBCMP(str, "nogroup", 7)) {
*gidp = nfsrv_defaultgid;
error = 0;
goto out;
}
hp = NFSGROUPNAMEHASH(str, len);
mtx_lock(&hp->mtx);
TAILQ_FOREACH(usrp, &hp->lughead, lug_namehash) {
if (usrp->lug_namelen == len &&
!NFSBCMP(usrp->lug_name, str, len)) {
if (usrp->lug_expiry < NFSD_MONOSEC)
break;
hp2 = NFSGROUPHASH(usrp->lug_gid);
mtx_lock(&hp2->mtx);
TAILQ_REMOVE(&hp2->lughead, usrp, lug_numhash);
TAILQ_INSERT_TAIL(&hp2->lughead, usrp,
lug_numhash);
*gidp = usrp->lug_gid;
mtx_unlock(&hp2->mtx);
mtx_unlock(&hp->mtx);
error = 0;
goto out;
}
}
mtx_unlock(&hp->mtx);
cnt++;
ret = nfsrv_getuser(RPCNFSUSERD_GETGROUP, (uid_t)0, (gid_t)0,
str, p);
if (ret == 0 && cnt < 2)
goto tryagain;
}
error = NFSERR_BADOWNER;
out:
NFSEXITCODE(error);
return (error);
}
/*
* Cmp len chars, allowing mixed case in the first argument to match lower
* case in the second, but not if the first argument is all upper case.
* Return 0 for a match, 1 otherwise.
*/
static int
nfsrv_cmpmixedcase(u_char *cp, u_char *cp2, int len)
{
int i;
u_char tmp;
int fndlower = 0;
for (i = 0; i < len; i++) {
if (*cp >= 'A' && *cp <= 'Z') {
tmp = *cp++ + ('a' - 'A');
} else {
tmp = *cp++;
if (tmp >= 'a' && tmp <= 'z')
fndlower = 1;
}
if (tmp != *cp2++)
return (1);
}
if (fndlower)
return (0);
else
return (1);
}
/*
* Set the port for the nfsuserd.
*/
APPLESTATIC int
nfsrv_nfsuserdport(struct sockaddr *sad, u_short port, NFSPROC_T *p)
{
struct nfssockreq *rp;
struct sockaddr_in *ad;
int error;
NFSLOCKNAMEID();
if (nfsrv_nfsuserd) {
NFSUNLOCKNAMEID();
error = EPERM;
NFSSOCKADDRFREE(sad);
goto out;
}
nfsrv_nfsuserd = 1;
NFSUNLOCKNAMEID();
/*
* Set up the socket record and connect.
*/
rp = &nfsrv_nfsuserdsock;
rp->nr_client = NULL;
rp->nr_cred = NULL;
rp->nr_lock = (NFSR_RESERVEDPORT | NFSR_LOCALHOST);
if (sad != NULL) {
/* Use the AF_LOCAL socket address passed in. */
rp->nr_sotype = SOCK_STREAM;
rp->nr_soproto = 0;
rp->nr_nam = sad;
} else {
/* Use the port# for a UDP socket (old nfsuserd). */
rp->nr_sotype = SOCK_DGRAM;
rp->nr_soproto = IPPROTO_UDP;
NFSSOCKADDRALLOC(rp->nr_nam);
NFSSOCKADDRSIZE(rp->nr_nam, sizeof (struct sockaddr_in));
ad = NFSSOCKADDR(rp->nr_nam, struct sockaddr_in *);
ad->sin_family = AF_INET;
ad->sin_addr.s_addr = htonl((u_int32_t)0x7f000001);
ad->sin_port = port;
}
rp->nr_prog = RPCPROG_NFSUSERD;
rp->nr_vers = RPCNFSUSERD_VERS;
error = newnfs_connect(NULL, rp, NFSPROCCRED(p), p, 0);
if (error) {
NFSSOCKADDRFREE(rp->nr_nam);
nfsrv_nfsuserd = 0;
}
out:
NFSEXITCODE(error);
return (error);
}
/*
* Delete the nfsuserd port.
*/
APPLESTATIC void
nfsrv_nfsuserddelport(void)
{
NFSLOCKNAMEID();
if (nfsrv_nfsuserd == 0) {
NFSUNLOCKNAMEID();
return;
}
nfsrv_nfsuserd = 0;
NFSUNLOCKNAMEID();
newnfs_disconnect(&nfsrv_nfsuserdsock);
NFSSOCKADDRFREE(nfsrv_nfsuserdsock.nr_nam);
}
/*
* Do upcalls to the nfsuserd, for cache misses of the owner/ownergroup
* name<-->id cache.
* Returns 0 upon success, non-zero otherwise.
*/
static int
nfsrv_getuser(int procnum, uid_t uid, gid_t gid, char *name, NFSPROC_T *p)
{
u_int32_t *tl;
struct nfsrv_descript *nd;
int len;
struct nfsrv_descript nfsd;
struct ucred *cred;
int error;
NFSLOCKNAMEID();
if (nfsrv_nfsuserd == 0) {
NFSUNLOCKNAMEID();
error = EPERM;
goto out;
}
NFSUNLOCKNAMEID();
nd = &nfsd;
cred = newnfs_getcred();
nd->nd_flag = ND_GSSINITREPLY;
nfsrvd_rephead(nd);
nd->nd_procnum = procnum;
if (procnum == RPCNFSUSERD_GETUID || procnum == RPCNFSUSERD_GETGID) {
NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
if (procnum == RPCNFSUSERD_GETUID)
*tl = txdr_unsigned(uid);
else
*tl = txdr_unsigned(gid);
} else {
len = strlen(name);
(void) nfsm_strtom(nd, name, len);
}
error = newnfs_request(nd, NULL, NULL, &nfsrv_nfsuserdsock, NULL, NULL,
cred, RPCPROG_NFSUSERD, RPCNFSUSERD_VERS, NULL, 0, NULL, NULL);
NFSFREECRED(cred);
if (!error) {
mbuf_freem(nd->nd_mrep);
error = nd->nd_repstat;
}
out:
NFSEXITCODE(error);
return (error);
}
/*
* This function is called from the nfssvc(2) system call, to update the
* kernel user/group name list(s) for the V4 owner and ownergroup attributes.
*/
APPLESTATIC int
nfssvc_idname(struct nfsd_idargs *nidp)
{
struct nfsusrgrp *nusrp, *usrp, *newusrp;
struct nfsrv_lughash *hp_name, *hp_idnum, *thp;
int i, group_locked, groupname_locked, user_locked, username_locked;
int error = 0;
u_char *cp;
gid_t *grps;
struct ucred *cr;
static int onethread = 0;
static time_t lasttime = 0;
if (nidp->nid_namelen <= 0 || nidp->nid_namelen > MAXHOSTNAMELEN) {
error = EINVAL;
goto out;
}
if (nidp->nid_flag & NFSID_INITIALIZE) {
cp = malloc(nidp->nid_namelen + 1, M_NFSSTRING, M_WAITOK);
error = copyin(CAST_USER_ADDR_T(nidp->nid_name), cp,
nidp->nid_namelen);
if (error != 0) {
free(cp, M_NFSSTRING);
goto out;
}
if (atomic_cmpset_acq_int(&nfsrv_dnsnamelen, 0, 0) == 0) {
/*
* Free up all the old stuff and reinitialize hash
* lists. All mutexes for both lists must be locked,
* with the user/group name ones before the uid/gid
* ones, to avoid a LOR.
*/
for (i = 0; i < nfsrv_lughashsize; i++)
mtx_lock(&nfsusernamehash[i].mtx);
for (i = 0; i < nfsrv_lughashsize; i++)
mtx_lock(&nfsuserhash[i].mtx);
for (i = 0; i < nfsrv_lughashsize; i++)
TAILQ_FOREACH_SAFE(usrp,
&nfsuserhash[i].lughead, lug_numhash, nusrp)
nfsrv_removeuser(usrp, 1);
for (i = 0; i < nfsrv_lughashsize; i++)
mtx_unlock(&nfsuserhash[i].mtx);
for (i = 0; i < nfsrv_lughashsize; i++)
mtx_unlock(&nfsusernamehash[i].mtx);
for (i = 0; i < nfsrv_lughashsize; i++)
mtx_lock(&nfsgroupnamehash[i].mtx);
for (i = 0; i < nfsrv_lughashsize; i++)
mtx_lock(&nfsgrouphash[i].mtx);
for (i = 0; i < nfsrv_lughashsize; i++)
TAILQ_FOREACH_SAFE(usrp,
&nfsgrouphash[i].lughead, lug_numhash,
nusrp)
nfsrv_removeuser(usrp, 0);
for (i = 0; i < nfsrv_lughashsize; i++)
mtx_unlock(&nfsgrouphash[i].mtx);
for (i = 0; i < nfsrv_lughashsize; i++)
mtx_unlock(&nfsgroupnamehash[i].mtx);
free(nfsrv_dnsname, M_NFSSTRING);
nfsrv_dnsname = NULL;
}
if (nfsuserhash == NULL) {
/* Allocate the hash tables. */
nfsuserhash = malloc(sizeof(struct nfsrv_lughash) *
nfsrv_lughashsize, M_NFSUSERGROUP, M_WAITOK |
M_ZERO);
for (i = 0; i < nfsrv_lughashsize; i++)
mtx_init(&nfsuserhash[i].mtx, "nfsuidhash",
NULL, MTX_DEF | MTX_DUPOK);
nfsusernamehash = malloc(sizeof(struct nfsrv_lughash) *
nfsrv_lughashsize, M_NFSUSERGROUP, M_WAITOK |
M_ZERO);
for (i = 0; i < nfsrv_lughashsize; i++)
mtx_init(&nfsusernamehash[i].mtx,
"nfsusrhash", NULL, MTX_DEF |
MTX_DUPOK);
nfsgrouphash = malloc(sizeof(struct nfsrv_lughash) *
nfsrv_lughashsize, M_NFSUSERGROUP, M_WAITOK |
M_ZERO);
for (i = 0; i < nfsrv_lughashsize; i++)
mtx_init(&nfsgrouphash[i].mtx, "nfsgidhash",
NULL, MTX_DEF | MTX_DUPOK);
nfsgroupnamehash = malloc(sizeof(struct nfsrv_lughash) *
nfsrv_lughashsize, M_NFSUSERGROUP, M_WAITOK |
M_ZERO);
for (i = 0; i < nfsrv_lughashsize; i++)
mtx_init(&nfsgroupnamehash[i].mtx,
"nfsgrphash", NULL, MTX_DEF | MTX_DUPOK);
}
/* (Re)initialize the list heads. */
for (i = 0; i < nfsrv_lughashsize; i++)
TAILQ_INIT(&nfsuserhash[i].lughead);
for (i = 0; i < nfsrv_lughashsize; i++)
TAILQ_INIT(&nfsusernamehash[i].lughead);
for (i = 0; i < nfsrv_lughashsize; i++)
TAILQ_INIT(&nfsgrouphash[i].lughead);
for (i = 0; i < nfsrv_lughashsize; i++)
TAILQ_INIT(&nfsgroupnamehash[i].lughead);
/*
* Put name in "DNS" string.
*/
nfsrv_dnsname = cp;
nfsrv_defaultuid = nidp->nid_uid;
nfsrv_defaultgid = nidp->nid_gid;
nfsrv_usercnt = 0;
nfsrv_usermax = nidp->nid_usermax;
atomic_store_rel_int(&nfsrv_dnsnamelen, nidp->nid_namelen);
goto out;
}
/*
* malloc the new one now, so any potential sleep occurs before
* manipulation of the lists.
*/
newusrp = malloc(sizeof(struct nfsusrgrp) + nidp->nid_namelen,
M_NFSUSERGROUP, M_WAITOK | M_ZERO);
error = copyin(CAST_USER_ADDR_T(nidp->nid_name), newusrp->lug_name,
nidp->nid_namelen);
if (error == 0 && nidp->nid_ngroup > 0 &&
(nidp->nid_flag & NFSID_ADDUID) != 0) {
grps = malloc(sizeof(gid_t) * nidp->nid_ngroup, M_TEMP,
M_WAITOK);
error = copyin(CAST_USER_ADDR_T(nidp->nid_grps), grps,
sizeof(gid_t) * nidp->nid_ngroup);
if (error == 0) {
/*
* Create a credential just like svc_getcred(),
* but using the group list provided.
*/
cr = crget();
cr->cr_uid = cr->cr_ruid = cr->cr_svuid = nidp->nid_uid;
crsetgroups(cr, nidp->nid_ngroup, grps);
cr->cr_rgid = cr->cr_svgid = cr->cr_groups[0];
cr->cr_prison = &prison0;
prison_hold(cr->cr_prison);
#ifdef MAC
mac_cred_associate_nfsd(cr);
#endif
newusrp->lug_cred = cr;
}
free(grps, M_TEMP);
}
if (error) {
free(newusrp, M_NFSUSERGROUP);
goto out;
}
newusrp->lug_namelen = nidp->nid_namelen;
/*
* The lock order is username[0]->[nfsrv_lughashsize - 1] followed
* by uid[0]->[nfsrv_lughashsize - 1], with the same for group.
* The flags user_locked, username_locked, group_locked and
* groupname_locked are set to indicate all of those hash lists are
* locked. hp_name != NULL and hp_idnum != NULL indicates that
* the respective one mutex is locked.
*/
user_locked = username_locked = group_locked = groupname_locked = 0;
hp_name = hp_idnum = NULL;
/*
* Delete old entries, as required.
*/
if (nidp->nid_flag & (NFSID_DELUID | NFSID_ADDUID)) {
/* Must lock all username hash lists first, to avoid a LOR. */
for (i = 0; i < nfsrv_lughashsize; i++)
mtx_lock(&nfsusernamehash[i].mtx);
username_locked = 1;
hp_idnum = NFSUSERHASH(nidp->nid_uid);
mtx_lock(&hp_idnum->mtx);
TAILQ_FOREACH_SAFE(usrp, &hp_idnum->lughead, lug_numhash,
nusrp) {
if (usrp->lug_uid == nidp->nid_uid)
nfsrv_removeuser(usrp, 1);
}
} else if (nidp->nid_flag & (NFSID_DELUSERNAME | NFSID_ADDUSERNAME)) {
hp_name = NFSUSERNAMEHASH(newusrp->lug_name,
newusrp->lug_namelen);
mtx_lock(&hp_name->mtx);
TAILQ_FOREACH_SAFE(usrp, &hp_name->lughead, lug_namehash,
nusrp) {
if (usrp->lug_namelen == newusrp->lug_namelen &&
!NFSBCMP(usrp->lug_name, newusrp->lug_name,
usrp->lug_namelen)) {
thp = NFSUSERHASH(usrp->lug_uid);
mtx_lock(&thp->mtx);
nfsrv_removeuser(usrp, 1);
mtx_unlock(&thp->mtx);
}
}
hp_idnum = NFSUSERHASH(nidp->nid_uid);
mtx_lock(&hp_idnum->mtx);
} else if (nidp->nid_flag & (NFSID_DELGID | NFSID_ADDGID)) {
/* Must lock all groupname hash lists first, to avoid a LOR. */
for (i = 0; i < nfsrv_lughashsize; i++)
mtx_lock(&nfsgroupnamehash[i].mtx);
groupname_locked = 1;
hp_idnum = NFSGROUPHASH(nidp->nid_gid);
mtx_lock(&hp_idnum->mtx);
TAILQ_FOREACH_SAFE(usrp, &hp_idnum->lughead, lug_numhash,
nusrp) {
if (usrp->lug_gid == nidp->nid_gid)
nfsrv_removeuser(usrp, 0);
}
} else if (nidp->nid_flag & (NFSID_DELGROUPNAME | NFSID_ADDGROUPNAME)) {
hp_name = NFSGROUPNAMEHASH(newusrp->lug_name,
newusrp->lug_namelen);
mtx_lock(&hp_name->mtx);
TAILQ_FOREACH_SAFE(usrp, &hp_name->lughead, lug_namehash,
nusrp) {
if (usrp->lug_namelen == newusrp->lug_namelen &&
!NFSBCMP(usrp->lug_name, newusrp->lug_name,
usrp->lug_namelen)) {
thp = NFSGROUPHASH(usrp->lug_gid);
mtx_lock(&thp->mtx);
nfsrv_removeuser(usrp, 0);
mtx_unlock(&thp->mtx);
}
}
hp_idnum = NFSGROUPHASH(nidp->nid_gid);
mtx_lock(&hp_idnum->mtx);
}
/*
* Now, we can add the new one.
*/
if (nidp->nid_usertimeout)
newusrp->lug_expiry = NFSD_MONOSEC + nidp->nid_usertimeout;
else
newusrp->lug_expiry = NFSD_MONOSEC + 5;
if (nidp->nid_flag & (NFSID_ADDUID | NFSID_ADDUSERNAME)) {
newusrp->lug_uid = nidp->nid_uid;
thp = NFSUSERHASH(newusrp->lug_uid);
mtx_assert(&thp->mtx, MA_OWNED);
TAILQ_INSERT_TAIL(&thp->lughead, newusrp, lug_numhash);
thp = NFSUSERNAMEHASH(newusrp->lug_name, newusrp->lug_namelen);
mtx_assert(&thp->mtx, MA_OWNED);
TAILQ_INSERT_TAIL(&thp->lughead, newusrp, lug_namehash);
atomic_add_int(&nfsrv_usercnt, 1);
} else if (nidp->nid_flag & (NFSID_ADDGID | NFSID_ADDGROUPNAME)) {
newusrp->lug_gid = nidp->nid_gid;
thp = NFSGROUPHASH(newusrp->lug_gid);
mtx_assert(&thp->mtx, MA_OWNED);
TAILQ_INSERT_TAIL(&thp->lughead, newusrp, lug_numhash);
thp = NFSGROUPNAMEHASH(newusrp->lug_name, newusrp->lug_namelen);
mtx_assert(&thp->mtx, MA_OWNED);
TAILQ_INSERT_TAIL(&thp->lughead, newusrp, lug_namehash);
atomic_add_int(&nfsrv_usercnt, 1);
} else {
if (newusrp->lug_cred != NULL)
crfree(newusrp->lug_cred);
free(newusrp, M_NFSUSERGROUP);
}
/*
* Once per second, allow one thread to trim the cache.
*/
if (lasttime < NFSD_MONOSEC &&
atomic_cmpset_acq_int(&onethread, 0, 1) != 0) {
/*
* First, unlock the single mutexes, so that all entries
* can be locked and any LOR is avoided.
*/
if (hp_name != NULL) {
mtx_unlock(&hp_name->mtx);
hp_name = NULL;
}
if (hp_idnum != NULL) {
mtx_unlock(&hp_idnum->mtx);
hp_idnum = NULL;
}
if ((nidp->nid_flag & (NFSID_DELUID | NFSID_ADDUID |
NFSID_DELUSERNAME | NFSID_ADDUSERNAME)) != 0) {
if (username_locked == 0) {
for (i = 0; i < nfsrv_lughashsize; i++)
mtx_lock(&nfsusernamehash[i].mtx);
username_locked = 1;
}
KASSERT(user_locked == 0,
("nfssvc_idname: user_locked"));
for (i = 0; i < nfsrv_lughashsize; i++)
mtx_lock(&nfsuserhash[i].mtx);
user_locked = 1;
for (i = 0; i < nfsrv_lughashsize; i++) {
TAILQ_FOREACH_SAFE(usrp,
&nfsuserhash[i].lughead, lug_numhash,
nusrp)
if (usrp->lug_expiry < NFSD_MONOSEC)
nfsrv_removeuser(usrp, 1);
}
for (i = 0; i < nfsrv_lughashsize; i++) {
/*
* Trim the cache using an approximate LRU
* algorithm. This code deletes the least
* recently used entry on each hash list.
*/
if (nfsrv_usercnt <= nfsrv_usermax)
break;
usrp = TAILQ_FIRST(&nfsuserhash[i].lughead);
if (usrp != NULL)
nfsrv_removeuser(usrp, 1);
}
} else {
if (groupname_locked == 0) {
for (i = 0; i < nfsrv_lughashsize; i++)
mtx_lock(&nfsgroupnamehash[i].mtx);
groupname_locked = 1;
}
KASSERT(group_locked == 0,
("nfssvc_idname: group_locked"));
for (i = 0; i < nfsrv_lughashsize; i++)
mtx_lock(&nfsgrouphash[i].mtx);
group_locked = 1;
for (i = 0; i < nfsrv_lughashsize; i++) {
TAILQ_FOREACH_SAFE(usrp,
&nfsgrouphash[i].lughead, lug_numhash,
nusrp)
if (usrp->lug_expiry < NFSD_MONOSEC)
nfsrv_removeuser(usrp, 0);
}
for (i = 0; i < nfsrv_lughashsize; i++) {
/*
* Trim the cache using an approximate LRU
* algorithm. This code deletes the least
* recently user entry on each hash list.
*/
if (nfsrv_usercnt <= nfsrv_usermax)
break;
usrp = TAILQ_FIRST(&nfsgrouphash[i].lughead);
if (usrp != NULL)
nfsrv_removeuser(usrp, 0);
}
}
lasttime = NFSD_MONOSEC;
atomic_store_rel_int(&onethread, 0);
}
/* Now, unlock all locked mutexes. */
if (hp_idnum != NULL)
mtx_unlock(&hp_idnum->mtx);
if (hp_name != NULL)
mtx_unlock(&hp_name->mtx);
if (user_locked != 0)
for (i = 0; i < nfsrv_lughashsize; i++)
mtx_unlock(&nfsuserhash[i].mtx);
if (username_locked != 0)
for (i = 0; i < nfsrv_lughashsize; i++)
mtx_unlock(&nfsusernamehash[i].mtx);
if (group_locked != 0)
for (i = 0; i < nfsrv_lughashsize; i++)
mtx_unlock(&nfsgrouphash[i].mtx);
if (groupname_locked != 0)
for (i = 0; i < nfsrv_lughashsize; i++)
mtx_unlock(&nfsgroupnamehash[i].mtx);
out:
NFSEXITCODE(error);
return (error);
}
/*
* Remove a user/group name element.
*/
static void
nfsrv_removeuser(struct nfsusrgrp *usrp, int isuser)
{
struct nfsrv_lughash *hp;
if (isuser != 0) {
hp = NFSUSERHASH(usrp->lug_uid);
mtx_assert(&hp->mtx, MA_OWNED);
TAILQ_REMOVE(&hp->lughead, usrp, lug_numhash);
hp = NFSUSERNAMEHASH(usrp->lug_name, usrp->lug_namelen);
mtx_assert(&hp->mtx, MA_OWNED);
TAILQ_REMOVE(&hp->lughead, usrp, lug_namehash);
} else {
hp = NFSGROUPHASH(usrp->lug_gid);
mtx_assert(&hp->mtx, MA_OWNED);
TAILQ_REMOVE(&hp->lughead, usrp, lug_numhash);
hp = NFSGROUPNAMEHASH(usrp->lug_name, usrp->lug_namelen);
mtx_assert(&hp->mtx, MA_OWNED);
TAILQ_REMOVE(&hp->lughead, usrp, lug_namehash);
}
atomic_add_int(&nfsrv_usercnt, -1);
if (usrp->lug_cred != NULL)
crfree(usrp->lug_cred);
free(usrp, M_NFSUSERGROUP);
}
/*
* Free up all the allocations related to the name<-->id cache.
* This function should only be called when the nfsuserd daemon isn't
* running, since it doesn't do any locking.
* This function is meant to be used when the nfscommon module is unloaded.
*/
APPLESTATIC void
nfsrv_cleanusergroup(void)
{
struct nfsrv_lughash *hp, *hp2;
struct nfsusrgrp *nusrp, *usrp;
int i;
if (nfsuserhash == NULL)
return;
for (i = 0; i < nfsrv_lughashsize; i++) {
hp = &nfsuserhash[i];
TAILQ_FOREACH_SAFE(usrp, &hp->lughead, lug_numhash, nusrp) {
TAILQ_REMOVE(&hp->lughead, usrp, lug_numhash);
hp2 = NFSUSERNAMEHASH(usrp->lug_name,
usrp->lug_namelen);
TAILQ_REMOVE(&hp2->lughead, usrp, lug_namehash);
if (usrp->lug_cred != NULL)
crfree(usrp->lug_cred);
free(usrp, M_NFSUSERGROUP);
}
hp = &nfsgrouphash[i];
TAILQ_FOREACH_SAFE(usrp, &hp->lughead, lug_numhash, nusrp) {
TAILQ_REMOVE(&hp->lughead, usrp, lug_numhash);
hp2 = NFSGROUPNAMEHASH(usrp->lug_name,
usrp->lug_namelen);
TAILQ_REMOVE(&hp2->lughead, usrp, lug_namehash);
if (usrp->lug_cred != NULL)
crfree(usrp->lug_cred);
free(usrp, M_NFSUSERGROUP);
}
mtx_destroy(&nfsuserhash[i].mtx);
mtx_destroy(&nfsusernamehash[i].mtx);
mtx_destroy(&nfsgroupnamehash[i].mtx);
mtx_destroy(&nfsgrouphash[i].mtx);
}
free(nfsuserhash, M_NFSUSERGROUP);
free(nfsusernamehash, M_NFSUSERGROUP);
free(nfsgrouphash, M_NFSUSERGROUP);
free(nfsgroupnamehash, M_NFSUSERGROUP);
free(nfsrv_dnsname, M_NFSSTRING);
}
/*
* This function scans a byte string and checks for UTF-8 compliance.
* It returns 0 if it conforms and NFSERR_INVAL if not.
*/
APPLESTATIC int
nfsrv_checkutf8(u_int8_t *cp, int len)
{
u_int32_t val = 0x0;
int cnt = 0, gotd = 0, shift = 0;
u_int8_t byte;
static int utf8_shift[5] = { 7, 11, 16, 21, 26 };
int error = 0;
/*
* Here are what the variables are used for:
* val - the calculated value of a multibyte char, used to check
* that it was coded with the correct range
* cnt - the number of 10xxxxxx bytes to follow
* gotd - set for a char of Dxxx, so D800<->DFFF can be checked for
* shift - lower order bits of range (ie. "val >> shift" should
* not be 0, in other words, dividing by the lower bound
* of the range should get a non-zero value)
* byte - used to calculate cnt
*/
while (len > 0) {
if (cnt > 0) {
/* This handles the 10xxxxxx bytes */
if ((*cp & 0xc0) != 0x80 ||
(gotd && (*cp & 0x20))) {
error = NFSERR_INVAL;
goto out;
}
gotd = 0;
val <<= 6;
val |= (*cp & 0x3f);
cnt--;
if (cnt == 0 && (val >> shift) == 0x0) {
error = NFSERR_INVAL;
goto out;
}
} else if (*cp & 0x80) {
/* first byte of multi byte char */
byte = *cp;
while ((byte & 0x40) && cnt < 6) {
cnt++;
byte <<= 1;
}
if (cnt == 0 || cnt == 6) {
error = NFSERR_INVAL;
goto out;
}
val = (*cp & (0x3f >> cnt));
shift = utf8_shift[cnt - 1];
if (cnt == 2 && val == 0xd)
/* Check for the 0xd800-0xdfff case */
gotd = 1;
}
cp++;
len--;
}
if (cnt > 0)
error = NFSERR_INVAL;
out:
NFSEXITCODE(error);
return (error);
}
/*
* Parse the xdr for an NFSv4 FsLocations attribute. Return two malloc'd
* strings, one with the root path in it and the other with the list of
* locations. The list is in the same format as is found in nfr_refs.
* It is a "," separated list of entries, where each of them is of the
* form <server>:<rootpath>. For example
* "nfsv4-test:/sub2,nfsv4-test2:/user/mnt,nfsv4-test2:/user/mnt2"
* The nilp argument is set to 1 for the special case of a null fs_root
* and an empty server list.
* It returns NFSERR_BADXDR, if the xdr can't be parsed and returns the
* number of xdr bytes parsed in sump.
*/
static int
nfsrv_getrefstr(struct nfsrv_descript *nd, u_char **fsrootp, u_char **srvp,
int *sump, int *nilp)
{
u_int32_t *tl;
u_char *cp = NULL, *cp2 = NULL, *cp3, *str;
int i, j, len, stringlen, cnt, slen, siz, xdrsum, error = 0, nsrv;
struct list {
SLIST_ENTRY(list) next;
int len;
u_char host[1];
} *lsp, *nlsp;
SLIST_HEAD(, list) head;
*fsrootp = NULL;
*srvp = NULL;
*nilp = 0;
/*
* Get the fs_root path and check for the special case of null path
* and 0 length server list.
*/
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
len = fxdr_unsigned(int, *tl);
if (len < 0 || len > 10240) {
error = NFSERR_BADXDR;
goto nfsmout;
}
if (len == 0) {
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (*tl != 0) {
error = NFSERR_BADXDR;
goto nfsmout;
}
*nilp = 1;
*sump = 2 * NFSX_UNSIGNED;
error = 0;
goto nfsmout;
}
cp = malloc(len + 1, M_NFSSTRING, M_WAITOK);
error = nfsrv_mtostr(nd, cp, len);
if (!error) {
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
cnt = fxdr_unsigned(int, *tl);
if (cnt <= 0)
error = NFSERR_BADXDR;
}
if (error)
goto nfsmout;
/*
* Now, loop through the location list and make up the srvlist.
*/
xdrsum = (2 * NFSX_UNSIGNED) + NFSM_RNDUP(len);
cp2 = cp3 = malloc(1024, M_NFSSTRING, M_WAITOK);
slen = 1024;
siz = 0;
for (i = 0; i < cnt; i++) {
SLIST_INIT(&head);
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
nsrv = fxdr_unsigned(int, *tl);
if (nsrv <= 0) {
error = NFSERR_BADXDR;
goto nfsmout;
}
/*
* Handle the first server by putting it in the srvstr.
*/
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
len = fxdr_unsigned(int, *tl);
if (len <= 0 || len > 1024) {
error = NFSERR_BADXDR;
goto nfsmout;
}
nfsrv_refstrbigenough(siz + len + 3, &cp2, &cp3, &slen);
if (cp3 != cp2) {
*cp3++ = ',';
siz++;
}
error = nfsrv_mtostr(nd, cp3, len);
if (error)
goto nfsmout;
cp3 += len;
*cp3++ = ':';
siz += (len + 1);
xdrsum += (2 * NFSX_UNSIGNED) + NFSM_RNDUP(len);
for (j = 1; j < nsrv; j++) {
/*
* Yuck, put them in an slist and process them later.
*/
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
len = fxdr_unsigned(int, *tl);
if (len <= 0 || len > 1024) {
error = NFSERR_BADXDR;
goto nfsmout;
}
lsp = (struct list *)malloc(sizeof (struct list)
+ len, M_TEMP, M_WAITOK);
error = nfsrv_mtostr(nd, lsp->host, len);
if (error)
goto nfsmout;
xdrsum += NFSX_UNSIGNED + NFSM_RNDUP(len);
lsp->len = len;
SLIST_INSERT_HEAD(&head, lsp, next);
}
/*
* Finally, we can get the path.
*/
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
len = fxdr_unsigned(int, *tl);
if (len <= 0 || len > 1024) {
error = NFSERR_BADXDR;
goto nfsmout;
}
nfsrv_refstrbigenough(siz + len + 1, &cp2, &cp3, &slen);
error = nfsrv_mtostr(nd, cp3, len);
if (error)
goto nfsmout;
xdrsum += NFSX_UNSIGNED + NFSM_RNDUP(len);
str = cp3;
stringlen = len;
cp3 += len;
siz += len;
SLIST_FOREACH_SAFE(lsp, &head, next, nlsp) {
nfsrv_refstrbigenough(siz + lsp->len + stringlen + 3,
&cp2, &cp3, &slen);
*cp3++ = ',';
NFSBCOPY(lsp->host, cp3, lsp->len);
cp3 += lsp->len;
*cp3++ = ':';
NFSBCOPY(str, cp3, stringlen);
cp3 += stringlen;
*cp3 = '\0';
siz += (lsp->len + stringlen + 2);
free((caddr_t)lsp, M_TEMP);
}
}
*fsrootp = cp;
*srvp = cp2;
*sump = xdrsum;
NFSEXITCODE2(0, nd);
return (0);
nfsmout:
if (cp != NULL)
free(cp, M_NFSSTRING);
if (cp2 != NULL)
free(cp2, M_NFSSTRING);
NFSEXITCODE2(error, nd);
return (error);
}
/*
* Make the malloc'd space large enough. This is a pain, but the xdr
* doesn't set an upper bound on the side, so...
*/
static void
nfsrv_refstrbigenough(int siz, u_char **cpp, u_char **cpp2, int *slenp)
{
u_char *cp;
int i;
if (siz <= *slenp)
return;
cp = malloc(siz + 1024, M_NFSSTRING, M_WAITOK);
NFSBCOPY(*cpp, cp, *slenp);
free(*cpp, M_NFSSTRING);
i = *cpp2 - *cpp;
*cpp = cp;
*cpp2 = cp + i;
*slenp = siz + 1024;
}
/*
* Initialize the reply header data structures.
*/
APPLESTATIC void
nfsrvd_rephead(struct nfsrv_descript *nd)
{
mbuf_t mreq;
/*
* If this is a big reply, use a cluster.
*/
if ((nd->nd_flag & ND_GSSINITREPLY) == 0 &&
nfs_bigreply[nd->nd_procnum]) {
NFSMCLGET(mreq, M_WAITOK);
nd->nd_mreq = mreq;
nd->nd_mb = mreq;
} else {
NFSMGET(mreq);
nd->nd_mreq = mreq;
nd->nd_mb = mreq;
}
nd->nd_bpos = NFSMTOD(mreq, caddr_t);
mbuf_setlen(mreq, 0);
if ((nd->nd_flag & ND_GSSINITREPLY) == 0)
NFSM_BUILD(nd->nd_errp, int *, NFSX_UNSIGNED);
}
/*
* Lock a socket against others.
* Currently used to serialize connect/disconnect attempts.
*/
int
newnfs_sndlock(int *flagp)
{
struct timespec ts;
NFSLOCKSOCK();
while (*flagp & NFSR_SNDLOCK) {
*flagp |= NFSR_WANTSND;
ts.tv_sec = 0;
ts.tv_nsec = 0;
(void) nfsmsleep((caddr_t)flagp, NFSSOCKMUTEXPTR,
PZERO - 1, "nfsndlck", &ts);
}
*flagp |= NFSR_SNDLOCK;
NFSUNLOCKSOCK();
return (0);
}
/*
* Unlock the stream socket for others.
*/
void
newnfs_sndunlock(int *flagp)
{
NFSLOCKSOCK();
if ((*flagp & NFSR_SNDLOCK) == 0)
panic("nfs sndunlock");
*flagp &= ~NFSR_SNDLOCK;
if (*flagp & NFSR_WANTSND) {
*flagp &= ~NFSR_WANTSND;
wakeup((caddr_t)flagp);
}
NFSUNLOCKSOCK();
}
APPLESTATIC int
nfsv4_getipaddr(struct nfsrv_descript *nd, struct sockaddr_in *sin,
struct sockaddr_in6 *sin6, sa_family_t *saf, int *isudp)
{
struct in_addr saddr;
uint32_t portnum, *tl;
int i, j, k;
sa_family_t af = AF_UNSPEC;
char addr[64], protocol[5], *cp;
int cantparse = 0, error = 0;
uint16_t portv;
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
i = fxdr_unsigned(int, *tl);
if (i >= 3 && i <= 4) {
error = nfsrv_mtostr(nd, protocol, i);
if (error)
goto nfsmout;
if (strcmp(protocol, "tcp") == 0) {
af = AF_INET;
*isudp = 0;
} else if (strcmp(protocol, "udp") == 0) {
af = AF_INET;
*isudp = 1;
} else if (strcmp(protocol, "tcp6") == 0) {
af = AF_INET6;
*isudp = 0;
} else if (strcmp(protocol, "udp6") == 0) {
af = AF_INET6;
*isudp = 1;
} else
cantparse = 1;
} else {
cantparse = 1;
if (i > 0) {
error = nfsm_advance(nd, NFSM_RNDUP(i), -1);
if (error)
goto nfsmout;
}
}
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
i = fxdr_unsigned(int, *tl);
if (i < 0) {
error = NFSERR_BADXDR;
goto nfsmout;
} else if (cantparse == 0 && i >= 11 && i < 64) {
/*
* The shortest address is 11chars and the longest is < 64.
*/
error = nfsrv_mtostr(nd, addr, i);
if (error)
goto nfsmout;
/* Find the port# at the end and extract that. */
i = strlen(addr);
k = 0;
cp = &addr[i - 1];
/* Count back two '.'s from end to get port# field. */
for (j = 0; j < i; j++) {
if (*cp == '.') {
k++;
if (k == 2)
break;
}
cp--;
}
if (k == 2) {
/*
* The NFSv4 port# is appended as .N.N, where N is
* a decimal # in the range 0-255, just like an inet4
* address. Cheat and use inet_aton(), which will
* return a Class A address and then shift the high
* order 8bits over to convert it to the port#.
*/
*cp++ = '\0';
if (inet_aton(cp, &saddr) == 1) {
portnum = ntohl(saddr.s_addr);
portv = (uint16_t)((portnum >> 16) |
(portnum & 0xff));
} else
cantparse = 1;
} else
cantparse = 1;
if (cantparse == 0) {
if (af == AF_INET) {
if (inet_pton(af, addr, &sin->sin_addr) == 1) {
sin->sin_len = sizeof(*sin);
sin->sin_family = AF_INET;
sin->sin_port = htons(portv);
*saf = af;
return (0);
}
} else {
if (inet_pton(af, addr, &sin6->sin6_addr)
== 1) {
sin6->sin6_len = sizeof(*sin6);
sin6->sin6_family = AF_INET6;
sin6->sin6_port = htons(portv);
*saf = af;
return (0);
}
}
}
} else {
if (i > 0) {
error = nfsm_advance(nd, NFSM_RNDUP(i), -1);
if (error)
goto nfsmout;
}
}
error = EPERM;
nfsmout:
return (error);
}
/*
* Handle an NFSv4.1 Sequence request for the session.
* If reply != NULL, use it to return the cached reply, as required.
* The client gets a cached reply via this call for callbacks, however the
* server gets a cached reply via the nfsv4_seqsess_cachereply() call.
*/
int
nfsv4_seqsession(uint32_t seqid, uint32_t slotid, uint32_t highslot,
struct nfsslot *slots, struct mbuf **reply, uint16_t maxslot)
{
int error;
error = 0;
if (reply != NULL)
*reply = NULL;
if (slotid > maxslot)
return (NFSERR_BADSLOT);
if (seqid == slots[slotid].nfssl_seq) {
/* A retry. */
if (slots[slotid].nfssl_inprog != 0)
error = NFSERR_DELAY;
else if (slots[slotid].nfssl_reply != NULL) {
if (reply != NULL) {
*reply = slots[slotid].nfssl_reply;
slots[slotid].nfssl_reply = NULL;
}
slots[slotid].nfssl_inprog = 1;
error = NFSERR_REPLYFROMCACHE;
} else
/* No reply cached, so just do it. */
slots[slotid].nfssl_inprog = 1;
} else if ((slots[slotid].nfssl_seq + 1) == seqid) {
if (slots[slotid].nfssl_reply != NULL)
m_freem(slots[slotid].nfssl_reply);
slots[slotid].nfssl_reply = NULL;
slots[slotid].nfssl_inprog = 1;
slots[slotid].nfssl_seq++;
} else
error = NFSERR_SEQMISORDERED;
return (error);
}
/*
* Cache this reply for the slot.
* Use the "rep" argument to return the cached reply if repstat is set to
* NFSERR_REPLYFROMCACHE. The client never sets repstat to this value.
*/
void
nfsv4_seqsess_cacherep(uint32_t slotid, struct nfsslot *slots, int repstat,
struct mbuf **rep)
{
if (repstat == NFSERR_REPLYFROMCACHE) {
*rep = slots[slotid].nfssl_reply;
slots[slotid].nfssl_reply = NULL;
} else {
if (slots[slotid].nfssl_reply != NULL)
m_freem(slots[slotid].nfssl_reply);
slots[slotid].nfssl_reply = *rep;
}
slots[slotid].nfssl_inprog = 0;
}
/*
* Generate the xdr for an NFSv4.1 Sequence Operation.
*/
APPLESTATIC void
nfsv4_setsequence(struct nfsmount *nmp, struct nfsrv_descript *nd,
struct nfsclsession *sep, int dont_replycache)
{
uint32_t *tl, slotseq = 0;
int error, maxslot, slotpos;
uint8_t sessionid[NFSX_V4SESSIONID];
error = nfsv4_sequencelookup(nmp, sep, &slotpos, &maxslot, &slotseq,
sessionid);
/* Build the Sequence arguments. */
NFSM_BUILD(tl, uint32_t *, NFSX_V4SESSIONID + 4 * NFSX_UNSIGNED);
nd->nd_sequence = tl;
bcopy(sessionid, tl, NFSX_V4SESSIONID);
tl += NFSX_V4SESSIONID / NFSX_UNSIGNED;
nd->nd_slotseq = tl;
if (error == 0) {
*tl++ = txdr_unsigned(slotseq);
*tl++ = txdr_unsigned(slotpos);
*tl++ = txdr_unsigned(maxslot);
if (dont_replycache == 0)
*tl = newnfs_true;
else
*tl = newnfs_false;
} else {
/*
* There are two errors and the rest of the session can
* just be zeros.
* NFSERR_BADSESSION: This bad session should just generate
* the same error again when the RPC is retried.
* ESTALE: A forced dismount is in progress and will cause the
* RPC to fail later.
*/
*tl++ = 0;
*tl++ = 0;
*tl++ = 0;
*tl = 0;
}
nd->nd_flag |= ND_HASSEQUENCE;
}
int
nfsv4_sequencelookup(struct nfsmount *nmp, struct nfsclsession *sep,
int *slotposp, int *maxslotp, uint32_t *slotseqp, uint8_t *sessionid)
{
int i, maxslot, slotpos;
uint64_t bitval;
/* Find an unused slot. */
slotpos = -1;
maxslot = -1;
mtx_lock(&sep->nfsess_mtx);
do {
if (nmp != NULL && sep->nfsess_defunct != 0) {
/* Just return the bad session. */
bcopy(sep->nfsess_sessionid, sessionid,
NFSX_V4SESSIONID);
mtx_unlock(&sep->nfsess_mtx);
return (NFSERR_BADSESSION);
}
bitval = 1;
for (i = 0; i < sep->nfsess_foreslots; i++) {
if ((bitval & sep->nfsess_slots) == 0) {
slotpos = i;
sep->nfsess_slots |= bitval;
sep->nfsess_slotseq[i]++;
*slotseqp = sep->nfsess_slotseq[i];
break;
}
bitval <<= 1;
}
if (slotpos == -1) {
/*
* If a forced dismount is in progress, just return.
* This RPC attempt will fail when it calls
* newnfs_request().
*/
if (nmp != NULL && NFSCL_FORCEDISM(nmp->nm_mountp)) {
mtx_unlock(&sep->nfsess_mtx);
return (ESTALE);
}
/* Wake up once/sec, to check for a forced dismount. */
(void)mtx_sleep(&sep->nfsess_slots, &sep->nfsess_mtx,
PZERO, "nfsclseq", hz);
}
} while (slotpos == -1);
/* Now, find the highest slot in use. (nfsc_slots is 64bits) */
bitval = 1;
for (i = 0; i < 64; i++) {
if ((bitval & sep->nfsess_slots) != 0)
maxslot = i;
bitval <<= 1;
}
bcopy(sep->nfsess_sessionid, sessionid, NFSX_V4SESSIONID);
mtx_unlock(&sep->nfsess_mtx);
*slotposp = slotpos;
*maxslotp = maxslot;
return (0);
}
/*
* Free a session slot.
*/
APPLESTATIC void
nfsv4_freeslot(struct nfsclsession *sep, int slot)
{
uint64_t bitval;
bitval = 1;
if (slot > 0)
bitval <<= slot;
mtx_lock(&sep->nfsess_mtx);
if ((bitval & sep->nfsess_slots) == 0)
printf("freeing free slot!!\n");
sep->nfsess_slots &= ~bitval;
wakeup(&sep->nfsess_slots);
mtx_unlock(&sep->nfsess_mtx);
}
Index: head/sys/fs/nfsclient/nfs_clstate.c
===================================================================
--- head/sys/fs/nfsclient/nfs_clstate.c (revision 327172)
+++ head/sys/fs/nfsclient/nfs_clstate.c (revision 327173)
@@ -1,5358 +1,5353 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2009 Rick Macklem, University of Guelph
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
/*
* These functions implement the client side state handling for NFSv4.
* NFSv4 state handling:
* - A lockowner is used to determine lock contention, so it
* corresponds directly to a Posix pid. (1 to 1 mapping)
* - The correct granularity of an OpenOwner is not nearly so
* obvious. An OpenOwner does the following:
* - provides a serial sequencing of Open/Close/Lock-with-new-lockowner
* - is used to check for Open/Share contention (not applicable to
* this client, since all Opens are Deny_None)
* As such, I considered both extreme.
* 1 OpenOwner per ClientID - Simple to manage, but fully serializes
* all Open, Close and Lock (with a new lockowner) Ops.
* 1 OpenOwner for each Open - This one results in an OpenConfirm for
* every Open, for most servers.
* So, I chose to use the same mapping as I did for LockOwnwers.
* The main concern here is that you can end up with multiple Opens
* for the same File Handle, but on different OpenOwners (opens
* inherited from parents, grandparents...) and you do not know
* which of these the vnodeop close applies to. This is handled by
* delaying the Close Op(s) until all of the Opens have been closed.
* (It is not yet obvious if this is the correct granularity.)
* - How the code handles serialization:
* - For the ClientId, it uses an exclusive lock while getting its
* SetClientId and during recovery. Otherwise, it uses a shared
* lock via a reference count.
* - For the rest of the data structures, it uses an SMP mutex
* (once the nfs client is SMP safe) and doesn't sleep while
* manipulating the linked lists.
* - The serialization of Open/Close/Lock/LockU falls out in the
* "wash", since OpenOwners and LockOwners are both mapped from
* Posix pid. In other words, there is only one Posix pid using
* any given owner, so that owner is serialized. (If you change
* the granularity of the OpenOwner, then code must be added to
* serialize Ops on the OpenOwner.)
* - When to get rid of OpenOwners and LockOwners.
* - The function nfscl_cleanup_common() is executed after a process exits.
* It goes through the client list looking for all Open and Lock Owners.
* When one is found, it is marked "defunct" or in the case of
* an OpenOwner without any Opens, freed.
* The renew thread scans for defunct Owners and gets rid of them,
* if it can. The LockOwners will also be deleted when the
* associated Open is closed.
* - If the LockU or Close Op(s) fail during close in a way
* that could be recovered upon retry, they are relinked to the
* ClientId's defunct open list and retried by the renew thread
* until they succeed or an unmount/recovery occurs.
* (Since we are done with them, they do not need to be recovered.)
*/
#ifndef APPLEKEXT
#include <fs/nfs/nfsport.h>
/*
* Global variables
*/
extern struct nfsstatsv1 nfsstatsv1;
extern struct nfsreqhead nfsd_reqq;
extern u_int32_t newnfs_false, newnfs_true;
extern int nfscl_debuglevel;
extern int nfscl_enablecallb;
extern int nfs_numnfscbd;
NFSREQSPINLOCK;
NFSCLSTATEMUTEX;
int nfscl_inited = 0;
struct nfsclhead nfsclhead; /* Head of clientid list */
int nfscl_deleghighwater = NFSCLDELEGHIGHWATER;
int nfscl_layouthighwater = NFSCLLAYOUTHIGHWATER;
#endif /* !APPLEKEXT */
static int nfscl_delegcnt = 0;
static int nfscl_layoutcnt = 0;
static int nfscl_getopen(struct nfsclownerhead *, u_int8_t *, int, u_int8_t *,
u_int8_t *, u_int32_t, struct nfscllockowner **, struct nfsclopen **);
static void nfscl_clrelease(struct nfsclclient *);
static void nfscl_cleanclient(struct nfsclclient *);
static void nfscl_expireclient(struct nfsclclient *, struct nfsmount *,
struct ucred *, NFSPROC_T *);
static int nfscl_expireopen(struct nfsclclient *, struct nfsclopen *,
struct nfsmount *, struct ucred *, NFSPROC_T *);
static void nfscl_recover(struct nfsclclient *, struct ucred *, NFSPROC_T *);
static void nfscl_insertlock(struct nfscllockowner *, struct nfscllock *,
struct nfscllock *, int);
static int nfscl_updatelock(struct nfscllockowner *, struct nfscllock **,
struct nfscllock **, int);
static void nfscl_delegreturnall(struct nfsclclient *, NFSPROC_T *);
static u_int32_t nfscl_nextcbident(void);
static mount_t nfscl_getmnt(int, uint8_t *, u_int32_t, struct nfsclclient **);
static struct nfsclclient *nfscl_getclnt(u_int32_t);
static struct nfsclclient *nfscl_getclntsess(uint8_t *);
static struct nfscldeleg *nfscl_finddeleg(struct nfsclclient *, u_int8_t *,
int);
static void nfscl_retoncloselayout(vnode_t, struct nfsclclient *, uint8_t *,
int, struct nfsclrecalllayout **);
static void nfscl_reldevinfo_locked(struct nfscldevinfo *);
static struct nfscllayout *nfscl_findlayout(struct nfsclclient *, u_int8_t *,
int);
static struct nfscldevinfo *nfscl_finddevinfo(struct nfsclclient *, uint8_t *);
static int nfscl_checkconflict(struct nfscllockownerhead *, struct nfscllock *,
u_int8_t *, struct nfscllock **);
static void nfscl_freealllocks(struct nfscllockownerhead *, int);
static int nfscl_localconflict(struct nfsclclient *, u_int8_t *, int,
struct nfscllock *, u_int8_t *, struct nfscldeleg *, struct nfscllock **);
static void nfscl_newopen(struct nfsclclient *, struct nfscldeleg *,
struct nfsclowner **, struct nfsclowner **, struct nfsclopen **,
struct nfsclopen **, u_int8_t *, u_int8_t *, int, struct ucred *, int *);
static int nfscl_moveopen(vnode_t , struct nfsclclient *,
struct nfsmount *, struct nfsclopen *, struct nfsclowner *,
struct nfscldeleg *, struct ucred *, NFSPROC_T *);
static void nfscl_totalrecall(struct nfsclclient *);
static int nfscl_relock(vnode_t , struct nfsclclient *, struct nfsmount *,
struct nfscllockowner *, struct nfscllock *, struct ucred *, NFSPROC_T *);
static int nfscl_tryopen(struct nfsmount *, vnode_t , u_int8_t *, int,
u_int8_t *, int, u_int32_t, struct nfsclopen *, u_int8_t *, int,
struct nfscldeleg **, int, u_int32_t, struct ucred *, NFSPROC_T *);
static int nfscl_trylock(struct nfsmount *, vnode_t , u_int8_t *,
int, struct nfscllockowner *, int, int, u_int64_t, u_int64_t, short,
struct ucred *, NFSPROC_T *);
static int nfsrpc_reopen(struct nfsmount *, u_int8_t *, int, u_int32_t,
struct nfsclopen *, struct nfscldeleg **, struct ucred *, NFSPROC_T *);
static void nfscl_freedeleg(struct nfscldeleghead *, struct nfscldeleg *);
static int nfscl_errmap(struct nfsrv_descript *, u_int32_t);
static void nfscl_cleanup_common(struct nfsclclient *, u_int8_t *);
static int nfscl_recalldeleg(struct nfsclclient *, struct nfsmount *,
struct nfscldeleg *, vnode_t, struct ucred *, NFSPROC_T *, int);
static void nfscl_freeopenowner(struct nfsclowner *, int);
static void nfscl_cleandeleg(struct nfscldeleg *);
static int nfscl_trydelegreturn(struct nfscldeleg *, struct ucred *,
struct nfsmount *, NFSPROC_T *);
static void nfscl_emptylockowner(struct nfscllockowner *,
struct nfscllockownerfhhead *);
static void nfscl_mergeflayouts(struct nfsclflayouthead *,
struct nfsclflayouthead *);
static int nfscl_layoutrecall(int, struct nfscllayout *, uint32_t, uint64_t,
uint64_t, uint32_t, struct nfsclrecalllayout *);
static int nfscl_seq(uint32_t, uint32_t);
static void nfscl_layoutreturn(struct nfsmount *, struct nfscllayout *,
struct ucred *, NFSPROC_T *);
static void nfscl_dolayoutcommit(struct nfsmount *, struct nfscllayout *,
struct ucred *, NFSPROC_T *);
static short nfscberr_null[] = {
0,
0,
};
static short nfscberr_getattr[] = {
NFSERR_RESOURCE,
NFSERR_BADHANDLE,
NFSERR_BADXDR,
NFSERR_RESOURCE,
NFSERR_SERVERFAULT,
0,
};
static short nfscberr_recall[] = {
NFSERR_RESOURCE,
NFSERR_BADHANDLE,
NFSERR_BADSTATEID,
NFSERR_BADXDR,
NFSERR_RESOURCE,
NFSERR_SERVERFAULT,
0,
};
static short *nfscl_cberrmap[] = {
nfscberr_null,
nfscberr_null,
nfscberr_null,
nfscberr_getattr,
nfscberr_recall
};
#define NETFAMILY(clp) \
(((clp)->nfsc_flags & NFSCLFLAGS_AFINET6) ? AF_INET6 : AF_INET)
/*
* Called for an open operation.
* If the nfhp argument is NULL, just get an openowner.
*/
APPLESTATIC int
nfscl_open(vnode_t vp, u_int8_t *nfhp, int fhlen, u_int32_t amode, int usedeleg,
struct ucred *cred, NFSPROC_T *p, struct nfsclowner **owpp,
struct nfsclopen **opp, int *newonep, int *retp, int lockit)
{
struct nfsclclient *clp;
struct nfsclowner *owp, *nowp;
struct nfsclopen *op = NULL, *nop = NULL;
struct nfscldeleg *dp;
struct nfsclownerhead *ohp;
u_int8_t own[NFSV4CL_LOCKNAMELEN];
int ret;
if (newonep != NULL)
*newonep = 0;
if (opp != NULL)
*opp = NULL;
if (owpp != NULL)
*owpp = NULL;
/*
* Might need one or both of these, so MALLOC them now, to
* avoid a tsleep() in MALLOC later.
*/
MALLOC(nowp, struct nfsclowner *, sizeof (struct nfsclowner),
M_NFSCLOWNER, M_WAITOK);
if (nfhp != NULL)
MALLOC(nop, struct nfsclopen *, sizeof (struct nfsclopen) +
fhlen - 1, M_NFSCLOPEN, M_WAITOK);
ret = nfscl_getcl(vnode_mount(vp), cred, p, 1, &clp);
if (ret != 0) {
FREE((caddr_t)nowp, M_NFSCLOWNER);
if (nop != NULL)
FREE((caddr_t)nop, M_NFSCLOPEN);
return (ret);
}
/*
* Get the Open iff it already exists.
* If none found, add the new one or return error, depending upon
* "create".
*/
NFSLOCKCLSTATE();
dp = NULL;
/* First check the delegation list */
if (nfhp != NULL && usedeleg) {
LIST_FOREACH(dp, NFSCLDELEGHASH(clp, nfhp, fhlen), nfsdl_hash) {
if (dp->nfsdl_fhlen == fhlen &&
!NFSBCMP(nfhp, dp->nfsdl_fh, fhlen)) {
if (!(amode & NFSV4OPEN_ACCESSWRITE) ||
(dp->nfsdl_flags & NFSCLDL_WRITE))
break;
dp = NULL;
break;
}
}
}
if (dp != NULL) {
nfscl_filllockowner(p->td_proc, own, F_POSIX);
ohp = &dp->nfsdl_owner;
} else {
/* For NFSv4.1 and this option, use a single open_owner. */
if (NFSHASONEOPENOWN(VFSTONFS(vnode_mount(vp))))
nfscl_filllockowner(NULL, own, F_POSIX);
else
nfscl_filllockowner(p->td_proc, own, F_POSIX);
ohp = &clp->nfsc_owner;
}
/* Now, search for an openowner */
LIST_FOREACH(owp, ohp, nfsow_list) {
if (!NFSBCMP(owp->nfsow_owner, own, NFSV4CL_LOCKNAMELEN))
break;
}
/*
* Create a new open, as required.
*/
nfscl_newopen(clp, dp, &owp, &nowp, &op, &nop, own, nfhp, fhlen,
cred, newonep);
/*
* Now, check the mode on the open and return the appropriate
* value.
*/
if (retp != NULL) {
if (nfhp != NULL && dp != NULL && nop == NULL)
/* new local open on delegation */
*retp = NFSCLOPEN_SETCRED;
else
*retp = NFSCLOPEN_OK;
}
if (op != NULL && (amode & ~(op->nfso_mode))) {
op->nfso_mode |= amode;
if (retp != NULL && dp == NULL)
*retp = NFSCLOPEN_DOOPEN;
}
/*
* Serialize modifications to the open owner for multiple threads
* within the same process using a read/write sleep lock.
* For NFSv4.1 and a single OpenOwner, allow concurrent open operations
* by acquiring a shared lock. The close operations still use an
* exclusive lock for this case.
*/
if (lockit != 0) {
if (NFSHASONEOPENOWN(VFSTONFS(vnode_mount(vp)))) {
/*
* Get a shared lock on the OpenOwner, but first
* wait for any pending exclusive lock, so that the
* exclusive locker gets priority.
*/
nfsv4_lock(&owp->nfsow_rwlock, 0, NULL,
NFSCLSTATEMUTEXPTR, NULL);
nfsv4_getref(&owp->nfsow_rwlock, NULL,
NFSCLSTATEMUTEXPTR, NULL);
} else
nfscl_lockexcl(&owp->nfsow_rwlock, NFSCLSTATEMUTEXPTR);
}
NFSUNLOCKCLSTATE();
if (nowp != NULL)
FREE((caddr_t)nowp, M_NFSCLOWNER);
if (nop != NULL)
FREE((caddr_t)nop, M_NFSCLOPEN);
if (owpp != NULL)
*owpp = owp;
if (opp != NULL)
*opp = op;
return (0);
}
/*
* Create a new open, as required.
*/
static void
nfscl_newopen(struct nfsclclient *clp, struct nfscldeleg *dp,
struct nfsclowner **owpp, struct nfsclowner **nowpp, struct nfsclopen **opp,
struct nfsclopen **nopp, u_int8_t *own, u_int8_t *fhp, int fhlen,
struct ucred *cred, int *newonep)
{
struct nfsclowner *owp = *owpp, *nowp;
struct nfsclopen *op, *nop;
if (nowpp != NULL)
nowp = *nowpp;
else
nowp = NULL;
if (nopp != NULL)
nop = *nopp;
else
nop = NULL;
if (owp == NULL && nowp != NULL) {
NFSBCOPY(own, nowp->nfsow_owner, NFSV4CL_LOCKNAMELEN);
LIST_INIT(&nowp->nfsow_open);
nowp->nfsow_clp = clp;
nowp->nfsow_seqid = 0;
nowp->nfsow_defunct = 0;
nfscl_lockinit(&nowp->nfsow_rwlock);
if (dp != NULL) {
nfsstatsv1.cllocalopenowners++;
LIST_INSERT_HEAD(&dp->nfsdl_owner, nowp, nfsow_list);
} else {
nfsstatsv1.clopenowners++;
LIST_INSERT_HEAD(&clp->nfsc_owner, nowp, nfsow_list);
}
owp = *owpp = nowp;
*nowpp = NULL;
if (newonep != NULL)
*newonep = 1;
}
/* If an fhp has been specified, create an Open as well. */
if (fhp != NULL) {
/* and look for the correct open, based upon FH */
LIST_FOREACH(op, &owp->nfsow_open, nfso_list) {
if (op->nfso_fhlen == fhlen &&
!NFSBCMP(op->nfso_fh, fhp, fhlen))
break;
}
if (op == NULL && nop != NULL) {
nop->nfso_own = owp;
nop->nfso_mode = 0;
nop->nfso_opencnt = 0;
nop->nfso_posixlock = 1;
nop->nfso_fhlen = fhlen;
NFSBCOPY(fhp, nop->nfso_fh, fhlen);
LIST_INIT(&nop->nfso_lock);
nop->nfso_stateid.seqid = 0;
nop->nfso_stateid.other[0] = 0;
nop->nfso_stateid.other[1] = 0;
nop->nfso_stateid.other[2] = 0;
KASSERT(cred != NULL, ("%s: cred NULL\n", __func__));
newnfs_copyincred(cred, &nop->nfso_cred);
if (dp != NULL) {
TAILQ_REMOVE(&clp->nfsc_deleg, dp, nfsdl_list);
TAILQ_INSERT_HEAD(&clp->nfsc_deleg, dp,
nfsdl_list);
dp->nfsdl_timestamp = NFSD_MONOSEC + 120;
nfsstatsv1.cllocalopens++;
} else {
nfsstatsv1.clopens++;
}
LIST_INSERT_HEAD(&owp->nfsow_open, nop, nfso_list);
*opp = nop;
*nopp = NULL;
if (newonep != NULL)
*newonep = 1;
} else {
*opp = op;
}
}
}
/*
* Called to find/add a delegation to a client.
*/
APPLESTATIC int
nfscl_deleg(mount_t mp, struct nfsclclient *clp, u_int8_t *nfhp,
int fhlen, struct ucred *cred, NFSPROC_T *p, struct nfscldeleg **dpp)
{
struct nfscldeleg *dp = *dpp, *tdp;
/*
* First, if we have received a Read delegation for a file on a
* read/write file system, just return it, because they aren't
* useful, imho.
*/
if (mp != NULL && dp != NULL && !NFSMNT_RDONLY(mp) &&
(dp->nfsdl_flags & NFSCLDL_READ)) {
(void) nfscl_trydelegreturn(dp, cred, VFSTONFS(mp), p);
FREE((caddr_t)dp, M_NFSCLDELEG);
*dpp = NULL;
return (0);
}
/* Look for the correct deleg, based upon FH */
NFSLOCKCLSTATE();
tdp = nfscl_finddeleg(clp, nfhp, fhlen);
if (tdp == NULL) {
if (dp == NULL) {
NFSUNLOCKCLSTATE();
return (NFSERR_BADSTATEID);
}
*dpp = NULL;
TAILQ_INSERT_HEAD(&clp->nfsc_deleg, dp, nfsdl_list);
LIST_INSERT_HEAD(NFSCLDELEGHASH(clp, nfhp, fhlen), dp,
nfsdl_hash);
dp->nfsdl_timestamp = NFSD_MONOSEC + 120;
nfsstatsv1.cldelegates++;
nfscl_delegcnt++;
} else {
/*
* Delegation already exists, what do we do if a new one??
*/
if (dp != NULL) {
printf("Deleg already exists!\n");
FREE((caddr_t)dp, M_NFSCLDELEG);
*dpp = NULL;
} else {
*dpp = tdp;
}
}
NFSUNLOCKCLSTATE();
return (0);
}
/*
* Find a delegation for this file handle. Return NULL upon failure.
*/
static struct nfscldeleg *
nfscl_finddeleg(struct nfsclclient *clp, u_int8_t *fhp, int fhlen)
{
struct nfscldeleg *dp;
LIST_FOREACH(dp, NFSCLDELEGHASH(clp, fhp, fhlen), nfsdl_hash) {
if (dp->nfsdl_fhlen == fhlen &&
!NFSBCMP(dp->nfsdl_fh, fhp, fhlen))
break;
}
return (dp);
}
/*
* Get a stateid for an I/O operation. First, look for an open and iff
* found, return either a lockowner stateid or the open stateid.
* If no Open is found, just return error and the special stateid of all zeros.
*/
APPLESTATIC int
nfscl_getstateid(vnode_t vp, u_int8_t *nfhp, int fhlen, u_int32_t mode,
int fords, struct ucred *cred, NFSPROC_T *p, nfsv4stateid_t *stateidp,
void **lckpp)
{
struct nfsclclient *clp;
struct nfsclowner *owp;
struct nfsclopen *op = NULL, *top;
struct nfscllockowner *lp;
struct nfscldeleg *dp;
struct nfsnode *np;
struct nfsmount *nmp;
u_int8_t own[NFSV4CL_LOCKNAMELEN];
int error, done;
*lckpp = NULL;
/*
* Initially, just set the special stateid of all zeros.
* (Don't do this for a DS, since the special stateid can't be used.)
*/
if (fords == 0) {
stateidp->seqid = 0;
stateidp->other[0] = 0;
stateidp->other[1] = 0;
stateidp->other[2] = 0;
}
if (vnode_vtype(vp) != VREG)
return (EISDIR);
np = VTONFS(vp);
nmp = VFSTONFS(vnode_mount(vp));
NFSLOCKCLSTATE();
clp = nfscl_findcl(nmp);
if (clp == NULL) {
NFSUNLOCKCLSTATE();
return (EACCES);
}
/*
* Wait for recovery to complete.
*/
while ((clp->nfsc_flags & NFSCLFLAGS_RECVRINPROG))
(void) nfsmsleep(&clp->nfsc_flags, NFSCLSTATEMUTEXPTR,
PZERO, "nfsrecvr", NULL);
/*
* First, look for a delegation.
*/
LIST_FOREACH(dp, NFSCLDELEGHASH(clp, nfhp, fhlen), nfsdl_hash) {
if (dp->nfsdl_fhlen == fhlen &&
!NFSBCMP(nfhp, dp->nfsdl_fh, fhlen)) {
if (!(mode & NFSV4OPEN_ACCESSWRITE) ||
(dp->nfsdl_flags & NFSCLDL_WRITE)) {
stateidp->seqid = dp->nfsdl_stateid.seqid;
stateidp->other[0] = dp->nfsdl_stateid.other[0];
stateidp->other[1] = dp->nfsdl_stateid.other[1];
stateidp->other[2] = dp->nfsdl_stateid.other[2];
if (!(np->n_flag & NDELEGRECALL)) {
TAILQ_REMOVE(&clp->nfsc_deleg, dp,
nfsdl_list);
TAILQ_INSERT_HEAD(&clp->nfsc_deleg, dp,
nfsdl_list);
dp->nfsdl_timestamp = NFSD_MONOSEC +
120;
dp->nfsdl_rwlock.nfslock_usecnt++;
*lckpp = (void *)&dp->nfsdl_rwlock;
}
NFSUNLOCKCLSTATE();
return (0);
}
break;
}
}
if (p != NULL) {
/*
* If p != NULL, we want to search the parentage tree
* for a matching OpenOwner and use that.
*/
if (NFSHASONEOPENOWN(VFSTONFS(vnode_mount(vp))))
nfscl_filllockowner(NULL, own, F_POSIX);
else
nfscl_filllockowner(p->td_proc, own, F_POSIX);
lp = NULL;
error = nfscl_getopen(&clp->nfsc_owner, nfhp, fhlen, own, own,
mode, &lp, &op);
if (error == 0 && lp != NULL && fords == 0) {
/* Don't return a lock stateid for a DS. */
stateidp->seqid =
lp->nfsl_stateid.seqid;
stateidp->other[0] =
lp->nfsl_stateid.other[0];
stateidp->other[1] =
lp->nfsl_stateid.other[1];
stateidp->other[2] =
lp->nfsl_stateid.other[2];
NFSUNLOCKCLSTATE();
return (0);
}
}
if (op == NULL) {
/* If not found, just look for any OpenOwner that will work. */
top = NULL;
done = 0;
owp = LIST_FIRST(&clp->nfsc_owner);
while (!done && owp != NULL) {
LIST_FOREACH(op, &owp->nfsow_open, nfso_list) {
if (op->nfso_fhlen == fhlen &&
!NFSBCMP(op->nfso_fh, nfhp, fhlen)) {
if (top == NULL && (op->nfso_mode &
NFSV4OPEN_ACCESSWRITE) != 0 &&
(mode & NFSV4OPEN_ACCESSREAD) != 0)
top = op;
if ((mode & op->nfso_mode) == mode) {
done = 1;
break;
}
}
}
if (!done)
owp = LIST_NEXT(owp, nfsow_list);
}
if (!done) {
NFSCL_DEBUG(2, "openmode top=%p\n", top);
if (top == NULL || NFSHASOPENMODE(nmp)) {
NFSUNLOCKCLSTATE();
return (ENOENT);
} else
op = top;
}
/*
* For read aheads or write behinds, use the open cred.
* A read ahead or write behind is indicated by p == NULL.
*/
if (p == NULL)
newnfs_copycred(&op->nfso_cred, cred);
}
/*
* No lock stateid, so return the open stateid.
*/
stateidp->seqid = op->nfso_stateid.seqid;
stateidp->other[0] = op->nfso_stateid.other[0];
stateidp->other[1] = op->nfso_stateid.other[1];
stateidp->other[2] = op->nfso_stateid.other[2];
NFSUNLOCKCLSTATE();
return (0);
}
/*
* Search for a matching file, mode and, optionally, lockowner.
*/
static int
nfscl_getopen(struct nfsclownerhead *ohp, u_int8_t *nfhp, int fhlen,
u_int8_t *openown, u_int8_t *lockown, u_int32_t mode,
struct nfscllockowner **lpp, struct nfsclopen **opp)
{
struct nfsclowner *owp;
struct nfsclopen *op, *rop, *rop2;
struct nfscllockowner *lp;
int keep_looping;
if (lpp != NULL)
*lpp = NULL;
/*
* rop will be set to the open to be returned. There are three
* variants of this, all for an open of the correct file:
* 1 - A match of lockown.
* 2 - A match of the openown, when no lockown match exists.
* 3 - A match for any open, if no openown or lockown match exists.
* Looking for #2 over #3 probably isn't necessary, but since
* RFC3530 is vague w.r.t. the relationship between openowners and
* lockowners, I think this is the safer way to go.
*/
rop = NULL;
rop2 = NULL;
keep_looping = 1;
/* Search the client list */
owp = LIST_FIRST(ohp);
while (owp != NULL && keep_looping != 0) {
/* and look for the correct open */
op = LIST_FIRST(&owp->nfsow_open);
while (op != NULL && keep_looping != 0) {
if (op->nfso_fhlen == fhlen &&
!NFSBCMP(op->nfso_fh, nfhp, fhlen)
&& (op->nfso_mode & mode) == mode) {
if (lpp != NULL) {
/* Now look for a matching lockowner. */
LIST_FOREACH(lp, &op->nfso_lock,
nfsl_list) {
if (!NFSBCMP(lp->nfsl_owner,
lockown,
NFSV4CL_LOCKNAMELEN)) {
*lpp = lp;
rop = op;
keep_looping = 0;
break;
}
}
}
if (rop == NULL && !NFSBCMP(owp->nfsow_owner,
openown, NFSV4CL_LOCKNAMELEN)) {
rop = op;
if (lpp == NULL)
keep_looping = 0;
}
if (rop2 == NULL)
rop2 = op;
}
op = LIST_NEXT(op, nfso_list);
}
owp = LIST_NEXT(owp, nfsow_list);
}
if (rop == NULL)
rop = rop2;
if (rop == NULL)
return (EBADF);
*opp = rop;
return (0);
}
/*
* Release use of an open owner. Called when open operations are done
* with the open owner.
*/
APPLESTATIC void
nfscl_ownerrelease(struct nfsmount *nmp, struct nfsclowner *owp,
__unused int error, __unused int candelete, int unlocked)
{
if (owp == NULL)
return;
NFSLOCKCLSTATE();
if (unlocked == 0) {
if (NFSHASONEOPENOWN(nmp))
nfsv4_relref(&owp->nfsow_rwlock);
else
nfscl_lockunlock(&owp->nfsow_rwlock);
}
nfscl_clrelease(owp->nfsow_clp);
NFSUNLOCKCLSTATE();
}
/*
* Release use of an open structure under an open owner.
*/
APPLESTATIC void
nfscl_openrelease(struct nfsmount *nmp, struct nfsclopen *op, int error,
int candelete)
{
struct nfsclclient *clp;
struct nfsclowner *owp;
if (op == NULL)
return;
NFSLOCKCLSTATE();
owp = op->nfso_own;
if (NFSHASONEOPENOWN(nmp))
nfsv4_relref(&owp->nfsow_rwlock);
else
nfscl_lockunlock(&owp->nfsow_rwlock);
clp = owp->nfsow_clp;
if (error && candelete && op->nfso_opencnt == 0)
nfscl_freeopen(op, 0);
nfscl_clrelease(clp);
NFSUNLOCKCLSTATE();
}
/*
* Called to get a clientid structure. It will optionally lock the
* client data structures to do the SetClientId/SetClientId_confirm,
* but will release that lock and return the clientid with a reference
* count on it.
* If the "cred" argument is NULL, a new clientid should not be created.
* If the "p" argument is NULL, a SetClientID/SetClientIDConfirm cannot
* be done.
* The start_renewthread argument tells nfscl_getcl() to start a renew
* thread if this creates a new clp.
* It always clpp with a reference count on it, unless returning an error.
*/
APPLESTATIC int
nfscl_getcl(struct mount *mp, struct ucred *cred, NFSPROC_T *p,
int start_renewthread, struct nfsclclient **clpp)
{
struct nfsclclient *clp;
struct nfsclclient *newclp = NULL;
struct nfsmount *nmp;
char uuid[HOSTUUIDLEN];
int igotlock = 0, error, trystalecnt, clidinusedelay, i;
u_int16_t idlen = 0;
nmp = VFSTONFS(mp);
if (cred != NULL) {
getcredhostuuid(cred, uuid, sizeof uuid);
idlen = strlen(uuid);
if (idlen > 0)
idlen += sizeof (u_int64_t);
else
idlen += sizeof (u_int64_t) + 16; /* 16 random bytes */
MALLOC(newclp, struct nfsclclient *,
sizeof (struct nfsclclient) + idlen - 1, M_NFSCLCLIENT,
M_WAITOK | M_ZERO);
}
NFSLOCKCLSTATE();
/*
* If a forced dismount is already in progress, don't
* allocate a new clientid and get out now. For the case where
* clp != NULL, this is a harmless optimization.
*/
if (NFSCL_FORCEDISM(mp)) {
NFSUNLOCKCLSTATE();
if (newclp != NULL)
free(newclp, M_NFSCLCLIENT);
return (EBADF);
}
clp = nmp->nm_clp;
if (clp == NULL) {
if (newclp == NULL) {
NFSUNLOCKCLSTATE();
return (EACCES);
}
clp = newclp;
clp->nfsc_idlen = idlen;
LIST_INIT(&clp->nfsc_owner);
TAILQ_INIT(&clp->nfsc_deleg);
TAILQ_INIT(&clp->nfsc_layout);
LIST_INIT(&clp->nfsc_devinfo);
for (i = 0; i < NFSCLDELEGHASHSIZE; i++)
LIST_INIT(&clp->nfsc_deleghash[i]);
for (i = 0; i < NFSCLLAYOUTHASHSIZE; i++)
LIST_INIT(&clp->nfsc_layouthash[i]);
clp->nfsc_flags = NFSCLFLAGS_INITED;
clp->nfsc_clientidrev = 1;
clp->nfsc_cbident = nfscl_nextcbident();
nfscl_fillclid(nmp->nm_clval, uuid, clp->nfsc_id,
clp->nfsc_idlen);
LIST_INSERT_HEAD(&nfsclhead, clp, nfsc_list);
nmp->nm_clp = clp;
clp->nfsc_nmp = nmp;
NFSUNLOCKCLSTATE();
if (start_renewthread != 0)
nfscl_start_renewthread(clp);
} else {
NFSUNLOCKCLSTATE();
if (newclp != NULL)
free(newclp, M_NFSCLCLIENT);
}
NFSLOCKCLSTATE();
while ((clp->nfsc_flags & NFSCLFLAGS_HASCLIENTID) == 0 && !igotlock &&
!NFSCL_FORCEDISM(mp))
igotlock = nfsv4_lock(&clp->nfsc_lock, 1, NULL,
NFSCLSTATEMUTEXPTR, mp);
if (igotlock == 0) {
/*
* Call nfsv4_lock() with "iwantlock == 0" so that it will
* wait for a pending exclusive lock request. This gives the
* exclusive lock request priority over this shared lock
* request.
* An exclusive lock on nfsc_lock is used mainly for server
* crash recoveries.
*/
nfsv4_lock(&clp->nfsc_lock, 0, NULL, NFSCLSTATEMUTEXPTR, mp);
nfsv4_getref(&clp->nfsc_lock, NULL, NFSCLSTATEMUTEXPTR, mp);
}
if (igotlock == 0 && NFSCL_FORCEDISM(mp)) {
/*
* Both nfsv4_lock() and nfsv4_getref() know to check
* for NFSCL_FORCEDISM() and return without sleeping to
* wait for the exclusive lock to be released, since it
* might be held by nfscl_umount() and we need to get out
* now for that case and not wait until nfscl_umount()
* releases it.
*/
NFSUNLOCKCLSTATE();
return (EBADF);
}
NFSUNLOCKCLSTATE();
/*
* If it needs a clientid, do the setclientid now.
*/
if ((clp->nfsc_flags & NFSCLFLAGS_HASCLIENTID) == 0) {
if (!igotlock)
panic("nfscl_clget");
if (p == NULL || cred == NULL) {
NFSLOCKCLSTATE();
nfsv4_unlock(&clp->nfsc_lock, 0);
NFSUNLOCKCLSTATE();
return (EACCES);
}
/*
* If RFC3530 Sec. 14.2.33 is taken literally,
* NFSERR_CLIDINUSE will be returned persistently for the
* case where a new mount of the same file system is using
* a different principal. In practice, NFSERR_CLIDINUSE is
* only returned when there is outstanding unexpired state
* on the clientid. As such, try for twice the lease
* interval, if we know what that is. Otherwise, make a
* wild ass guess.
* The case of returning NFSERR_STALECLIENTID is far less
* likely, but might occur if there is a significant delay
* between doing the SetClientID and SetClientIDConfirm Ops,
* such that the server throws away the clientid before
* receiving the SetClientIDConfirm.
*/
if (clp->nfsc_renew > 0)
clidinusedelay = NFSCL_LEASE(clp->nfsc_renew) * 2;
else
clidinusedelay = 120;
trystalecnt = 3;
do {
error = nfsrpc_setclient(nmp, clp, 0, cred, p);
if (error == NFSERR_STALECLIENTID ||
error == NFSERR_STALEDONTRECOVER ||
error == NFSERR_BADSESSION ||
error == NFSERR_CLIDINUSE) {
(void) nfs_catnap(PZERO, error, "nfs_setcl");
}
} while (((error == NFSERR_STALECLIENTID ||
error == NFSERR_BADSESSION ||
error == NFSERR_STALEDONTRECOVER) && --trystalecnt > 0) ||
(error == NFSERR_CLIDINUSE && --clidinusedelay > 0));
if (error) {
NFSLOCKCLSTATE();
nfsv4_unlock(&clp->nfsc_lock, 0);
NFSUNLOCKCLSTATE();
return (error);
}
clp->nfsc_flags |= NFSCLFLAGS_HASCLIENTID;
}
if (igotlock) {
NFSLOCKCLSTATE();
nfsv4_unlock(&clp->nfsc_lock, 1);
NFSUNLOCKCLSTATE();
}
*clpp = clp;
return (0);
}
/*
* Get a reference to a clientid and return it, if valid.
*/
APPLESTATIC struct nfsclclient *
nfscl_findcl(struct nfsmount *nmp)
{
struct nfsclclient *clp;
clp = nmp->nm_clp;
if (clp == NULL || !(clp->nfsc_flags & NFSCLFLAGS_HASCLIENTID))
return (NULL);
return (clp);
}
/*
* Release the clientid structure. It may be locked or reference counted.
*/
static void
nfscl_clrelease(struct nfsclclient *clp)
{
if (clp->nfsc_lock.nfslock_lock & NFSV4LOCK_LOCK)
nfsv4_unlock(&clp->nfsc_lock, 0);
else
nfsv4_relref(&clp->nfsc_lock);
}
/*
* External call for nfscl_clrelease.
*/
APPLESTATIC void
nfscl_clientrelease(struct nfsclclient *clp)
{
NFSLOCKCLSTATE();
if (clp->nfsc_lock.nfslock_lock & NFSV4LOCK_LOCK)
nfsv4_unlock(&clp->nfsc_lock, 0);
else
nfsv4_relref(&clp->nfsc_lock);
NFSUNLOCKCLSTATE();
}
/*
* Called when wanting to lock a byte region.
*/
APPLESTATIC int
nfscl_getbytelock(vnode_t vp, u_int64_t off, u_int64_t len,
short type, struct ucred *cred, NFSPROC_T *p, struct nfsclclient *rclp,
int recovery, void *id, int flags, u_int8_t *rownp, u_int8_t *ropenownp,
struct nfscllockowner **lpp, int *newonep, int *donelocallyp)
{
struct nfscllockowner *lp;
struct nfsclopen *op;
struct nfsclclient *clp;
struct nfscllockowner *nlp;
struct nfscllock *nlop, *otherlop;
struct nfscldeleg *dp = NULL, *ldp = NULL;
struct nfscllockownerhead *lhp = NULL;
struct nfsnode *np;
u_int8_t own[NFSV4CL_LOCKNAMELEN], *ownp, openown[NFSV4CL_LOCKNAMELEN];
u_int8_t *openownp;
int error = 0, ret, donelocally = 0;
u_int32_t mode;
/* For Lock Ops, the open mode doesn't matter, so use 0 to match any. */
mode = 0;
np = VTONFS(vp);
*lpp = NULL;
lp = NULL;
*newonep = 0;
*donelocallyp = 0;
/*
* Might need these, so MALLOC them now, to
* avoid a tsleep() in MALLOC later.
*/
MALLOC(nlp, struct nfscllockowner *,
sizeof (struct nfscllockowner), M_NFSCLLOCKOWNER, M_WAITOK);
MALLOC(otherlop, struct nfscllock *,
sizeof (struct nfscllock), M_NFSCLLOCK, M_WAITOK);
MALLOC(nlop, struct nfscllock *,
sizeof (struct nfscllock), M_NFSCLLOCK, M_WAITOK);
nlop->nfslo_type = type;
nlop->nfslo_first = off;
if (len == NFS64BITSSET) {
nlop->nfslo_end = NFS64BITSSET;
} else {
nlop->nfslo_end = off + len;
if (nlop->nfslo_end <= nlop->nfslo_first)
error = NFSERR_INVAL;
}
if (!error) {
if (recovery)
clp = rclp;
else
error = nfscl_getcl(vnode_mount(vp), cred, p, 1, &clp);
}
if (error) {
FREE((caddr_t)nlp, M_NFSCLLOCKOWNER);
FREE((caddr_t)otherlop, M_NFSCLLOCK);
FREE((caddr_t)nlop, M_NFSCLLOCK);
return (error);
}
op = NULL;
if (recovery) {
ownp = rownp;
openownp = ropenownp;
} else {
nfscl_filllockowner(id, own, flags);
ownp = own;
if (NFSHASONEOPENOWN(VFSTONFS(vnode_mount(vp))))
nfscl_filllockowner(NULL, openown, F_POSIX);
else
nfscl_filllockowner(p->td_proc, openown, F_POSIX);
openownp = openown;
}
if (!recovery) {
NFSLOCKCLSTATE();
/*
* First, search for a delegation. If one exists for this file,
* the lock can be done locally against it, so long as there
* isn't a local lock conflict.
*/
ldp = dp = nfscl_finddeleg(clp, np->n_fhp->nfh_fh,
np->n_fhp->nfh_len);
/* Just sanity check for correct type of delegation */
if (dp != NULL && ((dp->nfsdl_flags &
(NFSCLDL_RECALL | NFSCLDL_DELEGRET)) != 0 ||
(type == F_WRLCK &&
(dp->nfsdl_flags & NFSCLDL_WRITE) == 0)))
dp = NULL;
}
if (dp != NULL) {
/* Now, find an open and maybe a lockowner. */
ret = nfscl_getopen(&dp->nfsdl_owner, np->n_fhp->nfh_fh,
np->n_fhp->nfh_len, openownp, ownp, mode, NULL, &op);
if (ret)
ret = nfscl_getopen(&clp->nfsc_owner,
np->n_fhp->nfh_fh, np->n_fhp->nfh_len, openownp,
ownp, mode, NULL, &op);
if (!ret) {
lhp = &dp->nfsdl_lock;
TAILQ_REMOVE(&clp->nfsc_deleg, dp, nfsdl_list);
TAILQ_INSERT_HEAD(&clp->nfsc_deleg, dp, nfsdl_list);
dp->nfsdl_timestamp = NFSD_MONOSEC + 120;
donelocally = 1;
} else {
dp = NULL;
}
}
if (!donelocally) {
/*
* Get the related Open and maybe lockowner.
*/
error = nfscl_getopen(&clp->nfsc_owner,
np->n_fhp->nfh_fh, np->n_fhp->nfh_len, openownp,
ownp, mode, &lp, &op);
if (!error)
lhp = &op->nfso_lock;
}
if (!error && !recovery)
error = nfscl_localconflict(clp, np->n_fhp->nfh_fh,
np->n_fhp->nfh_len, nlop, ownp, ldp, NULL);
if (error) {
if (!recovery) {
nfscl_clrelease(clp);
NFSUNLOCKCLSTATE();
}
FREE((caddr_t)nlp, M_NFSCLLOCKOWNER);
FREE((caddr_t)otherlop, M_NFSCLLOCK);
FREE((caddr_t)nlop, M_NFSCLLOCK);
return (error);
}
/*
* Ok, see if a lockowner exists and create one, as required.
*/
if (lp == NULL)
LIST_FOREACH(lp, lhp, nfsl_list) {
if (!NFSBCMP(lp->nfsl_owner, ownp, NFSV4CL_LOCKNAMELEN))
break;
}
if (lp == NULL) {
NFSBCOPY(ownp, nlp->nfsl_owner, NFSV4CL_LOCKNAMELEN);
if (recovery)
NFSBCOPY(ropenownp, nlp->nfsl_openowner,
NFSV4CL_LOCKNAMELEN);
else
NFSBCOPY(op->nfso_own->nfsow_owner, nlp->nfsl_openowner,
NFSV4CL_LOCKNAMELEN);
nlp->nfsl_seqid = 0;
nlp->nfsl_lockflags = flags;
nlp->nfsl_inprog = NULL;
nfscl_lockinit(&nlp->nfsl_rwlock);
LIST_INIT(&nlp->nfsl_lock);
if (donelocally) {
nlp->nfsl_open = NULL;
nfsstatsv1.cllocallockowners++;
} else {
nlp->nfsl_open = op;
nfsstatsv1.cllockowners++;
}
LIST_INSERT_HEAD(lhp, nlp, nfsl_list);
lp = nlp;
nlp = NULL;
*newonep = 1;
}
/*
* Now, update the byte ranges for locks.
*/
ret = nfscl_updatelock(lp, &nlop, &otherlop, donelocally);
if (!ret)
donelocally = 1;
if (donelocally) {
*donelocallyp = 1;
if (!recovery)
nfscl_clrelease(clp);
} else {
/*
* Serial modifications on the lock owner for multiple threads
* for the same process using a read/write lock.
*/
if (!recovery)
nfscl_lockexcl(&lp->nfsl_rwlock, NFSCLSTATEMUTEXPTR);
}
if (!recovery)
NFSUNLOCKCLSTATE();
if (nlp)
FREE((caddr_t)nlp, M_NFSCLLOCKOWNER);
if (nlop)
FREE((caddr_t)nlop, M_NFSCLLOCK);
if (otherlop)
FREE((caddr_t)otherlop, M_NFSCLLOCK);
*lpp = lp;
return (0);
}
/*
* Called to unlock a byte range, for LockU.
*/
APPLESTATIC int
nfscl_relbytelock(vnode_t vp, u_int64_t off, u_int64_t len,
__unused struct ucred *cred, NFSPROC_T *p, int callcnt,
struct nfsclclient *clp, void *id, int flags,
struct nfscllockowner **lpp, int *dorpcp)
{
struct nfscllockowner *lp;
struct nfsclowner *owp;
struct nfsclopen *op;
struct nfscllock *nlop, *other_lop = NULL;
struct nfscldeleg *dp;
struct nfsnode *np;
u_int8_t own[NFSV4CL_LOCKNAMELEN];
int ret = 0, fnd;
np = VTONFS(vp);
*lpp = NULL;
*dorpcp = 0;
/*
* Might need these, so MALLOC them now, to
* avoid a tsleep() in MALLOC later.
*/
MALLOC(nlop, struct nfscllock *,
sizeof (struct nfscllock), M_NFSCLLOCK, M_WAITOK);
nlop->nfslo_type = F_UNLCK;
nlop->nfslo_first = off;
if (len == NFS64BITSSET) {
nlop->nfslo_end = NFS64BITSSET;
} else {
nlop->nfslo_end = off + len;
if (nlop->nfslo_end <= nlop->nfslo_first) {
FREE((caddr_t)nlop, M_NFSCLLOCK);
return (NFSERR_INVAL);
}
}
if (callcnt == 0) {
MALLOC(other_lop, struct nfscllock *,
sizeof (struct nfscllock), M_NFSCLLOCK, M_WAITOK);
*other_lop = *nlop;
}
nfscl_filllockowner(id, own, flags);
dp = NULL;
NFSLOCKCLSTATE();
if (callcnt == 0)
dp = nfscl_finddeleg(clp, np->n_fhp->nfh_fh,
np->n_fhp->nfh_len);
/*
* First, unlock any local regions on a delegation.
*/
if (dp != NULL) {
/* Look for this lockowner. */
LIST_FOREACH(lp, &dp->nfsdl_lock, nfsl_list) {
if (!NFSBCMP(lp->nfsl_owner, own,
NFSV4CL_LOCKNAMELEN))
break;
}
if (lp != NULL)
/* Use other_lop, so nlop is still available */
(void)nfscl_updatelock(lp, &other_lop, NULL, 1);
}
/*
* Now, find a matching open/lockowner that hasn't already been done,
* as marked by nfsl_inprog.
*/
lp = NULL;
fnd = 0;
LIST_FOREACH(owp, &clp->nfsc_owner, nfsow_list) {
LIST_FOREACH(op, &owp->nfsow_open, nfso_list) {
if (op->nfso_fhlen == np->n_fhp->nfh_len &&
!NFSBCMP(op->nfso_fh, np->n_fhp->nfh_fh, op->nfso_fhlen)) {
LIST_FOREACH(lp, &op->nfso_lock, nfsl_list) {
if (lp->nfsl_inprog == NULL &&
!NFSBCMP(lp->nfsl_owner, own,
NFSV4CL_LOCKNAMELEN)) {
fnd = 1;
break;
}
}
if (fnd)
break;
}
}
if (fnd)
break;
}
if (lp != NULL) {
ret = nfscl_updatelock(lp, &nlop, NULL, 0);
if (ret)
*dorpcp = 1;
/*
* Serial modifications on the lock owner for multiple
* threads for the same process using a read/write lock.
*/
lp->nfsl_inprog = p;
nfscl_lockexcl(&lp->nfsl_rwlock, NFSCLSTATEMUTEXPTR);
*lpp = lp;
}
NFSUNLOCKCLSTATE();
if (nlop)
FREE((caddr_t)nlop, M_NFSCLLOCK);
if (other_lop)
FREE((caddr_t)other_lop, M_NFSCLLOCK);
return (0);
}
/*
* Release all lockowners marked in progess for this process and file.
*/
APPLESTATIC void
nfscl_releasealllocks(struct nfsclclient *clp, vnode_t vp, NFSPROC_T *p,
void *id, int flags)
{
struct nfsclowner *owp;
struct nfsclopen *op;
struct nfscllockowner *lp;
struct nfsnode *np;
u_int8_t own[NFSV4CL_LOCKNAMELEN];
np = VTONFS(vp);
nfscl_filllockowner(id, own, flags);
NFSLOCKCLSTATE();
LIST_FOREACH(owp, &clp->nfsc_owner, nfsow_list) {
LIST_FOREACH(op, &owp->nfsow_open, nfso_list) {
if (op->nfso_fhlen == np->n_fhp->nfh_len &&
!NFSBCMP(op->nfso_fh, np->n_fhp->nfh_fh, op->nfso_fhlen)) {
LIST_FOREACH(lp, &op->nfso_lock, nfsl_list) {
if (lp->nfsl_inprog == p &&
!NFSBCMP(lp->nfsl_owner, own,
NFSV4CL_LOCKNAMELEN)) {
lp->nfsl_inprog = NULL;
nfscl_lockunlock(&lp->nfsl_rwlock);
}
}
}
}
}
nfscl_clrelease(clp);
NFSUNLOCKCLSTATE();
}
/*
* Called to find out if any bytes within the byte range specified are
* write locked by the calling process. Used to determine if flushing
* is required before a LockU.
* If in doubt, return 1, so the flush will occur.
*/
APPLESTATIC int
nfscl_checkwritelocked(vnode_t vp, struct flock *fl,
struct ucred *cred, NFSPROC_T *p, void *id, int flags)
{
struct nfsclowner *owp;
struct nfscllockowner *lp;
struct nfsclopen *op;
struct nfsclclient *clp;
struct nfscllock *lop;
struct nfscldeleg *dp;
struct nfsnode *np;
u_int64_t off, end;
u_int8_t own[NFSV4CL_LOCKNAMELEN];
int error = 0;
np = VTONFS(vp);
switch (fl->l_whence) {
case SEEK_SET:
case SEEK_CUR:
/*
* Caller is responsible for adding any necessary offset
* when SEEK_CUR is used.
*/
off = fl->l_start;
break;
case SEEK_END:
off = np->n_size + fl->l_start;
break;
default:
return (1);
}
if (fl->l_len != 0) {
end = off + fl->l_len;
if (end < off)
return (1);
} else {
end = NFS64BITSSET;
}
error = nfscl_getcl(vnode_mount(vp), cred, p, 1, &clp);
if (error)
return (1);
nfscl_filllockowner(id, own, flags);
NFSLOCKCLSTATE();
/*
* First check the delegation locks.
*/
dp = nfscl_finddeleg(clp, np->n_fhp->nfh_fh, np->n_fhp->nfh_len);
if (dp != NULL) {
LIST_FOREACH(lp, &dp->nfsdl_lock, nfsl_list) {
if (!NFSBCMP(lp->nfsl_owner, own,
NFSV4CL_LOCKNAMELEN))
break;
}
if (lp != NULL) {
LIST_FOREACH(lop, &lp->nfsl_lock, nfslo_list) {
if (lop->nfslo_first >= end)
break;
if (lop->nfslo_end <= off)
continue;
if (lop->nfslo_type == F_WRLCK) {
nfscl_clrelease(clp);
NFSUNLOCKCLSTATE();
return (1);
}
}
}
}
/*
* Now, check state against the server.
*/
LIST_FOREACH(owp, &clp->nfsc_owner, nfsow_list) {
LIST_FOREACH(op, &owp->nfsow_open, nfso_list) {
if (op->nfso_fhlen == np->n_fhp->nfh_len &&
!NFSBCMP(op->nfso_fh, np->n_fhp->nfh_fh, op->nfso_fhlen)) {
LIST_FOREACH(lp, &op->nfso_lock, nfsl_list) {
if (!NFSBCMP(lp->nfsl_owner, own,
NFSV4CL_LOCKNAMELEN))
break;
}
if (lp != NULL) {
LIST_FOREACH(lop, &lp->nfsl_lock, nfslo_list) {
if (lop->nfslo_first >= end)
break;
if (lop->nfslo_end <= off)
continue;
if (lop->nfslo_type == F_WRLCK) {
nfscl_clrelease(clp);
NFSUNLOCKCLSTATE();
return (1);
}
}
}
}
}
}
nfscl_clrelease(clp);
NFSUNLOCKCLSTATE();
return (0);
}
/*
* Release a byte range lock owner structure.
*/
APPLESTATIC void
nfscl_lockrelease(struct nfscllockowner *lp, int error, int candelete)
{
struct nfsclclient *clp;
if (lp == NULL)
return;
NFSLOCKCLSTATE();
clp = lp->nfsl_open->nfso_own->nfsow_clp;
if (error != 0 && candelete &&
(lp->nfsl_rwlock.nfslock_lock & NFSV4LOCK_WANTED) == 0)
nfscl_freelockowner(lp, 0);
else
nfscl_lockunlock(&lp->nfsl_rwlock);
nfscl_clrelease(clp);
NFSUNLOCKCLSTATE();
}
/*
* Free up an open structure and any associated byte range lock structures.
*/
APPLESTATIC void
nfscl_freeopen(struct nfsclopen *op, int local)
{
LIST_REMOVE(op, nfso_list);
nfscl_freealllocks(&op->nfso_lock, local);
FREE((caddr_t)op, M_NFSCLOPEN);
if (local)
nfsstatsv1.cllocalopens--;
else
nfsstatsv1.clopens--;
}
/*
* Free up all lock owners and associated locks.
*/
static void
nfscl_freealllocks(struct nfscllockownerhead *lhp, int local)
{
struct nfscllockowner *lp, *nlp;
LIST_FOREACH_SAFE(lp, lhp, nfsl_list, nlp) {
if ((lp->nfsl_rwlock.nfslock_lock & NFSV4LOCK_WANTED))
panic("nfscllckw");
nfscl_freelockowner(lp, local);
}
}
/*
* Called for an Open when NFSERR_EXPIRED is received from the server.
* If there are no byte range locks nor a Share Deny lost, try to do a
* fresh Open. Otherwise, free the open.
*/
static int
nfscl_expireopen(struct nfsclclient *clp, struct nfsclopen *op,
struct nfsmount *nmp, struct ucred *cred, NFSPROC_T *p)
{
struct nfscllockowner *lp;
struct nfscldeleg *dp;
int mustdelete = 0, error;
/*
* Look for any byte range lock(s).
*/
LIST_FOREACH(lp, &op->nfso_lock, nfsl_list) {
if (!LIST_EMPTY(&lp->nfsl_lock)) {
mustdelete = 1;
break;
}
}
/*
* If no byte range lock(s) nor a Share deny, try to re-open.
*/
if (!mustdelete && (op->nfso_mode & NFSLCK_DENYBITS) == 0) {
newnfs_copycred(&op->nfso_cred, cred);
dp = NULL;
error = nfsrpc_reopen(nmp, op->nfso_fh,
op->nfso_fhlen, op->nfso_mode, op, &dp, cred, p);
if (error) {
mustdelete = 1;
if (dp != NULL) {
FREE((caddr_t)dp, M_NFSCLDELEG);
dp = NULL;
}
}
if (dp != NULL)
nfscl_deleg(nmp->nm_mountp, clp, op->nfso_fh,
op->nfso_fhlen, cred, p, &dp);
}
/*
* If a byte range lock or Share deny or couldn't re-open, free it.
*/
if (mustdelete)
nfscl_freeopen(op, 0);
return (mustdelete);
}
/*
* Free up an open owner structure.
*/
static void
nfscl_freeopenowner(struct nfsclowner *owp, int local)
{
LIST_REMOVE(owp, nfsow_list);
FREE((caddr_t)owp, M_NFSCLOWNER);
if (local)
nfsstatsv1.cllocalopenowners--;
else
nfsstatsv1.clopenowners--;
}
/*
* Free up a byte range lock owner structure.
*/
APPLESTATIC void
nfscl_freelockowner(struct nfscllockowner *lp, int local)
{
struct nfscllock *lop, *nlop;
LIST_REMOVE(lp, nfsl_list);
LIST_FOREACH_SAFE(lop, &lp->nfsl_lock, nfslo_list, nlop) {
nfscl_freelock(lop, local);
}
FREE((caddr_t)lp, M_NFSCLLOCKOWNER);
if (local)
nfsstatsv1.cllocallockowners--;
else
nfsstatsv1.cllockowners--;
}
/*
* Free up a byte range lock structure.
*/
APPLESTATIC void
nfscl_freelock(struct nfscllock *lop, int local)
{
LIST_REMOVE(lop, nfslo_list);
FREE((caddr_t)lop, M_NFSCLLOCK);
if (local)
nfsstatsv1.cllocallocks--;
else
nfsstatsv1.cllocks--;
}
/*
* Clean out the state related to a delegation.
*/
static void
nfscl_cleandeleg(struct nfscldeleg *dp)
{
struct nfsclowner *owp, *nowp;
struct nfsclopen *op;
LIST_FOREACH_SAFE(owp, &dp->nfsdl_owner, nfsow_list, nowp) {
op = LIST_FIRST(&owp->nfsow_open);
if (op != NULL) {
if (LIST_NEXT(op, nfso_list) != NULL)
panic("nfscleandel");
nfscl_freeopen(op, 1);
}
nfscl_freeopenowner(owp, 1);
}
nfscl_freealllocks(&dp->nfsdl_lock, 1);
}
/*
* Free a delegation.
*/
static void
nfscl_freedeleg(struct nfscldeleghead *hdp, struct nfscldeleg *dp)
{
TAILQ_REMOVE(hdp, dp, nfsdl_list);
LIST_REMOVE(dp, nfsdl_hash);
FREE((caddr_t)dp, M_NFSCLDELEG);
nfsstatsv1.cldelegates--;
nfscl_delegcnt--;
}
/*
* Free up all state related to this client structure.
*/
static void
nfscl_cleanclient(struct nfsclclient *clp)
{
struct nfsclowner *owp, *nowp;
struct nfsclopen *op, *nop;
struct nfscllayout *lyp, *nlyp;
struct nfscldevinfo *dip, *ndip;
TAILQ_FOREACH_SAFE(lyp, &clp->nfsc_layout, nfsly_list, nlyp)
nfscl_freelayout(lyp);
LIST_FOREACH_SAFE(dip, &clp->nfsc_devinfo, nfsdi_list, ndip)
nfscl_freedevinfo(dip);
/* Now, all the OpenOwners, etc. */
LIST_FOREACH_SAFE(owp, &clp->nfsc_owner, nfsow_list, nowp) {
LIST_FOREACH_SAFE(op, &owp->nfsow_open, nfso_list, nop) {
nfscl_freeopen(op, 0);
}
nfscl_freeopenowner(owp, 0);
}
}
/*
* Called when an NFSERR_EXPIRED is received from the server.
*/
static void
nfscl_expireclient(struct nfsclclient *clp, struct nfsmount *nmp,
struct ucred *cred, NFSPROC_T *p)
{
struct nfsclowner *owp, *nowp, *towp;
struct nfsclopen *op, *nop, *top;
struct nfscldeleg *dp, *ndp;
int ret, printed = 0;
/*
* First, merge locally issued Opens into the list for the server.
*/
dp = TAILQ_FIRST(&clp->nfsc_deleg);
while (dp != NULL) {
ndp = TAILQ_NEXT(dp, nfsdl_list);
owp = LIST_FIRST(&dp->nfsdl_owner);
while (owp != NULL) {
nowp = LIST_NEXT(owp, nfsow_list);
op = LIST_FIRST(&owp->nfsow_open);
if (op != NULL) {
if (LIST_NEXT(op, nfso_list) != NULL)
panic("nfsclexp");
LIST_FOREACH(towp, &clp->nfsc_owner, nfsow_list) {
if (!NFSBCMP(towp->nfsow_owner, owp->nfsow_owner,
NFSV4CL_LOCKNAMELEN))
break;
}
if (towp != NULL) {
/* Merge opens in */
LIST_FOREACH(top, &towp->nfsow_open, nfso_list) {
if (top->nfso_fhlen == op->nfso_fhlen &&
!NFSBCMP(top->nfso_fh, op->nfso_fh,
op->nfso_fhlen)) {
top->nfso_mode |= op->nfso_mode;
top->nfso_opencnt += op->nfso_opencnt;
break;
}
}
if (top == NULL) {
/* Just add the open to the owner list */
LIST_REMOVE(op, nfso_list);
op->nfso_own = towp;
LIST_INSERT_HEAD(&towp->nfsow_open, op, nfso_list);
nfsstatsv1.cllocalopens--;
nfsstatsv1.clopens++;
}
} else {
/* Just add the openowner to the client list */
LIST_REMOVE(owp, nfsow_list);
owp->nfsow_clp = clp;
LIST_INSERT_HEAD(&clp->nfsc_owner, owp, nfsow_list);
nfsstatsv1.cllocalopenowners--;
nfsstatsv1.clopenowners++;
nfsstatsv1.cllocalopens--;
nfsstatsv1.clopens++;
}
}
owp = nowp;
}
if (!printed && !LIST_EMPTY(&dp->nfsdl_lock)) {
printed = 1;
printf("nfsv4 expired locks lost\n");
}
nfscl_cleandeleg(dp);
nfscl_freedeleg(&clp->nfsc_deleg, dp);
dp = ndp;
}
if (!TAILQ_EMPTY(&clp->nfsc_deleg))
panic("nfsclexp");
/*
* Now, try and reopen against the server.
*/
LIST_FOREACH_SAFE(owp, &clp->nfsc_owner, nfsow_list, nowp) {
owp->nfsow_seqid = 0;
LIST_FOREACH_SAFE(op, &owp->nfsow_open, nfso_list, nop) {
ret = nfscl_expireopen(clp, op, nmp, cred, p);
if (ret && !printed) {
printed = 1;
printf("nfsv4 expired locks lost\n");
}
}
if (LIST_EMPTY(&owp->nfsow_open))
nfscl_freeopenowner(owp, 0);
}
}
/*
* This function must be called after the process represented by "own" has
* exited. Must be called with CLSTATE lock held.
*/
static void
nfscl_cleanup_common(struct nfsclclient *clp, u_int8_t *own)
{
struct nfsclowner *owp, *nowp;
struct nfscllockowner *lp, *nlp;
struct nfscldeleg *dp;
/* First, get rid of local locks on delegations. */
TAILQ_FOREACH(dp, &clp->nfsc_deleg, nfsdl_list) {
LIST_FOREACH_SAFE(lp, &dp->nfsdl_lock, nfsl_list, nlp) {
if (!NFSBCMP(lp->nfsl_owner, own, NFSV4CL_LOCKNAMELEN)) {
if ((lp->nfsl_rwlock.nfslock_lock & NFSV4LOCK_WANTED))
panic("nfscllckw");
nfscl_freelockowner(lp, 1);
}
}
}
owp = LIST_FIRST(&clp->nfsc_owner);
while (owp != NULL) {
nowp = LIST_NEXT(owp, nfsow_list);
if (!NFSBCMP(owp->nfsow_owner, own,
NFSV4CL_LOCKNAMELEN)) {
/*
* If there are children that haven't closed the
* file descriptors yet, the opens will still be
* here. For that case, let the renew thread clear
* out the OpenOwner later.
*/
if (LIST_EMPTY(&owp->nfsow_open))
nfscl_freeopenowner(owp, 0);
else
owp->nfsow_defunct = 1;
}
owp = nowp;
}
}
/*
* Find open/lock owners for processes that have exited.
*/
static void
nfscl_cleanupkext(struct nfsclclient *clp, struct nfscllockownerfhhead *lhp)
{
struct nfsclowner *owp, *nowp;
struct nfsclopen *op;
struct nfscllockowner *lp, *nlp;
struct nfscldeleg *dp;
NFSPROCLISTLOCK();
NFSLOCKCLSTATE();
LIST_FOREACH_SAFE(owp, &clp->nfsc_owner, nfsow_list, nowp) {
LIST_FOREACH(op, &owp->nfsow_open, nfso_list) {
LIST_FOREACH_SAFE(lp, &op->nfso_lock, nfsl_list, nlp) {
if (LIST_EMPTY(&lp->nfsl_lock))
nfscl_emptylockowner(lp, lhp);
}
}
if (nfscl_procdoesntexist(owp->nfsow_owner))
nfscl_cleanup_common(clp, owp->nfsow_owner);
}
/*
* For the single open_owner case, these lock owners need to be
* checked to see if they still exist separately.
* This is because nfscl_procdoesntexist() never returns true for
* the single open_owner so that the above doesn't ever call
* nfscl_cleanup_common().
*/
TAILQ_FOREACH(dp, &clp->nfsc_deleg, nfsdl_list) {
LIST_FOREACH_SAFE(lp, &dp->nfsdl_lock, nfsl_list, nlp) {
if (nfscl_procdoesntexist(lp->nfsl_owner))
nfscl_cleanup_common(clp, lp->nfsl_owner);
}
}
NFSUNLOCKCLSTATE();
NFSPROCLISTUNLOCK();
}
/*
* Take the empty lock owner and move it to the local lhp list if the
* associated process no longer exists.
*/
static void
nfscl_emptylockowner(struct nfscllockowner *lp,
struct nfscllockownerfhhead *lhp)
{
struct nfscllockownerfh *lfhp, *mylfhp;
struct nfscllockowner *nlp;
int fnd_it;
/* If not a Posix lock owner, just return. */
if ((lp->nfsl_lockflags & F_POSIX) == 0)
return;
fnd_it = 0;
mylfhp = NULL;
/*
* First, search to see if this lock owner is already in the list.
* If it is, then the associated process no longer exists.
*/
SLIST_FOREACH(lfhp, lhp, nfslfh_list) {
if (lfhp->nfslfh_len == lp->nfsl_open->nfso_fhlen &&
!NFSBCMP(lfhp->nfslfh_fh, lp->nfsl_open->nfso_fh,
lfhp->nfslfh_len))
mylfhp = lfhp;
LIST_FOREACH(nlp, &lfhp->nfslfh_lock, nfsl_list)
if (!NFSBCMP(nlp->nfsl_owner, lp->nfsl_owner,
NFSV4CL_LOCKNAMELEN))
fnd_it = 1;
}
/* If not found, check if process still exists. */
if (fnd_it == 0 && nfscl_procdoesntexist(lp->nfsl_owner) == 0)
return;
/* Move the lock owner over to the local list. */
if (mylfhp == NULL) {
mylfhp = malloc(sizeof(struct nfscllockownerfh), M_TEMP,
M_NOWAIT);
if (mylfhp == NULL)
return;
mylfhp->nfslfh_len = lp->nfsl_open->nfso_fhlen;
NFSBCOPY(lp->nfsl_open->nfso_fh, mylfhp->nfslfh_fh,
mylfhp->nfslfh_len);
LIST_INIT(&mylfhp->nfslfh_lock);
SLIST_INSERT_HEAD(lhp, mylfhp, nfslfh_list);
}
LIST_REMOVE(lp, nfsl_list);
LIST_INSERT_HEAD(&mylfhp->nfslfh_lock, lp, nfsl_list);
}
static int fake_global; /* Used to force visibility of MNTK_UNMOUNTF */
/*
* Called from nfs umount to free up the clientid.
*/
APPLESTATIC void
nfscl_umount(struct nfsmount *nmp, NFSPROC_T *p)
{
struct nfsclclient *clp;
struct ucred *cred;
int igotlock;
/*
* For the case that matters, this is the thread that set
* MNTK_UNMOUNTF, so it will see it set. The code that follows is
* done to ensure that any thread executing nfscl_getcl() after
* this time, will see MNTK_UNMOUNTF set. nfscl_getcl() uses the
* mutex for NFSLOCKCLSTATE(), so it is "m" for the following
* explanation, courtesy of Alan Cox.
* What follows is a snippet from Alan Cox's email at:
* http://docs.FreeBSD.org/cgi/
* mid.cgi?BANLkTikR3d65zPHo9==08ZfJ2vmqZucEvw
*
* 1. Set MNTK_UNMOUNTF
* 2. Acquire a standard FreeBSD mutex "m".
* 3. Update some data structures.
* 4. Release mutex "m".
*
* Then, other threads that acquire "m" after step 4 has occurred will
* see MNTK_UNMOUNTF as set. But, other threads that beat thread X to
* step 2 may or may not see MNTK_UNMOUNTF as set.
*/
NFSLOCKCLSTATE();
if ((nmp->nm_mountp->mnt_kern_flag & MNTK_UNMOUNTF) != 0) {
fake_global++;
NFSUNLOCKCLSTATE();
NFSLOCKCLSTATE();
}
clp = nmp->nm_clp;
if (clp != NULL) {
if ((clp->nfsc_flags & NFSCLFLAGS_INITED) == 0)
panic("nfscl umount");
/*
* First, handshake with the nfscl renew thread, to terminate
* it.
*/
clp->nfsc_flags |= NFSCLFLAGS_UMOUNT;
while (clp->nfsc_flags & NFSCLFLAGS_HASTHREAD)
(void)mtx_sleep(clp, NFSCLSTATEMUTEXPTR, PWAIT,
"nfsclumnt", hz);
/*
* Now, get the exclusive lock on the client state, so
* that no uses of the state are still in progress.
*/
do {
igotlock = nfsv4_lock(&clp->nfsc_lock, 1, NULL,
NFSCLSTATEMUTEXPTR, NULL);
} while (!igotlock);
NFSUNLOCKCLSTATE();
/*
* Free up all the state. It will expire on the server, but
* maybe we should do a SetClientId/SetClientIdConfirm so
* the server throws it away?
*/
LIST_REMOVE(clp, nfsc_list);
nfscl_delegreturnall(clp, p);
cred = newnfs_getcred();
if (NFSHASNFSV4N(nmp)) {
(void)nfsrpc_destroysession(nmp, clp, cred, p);
(void)nfsrpc_destroyclient(nmp, clp, cred, p);
} else
(void)nfsrpc_setclient(nmp, clp, 0, cred, p);
nfscl_cleanclient(clp);
nmp->nm_clp = NULL;
NFSFREECRED(cred);
free(clp, M_NFSCLCLIENT);
} else
NFSUNLOCKCLSTATE();
}
/*
* This function is called when a server replies with NFSERR_STALECLIENTID
* NFSERR_STALESTATEID or NFSERR_BADSESSION. It traverses the clientid lists,
* doing Opens and Locks with reclaim. If these fail, it deletes the
* corresponding state.
*/
static void
nfscl_recover(struct nfsclclient *clp, struct ucred *cred, NFSPROC_T *p)
{
struct nfsclowner *owp, *nowp;
struct nfsclopen *op, *nop;
struct nfscllockowner *lp, *nlp;
struct nfscllock *lop, *nlop;
struct nfscldeleg *dp, *ndp, *tdp;
struct nfsmount *nmp;
struct ucred *tcred;
struct nfsclopenhead extra_open;
struct nfscldeleghead extra_deleg;
struct nfsreq *rep;
u_int64_t len;
u_int32_t delegtype = NFSV4OPEN_DELEGATEWRITE, mode;
int i, igotlock = 0, error, trycnt, firstlock;
struct nfscllayout *lyp, *nlyp;
/*
* First, lock the client structure, so everyone else will
* block when trying to use state.
*/
NFSLOCKCLSTATE();
clp->nfsc_flags |= NFSCLFLAGS_RECVRINPROG;
do {
igotlock = nfsv4_lock(&clp->nfsc_lock, 1, NULL,
NFSCLSTATEMUTEXPTR, NULL);
} while (!igotlock);
NFSUNLOCKCLSTATE();
nmp = clp->nfsc_nmp;
if (nmp == NULL)
panic("nfscl recover");
/*
* For now, just get rid of all layouts. There may be a need
* to do LayoutCommit Ops with reclaim == true later.
*/
TAILQ_FOREACH_SAFE(lyp, &clp->nfsc_layout, nfsly_list, nlyp)
nfscl_freelayout(lyp);
TAILQ_INIT(&clp->nfsc_layout);
for (i = 0; i < NFSCLLAYOUTHASHSIZE; i++)
LIST_INIT(&clp->nfsc_layouthash[i]);
trycnt = 5;
do {
error = nfsrpc_setclient(nmp, clp, 1, cred, p);
} while ((error == NFSERR_STALECLIENTID ||
error == NFSERR_BADSESSION ||
error == NFSERR_STALEDONTRECOVER) && --trycnt > 0);
if (error) {
NFSLOCKCLSTATE();
clp->nfsc_flags &= ~(NFSCLFLAGS_RECOVER |
NFSCLFLAGS_RECVRINPROG);
wakeup(&clp->nfsc_flags);
nfsv4_unlock(&clp->nfsc_lock, 0);
NFSUNLOCKCLSTATE();
return;
}
clp->nfsc_flags |= NFSCLFLAGS_HASCLIENTID;
clp->nfsc_flags &= ~NFSCLFLAGS_RECOVER;
/*
* Mark requests already queued on the server, so that they don't
* initiate another recovery cycle. Any requests already in the
* queue that handle state information will have the old stale
* clientid/stateid and will get a NFSERR_STALESTATEID,
* NFSERR_STALECLIENTID or NFSERR_BADSESSION reply from the server.
* This will be translated to NFSERR_STALEDONTRECOVER when
* R_DONTRECOVER is set.
*/
NFSLOCKREQ();
TAILQ_FOREACH(rep, &nfsd_reqq, r_chain) {
if (rep->r_nmp == nmp)
rep->r_flags |= R_DONTRECOVER;
}
NFSUNLOCKREQ();
/*
* Now, mark all delegations "need reclaim".
*/
TAILQ_FOREACH(dp, &clp->nfsc_deleg, nfsdl_list)
dp->nfsdl_flags |= NFSCLDL_NEEDRECLAIM;
TAILQ_INIT(&extra_deleg);
LIST_INIT(&extra_open);
/*
* Now traverse the state lists, doing Open and Lock Reclaims.
*/
tcred = newnfs_getcred();
owp = LIST_FIRST(&clp->nfsc_owner);
while (owp != NULL) {
nowp = LIST_NEXT(owp, nfsow_list);
owp->nfsow_seqid = 0;
op = LIST_FIRST(&owp->nfsow_open);
while (op != NULL) {
nop = LIST_NEXT(op, nfso_list);
if (error != NFSERR_NOGRACE && error != NFSERR_BADSESSION) {
/* Search for a delegation to reclaim with the open */
TAILQ_FOREACH(dp, &clp->nfsc_deleg, nfsdl_list) {
if (!(dp->nfsdl_flags & NFSCLDL_NEEDRECLAIM))
continue;
if ((dp->nfsdl_flags & NFSCLDL_WRITE)) {
mode = NFSV4OPEN_ACCESSWRITE;
delegtype = NFSV4OPEN_DELEGATEWRITE;
} else {
mode = NFSV4OPEN_ACCESSREAD;
delegtype = NFSV4OPEN_DELEGATEREAD;
}
if ((op->nfso_mode & mode) == mode &&
op->nfso_fhlen == dp->nfsdl_fhlen &&
!NFSBCMP(op->nfso_fh, dp->nfsdl_fh, op->nfso_fhlen))
break;
}
ndp = dp;
if (dp == NULL)
delegtype = NFSV4OPEN_DELEGATENONE;
newnfs_copycred(&op->nfso_cred, tcred);
error = nfscl_tryopen(nmp, NULL, op->nfso_fh,
op->nfso_fhlen, op->nfso_fh, op->nfso_fhlen,
op->nfso_mode, op, NULL, 0, &ndp, 1, delegtype,
tcred, p);
if (!error) {
/* Handle any replied delegation */
if (ndp != NULL && ((ndp->nfsdl_flags & NFSCLDL_WRITE)
|| NFSMNT_RDONLY(nmp->nm_mountp))) {
if ((ndp->nfsdl_flags & NFSCLDL_WRITE))
mode = NFSV4OPEN_ACCESSWRITE;
else
mode = NFSV4OPEN_ACCESSREAD;
TAILQ_FOREACH(dp, &clp->nfsc_deleg, nfsdl_list) {
if (!(dp->nfsdl_flags & NFSCLDL_NEEDRECLAIM))
continue;
if ((op->nfso_mode & mode) == mode &&
op->nfso_fhlen == dp->nfsdl_fhlen &&
!NFSBCMP(op->nfso_fh, dp->nfsdl_fh,
op->nfso_fhlen)) {
dp->nfsdl_stateid = ndp->nfsdl_stateid;
dp->nfsdl_sizelimit = ndp->nfsdl_sizelimit;
dp->nfsdl_ace = ndp->nfsdl_ace;
dp->nfsdl_change = ndp->nfsdl_change;
dp->nfsdl_flags &= ~NFSCLDL_NEEDRECLAIM;
if ((ndp->nfsdl_flags & NFSCLDL_RECALL))
dp->nfsdl_flags |= NFSCLDL_RECALL;
FREE((caddr_t)ndp, M_NFSCLDELEG);
ndp = NULL;
break;
}
}
}
if (ndp != NULL)
TAILQ_INSERT_HEAD(&extra_deleg, ndp, nfsdl_list);
/* and reclaim all byte range locks */
lp = LIST_FIRST(&op->nfso_lock);
while (lp != NULL) {
nlp = LIST_NEXT(lp, nfsl_list);
lp->nfsl_seqid = 0;
firstlock = 1;
lop = LIST_FIRST(&lp->nfsl_lock);
while (lop != NULL) {
nlop = LIST_NEXT(lop, nfslo_list);
if (lop->nfslo_end == NFS64BITSSET)
len = NFS64BITSSET;
else
len = lop->nfslo_end - lop->nfslo_first;
error = nfscl_trylock(nmp, NULL,
op->nfso_fh, op->nfso_fhlen, lp,
firstlock, 1, lop->nfslo_first, len,
lop->nfslo_type, tcred, p);
if (error != 0)
nfscl_freelock(lop, 0);
else
firstlock = 0;
lop = nlop;
}
/* If no locks, but a lockowner, just delete it. */
if (LIST_EMPTY(&lp->nfsl_lock))
nfscl_freelockowner(lp, 0);
lp = nlp;
}
}
}
if (error != 0 && error != NFSERR_BADSESSION)
nfscl_freeopen(op, 0);
op = nop;
}
owp = nowp;
}
/*
* Now, try and get any delegations not yet reclaimed by cobbling
* to-gether an appropriate open.
*/
nowp = NULL;
dp = TAILQ_FIRST(&clp->nfsc_deleg);
while (dp != NULL) {
ndp = TAILQ_NEXT(dp, nfsdl_list);
if ((dp->nfsdl_flags & NFSCLDL_NEEDRECLAIM)) {
if (nowp == NULL) {
MALLOC(nowp, struct nfsclowner *,
sizeof (struct nfsclowner), M_NFSCLOWNER, M_WAITOK);
/*
* Name must be as long an largest possible
* NFSV4CL_LOCKNAMELEN. 12 for now.
*/
NFSBCOPY("RECLAIMDELEG", nowp->nfsow_owner,
NFSV4CL_LOCKNAMELEN);
LIST_INIT(&nowp->nfsow_open);
nowp->nfsow_clp = clp;
nowp->nfsow_seqid = 0;
nowp->nfsow_defunct = 0;
nfscl_lockinit(&nowp->nfsow_rwlock);
}
nop = NULL;
if (error != NFSERR_NOGRACE && error != NFSERR_BADSESSION) {
MALLOC(nop, struct nfsclopen *, sizeof (struct nfsclopen) +
dp->nfsdl_fhlen - 1, M_NFSCLOPEN, M_WAITOK);
nop->nfso_own = nowp;
if ((dp->nfsdl_flags & NFSCLDL_WRITE)) {
nop->nfso_mode = NFSV4OPEN_ACCESSWRITE;
delegtype = NFSV4OPEN_DELEGATEWRITE;
} else {
nop->nfso_mode = NFSV4OPEN_ACCESSREAD;
delegtype = NFSV4OPEN_DELEGATEREAD;
}
nop->nfso_opencnt = 0;
nop->nfso_posixlock = 1;
nop->nfso_fhlen = dp->nfsdl_fhlen;
NFSBCOPY(dp->nfsdl_fh, nop->nfso_fh, dp->nfsdl_fhlen);
LIST_INIT(&nop->nfso_lock);
nop->nfso_stateid.seqid = 0;
nop->nfso_stateid.other[0] = 0;
nop->nfso_stateid.other[1] = 0;
nop->nfso_stateid.other[2] = 0;
newnfs_copycred(&dp->nfsdl_cred, tcred);
newnfs_copyincred(tcred, &nop->nfso_cred);
tdp = NULL;
error = nfscl_tryopen(nmp, NULL, nop->nfso_fh,
nop->nfso_fhlen, nop->nfso_fh, nop->nfso_fhlen,
nop->nfso_mode, nop, NULL, 0, &tdp, 1,
delegtype, tcred, p);
if (tdp != NULL) {
if ((tdp->nfsdl_flags & NFSCLDL_WRITE))
mode = NFSV4OPEN_ACCESSWRITE;
else
mode = NFSV4OPEN_ACCESSREAD;
if ((nop->nfso_mode & mode) == mode &&
nop->nfso_fhlen == tdp->nfsdl_fhlen &&
!NFSBCMP(nop->nfso_fh, tdp->nfsdl_fh,
nop->nfso_fhlen)) {
dp->nfsdl_stateid = tdp->nfsdl_stateid;
dp->nfsdl_sizelimit = tdp->nfsdl_sizelimit;
dp->nfsdl_ace = tdp->nfsdl_ace;
dp->nfsdl_change = tdp->nfsdl_change;
dp->nfsdl_flags &= ~NFSCLDL_NEEDRECLAIM;
if ((tdp->nfsdl_flags & NFSCLDL_RECALL))
dp->nfsdl_flags |= NFSCLDL_RECALL;
FREE((caddr_t)tdp, M_NFSCLDELEG);
} else {
TAILQ_INSERT_HEAD(&extra_deleg, tdp, nfsdl_list);
}
}
}
if (error) {
if (nop != NULL)
FREE((caddr_t)nop, M_NFSCLOPEN);
/*
* Couldn't reclaim it, so throw the state
* away. Ouch!!
*/
nfscl_cleandeleg(dp);
nfscl_freedeleg(&clp->nfsc_deleg, dp);
} else {
LIST_INSERT_HEAD(&extra_open, nop, nfso_list);
}
}
dp = ndp;
}
/*
* Now, get rid of extra Opens and Delegations.
*/
LIST_FOREACH_SAFE(op, &extra_open, nfso_list, nop) {
do {
newnfs_copycred(&op->nfso_cred, tcred);
error = nfscl_tryclose(op, tcred, nmp, p);
if (error == NFSERR_GRACE)
(void) nfs_catnap(PZERO, error, "nfsexcls");
} while (error == NFSERR_GRACE);
LIST_REMOVE(op, nfso_list);
FREE((caddr_t)op, M_NFSCLOPEN);
}
if (nowp != NULL)
FREE((caddr_t)nowp, M_NFSCLOWNER);
TAILQ_FOREACH_SAFE(dp, &extra_deleg, nfsdl_list, ndp) {
do {
newnfs_copycred(&dp->nfsdl_cred, tcred);
error = nfscl_trydelegreturn(dp, tcred, nmp, p);
if (error == NFSERR_GRACE)
(void) nfs_catnap(PZERO, error, "nfsexdlg");
} while (error == NFSERR_GRACE);
TAILQ_REMOVE(&extra_deleg, dp, nfsdl_list);
FREE((caddr_t)dp, M_NFSCLDELEG);
}
/* For NFSv4.1 or later, do a RECLAIM_COMPLETE. */
if (NFSHASNFSV4N(nmp))
(void)nfsrpc_reclaimcomplete(nmp, cred, p);
NFSLOCKCLSTATE();
clp->nfsc_flags &= ~NFSCLFLAGS_RECVRINPROG;
wakeup(&clp->nfsc_flags);
nfsv4_unlock(&clp->nfsc_lock, 0);
NFSUNLOCKCLSTATE();
NFSFREECRED(tcred);
}
/*
* This function is called when a server replies with NFSERR_EXPIRED.
* It deletes all state for the client and does a fresh SetClientId/confirm.
* XXX Someday it should post a signal to the process(es) that hold the
* state, so they know that lock state has been lost.
*/
APPLESTATIC int
nfscl_hasexpired(struct nfsclclient *clp, u_int32_t clidrev, NFSPROC_T *p)
{
struct nfsmount *nmp;
struct ucred *cred;
int igotlock = 0, error, trycnt;
/*
* If the clientid has gone away or a new SetClientid has already
* been done, just return ok.
*/
if (clp == NULL || clidrev != clp->nfsc_clientidrev)
return (0);
/*
* First, lock the client structure, so everyone else will
* block when trying to use state. Also, use NFSCLFLAGS_EXPIREIT so
* that only one thread does the work.
*/
NFSLOCKCLSTATE();
clp->nfsc_flags |= NFSCLFLAGS_EXPIREIT;
do {
igotlock = nfsv4_lock(&clp->nfsc_lock, 1, NULL,
NFSCLSTATEMUTEXPTR, NULL);
} while (!igotlock && (clp->nfsc_flags & NFSCLFLAGS_EXPIREIT));
if ((clp->nfsc_flags & NFSCLFLAGS_EXPIREIT) == 0) {
if (igotlock)
nfsv4_unlock(&clp->nfsc_lock, 0);
NFSUNLOCKCLSTATE();
return (0);
}
clp->nfsc_flags |= NFSCLFLAGS_RECVRINPROG;
NFSUNLOCKCLSTATE();
nmp = clp->nfsc_nmp;
if (nmp == NULL)
panic("nfscl expired");
cred = newnfs_getcred();
trycnt = 5;
do {
error = nfsrpc_setclient(nmp, clp, 0, cred, p);
} while ((error == NFSERR_STALECLIENTID ||
error == NFSERR_BADSESSION ||
error == NFSERR_STALEDONTRECOVER) && --trycnt > 0);
if (error) {
NFSLOCKCLSTATE();
clp->nfsc_flags &= ~NFSCLFLAGS_RECOVER;
} else {
/*
* Expire the state for the client.
*/
nfscl_expireclient(clp, nmp, cred, p);
NFSLOCKCLSTATE();
clp->nfsc_flags |= NFSCLFLAGS_HASCLIENTID;
clp->nfsc_flags &= ~NFSCLFLAGS_RECOVER;
}
clp->nfsc_flags &= ~(NFSCLFLAGS_EXPIREIT | NFSCLFLAGS_RECVRINPROG);
wakeup(&clp->nfsc_flags);
nfsv4_unlock(&clp->nfsc_lock, 0);
NFSUNLOCKCLSTATE();
NFSFREECRED(cred);
return (error);
}
/*
* This function inserts a lock in the list after insert_lop.
*/
static void
nfscl_insertlock(struct nfscllockowner *lp, struct nfscllock *new_lop,
struct nfscllock *insert_lop, int local)
{
if ((struct nfscllockowner *)insert_lop == lp)
LIST_INSERT_HEAD(&lp->nfsl_lock, new_lop, nfslo_list);
else
LIST_INSERT_AFTER(insert_lop, new_lop, nfslo_list);
if (local)
nfsstatsv1.cllocallocks++;
else
nfsstatsv1.cllocks++;
}
/*
* This function updates the locking for a lock owner and given file. It
* maintains a list of lock ranges ordered on increasing file offset that
* are NFSCLLOCK_READ or NFSCLLOCK_WRITE and non-overlapping (aka POSIX style).
* It always adds new_lop to the list and sometimes uses the one pointed
* at by other_lopp.
* Returns 1 if the locks were modified, 0 otherwise.
*/
static int
nfscl_updatelock(struct nfscllockowner *lp, struct nfscllock **new_lopp,
struct nfscllock **other_lopp, int local)
{
struct nfscllock *new_lop = *new_lopp;
struct nfscllock *lop, *tlop, *ilop;
struct nfscllock *other_lop;
int unlock = 0, modified = 0;
u_int64_t tmp;
/*
* Work down the list until the lock is merged.
*/
if (new_lop->nfslo_type == F_UNLCK)
unlock = 1;
ilop = (struct nfscllock *)lp;
lop = LIST_FIRST(&lp->nfsl_lock);
while (lop != NULL) {
/*
* Only check locks for this file that aren't before the start of
* new lock's range.
*/
if (lop->nfslo_end >= new_lop->nfslo_first) {
if (new_lop->nfslo_end < lop->nfslo_first) {
/*
* If the new lock ends before the start of the
* current lock's range, no merge, just insert
* the new lock.
*/
break;
}
if (new_lop->nfslo_type == lop->nfslo_type ||
(new_lop->nfslo_first <= lop->nfslo_first &&
new_lop->nfslo_end >= lop->nfslo_end)) {
/*
* This lock can be absorbed by the new lock/unlock.
* This happens when it covers the entire range
* of the old lock or is contiguous
* with the old lock and is of the same type or an
* unlock.
*/
if (new_lop->nfslo_type != lop->nfslo_type ||
new_lop->nfslo_first != lop->nfslo_first ||
new_lop->nfslo_end != lop->nfslo_end)
modified = 1;
if (lop->nfslo_first < new_lop->nfslo_first)
new_lop->nfslo_first = lop->nfslo_first;
if (lop->nfslo_end > new_lop->nfslo_end)
new_lop->nfslo_end = lop->nfslo_end;
tlop = lop;
lop = LIST_NEXT(lop, nfslo_list);
nfscl_freelock(tlop, local);
continue;
}
/*
* All these cases are for contiguous locks that are not the
* same type, so they can't be merged.
*/
if (new_lop->nfslo_first <= lop->nfslo_first) {
/*
* This case is where the new lock overlaps with the
* first part of the old lock. Move the start of the
* old lock to just past the end of the new lock. The
* new lock will be inserted in front of the old, since
* ilop hasn't been updated. (We are done now.)
*/
if (lop->nfslo_first != new_lop->nfslo_end) {
lop->nfslo_first = new_lop->nfslo_end;
modified = 1;
}
break;
}
if (new_lop->nfslo_end >= lop->nfslo_end) {
/*
* This case is where the new lock overlaps with the
* end of the old lock's range. Move the old lock's
* end to just before the new lock's first and insert
* the new lock after the old lock.
* Might not be done yet, since the new lock could
* overlap further locks with higher ranges.
*/
if (lop->nfslo_end != new_lop->nfslo_first) {
lop->nfslo_end = new_lop->nfslo_first;
modified = 1;
}
ilop = lop;
lop = LIST_NEXT(lop, nfslo_list);
continue;
}
/*
* The final case is where the new lock's range is in the
* middle of the current lock's and splits the current lock
* up. Use *other_lopp to handle the second part of the
* split old lock range. (We are done now.)
* For unlock, we use new_lop as other_lop and tmp, since
* other_lop and new_lop are the same for this case.
* We noted the unlock case above, so we don't need
* new_lop->nfslo_type any longer.
*/
tmp = new_lop->nfslo_first;
if (unlock) {
other_lop = new_lop;
*new_lopp = NULL;
} else {
other_lop = *other_lopp;
*other_lopp = NULL;
}
other_lop->nfslo_first = new_lop->nfslo_end;
other_lop->nfslo_end = lop->nfslo_end;
other_lop->nfslo_type = lop->nfslo_type;
lop->nfslo_end = tmp;
nfscl_insertlock(lp, other_lop, lop, local);
ilop = lop;
modified = 1;
break;
}
ilop = lop;
lop = LIST_NEXT(lop, nfslo_list);
if (lop == NULL)
break;
}
/*
* Insert the new lock in the list at the appropriate place.
*/
if (!unlock) {
nfscl_insertlock(lp, new_lop, ilop, local);
*new_lopp = NULL;
modified = 1;
}
return (modified);
}
/*
* This function must be run as a kernel thread.
* It does Renew Ops and recovery, when required.
*/
APPLESTATIC void
nfscl_renewthread(struct nfsclclient *clp, NFSPROC_T *p)
{
struct nfsclowner *owp, *nowp;
struct nfsclopen *op;
struct nfscllockowner *lp, *nlp;
struct nfscldeleghead dh;
struct nfscldeleg *dp, *ndp;
struct ucred *cred;
u_int32_t clidrev;
int error, cbpathdown, islept, igotlock, ret, clearok;
uint32_t recover_done_time = 0;
time_t mytime;
static time_t prevsec = 0;
struct nfscllockownerfh *lfhp, *nlfhp;
struct nfscllockownerfhhead lfh;
struct nfscllayout *lyp, *nlyp;
struct nfscldevinfo *dip, *ndip;
struct nfscllayouthead rlh;
struct nfsclrecalllayout *recallp;
struct nfsclds *dsp;
cred = newnfs_getcred();
NFSLOCKCLSTATE();
clp->nfsc_flags |= NFSCLFLAGS_HASTHREAD;
NFSUNLOCKCLSTATE();
for(;;) {
newnfs_setroot(cred);
cbpathdown = 0;
if (clp->nfsc_flags & NFSCLFLAGS_RECOVER) {
/*
* Only allow one recover within 1/2 of the lease
* duration (nfsc_renew).
*/
if (recover_done_time < NFSD_MONOSEC) {
recover_done_time = NFSD_MONOSEC +
clp->nfsc_renew;
NFSCL_DEBUG(1, "Doing recovery..\n");
nfscl_recover(clp, cred, p);
} else {
NFSCL_DEBUG(1, "Clear Recovery dt=%u ms=%jd\n",
recover_done_time, (intmax_t)NFSD_MONOSEC);
NFSLOCKCLSTATE();
clp->nfsc_flags &= ~NFSCLFLAGS_RECOVER;
NFSUNLOCKCLSTATE();
}
}
if (clp->nfsc_expire <= NFSD_MONOSEC &&
(clp->nfsc_flags & NFSCLFLAGS_HASCLIENTID)) {
clp->nfsc_expire = NFSD_MONOSEC + clp->nfsc_renew;
clidrev = clp->nfsc_clientidrev;
error = nfsrpc_renew(clp, NULL, cred, p);
if (error == NFSERR_CBPATHDOWN)
cbpathdown = 1;
else if (error == NFSERR_STALECLIENTID ||
error == NFSERR_BADSESSION) {
NFSLOCKCLSTATE();
clp->nfsc_flags |= NFSCLFLAGS_RECOVER;
NFSUNLOCKCLSTATE();
} else if (error == NFSERR_EXPIRED)
(void) nfscl_hasexpired(clp, clidrev, p);
}
checkdsrenew:
if (NFSHASNFSV4N(clp->nfsc_nmp)) {
/* Do renews for any DS sessions. */
NFSLOCKMNT(clp->nfsc_nmp);
/* Skip first entry, since the MDS is handled above. */
dsp = TAILQ_FIRST(&clp->nfsc_nmp->nm_sess);
if (dsp != NULL)
dsp = TAILQ_NEXT(dsp, nfsclds_list);
while (dsp != NULL) {
if (dsp->nfsclds_expire <= NFSD_MONOSEC &&
dsp->nfsclds_sess.nfsess_defunct == 0) {
dsp->nfsclds_expire = NFSD_MONOSEC +
clp->nfsc_renew;
NFSUNLOCKMNT(clp->nfsc_nmp);
(void)nfsrpc_renew(clp, dsp, cred, p);
goto checkdsrenew;
}
dsp = TAILQ_NEXT(dsp, nfsclds_list);
}
NFSUNLOCKMNT(clp->nfsc_nmp);
}
TAILQ_INIT(&dh);
NFSLOCKCLSTATE();
if (cbpathdown)
/* It's a Total Recall! */
nfscl_totalrecall(clp);
/*
* Now, handle defunct owners.
*/
LIST_FOREACH_SAFE(owp, &clp->nfsc_owner, nfsow_list, nowp) {
if (LIST_EMPTY(&owp->nfsow_open)) {
if (owp->nfsow_defunct != 0)
nfscl_freeopenowner(owp, 0);
}
}
/*
* Do the recall on any delegations. To avoid trouble, always
* come back up here after having slept.
*/
igotlock = 0;
tryagain:
dp = TAILQ_FIRST(&clp->nfsc_deleg);
while (dp != NULL) {
ndp = TAILQ_NEXT(dp, nfsdl_list);
if ((dp->nfsdl_flags & NFSCLDL_RECALL)) {
/*
* Wait for outstanding I/O ops to be done.
*/
if (dp->nfsdl_rwlock.nfslock_usecnt > 0) {
if (igotlock) {
nfsv4_unlock(&clp->nfsc_lock, 0);
igotlock = 0;
}
dp->nfsdl_rwlock.nfslock_lock |=
NFSV4LOCK_WANTED;
(void) nfsmsleep(&dp->nfsdl_rwlock,
NFSCLSTATEMUTEXPTR, PZERO, "nfscld",
NULL);
goto tryagain;
}
while (!igotlock) {
igotlock = nfsv4_lock(&clp->nfsc_lock, 1,
&islept, NFSCLSTATEMUTEXPTR, NULL);
if (islept)
goto tryagain;
}
NFSUNLOCKCLSTATE();
newnfs_copycred(&dp->nfsdl_cred, cred);
ret = nfscl_recalldeleg(clp, clp->nfsc_nmp, dp,
NULL, cred, p, 1);
if (!ret) {
nfscl_cleandeleg(dp);
TAILQ_REMOVE(&clp->nfsc_deleg, dp,
nfsdl_list);
LIST_REMOVE(dp, nfsdl_hash);
TAILQ_INSERT_HEAD(&dh, dp, nfsdl_list);
nfscl_delegcnt--;
nfsstatsv1.cldelegates--;
}
NFSLOCKCLSTATE();
}
dp = ndp;
}
/*
* Clear out old delegations, if we are above the high water
* mark. Only clear out ones with no state related to them.
* The tailq list is in LRU order.
*/
dp = TAILQ_LAST(&clp->nfsc_deleg, nfscldeleghead);
while (nfscl_delegcnt > nfscl_deleghighwater && dp != NULL) {
ndp = TAILQ_PREV(dp, nfscldeleghead, nfsdl_list);
if (dp->nfsdl_rwlock.nfslock_usecnt == 0 &&
dp->nfsdl_rwlock.nfslock_lock == 0 &&
dp->nfsdl_timestamp < NFSD_MONOSEC &&
(dp->nfsdl_flags & (NFSCLDL_RECALL | NFSCLDL_ZAPPED |
NFSCLDL_NEEDRECLAIM | NFSCLDL_DELEGRET)) == 0) {
clearok = 1;
LIST_FOREACH(owp, &dp->nfsdl_owner, nfsow_list) {
op = LIST_FIRST(&owp->nfsow_open);
if (op != NULL) {
clearok = 0;
break;
}
}
if (clearok) {
LIST_FOREACH(lp, &dp->nfsdl_lock, nfsl_list) {
if (!LIST_EMPTY(&lp->nfsl_lock)) {
clearok = 0;
break;
}
}
}
if (clearok) {
TAILQ_REMOVE(&clp->nfsc_deleg, dp, nfsdl_list);
LIST_REMOVE(dp, nfsdl_hash);
TAILQ_INSERT_HEAD(&dh, dp, nfsdl_list);
nfscl_delegcnt--;
nfsstatsv1.cldelegates--;
}
}
dp = ndp;
}
if (igotlock)
nfsv4_unlock(&clp->nfsc_lock, 0);
/*
* Do the recall on any layouts. To avoid trouble, always
* come back up here after having slept.
*/
TAILQ_INIT(&rlh);
tryagain2:
TAILQ_FOREACH_SAFE(lyp, &clp->nfsc_layout, nfsly_list, nlyp) {
if ((lyp->nfsly_flags & NFSLY_RECALL) != 0) {
/*
* Wait for outstanding I/O ops to be done.
*/
if (lyp->nfsly_lock.nfslock_usecnt > 0 ||
(lyp->nfsly_lock.nfslock_lock &
NFSV4LOCK_LOCK) != 0) {
lyp->nfsly_lock.nfslock_lock |=
NFSV4LOCK_WANTED;
(void)nfsmsleep(&lyp->nfsly_lock,
NFSCLSTATEMUTEXPTR, PZERO, "nfslyp",
NULL);
goto tryagain2;
}
/* Move the layout to the recall list. */
TAILQ_REMOVE(&clp->nfsc_layout, lyp,
nfsly_list);
LIST_REMOVE(lyp, nfsly_hash);
TAILQ_INSERT_HEAD(&rlh, lyp, nfsly_list);
/* Handle any layout commits. */
if (!NFSHASNOLAYOUTCOMMIT(clp->nfsc_nmp) &&
(lyp->nfsly_flags & NFSLY_WRITTEN) != 0) {
lyp->nfsly_flags &= ~NFSLY_WRITTEN;
NFSUNLOCKCLSTATE();
NFSCL_DEBUG(3, "do layoutcommit\n");
nfscl_dolayoutcommit(clp->nfsc_nmp, lyp,
cred, p);
NFSLOCKCLSTATE();
goto tryagain2;
}
}
}
/* Now, look for stale layouts. */
lyp = TAILQ_LAST(&clp->nfsc_layout, nfscllayouthead);
while (lyp != NULL) {
nlyp = TAILQ_PREV(lyp, nfscllayouthead, nfsly_list);
if (lyp->nfsly_timestamp < NFSD_MONOSEC &&
(lyp->nfsly_flags & NFSLY_RECALL) == 0 &&
lyp->nfsly_lock.nfslock_usecnt == 0 &&
lyp->nfsly_lock.nfslock_lock == 0) {
NFSCL_DEBUG(4, "ret stale lay=%d\n",
nfscl_layoutcnt);
recallp = malloc(sizeof(*recallp),
M_NFSLAYRECALL, M_NOWAIT);
if (recallp == NULL)
break;
(void)nfscl_layoutrecall(NFSLAYOUTRETURN_FILE,
lyp, NFSLAYOUTIOMODE_ANY, 0, UINT64_MAX,
lyp->nfsly_stateid.seqid, recallp);
}
lyp = nlyp;
}
/*
* Free up any unreferenced device info structures.
*/
LIST_FOREACH_SAFE(dip, &clp->nfsc_devinfo, nfsdi_list, ndip) {
if (dip->nfsdi_layoutrefs == 0 &&
dip->nfsdi_refcnt == 0) {
NFSCL_DEBUG(4, "freeing devinfo\n");
LIST_REMOVE(dip, nfsdi_list);
nfscl_freedevinfo(dip);
}
}
NFSUNLOCKCLSTATE();
/* Do layout return(s), as required. */
TAILQ_FOREACH_SAFE(lyp, &rlh, nfsly_list, nlyp) {
TAILQ_REMOVE(&rlh, lyp, nfsly_list);
NFSCL_DEBUG(4, "ret layout\n");
nfscl_layoutreturn(clp->nfsc_nmp, lyp, cred, p);
nfscl_freelayout(lyp);
}
/*
* Delegreturn any delegations cleaned out or recalled.
*/
TAILQ_FOREACH_SAFE(dp, &dh, nfsdl_list, ndp) {
newnfs_copycred(&dp->nfsdl_cred, cred);
(void) nfscl_trydelegreturn(dp, cred, clp->nfsc_nmp, p);
TAILQ_REMOVE(&dh, dp, nfsdl_list);
FREE((caddr_t)dp, M_NFSCLDELEG);
}
SLIST_INIT(&lfh);
/*
* Call nfscl_cleanupkext() once per second to check for
* open/lock owners where the process has exited.
*/
mytime = NFSD_MONOSEC;
if (prevsec != mytime) {
prevsec = mytime;
nfscl_cleanupkext(clp, &lfh);
}
/*
* Do a ReleaseLockOwner for all lock owners where the
* associated process no longer exists, as found by
* nfscl_cleanupkext().
*/
newnfs_setroot(cred);
SLIST_FOREACH_SAFE(lfhp, &lfh, nfslfh_list, nlfhp) {
LIST_FOREACH_SAFE(lp, &lfhp->nfslfh_lock, nfsl_list,
nlp) {
(void)nfsrpc_rellockown(clp->nfsc_nmp, lp,
lfhp->nfslfh_fh, lfhp->nfslfh_len, cred,
p);
nfscl_freelockowner(lp, 0);
}
free(lfhp, M_TEMP);
}
SLIST_INIT(&lfh);
NFSLOCKCLSTATE();
if ((clp->nfsc_flags & NFSCLFLAGS_RECOVER) == 0)
(void)mtx_sleep(clp, NFSCLSTATEMUTEXPTR, PWAIT, "nfscl",
hz);
if (clp->nfsc_flags & NFSCLFLAGS_UMOUNT) {
clp->nfsc_flags &= ~NFSCLFLAGS_HASTHREAD;
NFSUNLOCKCLSTATE();
NFSFREECRED(cred);
wakeup((caddr_t)clp);
return;
}
NFSUNLOCKCLSTATE();
}
}
/*
* Initiate state recovery. Called when NFSERR_STALECLIENTID,
* NFSERR_STALESTATEID or NFSERR_BADSESSION is received.
*/
APPLESTATIC void
nfscl_initiate_recovery(struct nfsclclient *clp)
{
if (clp == NULL)
return;
NFSLOCKCLSTATE();
clp->nfsc_flags |= NFSCLFLAGS_RECOVER;
NFSUNLOCKCLSTATE();
wakeup((caddr_t)clp);
}
/*
* Dump out the state stuff for debugging.
*/
APPLESTATIC void
nfscl_dumpstate(struct nfsmount *nmp, int openowner, int opens,
int lockowner, int locks)
{
struct nfsclclient *clp;
struct nfsclowner *owp;
struct nfsclopen *op;
struct nfscllockowner *lp;
struct nfscllock *lop;
struct nfscldeleg *dp;
clp = nmp->nm_clp;
if (clp == NULL) {
printf("nfscl dumpstate NULL clp\n");
return;
}
NFSLOCKCLSTATE();
TAILQ_FOREACH(dp, &clp->nfsc_deleg, nfsdl_list) {
LIST_FOREACH(owp, &dp->nfsdl_owner, nfsow_list) {
if (openowner && !LIST_EMPTY(&owp->nfsow_open))
printf("owner=0x%x 0x%x 0x%x 0x%x seqid=%d\n",
owp->nfsow_owner[0], owp->nfsow_owner[1],
owp->nfsow_owner[2], owp->nfsow_owner[3],
owp->nfsow_seqid);
LIST_FOREACH(op, &owp->nfsow_open, nfso_list) {
if (opens)
printf("open st=0x%x 0x%x 0x%x cnt=%d fh12=0x%x\n",
op->nfso_stateid.other[0], op->nfso_stateid.other[1],
op->nfso_stateid.other[2], op->nfso_opencnt,
op->nfso_fh[12]);
LIST_FOREACH(lp, &op->nfso_lock, nfsl_list) {
if (lockowner)
printf("lckown=0x%x 0x%x 0x%x 0x%x seqid=%d st=0x%x 0x%x 0x%x\n",
lp->nfsl_owner[0], lp->nfsl_owner[1],
lp->nfsl_owner[2], lp->nfsl_owner[3],
lp->nfsl_seqid,
lp->nfsl_stateid.other[0], lp->nfsl_stateid.other[1],
lp->nfsl_stateid.other[2]);
LIST_FOREACH(lop, &lp->nfsl_lock, nfslo_list) {
if (locks)
#ifdef __FreeBSD__
printf("lck typ=%d fst=%ju end=%ju\n",
lop->nfslo_type, (intmax_t)lop->nfslo_first,
(intmax_t)lop->nfslo_end);
#else
printf("lck typ=%d fst=%qd end=%qd\n",
lop->nfslo_type, lop->nfslo_first,
lop->nfslo_end);
#endif
}
}
}
}
}
LIST_FOREACH(owp, &clp->nfsc_owner, nfsow_list) {
if (openowner && !LIST_EMPTY(&owp->nfsow_open))
printf("owner=0x%x 0x%x 0x%x 0x%x seqid=%d\n",
owp->nfsow_owner[0], owp->nfsow_owner[1],
owp->nfsow_owner[2], owp->nfsow_owner[3],
owp->nfsow_seqid);
LIST_FOREACH(op, &owp->nfsow_open, nfso_list) {
if (opens)
printf("open st=0x%x 0x%x 0x%x cnt=%d fh12=0x%x\n",
op->nfso_stateid.other[0], op->nfso_stateid.other[1],
op->nfso_stateid.other[2], op->nfso_opencnt,
op->nfso_fh[12]);
LIST_FOREACH(lp, &op->nfso_lock, nfsl_list) {
if (lockowner)
printf("lckown=0x%x 0x%x 0x%x 0x%x seqid=%d st=0x%x 0x%x 0x%x\n",
lp->nfsl_owner[0], lp->nfsl_owner[1],
lp->nfsl_owner[2], lp->nfsl_owner[3],
lp->nfsl_seqid,
lp->nfsl_stateid.other[0], lp->nfsl_stateid.other[1],
lp->nfsl_stateid.other[2]);
LIST_FOREACH(lop, &lp->nfsl_lock, nfslo_list) {
if (locks)
#ifdef __FreeBSD__
printf("lck typ=%d fst=%ju end=%ju\n",
lop->nfslo_type, (intmax_t)lop->nfslo_first,
(intmax_t)lop->nfslo_end);
#else
printf("lck typ=%d fst=%qd end=%qd\n",
lop->nfslo_type, lop->nfslo_first,
lop->nfslo_end);
#endif
}
}
}
}
NFSUNLOCKCLSTATE();
}
/*
* Check for duplicate open owners and opens.
* (Only used as a diagnostic aid.)
*/
APPLESTATIC void
nfscl_dupopen(vnode_t vp, int dupopens)
{
struct nfsclclient *clp;
struct nfsclowner *owp, *owp2;
struct nfsclopen *op, *op2;
struct nfsfh *nfhp;
clp = VFSTONFS(vnode_mount(vp))->nm_clp;
if (clp == NULL) {
printf("nfscl dupopen NULL clp\n");
return;
}
nfhp = VTONFS(vp)->n_fhp;
NFSLOCKCLSTATE();
/*
* First, search for duplicate owners.
* These should never happen!
*/
LIST_FOREACH(owp2, &clp->nfsc_owner, nfsow_list) {
LIST_FOREACH(owp, &clp->nfsc_owner, nfsow_list) {
if (owp != owp2 &&
!NFSBCMP(owp->nfsow_owner, owp2->nfsow_owner,
NFSV4CL_LOCKNAMELEN)) {
NFSUNLOCKCLSTATE();
printf("DUP OWNER\n");
nfscl_dumpstate(VFSTONFS(vnode_mount(vp)), 1, 1, 0, 0);
return;
}
}
}
/*
* Now, search for duplicate stateids.
* These shouldn't happen, either.
*/
LIST_FOREACH(owp2, &clp->nfsc_owner, nfsow_list) {
LIST_FOREACH(op2, &owp2->nfsow_open, nfso_list) {
LIST_FOREACH(owp, &clp->nfsc_owner, nfsow_list) {
LIST_FOREACH(op, &owp->nfsow_open, nfso_list) {
if (op != op2 &&
(op->nfso_stateid.other[0] != 0 ||
op->nfso_stateid.other[1] != 0 ||
op->nfso_stateid.other[2] != 0) &&
op->nfso_stateid.other[0] == op2->nfso_stateid.other[0] &&
op->nfso_stateid.other[1] == op2->nfso_stateid.other[1] &&
op->nfso_stateid.other[2] == op2->nfso_stateid.other[2]) {
NFSUNLOCKCLSTATE();
printf("DUP STATEID\n");
nfscl_dumpstate(VFSTONFS(vnode_mount(vp)), 1, 1, 0,
0);
return;
}
}
}
}
}
/*
* Now search for duplicate opens.
* Duplicate opens for the same owner
* should never occur. Other duplicates are
* possible and are checked for if "dupopens"
* is true.
*/
LIST_FOREACH(owp2, &clp->nfsc_owner, nfsow_list) {
LIST_FOREACH(op2, &owp2->nfsow_open, nfso_list) {
if (nfhp->nfh_len == op2->nfso_fhlen &&
!NFSBCMP(nfhp->nfh_fh, op2->nfso_fh, nfhp->nfh_len)) {
LIST_FOREACH(owp, &clp->nfsc_owner, nfsow_list) {
LIST_FOREACH(op, &owp->nfsow_open, nfso_list) {
if (op != op2 && nfhp->nfh_len == op->nfso_fhlen &&
!NFSBCMP(nfhp->nfh_fh, op->nfso_fh, nfhp->nfh_len) &&
(!NFSBCMP(op->nfso_own->nfsow_owner,
op2->nfso_own->nfsow_owner, NFSV4CL_LOCKNAMELEN) ||
dupopens)) {
if (!NFSBCMP(op->nfso_own->nfsow_owner,
op2->nfso_own->nfsow_owner, NFSV4CL_LOCKNAMELEN)) {
NFSUNLOCKCLSTATE();
printf("BADDUP OPEN\n");
} else {
NFSUNLOCKCLSTATE();
printf("DUP OPEN\n");
}
nfscl_dumpstate(VFSTONFS(vnode_mount(vp)), 1, 1,
0, 0);
return;
}
}
}
}
}
}
NFSUNLOCKCLSTATE();
}
/*
* During close, find an open that needs to be dereferenced and
* dereference it. If there are no more opens for this file,
* log a message to that effect.
* Opens aren't actually Close'd until VOP_INACTIVE() is performed
* on the file's vnode.
* This is the safe way, since it is difficult to identify
* which open the close is for and I/O can be performed after the
* close(2) system call when a file is mmap'd.
* If it returns 0 for success, there will be a referenced
* clp returned via clpp.
*/
APPLESTATIC int
nfscl_getclose(vnode_t vp, struct nfsclclient **clpp)
{
struct nfsclclient *clp;
struct nfsclowner *owp;
struct nfsclopen *op;
struct nfscldeleg *dp;
struct nfsfh *nfhp;
int error, notdecr;
error = nfscl_getcl(vnode_mount(vp), NULL, NULL, 1, &clp);
if (error)
return (error);
*clpp = clp;
nfhp = VTONFS(vp)->n_fhp;
notdecr = 1;
NFSLOCKCLSTATE();
/*
* First, look for one under a delegation that was locally issued
* and just decrement the opencnt for it. Since all my Opens against
* the server are DENY_NONE, I don't see a problem with hanging
* onto them. (It is much easier to use one of the extant Opens
* that I already have on the server when a Delegation is recalled
* than to do fresh Opens.) Someday, I might need to rethink this, but.
*/
dp = nfscl_finddeleg(clp, nfhp->nfh_fh, nfhp->nfh_len);
if (dp != NULL) {
LIST_FOREACH(owp, &dp->nfsdl_owner, nfsow_list) {
op = LIST_FIRST(&owp->nfsow_open);
if (op != NULL) {
/*
* Since a delegation is for a file, there
* should never be more than one open for
* each openowner.
*/
if (LIST_NEXT(op, nfso_list) != NULL)
panic("nfscdeleg opens");
if (notdecr && op->nfso_opencnt > 0) {
notdecr = 0;
op->nfso_opencnt--;
break;
}
}
}
}
/* Now process the opens against the server. */
LIST_FOREACH(owp, &clp->nfsc_owner, nfsow_list) {
LIST_FOREACH(op, &owp->nfsow_open, nfso_list) {
if (op->nfso_fhlen == nfhp->nfh_len &&
!NFSBCMP(op->nfso_fh, nfhp->nfh_fh,
nfhp->nfh_len)) {
/* Found an open, decrement cnt if possible */
if (notdecr && op->nfso_opencnt > 0) {
notdecr = 0;
op->nfso_opencnt--;
}
/*
* There are more opens, so just return.
*/
if (op->nfso_opencnt > 0) {
NFSUNLOCKCLSTATE();
return (0);
}
}
}
}
NFSUNLOCKCLSTATE();
if (notdecr)
printf("nfscl: never fnd open\n");
return (0);
}
APPLESTATIC int
nfscl_doclose(vnode_t vp, struct nfsclclient **clpp, NFSPROC_T *p)
{
struct nfsclclient *clp;
struct nfsclowner *owp, *nowp;
struct nfsclopen *op;
struct nfscldeleg *dp;
struct nfsfh *nfhp;
struct nfsclrecalllayout *recallp;
int error;
error = nfscl_getcl(vnode_mount(vp), NULL, NULL, 1, &clp);
if (error)
return (error);
*clpp = clp;
nfhp = VTONFS(vp)->n_fhp;
recallp = malloc(sizeof(*recallp), M_NFSLAYRECALL, M_WAITOK);
NFSLOCKCLSTATE();
/*
* First get rid of the local Open structures, which should be no
* longer in use.
*/
dp = nfscl_finddeleg(clp, nfhp->nfh_fh, nfhp->nfh_len);
if (dp != NULL) {
LIST_FOREACH_SAFE(owp, &dp->nfsdl_owner, nfsow_list, nowp) {
op = LIST_FIRST(&owp->nfsow_open);
if (op != NULL) {
KASSERT((op->nfso_opencnt == 0),
("nfscl: bad open cnt on deleg"));
nfscl_freeopen(op, 1);
}
nfscl_freeopenowner(owp, 1);
}
}
/* Return any layouts marked return on close. */
nfscl_retoncloselayout(vp, clp, nfhp->nfh_fh, nfhp->nfh_len, &recallp);
/* Now process the opens against the server. */
lookformore:
LIST_FOREACH(owp, &clp->nfsc_owner, nfsow_list) {
op = LIST_FIRST(&owp->nfsow_open);
while (op != NULL) {
if (op->nfso_fhlen == nfhp->nfh_len &&
!NFSBCMP(op->nfso_fh, nfhp->nfh_fh,
nfhp->nfh_len)) {
/* Found an open, close it. */
KASSERT((op->nfso_opencnt == 0),
("nfscl: bad open cnt on server"));
NFSUNLOCKCLSTATE();
nfsrpc_doclose(VFSTONFS(vnode_mount(vp)), op,
p);
NFSLOCKCLSTATE();
goto lookformore;
}
op = LIST_NEXT(op, nfso_list);
}
}
NFSUNLOCKCLSTATE();
/*
* recallp has been set NULL by nfscl_retoncloselayout() if it was
* used by the function, but calling free() with a NULL pointer is ok.
*/
free(recallp, M_NFSLAYRECALL);
return (0);
}
/*
* Return all delegations on this client.
* (Must be called with client sleep lock.)
*/
static void
nfscl_delegreturnall(struct nfsclclient *clp, NFSPROC_T *p)
{
struct nfscldeleg *dp, *ndp;
struct ucred *cred;
cred = newnfs_getcred();
TAILQ_FOREACH_SAFE(dp, &clp->nfsc_deleg, nfsdl_list, ndp) {
nfscl_cleandeleg(dp);
(void) nfscl_trydelegreturn(dp, cred, clp->nfsc_nmp, p);
nfscl_freedeleg(&clp->nfsc_deleg, dp);
}
NFSFREECRED(cred);
}
/*
* Do a callback RPC.
*/
APPLESTATIC void
nfscl_docb(struct nfsrv_descript *nd, NFSPROC_T *p)
{
int clist, gotseq_ok, i, j, k, op, rcalls;
u_int32_t *tl;
struct nfsclclient *clp;
struct nfscldeleg *dp = NULL;
int numops, taglen = -1, error = 0, trunc;
u_int32_t minorvers = 0, retops = 0, *retopsp = NULL, *repp, cbident;
u_char tag[NFSV4_SMALLSTR + 1], *tagstr;
vnode_t vp = NULL;
struct nfsnode *np;
struct vattr va;
struct nfsfh *nfhp;
mount_t mp;
nfsattrbit_t attrbits, rattrbits;
nfsv4stateid_t stateid;
uint32_t seqid, slotid = 0, highslot, cachethis;
uint8_t sessionid[NFSX_V4SESSIONID];
struct mbuf *rep;
struct nfscllayout *lyp;
uint64_t filesid[2], len, off;
int changed, gotone, laytype, recalltype;
uint32_t iomode;
struct nfsclrecalllayout *recallp = NULL;
struct nfsclsession *tsep;
gotseq_ok = 0;
nfsrvd_rephead(nd);
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
taglen = fxdr_unsigned(int, *tl);
if (taglen < 0) {
error = EBADRPC;
goto nfsmout;
}
if (taglen <= NFSV4_SMALLSTR)
tagstr = tag;
else
tagstr = malloc(taglen + 1, M_TEMP, M_WAITOK);
error = nfsrv_mtostr(nd, tagstr, taglen);
if (error) {
if (taglen > NFSV4_SMALLSTR)
free(tagstr, M_TEMP);
taglen = -1;
goto nfsmout;
}
(void) nfsm_strtom(nd, tag, taglen);
if (taglen > NFSV4_SMALLSTR) {
free(tagstr, M_TEMP);
}
NFSM_BUILD(retopsp, u_int32_t *, NFSX_UNSIGNED);
NFSM_DISSECT(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
minorvers = fxdr_unsigned(u_int32_t, *tl++);
if (minorvers != NFSV4_MINORVERSION && minorvers != NFSV41_MINORVERSION)
nd->nd_repstat = NFSERR_MINORVERMISMATCH;
cbident = fxdr_unsigned(u_int32_t, *tl++);
if (nd->nd_repstat)
numops = 0;
else
numops = fxdr_unsigned(int, *tl);
/*
* Loop around doing the sub ops.
*/
for (i = 0; i < numops; i++) {
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
NFSM_BUILD(repp, u_int32_t *, 2 * NFSX_UNSIGNED);
*repp++ = *tl;
op = fxdr_unsigned(int, *tl);
if (op < NFSV4OP_CBGETATTR ||
(op > NFSV4OP_CBRECALL && minorvers == NFSV4_MINORVERSION) ||
(op > NFSV4OP_CBNOTIFYDEVID &&
minorvers == NFSV41_MINORVERSION)) {
nd->nd_repstat = NFSERR_OPILLEGAL;
*repp = nfscl_errmap(nd, minorvers);
retops++;
break;
}
nd->nd_procnum = op;
if (op < NFSV41_CBNOPS)
nfsstatsv1.cbrpccnt[nd->nd_procnum]++;
switch (op) {
case NFSV4OP_CBGETATTR:
NFSCL_DEBUG(4, "cbgetattr\n");
mp = NULL;
vp = NULL;
error = nfsm_getfh(nd, &nfhp);
if (!error)
error = nfsrv_getattrbits(nd, &attrbits,
NULL, NULL);
if (error == 0 && i == 0 &&
minorvers != NFSV4_MINORVERSION)
error = NFSERR_OPNOTINSESS;
if (!error) {
mp = nfscl_getmnt(minorvers, sessionid, cbident,
&clp);
if (mp == NULL)
error = NFSERR_SERVERFAULT;
}
if (!error) {
error = nfscl_ngetreopen(mp, nfhp->nfh_fh,
nfhp->nfh_len, p, &np);
if (!error)
vp = NFSTOV(np);
}
if (!error) {
NFSZERO_ATTRBIT(&rattrbits);
NFSLOCKCLSTATE();
dp = nfscl_finddeleg(clp, nfhp->nfh_fh,
nfhp->nfh_len);
if (dp != NULL) {
if (NFSISSET_ATTRBIT(&attrbits,
NFSATTRBIT_SIZE)) {
if (vp != NULL)
va.va_size = np->n_size;
else
va.va_size =
dp->nfsdl_size;
NFSSETBIT_ATTRBIT(&rattrbits,
NFSATTRBIT_SIZE);
}
if (NFSISSET_ATTRBIT(&attrbits,
NFSATTRBIT_CHANGE)) {
va.va_filerev =
dp->nfsdl_change;
if (vp == NULL ||
(np->n_flag & NDELEGMOD))
va.va_filerev++;
NFSSETBIT_ATTRBIT(&rattrbits,
NFSATTRBIT_CHANGE);
}
} else
error = NFSERR_SERVERFAULT;
NFSUNLOCKCLSTATE();
}
if (vp != NULL)
vrele(vp);
if (mp != NULL)
vfs_unbusy(mp);
if (nfhp != NULL)
FREE((caddr_t)nfhp, M_NFSFH);
if (!error)
(void) nfsv4_fillattr(nd, NULL, NULL, NULL, &va,
NULL, 0, &rattrbits, NULL, p, 0, 0, 0, 0,
(uint64_t)0);
break;
case NFSV4OP_CBRECALL:
NFSCL_DEBUG(4, "cbrecall\n");
NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID +
NFSX_UNSIGNED);
stateid.seqid = *tl++;
NFSBCOPY((caddr_t)tl, (caddr_t)stateid.other,
NFSX_STATEIDOTHER);
tl += (NFSX_STATEIDOTHER / NFSX_UNSIGNED);
trunc = fxdr_unsigned(int, *tl);
error = nfsm_getfh(nd, &nfhp);
if (error == 0 && i == 0 &&
minorvers != NFSV4_MINORVERSION)
error = NFSERR_OPNOTINSESS;
if (!error) {
NFSLOCKCLSTATE();
if (minorvers == NFSV4_MINORVERSION)
clp = nfscl_getclnt(cbident);
else
clp = nfscl_getclntsess(sessionid);
if (clp != NULL) {
dp = nfscl_finddeleg(clp, nfhp->nfh_fh,
nfhp->nfh_len);
if (dp != NULL && (dp->nfsdl_flags &
NFSCLDL_DELEGRET) == 0) {
dp->nfsdl_flags |=
NFSCLDL_RECALL;
wakeup((caddr_t)clp);
}
} else {
error = NFSERR_SERVERFAULT;
}
NFSUNLOCKCLSTATE();
}
if (nfhp != NULL)
FREE((caddr_t)nfhp, M_NFSFH);
break;
case NFSV4OP_CBLAYOUTRECALL:
NFSCL_DEBUG(4, "cblayrec\n");
nfhp = NULL;
NFSM_DISSECT(tl, uint32_t *, 4 * NFSX_UNSIGNED);
laytype = fxdr_unsigned(int, *tl++);
iomode = fxdr_unsigned(uint32_t, *tl++);
if (newnfs_true == *tl++)
changed = 1;
else
changed = 0;
recalltype = fxdr_unsigned(int, *tl);
recallp = malloc(sizeof(*recallp), M_NFSLAYRECALL,
M_WAITOK);
if (laytype != NFSLAYOUT_NFSV4_1_FILES)
error = NFSERR_NOMATCHLAYOUT;
else if (recalltype == NFSLAYOUTRETURN_FILE) {
error = nfsm_getfh(nd, &nfhp);
NFSCL_DEBUG(4, "retfile getfh=%d\n", error);
if (error != 0)
goto nfsmout;
NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_HYPER +
NFSX_STATEID);
off = fxdr_hyper(tl); tl += 2;
len = fxdr_hyper(tl); tl += 2;
stateid.seqid = fxdr_unsigned(uint32_t, *tl++);
NFSBCOPY(tl, stateid.other, NFSX_STATEIDOTHER);
if (minorvers == NFSV4_MINORVERSION)
error = NFSERR_NOTSUPP;
else if (i == 0)
error = NFSERR_OPNOTINSESS;
if (error == 0) {
NFSLOCKCLSTATE();
clp = nfscl_getclntsess(sessionid);
NFSCL_DEBUG(4, "cbly clp=%p\n", clp);
if (clp != NULL) {
lyp = nfscl_findlayout(clp,
nfhp->nfh_fh,
nfhp->nfh_len);
NFSCL_DEBUG(4, "cblyp=%p\n",
lyp);
if (lyp != NULL &&
(lyp->nfsly_flags &
NFSLY_FILES) != 0 &&
!NFSBCMP(stateid.other,
lyp->nfsly_stateid.other,
NFSX_STATEIDOTHER)) {
error =
nfscl_layoutrecall(
recalltype,
lyp, iomode, off,
len, stateid.seqid,
recallp);
recallp = NULL;
wakeup(clp);
NFSCL_DEBUG(4,
"aft layrcal=%d\n",
error);
} else
error =
NFSERR_NOMATCHLAYOUT;
} else
error = NFSERR_NOMATCHLAYOUT;
NFSUNLOCKCLSTATE();
}
free(nfhp, M_NFSFH);
} else if (recalltype == NFSLAYOUTRETURN_FSID) {
NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_HYPER);
filesid[0] = fxdr_hyper(tl); tl += 2;
filesid[1] = fxdr_hyper(tl); tl += 2;
gotone = 0;
NFSLOCKCLSTATE();
clp = nfscl_getclntsess(sessionid);
if (clp != NULL) {
TAILQ_FOREACH(lyp, &clp->nfsc_layout,
nfsly_list) {
if (lyp->nfsly_filesid[0] ==
filesid[0] &&
lyp->nfsly_filesid[1] ==
filesid[1]) {
error =
nfscl_layoutrecall(
recalltype,
lyp, iomode, 0,
UINT64_MAX,
lyp->nfsly_stateid.seqid,
recallp);
recallp = NULL;
gotone = 1;
}
}
if (gotone != 0)
wakeup(clp);
else
error = NFSERR_NOMATCHLAYOUT;
} else
error = NFSERR_NOMATCHLAYOUT;
NFSUNLOCKCLSTATE();
} else if (recalltype == NFSLAYOUTRETURN_ALL) {
gotone = 0;
NFSLOCKCLSTATE();
clp = nfscl_getclntsess(sessionid);
if (clp != NULL) {
TAILQ_FOREACH(lyp, &clp->nfsc_layout,
nfsly_list) {
error = nfscl_layoutrecall(
recalltype, lyp, iomode, 0,
UINT64_MAX,
lyp->nfsly_stateid.seqid,
recallp);
recallp = NULL;
gotone = 1;
}
if (gotone != 0)
wakeup(clp);
else
error = NFSERR_NOMATCHLAYOUT;
} else
error = NFSERR_NOMATCHLAYOUT;
NFSUNLOCKCLSTATE();
} else
error = NFSERR_NOMATCHLAYOUT;
if (recallp != NULL) {
free(recallp, M_NFSLAYRECALL);
recallp = NULL;
}
break;
case NFSV4OP_CBSEQUENCE:
NFSM_DISSECT(tl, uint32_t *, NFSX_V4SESSIONID +
5 * NFSX_UNSIGNED);
bcopy(tl, sessionid, NFSX_V4SESSIONID);
tl += NFSX_V4SESSIONID / NFSX_UNSIGNED;
seqid = fxdr_unsigned(uint32_t, *tl++);
slotid = fxdr_unsigned(uint32_t, *tl++);
highslot = fxdr_unsigned(uint32_t, *tl++);
cachethis = *tl++;
/* Throw away the referring call stuff. */
clist = fxdr_unsigned(int, *tl);
for (j = 0; j < clist; j++) {
NFSM_DISSECT(tl, uint32_t *, NFSX_V4SESSIONID +
NFSX_UNSIGNED);
tl += NFSX_V4SESSIONID / NFSX_UNSIGNED;
rcalls = fxdr_unsigned(int, *tl);
for (k = 0; k < rcalls; k++) {
NFSM_DISSECT(tl, uint32_t *,
2 * NFSX_UNSIGNED);
}
}
NFSLOCKCLSTATE();
if (i == 0) {
clp = nfscl_getclntsess(sessionid);
if (clp == NULL)
error = NFSERR_SERVERFAULT;
} else
error = NFSERR_SEQUENCEPOS;
if (error == 0) {
tsep = nfsmnt_mdssession(clp->nfsc_nmp);
error = nfsv4_seqsession(seqid, slotid,
highslot, tsep->nfsess_cbslots, &rep,
tsep->nfsess_backslots);
}
NFSUNLOCKCLSTATE();
if (error == 0 || error == NFSERR_REPLYFROMCACHE) {
gotseq_ok = 1;
if (rep != NULL) {
/*
* Handle a reply for a retried
* callback. The reply will be
* re-inserted in the session cache
* by the nfsv4_seqsess_cacherep() call
* after out:
*/
KASSERT(error == NFSERR_REPLYFROMCACHE,
("cbsequence: non-NULL rep"));
NFSCL_DEBUG(4, "Got cbretry\n");
m_freem(nd->nd_mreq);
nd->nd_mreq = rep;
rep = NULL;
goto out;
}
NFSM_BUILD(tl, uint32_t *,
NFSX_V4SESSIONID + 4 * NFSX_UNSIGNED);
bcopy(sessionid, tl, NFSX_V4SESSIONID);
tl += NFSX_V4SESSIONID / NFSX_UNSIGNED;
*tl++ = txdr_unsigned(seqid);
*tl++ = txdr_unsigned(slotid);
*tl++ = txdr_unsigned(NFSV4_CBSLOTS - 1);
*tl = txdr_unsigned(NFSV4_CBSLOTS - 1);
}
break;
default:
if (i == 0 && minorvers == NFSV41_MINORVERSION)
error = NFSERR_OPNOTINSESS;
else {
NFSCL_DEBUG(1, "unsupp callback %d\n", op);
error = NFSERR_NOTSUPP;
}
break;
}
if (error) {
if (error == EBADRPC || error == NFSERR_BADXDR) {
nd->nd_repstat = NFSERR_BADXDR;
} else {
nd->nd_repstat = error;
}
error = 0;
}
retops++;
if (nd->nd_repstat) {
*repp = nfscl_errmap(nd, minorvers);
break;
} else
*repp = 0; /* NFS4_OK */
}
nfsmout:
if (recallp != NULL)
free(recallp, M_NFSLAYRECALL);
if (error) {
if (error == EBADRPC || error == NFSERR_BADXDR)
nd->nd_repstat = NFSERR_BADXDR;
else
printf("nfsv4 comperr1=%d\n", error);
}
if (taglen == -1) {
NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
*tl++ = 0;
*tl = 0;
} else {
*retopsp = txdr_unsigned(retops);
}
*nd->nd_errp = nfscl_errmap(nd, minorvers);
out:
if (gotseq_ok != 0) {
rep = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
NFSLOCKCLSTATE();
clp = nfscl_getclntsess(sessionid);
if (clp != NULL) {
tsep = nfsmnt_mdssession(clp->nfsc_nmp);
nfsv4_seqsess_cacherep(slotid, tsep->nfsess_cbslots,
NFSERR_OK, &rep);
NFSUNLOCKCLSTATE();
} else {
NFSUNLOCKCLSTATE();
m_freem(rep);
}
}
}
/*
* Generate the next cbident value. Basically just increment a static value
* and then check that it isn't already in the list, if it has wrapped around.
*/
static u_int32_t
nfscl_nextcbident(void)
{
struct nfsclclient *clp;
int matched;
static u_int32_t nextcbident = 0;
static int haswrapped = 0;
nextcbident++;
if (nextcbident == 0)
haswrapped = 1;
if (haswrapped) {
/*
* Search the clientid list for one already using this cbident.
*/
do {
matched = 0;
NFSLOCKCLSTATE();
LIST_FOREACH(clp, &nfsclhead, nfsc_list) {
if (clp->nfsc_cbident == nextcbident) {
matched = 1;
break;
}
}
NFSUNLOCKCLSTATE();
if (matched == 1)
nextcbident++;
} while (matched);
}
return (nextcbident);
}
/*
* Get the mount point related to a given cbident or session and busy it.
*/
static mount_t
nfscl_getmnt(int minorvers, uint8_t *sessionid, u_int32_t cbident,
struct nfsclclient **clpp)
{
struct nfsclclient *clp;
mount_t mp;
int error;
struct nfsclsession *tsep;
*clpp = NULL;
NFSLOCKCLSTATE();
LIST_FOREACH(clp, &nfsclhead, nfsc_list) {
tsep = nfsmnt_mdssession(clp->nfsc_nmp);
if (minorvers == NFSV4_MINORVERSION) {
if (clp->nfsc_cbident == cbident)
break;
} else if (!NFSBCMP(tsep->nfsess_sessionid, sessionid,
NFSX_V4SESSIONID))
break;
}
if (clp == NULL) {
NFSUNLOCKCLSTATE();
return (NULL);
}
mp = clp->nfsc_nmp->nm_mountp;
vfs_ref(mp);
NFSUNLOCKCLSTATE();
error = vfs_busy(mp, 0);
vfs_rel(mp);
if (error != 0)
return (NULL);
*clpp = clp;
return (mp);
}
/*
* Get the clientid pointer related to a given cbident.
*/
static struct nfsclclient *
nfscl_getclnt(u_int32_t cbident)
{
struct nfsclclient *clp;
LIST_FOREACH(clp, &nfsclhead, nfsc_list)
if (clp->nfsc_cbident == cbident)
break;
return (clp);
}
/*
* Get the clientid pointer related to a given sessionid.
*/
static struct nfsclclient *
nfscl_getclntsess(uint8_t *sessionid)
{
struct nfsclclient *clp;
struct nfsclsession *tsep;
LIST_FOREACH(clp, &nfsclhead, nfsc_list) {
tsep = nfsmnt_mdssession(clp->nfsc_nmp);
if (!NFSBCMP(tsep->nfsess_sessionid, sessionid,
NFSX_V4SESSIONID))
break;
}
return (clp);
}
/*
* Search for a lock conflict locally on the client. A conflict occurs if
* - not same owner and overlapping byte range and at least one of them is
* a write lock or this is an unlock.
*/
static int
nfscl_localconflict(struct nfsclclient *clp, u_int8_t *fhp, int fhlen,
struct nfscllock *nlop, u_int8_t *own, struct nfscldeleg *dp,
struct nfscllock **lopp)
{
struct nfsclowner *owp;
struct nfsclopen *op;
int ret;
if (dp != NULL) {
ret = nfscl_checkconflict(&dp->nfsdl_lock, nlop, own, lopp);
if (ret)
return (ret);
}
LIST_FOREACH(owp, &clp->nfsc_owner, nfsow_list) {
LIST_FOREACH(op, &owp->nfsow_open, nfso_list) {
if (op->nfso_fhlen == fhlen &&
!NFSBCMP(op->nfso_fh, fhp, fhlen)) {
ret = nfscl_checkconflict(&op->nfso_lock, nlop,
own, lopp);
if (ret)
return (ret);
}
}
}
return (0);
}
static int
nfscl_checkconflict(struct nfscllockownerhead *lhp, struct nfscllock *nlop,
u_int8_t *own, struct nfscllock **lopp)
{
struct nfscllockowner *lp;
struct nfscllock *lop;
LIST_FOREACH(lp, lhp, nfsl_list) {
if (NFSBCMP(lp->nfsl_owner, own, NFSV4CL_LOCKNAMELEN)) {
LIST_FOREACH(lop, &lp->nfsl_lock, nfslo_list) {
if (lop->nfslo_first >= nlop->nfslo_end)
break;
if (lop->nfslo_end <= nlop->nfslo_first)
continue;
if (lop->nfslo_type == F_WRLCK ||
nlop->nfslo_type == F_WRLCK ||
nlop->nfslo_type == F_UNLCK) {
if (lopp != NULL)
*lopp = lop;
return (NFSERR_DENIED);
}
}
}
}
return (0);
}
/*
* Check for a local conflicting lock.
*/
APPLESTATIC int
nfscl_lockt(vnode_t vp, struct nfsclclient *clp, u_int64_t off,
u_int64_t len, struct flock *fl, NFSPROC_T *p, void *id, int flags)
{
struct nfscllock *lop, nlck;
struct nfscldeleg *dp;
struct nfsnode *np;
u_int8_t own[NFSV4CL_LOCKNAMELEN];
int error;
nlck.nfslo_type = fl->l_type;
nlck.nfslo_first = off;
if (len == NFS64BITSSET) {
nlck.nfslo_end = NFS64BITSSET;
} else {
nlck.nfslo_end = off + len;
if (nlck.nfslo_end <= nlck.nfslo_first)
return (NFSERR_INVAL);
}
np = VTONFS(vp);
nfscl_filllockowner(id, own, flags);
NFSLOCKCLSTATE();
dp = nfscl_finddeleg(clp, np->n_fhp->nfh_fh, np->n_fhp->nfh_len);
error = nfscl_localconflict(clp, np->n_fhp->nfh_fh, np->n_fhp->nfh_len,
&nlck, own, dp, &lop);
if (error != 0) {
fl->l_whence = SEEK_SET;
fl->l_start = lop->nfslo_first;
if (lop->nfslo_end == NFS64BITSSET)
fl->l_len = 0;
else
fl->l_len = lop->nfslo_end - lop->nfslo_first;
fl->l_pid = (pid_t)0;
fl->l_type = lop->nfslo_type;
error = -1; /* no RPC required */
} else if (dp != NULL && ((dp->nfsdl_flags & NFSCLDL_WRITE) ||
fl->l_type == F_RDLCK)) {
/*
* The delegation ensures that there isn't a conflicting
* lock on the server, so return -1 to indicate an RPC
* isn't required.
*/
fl->l_type = F_UNLCK;
error = -1;
}
NFSUNLOCKCLSTATE();
return (error);
}
/*
* Handle Recall of a delegation.
* The clp must be exclusive locked when this is called.
*/
static int
nfscl_recalldeleg(struct nfsclclient *clp, struct nfsmount *nmp,
struct nfscldeleg *dp, vnode_t vp, struct ucred *cred, NFSPROC_T *p,
int called_from_renewthread)
{
struct nfsclowner *owp, *lowp, *nowp;
struct nfsclopen *op, *lop;
struct nfscllockowner *lp;
struct nfscllock *lckp;
struct nfsnode *np;
int error = 0, ret, gotvp = 0;
if (vp == NULL) {
/*
* First, get a vnode for the file. This is needed to do RPCs.
*/
ret = nfscl_ngetreopen(nmp->nm_mountp, dp->nfsdl_fh,
dp->nfsdl_fhlen, p, &np);
if (ret) {
/*
* File isn't open, so nothing to move over to the
* server.
*/
return (0);
}
vp = NFSTOV(np);
gotvp = 1;
} else {
np = VTONFS(vp);
}
dp->nfsdl_flags &= ~NFSCLDL_MODTIMESET;
/*
* Ok, if it's a write delegation, flush data to the server, so
* that close/open consistency is retained.
*/
ret = 0;
NFSLOCKNODE(np);
if ((dp->nfsdl_flags & NFSCLDL_WRITE) && (np->n_flag & NMODIFIED)) {
np->n_flag |= NDELEGRECALL;
NFSUNLOCKNODE(np);
ret = ncl_flush(vp, MNT_WAIT, p, 1, called_from_renewthread);
NFSLOCKNODE(np);
np->n_flag &= ~NDELEGRECALL;
}
NFSINVALATTRCACHE(np);
NFSUNLOCKNODE(np);
if (ret == EIO && called_from_renewthread != 0) {
/*
* If the flush failed with EIO for the renew thread,
* return now, so that the dirty buffer will be flushed
* later.
*/
if (gotvp != 0)
vrele(vp);
return (ret);
}
/*
* Now, for each openowner with opens issued locally, move them
* over to state against the server.
*/
LIST_FOREACH(lowp, &dp->nfsdl_owner, nfsow_list) {
lop = LIST_FIRST(&lowp->nfsow_open);
if (lop != NULL) {
if (LIST_NEXT(lop, nfso_list) != NULL)
panic("nfsdlg mult opens");
/*
* Look for the same openowner against the server.
*/
LIST_FOREACH(owp, &clp->nfsc_owner, nfsow_list) {
if (!NFSBCMP(lowp->nfsow_owner,
owp->nfsow_owner, NFSV4CL_LOCKNAMELEN)) {
newnfs_copycred(&dp->nfsdl_cred, cred);
ret = nfscl_moveopen(vp, clp, nmp, lop,
owp, dp, cred, p);
if (ret == NFSERR_STALECLIENTID ||
ret == NFSERR_STALEDONTRECOVER ||
ret == NFSERR_BADSESSION) {
if (gotvp)
vrele(vp);
return (ret);
}
if (ret) {
nfscl_freeopen(lop, 1);
if (!error)
error = ret;
}
break;
}
}
/*
* If no openowner found, create one and get an open
* for it.
*/
if (owp == NULL) {
MALLOC(nowp, struct nfsclowner *,
sizeof (struct nfsclowner), M_NFSCLOWNER,
M_WAITOK);
nfscl_newopen(clp, NULL, &owp, &nowp, &op,
NULL, lowp->nfsow_owner, dp->nfsdl_fh,
dp->nfsdl_fhlen, NULL, NULL);
newnfs_copycred(&dp->nfsdl_cred, cred);
ret = nfscl_moveopen(vp, clp, nmp, lop,
owp, dp, cred, p);
if (ret) {
nfscl_freeopenowner(owp, 0);
if (ret == NFSERR_STALECLIENTID ||
ret == NFSERR_STALEDONTRECOVER ||
ret == NFSERR_BADSESSION) {
if (gotvp)
vrele(vp);
return (ret);
}
if (ret) {
nfscl_freeopen(lop, 1);
if (!error)
error = ret;
}
}
}
}
}
/*
* Now, get byte range locks for any locks done locally.
*/
LIST_FOREACH(lp, &dp->nfsdl_lock, nfsl_list) {
LIST_FOREACH(lckp, &lp->nfsl_lock, nfslo_list) {
newnfs_copycred(&dp->nfsdl_cred, cred);
ret = nfscl_relock(vp, clp, nmp, lp, lckp, cred, p);
if (ret == NFSERR_STALESTATEID ||
ret == NFSERR_STALEDONTRECOVER ||
ret == NFSERR_STALECLIENTID ||
ret == NFSERR_BADSESSION) {
if (gotvp)
vrele(vp);
return (ret);
}
if (ret && !error)
error = ret;
}
}
if (gotvp)
vrele(vp);
return (error);
}
/*
* Move a locally issued open over to an owner on the state list.
* SIDE EFFECT: If it needs to sleep (do an rpc), it unlocks clstate and
* returns with it unlocked.
*/
static int
nfscl_moveopen(vnode_t vp, struct nfsclclient *clp, struct nfsmount *nmp,
struct nfsclopen *lop, struct nfsclowner *owp, struct nfscldeleg *dp,
struct ucred *cred, NFSPROC_T *p)
{
struct nfsclopen *op, *nop;
struct nfscldeleg *ndp;
struct nfsnode *np;
int error = 0, newone;
/*
* First, look for an appropriate open, If found, just increment the
* opencnt in it.
*/
LIST_FOREACH(op, &owp->nfsow_open, nfso_list) {
if ((op->nfso_mode & lop->nfso_mode) == lop->nfso_mode &&
op->nfso_fhlen == lop->nfso_fhlen &&
!NFSBCMP(op->nfso_fh, lop->nfso_fh, op->nfso_fhlen)) {
op->nfso_opencnt += lop->nfso_opencnt;
nfscl_freeopen(lop, 1);
return (0);
}
}
/* No appropriate open, so we have to do one against the server. */
np = VTONFS(vp);
MALLOC(nop, struct nfsclopen *, sizeof (struct nfsclopen) +
lop->nfso_fhlen - 1, M_NFSCLOPEN, M_WAITOK);
newone = 0;
nfscl_newopen(clp, NULL, &owp, NULL, &op, &nop, owp->nfsow_owner,
lop->nfso_fh, lop->nfso_fhlen, cred, &newone);
ndp = dp;
error = nfscl_tryopen(nmp, vp, np->n_v4->n4_data, np->n_v4->n4_fhlen,
lop->nfso_fh, lop->nfso_fhlen, lop->nfso_mode, op,
NFS4NODENAME(np->n_v4), np->n_v4->n4_namelen, &ndp, 0, 0, cred, p);
if (error) {
if (newone)
nfscl_freeopen(op, 0);
} else {
op->nfso_mode |= lop->nfso_mode;
op->nfso_opencnt += lop->nfso_opencnt;
nfscl_freeopen(lop, 1);
}
if (nop != NULL)
FREE((caddr_t)nop, M_NFSCLOPEN);
if (ndp != NULL) {
/*
* What should I do with the returned delegation, since the
* delegation is being recalled? For now, just printf and
* through it away.
*/
printf("Moveopen returned deleg\n");
FREE((caddr_t)ndp, M_NFSCLDELEG);
}
return (error);
}
/*
* Recall all delegations on this client.
*/
static void
nfscl_totalrecall(struct nfsclclient *clp)
{
struct nfscldeleg *dp;
TAILQ_FOREACH(dp, &clp->nfsc_deleg, nfsdl_list) {
if ((dp->nfsdl_flags & NFSCLDL_DELEGRET) == 0)
dp->nfsdl_flags |= NFSCLDL_RECALL;
}
}
/*
* Relock byte ranges. Called for delegation recall and state expiry.
*/
static int
nfscl_relock(vnode_t vp, struct nfsclclient *clp, struct nfsmount *nmp,
struct nfscllockowner *lp, struct nfscllock *lop, struct ucred *cred,
NFSPROC_T *p)
{
struct nfscllockowner *nlp;
struct nfsfh *nfhp;
u_int64_t off, len;
- u_int32_t clidrev = 0;
int error, newone, donelocally;
off = lop->nfslo_first;
len = lop->nfslo_end - lop->nfslo_first;
error = nfscl_getbytelock(vp, off, len, lop->nfslo_type, cred, p,
clp, 1, NULL, lp->nfsl_lockflags, lp->nfsl_owner,
lp->nfsl_openowner, &nlp, &newone, &donelocally);
if (error || donelocally)
return (error);
- if (nmp->nm_clp != NULL)
- clidrev = nmp->nm_clp->nfsc_clientidrev;
- else
- clidrev = 0;
nfhp = VTONFS(vp)->n_fhp;
error = nfscl_trylock(nmp, vp, nfhp->nfh_fh,
nfhp->nfh_len, nlp, newone, 0, off,
len, lop->nfslo_type, cred, p);
if (error)
nfscl_freelockowner(nlp, 0);
return (error);
}
/*
* Called to re-open a file. Basically get a vnode for the file handle
* and then call nfsrpc_openrpc() to do the rest.
*/
static int
nfsrpc_reopen(struct nfsmount *nmp, u_int8_t *fhp, int fhlen,
u_int32_t mode, struct nfsclopen *op, struct nfscldeleg **dpp,
struct ucred *cred, NFSPROC_T *p)
{
struct nfsnode *np;
vnode_t vp;
int error;
error = nfscl_ngetreopen(nmp->nm_mountp, fhp, fhlen, p, &np);
if (error)
return (error);
vp = NFSTOV(np);
if (np->n_v4 != NULL) {
error = nfscl_tryopen(nmp, vp, np->n_v4->n4_data,
np->n_v4->n4_fhlen, fhp, fhlen, mode, op,
NFS4NODENAME(np->n_v4), np->n_v4->n4_namelen, dpp, 0, 0,
cred, p);
} else {
error = EINVAL;
}
vrele(vp);
return (error);
}
/*
* Try an open against the server. Just call nfsrpc_openrpc(), retrying while
* NFSERR_DELAY. Also, try system credentials, if the passed in credentials
* fail.
*/
static int
nfscl_tryopen(struct nfsmount *nmp, vnode_t vp, u_int8_t *fhp, int fhlen,
u_int8_t *newfhp, int newfhlen, u_int32_t mode, struct nfsclopen *op,
u_int8_t *name, int namelen, struct nfscldeleg **ndpp,
int reclaim, u_int32_t delegtype, struct ucred *cred, NFSPROC_T *p)
{
int error;
do {
error = nfsrpc_openrpc(nmp, vp, fhp, fhlen, newfhp, newfhlen,
mode, op, name, namelen, ndpp, reclaim, delegtype, cred, p,
0, 0);
if (error == NFSERR_DELAY)
(void) nfs_catnap(PZERO, error, "nfstryop");
} while (error == NFSERR_DELAY);
if (error == EAUTH || error == EACCES) {
/* Try again using system credentials */
newnfs_setroot(cred);
do {
error = nfsrpc_openrpc(nmp, vp, fhp, fhlen, newfhp,
newfhlen, mode, op, name, namelen, ndpp, reclaim,
delegtype, cred, p, 1, 0);
if (error == NFSERR_DELAY)
(void) nfs_catnap(PZERO, error, "nfstryop");
} while (error == NFSERR_DELAY);
}
return (error);
}
/*
* Try a byte range lock. Just loop on nfsrpc_lock() while it returns
* NFSERR_DELAY. Also, retry with system credentials, if the provided
* cred don't work.
*/
static int
nfscl_trylock(struct nfsmount *nmp, vnode_t vp, u_int8_t *fhp,
int fhlen, struct nfscllockowner *nlp, int newone, int reclaim,
u_int64_t off, u_int64_t len, short type, struct ucred *cred, NFSPROC_T *p)
{
struct nfsrv_descript nfsd, *nd = &nfsd;
int error;
do {
error = nfsrpc_lock(nd, nmp, vp, fhp, fhlen, nlp, newone,
reclaim, off, len, type, cred, p, 0);
if (!error && nd->nd_repstat == NFSERR_DELAY)
(void) nfs_catnap(PZERO, (int)nd->nd_repstat,
"nfstrylck");
} while (!error && nd->nd_repstat == NFSERR_DELAY);
if (!error)
error = nd->nd_repstat;
if (error == EAUTH || error == EACCES) {
/* Try again using root credentials */
newnfs_setroot(cred);
do {
error = nfsrpc_lock(nd, nmp, vp, fhp, fhlen, nlp,
newone, reclaim, off, len, type, cred, p, 1);
if (!error && nd->nd_repstat == NFSERR_DELAY)
(void) nfs_catnap(PZERO, (int)nd->nd_repstat,
"nfstrylck");
} while (!error && nd->nd_repstat == NFSERR_DELAY);
if (!error)
error = nd->nd_repstat;
}
return (error);
}
/*
* Try a delegreturn against the server. Just call nfsrpc_delegreturn(),
* retrying while NFSERR_DELAY. Also, try system credentials, if the passed in
* credentials fail.
*/
static int
nfscl_trydelegreturn(struct nfscldeleg *dp, struct ucred *cred,
struct nfsmount *nmp, NFSPROC_T *p)
{
int error;
do {
error = nfsrpc_delegreturn(dp, cred, nmp, p, 0);
if (error == NFSERR_DELAY)
(void) nfs_catnap(PZERO, error, "nfstrydp");
} while (error == NFSERR_DELAY);
if (error == EAUTH || error == EACCES) {
/* Try again using system credentials */
newnfs_setroot(cred);
do {
error = nfsrpc_delegreturn(dp, cred, nmp, p, 1);
if (error == NFSERR_DELAY)
(void) nfs_catnap(PZERO, error, "nfstrydp");
} while (error == NFSERR_DELAY);
}
return (error);
}
/*
* Try a close against the server. Just call nfsrpc_closerpc(),
* retrying while NFSERR_DELAY. Also, try system credentials, if the passed in
* credentials fail.
*/
APPLESTATIC int
nfscl_tryclose(struct nfsclopen *op, struct ucred *cred,
struct nfsmount *nmp, NFSPROC_T *p)
{
struct nfsrv_descript nfsd, *nd = &nfsd;
int error;
do {
error = nfsrpc_closerpc(nd, nmp, op, cred, p, 0);
if (error == NFSERR_DELAY)
(void) nfs_catnap(PZERO, error, "nfstrycl");
} while (error == NFSERR_DELAY);
if (error == EAUTH || error == EACCES) {
/* Try again using system credentials */
newnfs_setroot(cred);
do {
error = nfsrpc_closerpc(nd, nmp, op, cred, p, 1);
if (error == NFSERR_DELAY)
(void) nfs_catnap(PZERO, error, "nfstrycl");
} while (error == NFSERR_DELAY);
}
return (error);
}
/*
* Decide if a delegation on a file permits close without flushing writes
* to the server. This might be a big performance win in some environments.
* (Not useful until the client does caching on local stable storage.)
*/
APPLESTATIC int
nfscl_mustflush(vnode_t vp)
{
struct nfsclclient *clp;
struct nfscldeleg *dp;
struct nfsnode *np;
struct nfsmount *nmp;
np = VTONFS(vp);
nmp = VFSTONFS(vnode_mount(vp));
if (!NFSHASNFSV4(nmp))
return (1);
NFSLOCKCLSTATE();
clp = nfscl_findcl(nmp);
if (clp == NULL) {
NFSUNLOCKCLSTATE();
return (1);
}
dp = nfscl_finddeleg(clp, np->n_fhp->nfh_fh, np->n_fhp->nfh_len);
if (dp != NULL && (dp->nfsdl_flags &
(NFSCLDL_WRITE | NFSCLDL_RECALL | NFSCLDL_DELEGRET)) ==
NFSCLDL_WRITE &&
(dp->nfsdl_sizelimit >= np->n_size ||
!NFSHASSTRICT3530(nmp))) {
NFSUNLOCKCLSTATE();
return (0);
}
NFSUNLOCKCLSTATE();
return (1);
}
/*
* See if a (write) delegation exists for this file.
*/
APPLESTATIC int
nfscl_nodeleg(vnode_t vp, int writedeleg)
{
struct nfsclclient *clp;
struct nfscldeleg *dp;
struct nfsnode *np;
struct nfsmount *nmp;
np = VTONFS(vp);
nmp = VFSTONFS(vnode_mount(vp));
if (!NFSHASNFSV4(nmp))
return (1);
NFSLOCKCLSTATE();
clp = nfscl_findcl(nmp);
if (clp == NULL) {
NFSUNLOCKCLSTATE();
return (1);
}
dp = nfscl_finddeleg(clp, np->n_fhp->nfh_fh, np->n_fhp->nfh_len);
if (dp != NULL &&
(dp->nfsdl_flags & (NFSCLDL_RECALL | NFSCLDL_DELEGRET)) == 0 &&
(writedeleg == 0 || (dp->nfsdl_flags & NFSCLDL_WRITE) ==
NFSCLDL_WRITE)) {
NFSUNLOCKCLSTATE();
return (0);
}
NFSUNLOCKCLSTATE();
return (1);
}
/*
* Look for an associated delegation that should be DelegReturned.
*/
APPLESTATIC int
nfscl_removedeleg(vnode_t vp, NFSPROC_T *p, nfsv4stateid_t *stp)
{
struct nfsclclient *clp;
struct nfscldeleg *dp;
struct nfsclowner *owp;
struct nfscllockowner *lp;
struct nfsmount *nmp;
struct ucred *cred;
struct nfsnode *np;
int igotlock = 0, triedrecall = 0, needsrecall, retcnt = 0, islept;
nmp = VFSTONFS(vnode_mount(vp));
np = VTONFS(vp);
NFSLOCKCLSTATE();
/*
* Loop around waiting for:
* - outstanding I/O operations on delegations to complete
* - for a delegation on vp that has state, lock the client and
* do a recall
* - return delegation with no state
*/
while (1) {
clp = nfscl_findcl(nmp);
if (clp == NULL) {
NFSUNLOCKCLSTATE();
return (retcnt);
}
dp = nfscl_finddeleg(clp, np->n_fhp->nfh_fh,
np->n_fhp->nfh_len);
if (dp != NULL) {
/*
* Wait for outstanding I/O ops to be done.
*/
if (dp->nfsdl_rwlock.nfslock_usecnt > 0) {
if (igotlock) {
nfsv4_unlock(&clp->nfsc_lock, 0);
igotlock = 0;
}
dp->nfsdl_rwlock.nfslock_lock |= NFSV4LOCK_WANTED;
(void) nfsmsleep(&dp->nfsdl_rwlock,
NFSCLSTATEMUTEXPTR, PZERO, "nfscld", NULL);
continue;
}
needsrecall = 0;
LIST_FOREACH(owp, &dp->nfsdl_owner, nfsow_list) {
if (!LIST_EMPTY(&owp->nfsow_open)) {
needsrecall = 1;
break;
}
}
if (!needsrecall) {
LIST_FOREACH(lp, &dp->nfsdl_lock, nfsl_list) {
if (!LIST_EMPTY(&lp->nfsl_lock)) {
needsrecall = 1;
break;
}
}
}
if (needsrecall && !triedrecall) {
dp->nfsdl_flags |= NFSCLDL_DELEGRET;
islept = 0;
while (!igotlock) {
igotlock = nfsv4_lock(&clp->nfsc_lock, 1,
&islept, NFSCLSTATEMUTEXPTR, NULL);
if (islept)
break;
}
if (islept)
continue;
NFSUNLOCKCLSTATE();
cred = newnfs_getcred();
newnfs_copycred(&dp->nfsdl_cred, cred);
(void) nfscl_recalldeleg(clp, nmp, dp, vp, cred, p, 0);
NFSFREECRED(cred);
triedrecall = 1;
NFSLOCKCLSTATE();
nfsv4_unlock(&clp->nfsc_lock, 0);
igotlock = 0;
continue;
}
*stp = dp->nfsdl_stateid;
retcnt = 1;
nfscl_cleandeleg(dp);
nfscl_freedeleg(&clp->nfsc_deleg, dp);
}
if (igotlock)
nfsv4_unlock(&clp->nfsc_lock, 0);
NFSUNLOCKCLSTATE();
return (retcnt);
}
}
/*
* Look for associated delegation(s) that should be DelegReturned.
*/
APPLESTATIC int
nfscl_renamedeleg(vnode_t fvp, nfsv4stateid_t *fstp, int *gotfdp, vnode_t tvp,
nfsv4stateid_t *tstp, int *gottdp, NFSPROC_T *p)
{
struct nfsclclient *clp;
struct nfscldeleg *dp;
struct nfsclowner *owp;
struct nfscllockowner *lp;
struct nfsmount *nmp;
struct ucred *cred;
struct nfsnode *np;
int igotlock = 0, triedrecall = 0, needsrecall, retcnt = 0, islept;
nmp = VFSTONFS(vnode_mount(fvp));
*gotfdp = 0;
*gottdp = 0;
NFSLOCKCLSTATE();
/*
* Loop around waiting for:
* - outstanding I/O operations on delegations to complete
* - for a delegation on fvp that has state, lock the client and
* do a recall
* - return delegation(s) with no state.
*/
while (1) {
clp = nfscl_findcl(nmp);
if (clp == NULL) {
NFSUNLOCKCLSTATE();
return (retcnt);
}
np = VTONFS(fvp);
dp = nfscl_finddeleg(clp, np->n_fhp->nfh_fh,
np->n_fhp->nfh_len);
if (dp != NULL && *gotfdp == 0) {
/*
* Wait for outstanding I/O ops to be done.
*/
if (dp->nfsdl_rwlock.nfslock_usecnt > 0) {
if (igotlock) {
nfsv4_unlock(&clp->nfsc_lock, 0);
igotlock = 0;
}
dp->nfsdl_rwlock.nfslock_lock |= NFSV4LOCK_WANTED;
(void) nfsmsleep(&dp->nfsdl_rwlock,
NFSCLSTATEMUTEXPTR, PZERO, "nfscld", NULL);
continue;
}
needsrecall = 0;
LIST_FOREACH(owp, &dp->nfsdl_owner, nfsow_list) {
if (!LIST_EMPTY(&owp->nfsow_open)) {
needsrecall = 1;
break;
}
}
if (!needsrecall) {
LIST_FOREACH(lp, &dp->nfsdl_lock, nfsl_list) {
if (!LIST_EMPTY(&lp->nfsl_lock)) {
needsrecall = 1;
break;
}
}
}
if (needsrecall && !triedrecall) {
dp->nfsdl_flags |= NFSCLDL_DELEGRET;
islept = 0;
while (!igotlock) {
igotlock = nfsv4_lock(&clp->nfsc_lock, 1,
&islept, NFSCLSTATEMUTEXPTR, NULL);
if (islept)
break;
}
if (islept)
continue;
NFSUNLOCKCLSTATE();
cred = newnfs_getcred();
newnfs_copycred(&dp->nfsdl_cred, cred);
(void) nfscl_recalldeleg(clp, nmp, dp, fvp, cred, p, 0);
NFSFREECRED(cred);
triedrecall = 1;
NFSLOCKCLSTATE();
nfsv4_unlock(&clp->nfsc_lock, 0);
igotlock = 0;
continue;
}
*fstp = dp->nfsdl_stateid;
retcnt++;
*gotfdp = 1;
nfscl_cleandeleg(dp);
nfscl_freedeleg(&clp->nfsc_deleg, dp);
}
if (igotlock) {
nfsv4_unlock(&clp->nfsc_lock, 0);
igotlock = 0;
}
if (tvp != NULL) {
np = VTONFS(tvp);
dp = nfscl_finddeleg(clp, np->n_fhp->nfh_fh,
np->n_fhp->nfh_len);
if (dp != NULL && *gottdp == 0) {
/*
* Wait for outstanding I/O ops to be done.
*/
if (dp->nfsdl_rwlock.nfslock_usecnt > 0) {
dp->nfsdl_rwlock.nfslock_lock |= NFSV4LOCK_WANTED;
(void) nfsmsleep(&dp->nfsdl_rwlock,
NFSCLSTATEMUTEXPTR, PZERO, "nfscld", NULL);
continue;
}
LIST_FOREACH(owp, &dp->nfsdl_owner, nfsow_list) {
if (!LIST_EMPTY(&owp->nfsow_open)) {
NFSUNLOCKCLSTATE();
return (retcnt);
}
}
LIST_FOREACH(lp, &dp->nfsdl_lock, nfsl_list) {
if (!LIST_EMPTY(&lp->nfsl_lock)) {
NFSUNLOCKCLSTATE();
return (retcnt);
}
}
*tstp = dp->nfsdl_stateid;
retcnt++;
*gottdp = 1;
nfscl_cleandeleg(dp);
nfscl_freedeleg(&clp->nfsc_deleg, dp);
}
}
NFSUNLOCKCLSTATE();
return (retcnt);
}
}
/*
* Get a reference on the clientid associated with the mount point.
* Return 1 if success, 0 otherwise.
*/
APPLESTATIC int
nfscl_getref(struct nfsmount *nmp)
{
struct nfsclclient *clp;
NFSLOCKCLSTATE();
clp = nfscl_findcl(nmp);
if (clp == NULL) {
NFSUNLOCKCLSTATE();
return (0);
}
nfsv4_getref(&clp->nfsc_lock, NULL, NFSCLSTATEMUTEXPTR, NULL);
NFSUNLOCKCLSTATE();
return (1);
}
/*
* Release a reference on a clientid acquired with the above call.
*/
APPLESTATIC void
nfscl_relref(struct nfsmount *nmp)
{
struct nfsclclient *clp;
NFSLOCKCLSTATE();
clp = nfscl_findcl(nmp);
if (clp == NULL) {
NFSUNLOCKCLSTATE();
return;
}
nfsv4_relref(&clp->nfsc_lock);
NFSUNLOCKCLSTATE();
}
/*
* Save the size attribute in the delegation, since the nfsnode
* is going away.
*/
APPLESTATIC void
nfscl_reclaimnode(vnode_t vp)
{
struct nfsclclient *clp;
struct nfscldeleg *dp;
struct nfsnode *np = VTONFS(vp);
struct nfsmount *nmp;
nmp = VFSTONFS(vnode_mount(vp));
if (!NFSHASNFSV4(nmp))
return;
NFSLOCKCLSTATE();
clp = nfscl_findcl(nmp);
if (clp == NULL) {
NFSUNLOCKCLSTATE();
return;
}
dp = nfscl_finddeleg(clp, np->n_fhp->nfh_fh, np->n_fhp->nfh_len);
if (dp != NULL && (dp->nfsdl_flags & NFSCLDL_WRITE))
dp->nfsdl_size = np->n_size;
NFSUNLOCKCLSTATE();
}
/*
* Get the saved size attribute in the delegation, since it is a
* newly allocated nfsnode.
*/
APPLESTATIC void
nfscl_newnode(vnode_t vp)
{
struct nfsclclient *clp;
struct nfscldeleg *dp;
struct nfsnode *np = VTONFS(vp);
struct nfsmount *nmp;
nmp = VFSTONFS(vnode_mount(vp));
if (!NFSHASNFSV4(nmp))
return;
NFSLOCKCLSTATE();
clp = nfscl_findcl(nmp);
if (clp == NULL) {
NFSUNLOCKCLSTATE();
return;
}
dp = nfscl_finddeleg(clp, np->n_fhp->nfh_fh, np->n_fhp->nfh_len);
if (dp != NULL && (dp->nfsdl_flags & NFSCLDL_WRITE))
np->n_size = dp->nfsdl_size;
NFSUNLOCKCLSTATE();
}
/*
* If there is a valid write delegation for this file, set the modtime
* to the local clock time.
*/
APPLESTATIC void
nfscl_delegmodtime(vnode_t vp)
{
struct nfsclclient *clp;
struct nfscldeleg *dp;
struct nfsnode *np = VTONFS(vp);
struct nfsmount *nmp;
nmp = VFSTONFS(vnode_mount(vp));
if (!NFSHASNFSV4(nmp))
return;
NFSLOCKCLSTATE();
clp = nfscl_findcl(nmp);
if (clp == NULL) {
NFSUNLOCKCLSTATE();
return;
}
dp = nfscl_finddeleg(clp, np->n_fhp->nfh_fh, np->n_fhp->nfh_len);
if (dp != NULL && (dp->nfsdl_flags & NFSCLDL_WRITE)) {
nanotime(&dp->nfsdl_modtime);
dp->nfsdl_flags |= NFSCLDL_MODTIMESET;
}
NFSUNLOCKCLSTATE();
}
/*
* If there is a valid write delegation for this file with a modtime set,
* put that modtime in mtime.
*/
APPLESTATIC void
nfscl_deleggetmodtime(vnode_t vp, struct timespec *mtime)
{
struct nfsclclient *clp;
struct nfscldeleg *dp;
struct nfsnode *np = VTONFS(vp);
struct nfsmount *nmp;
nmp = VFSTONFS(vnode_mount(vp));
if (!NFSHASNFSV4(nmp))
return;
NFSLOCKCLSTATE();
clp = nfscl_findcl(nmp);
if (clp == NULL) {
NFSUNLOCKCLSTATE();
return;
}
dp = nfscl_finddeleg(clp, np->n_fhp->nfh_fh, np->n_fhp->nfh_len);
if (dp != NULL &&
(dp->nfsdl_flags & (NFSCLDL_WRITE | NFSCLDL_MODTIMESET)) ==
(NFSCLDL_WRITE | NFSCLDL_MODTIMESET))
*mtime = dp->nfsdl_modtime;
NFSUNLOCKCLSTATE();
}
static int
nfscl_errmap(struct nfsrv_descript *nd, u_int32_t minorvers)
{
short *defaulterrp, *errp;
if (!nd->nd_repstat)
return (0);
if (nd->nd_procnum == NFSPROC_NOOP)
return (txdr_unsigned(nd->nd_repstat & 0xffff));
if (nd->nd_repstat == EBADRPC)
return (txdr_unsigned(NFSERR_BADXDR));
if (nd->nd_repstat == NFSERR_MINORVERMISMATCH ||
nd->nd_repstat == NFSERR_OPILLEGAL)
return (txdr_unsigned(nd->nd_repstat));
if (nd->nd_repstat >= NFSERR_BADIOMODE && nd->nd_repstat < 20000 &&
minorvers > NFSV4_MINORVERSION) {
/* NFSv4.n error. */
return (txdr_unsigned(nd->nd_repstat));
}
if (nd->nd_procnum < NFSV4OP_CBNOPS)
errp = defaulterrp = nfscl_cberrmap[nd->nd_procnum];
else
return (txdr_unsigned(nd->nd_repstat));
while (*++errp)
if (*errp == (short)nd->nd_repstat)
return (txdr_unsigned(nd->nd_repstat));
return (txdr_unsigned(*defaulterrp));
}
/*
* Called to find/add a layout to a client.
* This function returns the layout with a refcnt (shared lock) upon
* success (returns 0) or with no lock/refcnt on the layout when an
* error is returned.
* If a layout is passed in via lypp, it is locked (exclusively locked).
*/
APPLESTATIC int
nfscl_layout(struct nfsmount *nmp, vnode_t vp, u_int8_t *fhp, int fhlen,
nfsv4stateid_t *stateidp, int layouttype, int retonclose,
struct nfsclflayouthead *fhlp, struct nfscllayout **lypp,
struct ucred *cred, NFSPROC_T *p)
{
struct nfsclclient *clp;
struct nfscllayout *lyp, *tlyp;
struct nfsclflayout *flp;
struct nfsnode *np = VTONFS(vp);
mount_t mp;
int layout_passed_in;
mp = nmp->nm_mountp;
layout_passed_in = 1;
tlyp = NULL;
lyp = *lypp;
if (lyp == NULL) {
layout_passed_in = 0;
tlyp = malloc(sizeof(*tlyp) + fhlen - 1, M_NFSLAYOUT,
M_WAITOK | M_ZERO);
}
NFSLOCKCLSTATE();
clp = nmp->nm_clp;
if (clp == NULL) {
if (layout_passed_in != 0)
nfsv4_unlock(&lyp->nfsly_lock, 0);
NFSUNLOCKCLSTATE();
if (tlyp != NULL)
free(tlyp, M_NFSLAYOUT);
return (EPERM);
}
if (lyp == NULL) {
/*
* Although no lyp was passed in, another thread might have
* allocated one. If one is found, just increment it's ref
* count and return it.
*/
lyp = nfscl_findlayout(clp, fhp, fhlen);
if (lyp == NULL) {
lyp = tlyp;
tlyp = NULL;
lyp->nfsly_stateid.seqid = stateidp->seqid;
lyp->nfsly_stateid.other[0] = stateidp->other[0];
lyp->nfsly_stateid.other[1] = stateidp->other[1];
lyp->nfsly_stateid.other[2] = stateidp->other[2];
lyp->nfsly_lastbyte = 0;
LIST_INIT(&lyp->nfsly_flayread);
LIST_INIT(&lyp->nfsly_flayrw);
LIST_INIT(&lyp->nfsly_recall);
lyp->nfsly_filesid[0] = np->n_vattr.na_filesid[0];
lyp->nfsly_filesid[1] = np->n_vattr.na_filesid[1];
lyp->nfsly_clp = clp;
if (layouttype == NFSLAYOUT_FLEXFILE)
lyp->nfsly_flags = NFSLY_FLEXFILE;
else
lyp->nfsly_flags = NFSLY_FILES;
if (retonclose != 0)
lyp->nfsly_flags |= NFSLY_RETONCLOSE;
lyp->nfsly_fhlen = fhlen;
NFSBCOPY(fhp, lyp->nfsly_fh, fhlen);
TAILQ_INSERT_HEAD(&clp->nfsc_layout, lyp, nfsly_list);
LIST_INSERT_HEAD(NFSCLLAYOUTHASH(clp, fhp, fhlen), lyp,
nfsly_hash);
lyp->nfsly_timestamp = NFSD_MONOSEC + 120;
nfscl_layoutcnt++;
} else {
if (retonclose != 0)
lyp->nfsly_flags |= NFSLY_RETONCLOSE;
TAILQ_REMOVE(&clp->nfsc_layout, lyp, nfsly_list);
TAILQ_INSERT_HEAD(&clp->nfsc_layout, lyp, nfsly_list);
lyp->nfsly_timestamp = NFSD_MONOSEC + 120;
}
nfsv4_getref(&lyp->nfsly_lock, NULL, NFSCLSTATEMUTEXPTR, mp);
if (NFSCL_FORCEDISM(mp)) {
NFSUNLOCKCLSTATE();
if (tlyp != NULL)
free(tlyp, M_NFSLAYOUT);
return (EPERM);
}
*lypp = lyp;
} else
lyp->nfsly_stateid.seqid = stateidp->seqid;
/* Merge the new list of File Layouts into the list. */
flp = LIST_FIRST(fhlp);
if (flp != NULL) {
if (flp->nfsfl_iomode == NFSLAYOUTIOMODE_READ)
nfscl_mergeflayouts(&lyp->nfsly_flayread, fhlp);
else
nfscl_mergeflayouts(&lyp->nfsly_flayrw, fhlp);
}
if (layout_passed_in != 0)
nfsv4_unlock(&lyp->nfsly_lock, 1);
NFSUNLOCKCLSTATE();
if (tlyp != NULL)
free(tlyp, M_NFSLAYOUT);
return (0);
}
/*
* Search for a layout by MDS file handle.
* If one is found, it is returned with a refcnt (shared lock) iff
* retflpp returned non-NULL and locked (exclusive locked) iff retflpp is
* returned NULL.
*/
struct nfscllayout *
nfscl_getlayout(struct nfsclclient *clp, uint8_t *fhp, int fhlen,
uint64_t off, struct nfsclflayout **retflpp, int *recalledp)
{
struct nfscllayout *lyp;
mount_t mp;
int error, igotlock;
mp = clp->nfsc_nmp->nm_mountp;
*recalledp = 0;
*retflpp = NULL;
NFSLOCKCLSTATE();
lyp = nfscl_findlayout(clp, fhp, fhlen);
if (lyp != NULL) {
if ((lyp->nfsly_flags & NFSLY_RECALL) == 0) {
TAILQ_REMOVE(&clp->nfsc_layout, lyp, nfsly_list);
TAILQ_INSERT_HEAD(&clp->nfsc_layout, lyp, nfsly_list);
lyp->nfsly_timestamp = NFSD_MONOSEC + 120;
error = nfscl_findlayoutforio(lyp, off,
NFSV4OPEN_ACCESSREAD, retflpp);
if (error == 0)
nfsv4_getref(&lyp->nfsly_lock, NULL,
NFSCLSTATEMUTEXPTR, mp);
else {
do {
igotlock = nfsv4_lock(&lyp->nfsly_lock,
1, NULL, NFSCLSTATEMUTEXPTR, mp);
} while (igotlock == 0 && !NFSCL_FORCEDISM(mp));
*retflpp = NULL;
}
if (NFSCL_FORCEDISM(mp)) {
lyp = NULL;
*recalledp = 1;
}
} else {
lyp = NULL;
*recalledp = 1;
}
}
NFSUNLOCKCLSTATE();
return (lyp);
}
/*
* Search for a layout by MDS file handle. If one is found, mark in to be
* recalled, if it already marked "return on close".
*/
static void
nfscl_retoncloselayout(vnode_t vp, struct nfsclclient *clp, uint8_t *fhp,
int fhlen, struct nfsclrecalllayout **recallpp)
{
struct nfscllayout *lyp;
uint32_t iomode;
if (vp->v_type != VREG || !NFSHASPNFS(VFSTONFS(vnode_mount(vp))) ||
nfscl_enablecallb == 0 || nfs_numnfscbd == 0 ||
(VTONFS(vp)->n_flag & NNOLAYOUT) != 0)
return;
lyp = nfscl_findlayout(clp, fhp, fhlen);
if (lyp != NULL && (lyp->nfsly_flags & (NFSLY_RETONCLOSE |
NFSLY_RECALL)) == NFSLY_RETONCLOSE) {
iomode = 0;
if (!LIST_EMPTY(&lyp->nfsly_flayread))
iomode |= NFSLAYOUTIOMODE_READ;
if (!LIST_EMPTY(&lyp->nfsly_flayrw))
iomode |= NFSLAYOUTIOMODE_RW;
(void)nfscl_layoutrecall(NFSLAYOUTRETURN_FILE, lyp, iomode,
0, UINT64_MAX, lyp->nfsly_stateid.seqid, *recallpp);
NFSCL_DEBUG(4, "retoncls recall iomode=%d\n", iomode);
*recallpp = NULL;
}
}
/*
* Dereference a layout.
*/
void
nfscl_rellayout(struct nfscllayout *lyp, int exclocked)
{
NFSLOCKCLSTATE();
if (exclocked != 0)
nfsv4_unlock(&lyp->nfsly_lock, 0);
else
nfsv4_relref(&lyp->nfsly_lock);
NFSUNLOCKCLSTATE();
}
/*
* Search for a devinfo by deviceid. If one is found, return it after
* acquiring a reference count on it.
*/
struct nfscldevinfo *
nfscl_getdevinfo(struct nfsclclient *clp, uint8_t *deviceid,
struct nfscldevinfo *dip)
{
NFSLOCKCLSTATE();
if (dip == NULL)
dip = nfscl_finddevinfo(clp, deviceid);
if (dip != NULL)
dip->nfsdi_refcnt++;
NFSUNLOCKCLSTATE();
return (dip);
}
/*
* Dereference a devinfo structure.
*/
static void
nfscl_reldevinfo_locked(struct nfscldevinfo *dip)
{
dip->nfsdi_refcnt--;
if (dip->nfsdi_refcnt == 0)
wakeup(&dip->nfsdi_refcnt);
}
/*
* Dereference a devinfo structure.
*/
void
nfscl_reldevinfo(struct nfscldevinfo *dip)
{
NFSLOCKCLSTATE();
nfscl_reldevinfo_locked(dip);
NFSUNLOCKCLSTATE();
}
/*
* Find a layout for this file handle. Return NULL upon failure.
*/
static struct nfscllayout *
nfscl_findlayout(struct nfsclclient *clp, u_int8_t *fhp, int fhlen)
{
struct nfscllayout *lyp;
LIST_FOREACH(lyp, NFSCLLAYOUTHASH(clp, fhp, fhlen), nfsly_hash)
if (lyp->nfsly_fhlen == fhlen &&
!NFSBCMP(lyp->nfsly_fh, fhp, fhlen))
break;
return (lyp);
}
/*
* Find a devinfo for this deviceid. Return NULL upon failure.
*/
static struct nfscldevinfo *
nfscl_finddevinfo(struct nfsclclient *clp, uint8_t *deviceid)
{
struct nfscldevinfo *dip;
LIST_FOREACH(dip, &clp->nfsc_devinfo, nfsdi_list)
if (NFSBCMP(dip->nfsdi_deviceid, deviceid, NFSX_V4DEVICEID)
== 0)
break;
return (dip);
}
/*
* Merge the new file layout list into the main one, maintaining it in
* increasing offset order.
*/
static void
nfscl_mergeflayouts(struct nfsclflayouthead *fhlp,
struct nfsclflayouthead *newfhlp)
{
struct nfsclflayout *flp, *nflp, *prevflp, *tflp;
flp = LIST_FIRST(fhlp);
prevflp = NULL;
LIST_FOREACH_SAFE(nflp, newfhlp, nfsfl_list, tflp) {
while (flp != NULL && flp->nfsfl_off < nflp->nfsfl_off) {
prevflp = flp;
flp = LIST_NEXT(flp, nfsfl_list);
}
if (prevflp == NULL)
LIST_INSERT_HEAD(fhlp, nflp, nfsfl_list);
else
LIST_INSERT_AFTER(prevflp, nflp, nfsfl_list);
prevflp = nflp;
}
}
/*
* Add this nfscldevinfo to the client, if it doesn't already exist.
* This function consumes the structure pointed at by dip, if not NULL.
*/
APPLESTATIC int
nfscl_adddevinfo(struct nfsmount *nmp, struct nfscldevinfo *dip,
struct nfsclflayout *flp)
{
struct nfsclclient *clp;
struct nfscldevinfo *tdip;
uint8_t *dev;
NFSLOCKCLSTATE();
clp = nmp->nm_clp;
if (clp == NULL) {
NFSUNLOCKCLSTATE();
if (dip != NULL)
free(dip, M_NFSDEVINFO);
return (ENODEV);
}
if ((flp->nfsfl_flags & NFSFL_FILE) != 0)
dev = flp->nfsfl_dev;
else
dev = flp->nfsfl_ffm[0].dev;
tdip = nfscl_finddevinfo(clp, dev);
if (tdip != NULL) {
tdip->nfsdi_layoutrefs++;
flp->nfsfl_devp = tdip;
nfscl_reldevinfo_locked(tdip);
NFSUNLOCKCLSTATE();
if (dip != NULL)
free(dip, M_NFSDEVINFO);
return (0);
}
if (dip != NULL) {
LIST_INSERT_HEAD(&clp->nfsc_devinfo, dip, nfsdi_list);
dip->nfsdi_layoutrefs = 1;
flp->nfsfl_devp = dip;
}
NFSUNLOCKCLSTATE();
if (dip == NULL)
return (ENODEV);
return (0);
}
/*
* Free up a layout structure and associated file layout structure(s).
*/
APPLESTATIC void
nfscl_freelayout(struct nfscllayout *layp)
{
struct nfsclflayout *flp, *nflp;
struct nfsclrecalllayout *rp, *nrp;
LIST_FOREACH_SAFE(flp, &layp->nfsly_flayread, nfsfl_list, nflp) {
LIST_REMOVE(flp, nfsfl_list);
nfscl_freeflayout(flp);
}
LIST_FOREACH_SAFE(flp, &layp->nfsly_flayrw, nfsfl_list, nflp) {
LIST_REMOVE(flp, nfsfl_list);
nfscl_freeflayout(flp);
}
LIST_FOREACH_SAFE(rp, &layp->nfsly_recall, nfsrecly_list, nrp) {
LIST_REMOVE(rp, nfsrecly_list);
free(rp, M_NFSLAYRECALL);
}
nfscl_layoutcnt--;
free(layp, M_NFSLAYOUT);
}
/*
* Free up a file layout structure.
*/
APPLESTATIC void
nfscl_freeflayout(struct nfsclflayout *flp)
{
int i, j;
if ((flp->nfsfl_flags & NFSFL_FILE) != 0)
for (i = 0; i < flp->nfsfl_fhcnt; i++)
free(flp->nfsfl_fh[i], M_NFSFH);
if ((flp->nfsfl_flags & NFSFL_FLEXFILE) != 0)
for (i = 0; i < flp->nfsfl_mirrorcnt; i++)
for (j = 0; j < flp->nfsfl_ffm[i].fhcnt; j++)
free(flp->nfsfl_ffm[i].fh[j], M_NFSFH);
if (flp->nfsfl_devp != NULL)
flp->nfsfl_devp->nfsdi_layoutrefs--;
free(flp, M_NFSFLAYOUT);
}
/*
* Free up a file layout devinfo structure.
*/
APPLESTATIC void
nfscl_freedevinfo(struct nfscldevinfo *dip)
{
free(dip, M_NFSDEVINFO);
}
/*
* Mark any layouts that match as recalled.
*/
static int
nfscl_layoutrecall(int recalltype, struct nfscllayout *lyp, uint32_t iomode,
uint64_t off, uint64_t len, uint32_t stateseqid,
struct nfsclrecalllayout *recallp)
{
struct nfsclrecalllayout *rp, *orp;
recallp->nfsrecly_recalltype = recalltype;
recallp->nfsrecly_iomode = iomode;
recallp->nfsrecly_stateseqid = stateseqid;
recallp->nfsrecly_off = off;
recallp->nfsrecly_len = len;
/*
* Order the list as file returns first, followed by fsid and any
* returns, both in increasing stateseqid order.
* Note that the seqids wrap around, so 1 is after 0xffffffff.
* (I'm not sure this is correct because I find RFC5661 confusing
* on this, but hopefully it will work ok.)
*/
orp = NULL;
LIST_FOREACH(rp, &lyp->nfsly_recall, nfsrecly_list) {
orp = rp;
if ((recalltype == NFSLAYOUTRETURN_FILE &&
(rp->nfsrecly_recalltype != NFSLAYOUTRETURN_FILE ||
nfscl_seq(stateseqid, rp->nfsrecly_stateseqid) != 0)) ||
(recalltype != NFSLAYOUTRETURN_FILE &&
rp->nfsrecly_recalltype != NFSLAYOUTRETURN_FILE &&
nfscl_seq(stateseqid, rp->nfsrecly_stateseqid) != 0)) {
LIST_INSERT_BEFORE(rp, recallp, nfsrecly_list);
break;
}
}
if (rp == NULL) {
if (orp == NULL)
LIST_INSERT_HEAD(&lyp->nfsly_recall, recallp,
nfsrecly_list);
else
LIST_INSERT_AFTER(orp, recallp, nfsrecly_list);
}
lyp->nfsly_flags |= NFSLY_RECALL;
return (0);
}
/*
* Compare the two seqids for ordering. The trick is that the seqids can
* wrap around from 0xffffffff->0, so check for the cases where one
* has wrapped around.
* Return 1 if seqid1 comes before seqid2, 0 otherwise.
*/
static int
nfscl_seq(uint32_t seqid1, uint32_t seqid2)
{
if (seqid2 > seqid1 && (seqid2 - seqid1) >= 0x7fffffff)
/* seqid2 has wrapped around. */
return (0);
if (seqid1 > seqid2 && (seqid1 - seqid2) >= 0x7fffffff)
/* seqid1 has wrapped around. */
return (1);
if (seqid1 <= seqid2)
return (1);
return (0);
}
/*
* Do a layout return for each of the recalls.
*/
static void
nfscl_layoutreturn(struct nfsmount *nmp, struct nfscllayout *lyp,
struct ucred *cred, NFSPROC_T *p)
{
struct nfsclrecalllayout *rp;
nfsv4stateid_t stateid;
int layouttype;
NFSBCOPY(lyp->nfsly_stateid.other, stateid.other, NFSX_STATEIDOTHER);
stateid.seqid = lyp->nfsly_stateid.seqid;
if ((lyp->nfsly_flags & NFSLY_FILES) != 0)
layouttype = NFSLAYOUT_NFSV4_1_FILES;
else
layouttype = NFSLAYOUT_FLEXFILE;
LIST_FOREACH(rp, &lyp->nfsly_recall, nfsrecly_list) {
(void)nfsrpc_layoutreturn(nmp, lyp->nfsly_fh,
lyp->nfsly_fhlen, 0, layouttype,
rp->nfsrecly_iomode, rp->nfsrecly_recalltype,
rp->nfsrecly_off, rp->nfsrecly_len,
&stateid, cred, p, NULL);
}
}
/*
* Do the layout commit for a file layout.
*/
static void
nfscl_dolayoutcommit(struct nfsmount *nmp, struct nfscllayout *lyp,
struct ucred *cred, NFSPROC_T *p)
{
struct nfsclflayout *flp;
uint64_t len;
int error, layouttype;
if ((lyp->nfsly_flags & NFSLY_FILES) != 0)
layouttype = NFSLAYOUT_NFSV4_1_FILES;
else
layouttype = NFSLAYOUT_FLEXFILE;
LIST_FOREACH(flp, &lyp->nfsly_flayrw, nfsfl_list) {
if (layouttype == NFSLAYOUT_FLEXFILE &&
(flp->nfsfl_fflags & NFSFLEXFLAG_NO_LAYOUTCOMMIT) != 0) {
NFSCL_DEBUG(4, "Flex file: no layoutcommit\n");
/* If not supported, don't bother doing it. */
NFSLOCKMNT(nmp);
nmp->nm_state |= NFSSTA_NOLAYOUTCOMMIT;
NFSUNLOCKMNT(nmp);
break;
} else if (flp->nfsfl_off <= lyp->nfsly_lastbyte) {
len = flp->nfsfl_end - flp->nfsfl_off;
error = nfsrpc_layoutcommit(nmp, lyp->nfsly_fh,
lyp->nfsly_fhlen, 0, flp->nfsfl_off, len,
lyp->nfsly_lastbyte, &lyp->nfsly_stateid,
layouttype, cred, p, NULL);
NFSCL_DEBUG(4, "layoutcommit err=%d\n", error);
if (error == NFSERR_NOTSUPP) {
/* If not supported, don't bother doing it. */
NFSLOCKMNT(nmp);
nmp->nm_state |= NFSSTA_NOLAYOUTCOMMIT;
NFSUNLOCKMNT(nmp);
break;
}
}
}
}
/*
* Commit all layouts for a file (vnode).
*/
int
nfscl_layoutcommit(vnode_t vp, NFSPROC_T *p)
{
struct nfsclclient *clp;
struct nfscllayout *lyp;
struct nfsnode *np = VTONFS(vp);
mount_t mp;
struct nfsmount *nmp;
mp = vnode_mount(vp);
nmp = VFSTONFS(mp);
if (NFSHASNOLAYOUTCOMMIT(nmp))
return (0);
NFSLOCKCLSTATE();
clp = nmp->nm_clp;
if (clp == NULL) {
NFSUNLOCKCLSTATE();
return (EPERM);
}
lyp = nfscl_findlayout(clp, np->n_fhp->nfh_fh, np->n_fhp->nfh_len);
if (lyp == NULL) {
NFSUNLOCKCLSTATE();
return (EPERM);
}
nfsv4_getref(&lyp->nfsly_lock, NULL, NFSCLSTATEMUTEXPTR, mp);
if (NFSCL_FORCEDISM(mp)) {
NFSUNLOCKCLSTATE();
return (EPERM);
}
tryagain:
if ((lyp->nfsly_flags & NFSLY_WRITTEN) != 0) {
lyp->nfsly_flags &= ~NFSLY_WRITTEN;
NFSUNLOCKCLSTATE();
NFSCL_DEBUG(4, "do layoutcommit2\n");
nfscl_dolayoutcommit(clp->nfsc_nmp, lyp, NFSPROCCRED(p), p);
NFSLOCKCLSTATE();
goto tryagain;
}
nfsv4_relref(&lyp->nfsly_lock);
NFSUNLOCKCLSTATE();
return (0);
}
Index: head/sys/fs/nfsserver/nfs_nfsdport.c
===================================================================
--- head/sys/fs/nfsserver/nfs_nfsdport.c (revision 327172)
+++ head/sys/fs/nfsserver/nfs_nfsdport.c (revision 327173)
@@ -1,3443 +1,3441 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Rick Macklem at The University of Guelph.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/capsicum.h>
/*
* Functions that perform the vfs operations required by the routines in
* nfsd_serv.c. It is hoped that this change will make the server more
* portable.
*/
#include <fs/nfs/nfsport.h>
#include <sys/hash.h>
#include <sys/sysctl.h>
#include <nlm/nlm_prot.h>
#include <nlm/nlm.h>
FEATURE(nfsd, "NFSv4 server");
extern u_int32_t newnfs_true, newnfs_false, newnfs_xdrneg1;
extern int nfsrv_useacl;
extern int newnfs_numnfsd;
extern struct mount nfsv4root_mnt;
extern struct nfsrv_stablefirst nfsrv_stablefirst;
extern void (*nfsd_call_servertimer)(void);
extern SVCPOOL *nfsrvd_pool;
extern struct nfsv4lock nfsd_suspend_lock;
extern struct nfsclienthashhead *nfsclienthash;
extern struct nfslockhashhead *nfslockhash;
extern struct nfssessionhash *nfssessionhash;
extern int nfsrv_sessionhashsize;
extern struct nfsstatsv1 nfsstatsv1;
struct vfsoptlist nfsv4root_opt, nfsv4root_newopt;
NFSDLOCKMUTEX;
struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE];
struct nfsrchash_bucket nfsrcahash_table[NFSRVCACHE_HASHSIZE];
struct mtx nfsrc_udpmtx;
struct mtx nfs_v4root_mutex;
struct nfsrvfh nfs_rootfh, nfs_pubfh;
int nfs_pubfhset = 0, nfs_rootfhset = 0;
struct proc *nfsd_master_proc = NULL;
int nfsd_debuglevel = 0;
static pid_t nfsd_master_pid = (pid_t)-1;
static char nfsd_master_comm[MAXCOMLEN + 1];
static struct timeval nfsd_master_start;
static uint32_t nfsv4_sysid = 0;
static int nfssvc_srvcall(struct thread *, struct nfssvc_args *,
struct ucred *);
int nfsrv_enable_crossmntpt = 1;
static int nfs_commit_blks;
static int nfs_commit_miss;
extern int nfsrv_issuedelegs;
extern int nfsrv_dolocallocks;
extern int nfsd_enable_stringtouid;
SYSCTL_NODE(_vfs, OID_AUTO, nfsd, CTLFLAG_RW, 0, "NFS server");
SYSCTL_INT(_vfs_nfsd, OID_AUTO, mirrormnt, CTLFLAG_RW,
&nfsrv_enable_crossmntpt, 0, "Enable nfsd to cross mount points");
SYSCTL_INT(_vfs_nfsd, OID_AUTO, commit_blks, CTLFLAG_RW, &nfs_commit_blks,
0, "");
SYSCTL_INT(_vfs_nfsd, OID_AUTO, commit_miss, CTLFLAG_RW, &nfs_commit_miss,
0, "");
SYSCTL_INT(_vfs_nfsd, OID_AUTO, issue_delegations, CTLFLAG_RW,
&nfsrv_issuedelegs, 0, "Enable nfsd to issue delegations");
SYSCTL_INT(_vfs_nfsd, OID_AUTO, enable_locallocks, CTLFLAG_RW,
&nfsrv_dolocallocks, 0, "Enable nfsd to acquire local locks on files");
SYSCTL_INT(_vfs_nfsd, OID_AUTO, debuglevel, CTLFLAG_RW, &nfsd_debuglevel,
0, "Debug level for NFS server");
SYSCTL_INT(_vfs_nfsd, OID_AUTO, enable_stringtouid, CTLFLAG_RW,
&nfsd_enable_stringtouid, 0, "Enable nfsd to accept numeric owner_names");
#define MAX_REORDERED_RPC 16
#define NUM_HEURISTIC 1031
#define NHUSE_INIT 64
#define NHUSE_INC 16
#define NHUSE_MAX 2048
static struct nfsheur {
struct vnode *nh_vp; /* vp to match (unreferenced pointer) */
off_t nh_nextoff; /* next offset for sequential detection */
int nh_use; /* use count for selection */
int nh_seqcount; /* heuristic */
} nfsheur[NUM_HEURISTIC];
/*
* Heuristic to detect sequential operation.
*/
static struct nfsheur *
nfsrv_sequential_heuristic(struct uio *uio, struct vnode *vp)
{
struct nfsheur *nh;
int hi, try;
/* Locate best candidate. */
try = 32;
hi = ((int)(vm_offset_t)vp / sizeof(struct vnode)) % NUM_HEURISTIC;
nh = &nfsheur[hi];
while (try--) {
if (nfsheur[hi].nh_vp == vp) {
nh = &nfsheur[hi];
break;
}
if (nfsheur[hi].nh_use > 0)
--nfsheur[hi].nh_use;
hi = (hi + 1) % NUM_HEURISTIC;
if (nfsheur[hi].nh_use < nh->nh_use)
nh = &nfsheur[hi];
}
/* Initialize hint if this is a new file. */
if (nh->nh_vp != vp) {
nh->nh_vp = vp;
nh->nh_nextoff = uio->uio_offset;
nh->nh_use = NHUSE_INIT;
if (uio->uio_offset == 0)
nh->nh_seqcount = 4;
else
nh->nh_seqcount = 1;
}
/* Calculate heuristic. */
if ((uio->uio_offset == 0 && nh->nh_seqcount > 0) ||
uio->uio_offset == nh->nh_nextoff) {
/* See comments in vfs_vnops.c:sequential_heuristic(). */
nh->nh_seqcount += howmany(uio->uio_resid, 16384);
if (nh->nh_seqcount > IO_SEQMAX)
nh->nh_seqcount = IO_SEQMAX;
} else if (qabs(uio->uio_offset - nh->nh_nextoff) <= MAX_REORDERED_RPC *
imax(vp->v_mount->mnt_stat.f_iosize, uio->uio_resid)) {
/* Probably a reordered RPC, leave seqcount alone. */
} else if (nh->nh_seqcount > 1) {
nh->nh_seqcount /= 2;
} else {
nh->nh_seqcount = 0;
}
nh->nh_use += NHUSE_INC;
if (nh->nh_use > NHUSE_MAX)
nh->nh_use = NHUSE_MAX;
return (nh);
}
/*
* Get attributes into nfsvattr structure.
*/
int
nfsvno_getattr(struct vnode *vp, struct nfsvattr *nvap, struct ucred *cred,
struct thread *p, int vpislocked)
{
int error, lockedit = 0;
if (vpislocked == 0) {
/*
* When vpislocked == 0, the vnode is either exclusively
* locked by this thread or not locked by this thread.
* As such, shared lock it, if not exclusively locked.
*/
if (NFSVOPISLOCKED(vp) != LK_EXCLUSIVE) {
lockedit = 1;
NFSVOPLOCK(vp, LK_SHARED | LK_RETRY);
}
}
error = VOP_GETATTR(vp, &nvap->na_vattr, cred);
if (lockedit != 0)
NFSVOPUNLOCK(vp, 0);
NFSEXITCODE(error);
return (error);
}
/*
* Get a file handle for a vnode.
*/
int
nfsvno_getfh(struct vnode *vp, fhandle_t *fhp, struct thread *p)
{
int error;
NFSBZERO((caddr_t)fhp, sizeof(fhandle_t));
fhp->fh_fsid = vp->v_mount->mnt_stat.f_fsid;
error = VOP_VPTOFH(vp, &fhp->fh_fid);
NFSEXITCODE(error);
return (error);
}
/*
* Perform access checking for vnodes obtained from file handles that would
* refer to files already opened by a Unix client. You cannot just use
* vn_writechk() and VOP_ACCESSX() for two reasons.
* 1 - You must check for exported rdonly as well as MNT_RDONLY for the write
* case.
* 2 - The owner is to be given access irrespective of mode bits for some
* operations, so that processes that chmod after opening a file don't
* break.
*/
int
nfsvno_accchk(struct vnode *vp, accmode_t accmode, struct ucred *cred,
struct nfsexstuff *exp, struct thread *p, int override, int vpislocked,
u_int32_t *supportedtypep)
{
struct vattr vattr;
int error = 0, getret = 0;
if (vpislocked == 0) {
if (NFSVOPLOCK(vp, LK_SHARED) != 0) {
error = EPERM;
goto out;
}
}
if (accmode & VWRITE) {
/* Just vn_writechk() changed to check rdonly */
/*
* Disallow write attempts on read-only file systems;
* unless the file is a socket or a block or character
* device resident on the file system.
*/
if (NFSVNO_EXRDONLY(exp) ||
(vp->v_mount->mnt_flag & MNT_RDONLY)) {
switch (vp->v_type) {
case VREG:
case VDIR:
case VLNK:
error = EROFS;
default:
break;
}
}
/*
* If there's shared text associated with
* the inode, try to free it up once. If
* we fail, we can't allow writing.
*/
if (VOP_IS_TEXT(vp) && error == 0)
error = ETXTBSY;
}
if (error != 0) {
if (vpislocked == 0)
NFSVOPUNLOCK(vp, 0);
goto out;
}
/*
* Should the override still be applied when ACLs are enabled?
*/
error = VOP_ACCESSX(vp, accmode, cred, p);
if (error != 0 && (accmode & (VDELETE | VDELETE_CHILD))) {
/*
* Try again with VEXPLICIT_DENY, to see if the test for
* deletion is supported.
*/
error = VOP_ACCESSX(vp, accmode | VEXPLICIT_DENY, cred, p);
if (error == 0) {
if (vp->v_type == VDIR) {
accmode &= ~(VDELETE | VDELETE_CHILD);
accmode |= VWRITE;
error = VOP_ACCESSX(vp, accmode, cred, p);
} else if (supportedtypep != NULL) {
*supportedtypep &= ~NFSACCESS_DELETE;
}
}
}
/*
* Allow certain operations for the owner (reads and writes
* on files that are already open).
*/
if (override != NFSACCCHK_NOOVERRIDE &&
(error == EPERM || error == EACCES)) {
if (cred->cr_uid == 0 && (override & NFSACCCHK_ALLOWROOT))
error = 0;
else if (override & NFSACCCHK_ALLOWOWNER) {
getret = VOP_GETATTR(vp, &vattr, cred);
if (getret == 0 && cred->cr_uid == vattr.va_uid)
error = 0;
}
}
if (vpislocked == 0)
NFSVOPUNLOCK(vp, 0);
out:
NFSEXITCODE(error);
return (error);
}
/*
* Set attribute(s) vnop.
*/
int
nfsvno_setattr(struct vnode *vp, struct nfsvattr *nvap, struct ucred *cred,
struct thread *p, struct nfsexstuff *exp)
{
int error;
error = VOP_SETATTR(vp, &nvap->na_vattr, cred);
NFSEXITCODE(error);
return (error);
}
/*
* Set up nameidata for a lookup() call and do it.
*/
int
nfsvno_namei(struct nfsrv_descript *nd, struct nameidata *ndp,
struct vnode *dp, int islocked, struct nfsexstuff *exp, struct thread *p,
struct vnode **retdirp)
{
struct componentname *cnp = &ndp->ni_cnd;
int i;
struct iovec aiov;
struct uio auio;
int lockleaf = (cnp->cn_flags & LOCKLEAF) != 0, linklen;
- int error = 0, crossmnt;
+ int error = 0;
char *cp;
*retdirp = NULL;
cnp->cn_nameptr = cnp->cn_pnbuf;
ndp->ni_lcf = 0;
/*
* Extract and set starting directory.
*/
if (dp->v_type != VDIR) {
if (islocked)
vput(dp);
else
vrele(dp);
nfsvno_relpathbuf(ndp);
error = ENOTDIR;
goto out1;
}
if (islocked)
NFSVOPUNLOCK(dp, 0);
VREF(dp);
*retdirp = dp;
if (NFSVNO_EXRDONLY(exp))
cnp->cn_flags |= RDONLY;
ndp->ni_segflg = UIO_SYSSPACE;
- crossmnt = 1;
if (nd->nd_flag & ND_PUBLOOKUP) {
ndp->ni_loopcnt = 0;
if (cnp->cn_pnbuf[0] == '/') {
vrele(dp);
/*
* Check for degenerate pathnames here, since lookup()
* panics on them.
*/
for (i = 1; i < ndp->ni_pathlen; i++)
if (cnp->cn_pnbuf[i] != '/')
break;
if (i == ndp->ni_pathlen) {
error = NFSERR_ACCES;
goto out;
}
dp = rootvnode;
VREF(dp);
}
} else if ((nfsrv_enable_crossmntpt == 0 && NFSVNO_EXPORTED(exp)) ||
(nd->nd_flag & ND_NFSV4) == 0) {
/*
* Only cross mount points for NFSv4 when doing a
* mount while traversing the file system above
* the mount point, unless nfsrv_enable_crossmntpt is set.
*/
cnp->cn_flags |= NOCROSSMOUNT;
- crossmnt = 0;
}
/*
* Initialize for scan, set ni_startdir and bump ref on dp again
* because lookup() will dereference ni_startdir.
*/
cnp->cn_thread = p;
ndp->ni_startdir = dp;
ndp->ni_rootdir = rootvnode;
ndp->ni_topdir = NULL;
if (!lockleaf)
cnp->cn_flags |= LOCKLEAF;
for (;;) {
cnp->cn_nameptr = cnp->cn_pnbuf;
/*
* Call lookup() to do the real work. If an error occurs,
* ndp->ni_vp and ni_dvp are left uninitialized or NULL and
* we do not have to dereference anything before returning.
* In either case ni_startdir will be dereferenced and NULLed
* out.
*/
error = lookup(ndp);
if (error)
break;
/*
* Check for encountering a symbolic link. Trivial
* termination occurs if no symlink encountered.
*/
if ((cnp->cn_flags & ISSYMLINK) == 0) {
if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0)
nfsvno_relpathbuf(ndp);
if (ndp->ni_vp && !lockleaf)
NFSVOPUNLOCK(ndp->ni_vp, 0);
break;
}
/*
* Validate symlink
*/
if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1)
NFSVOPUNLOCK(ndp->ni_dvp, 0);
if (!(nd->nd_flag & ND_PUBLOOKUP)) {
error = EINVAL;
goto badlink2;
}
if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
error = ELOOP;
goto badlink2;
}
if (ndp->ni_pathlen > 1)
cp = uma_zalloc(namei_zone, M_WAITOK);
else
cp = cnp->cn_pnbuf;
aiov.iov_base = cp;
aiov.iov_len = MAXPATHLEN;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_offset = 0;
auio.uio_rw = UIO_READ;
auio.uio_segflg = UIO_SYSSPACE;
auio.uio_td = NULL;
auio.uio_resid = MAXPATHLEN;
error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred);
if (error) {
badlink1:
if (ndp->ni_pathlen > 1)
uma_zfree(namei_zone, cp);
badlink2:
vrele(ndp->ni_dvp);
vput(ndp->ni_vp);
break;
}
linklen = MAXPATHLEN - auio.uio_resid;
if (linklen == 0) {
error = ENOENT;
goto badlink1;
}
if (linklen + ndp->ni_pathlen >= MAXPATHLEN) {
error = ENAMETOOLONG;
goto badlink1;
}
/*
* Adjust or replace path
*/
if (ndp->ni_pathlen > 1) {
NFSBCOPY(ndp->ni_next, cp + linklen, ndp->ni_pathlen);
uma_zfree(namei_zone, cnp->cn_pnbuf);
cnp->cn_pnbuf = cp;
} else
cnp->cn_pnbuf[linklen] = '\0';
ndp->ni_pathlen += linklen;
/*
* Cleanup refs for next loop and check if root directory
* should replace current directory. Normally ni_dvp
* becomes the new base directory and is cleaned up when
* we loop. Explicitly null pointers after invalidation
* to clarify operation.
*/
vput(ndp->ni_vp);
ndp->ni_vp = NULL;
if (cnp->cn_pnbuf[0] == '/') {
vrele(ndp->ni_dvp);
ndp->ni_dvp = ndp->ni_rootdir;
VREF(ndp->ni_dvp);
}
ndp->ni_startdir = ndp->ni_dvp;
ndp->ni_dvp = NULL;
}
if (!lockleaf)
cnp->cn_flags &= ~LOCKLEAF;
out:
if (error) {
nfsvno_relpathbuf(ndp);
ndp->ni_vp = NULL;
ndp->ni_dvp = NULL;
ndp->ni_startdir = NULL;
} else if ((ndp->ni_cnd.cn_flags & (WANTPARENT|LOCKPARENT)) == 0) {
ndp->ni_dvp = NULL;
}
out1:
NFSEXITCODE2(error, nd);
return (error);
}
/*
* Set up a pathname buffer and return a pointer to it and, optionally
* set a hash pointer.
*/
void
nfsvno_setpathbuf(struct nameidata *ndp, char **bufpp, u_long **hashpp)
{
struct componentname *cnp = &ndp->ni_cnd;
cnp->cn_flags |= (NOMACCHECK | HASBUF);
cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
if (hashpp != NULL)
*hashpp = NULL;
*bufpp = cnp->cn_pnbuf;
}
/*
* Release the above path buffer, if not released by nfsvno_namei().
*/
void
nfsvno_relpathbuf(struct nameidata *ndp)
{
if ((ndp->ni_cnd.cn_flags & HASBUF) == 0)
panic("nfsrelpath");
uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
ndp->ni_cnd.cn_flags &= ~HASBUF;
}
/*
* Readlink vnode op into an mbuf list.
*/
int
nfsvno_readlink(struct vnode *vp, struct ucred *cred, struct thread *p,
struct mbuf **mpp, struct mbuf **mpendp, int *lenp)
{
struct iovec iv[(NFS_MAXPATHLEN+MLEN-1)/MLEN];
struct iovec *ivp = iv;
struct uio io, *uiop = &io;
struct mbuf *mp, *mp2 = NULL, *mp3 = NULL;
int i, len, tlen, error = 0;
len = 0;
i = 0;
while (len < NFS_MAXPATHLEN) {
NFSMGET(mp);
MCLGET(mp, M_WAITOK);
mp->m_len = M_SIZE(mp);
if (len == 0) {
mp3 = mp2 = mp;
} else {
mp2->m_next = mp;
mp2 = mp;
}
if ((len + mp->m_len) > NFS_MAXPATHLEN) {
mp->m_len = NFS_MAXPATHLEN - len;
len = NFS_MAXPATHLEN;
} else {
len += mp->m_len;
}
ivp->iov_base = mtod(mp, caddr_t);
ivp->iov_len = mp->m_len;
i++;
ivp++;
}
uiop->uio_iov = iv;
uiop->uio_iovcnt = i;
uiop->uio_offset = 0;
uiop->uio_resid = len;
uiop->uio_rw = UIO_READ;
uiop->uio_segflg = UIO_SYSSPACE;
uiop->uio_td = NULL;
error = VOP_READLINK(vp, uiop, cred);
if (error) {
m_freem(mp3);
*lenp = 0;
goto out;
}
if (uiop->uio_resid > 0) {
len -= uiop->uio_resid;
tlen = NFSM_RNDUP(len);
nfsrv_adj(mp3, NFS_MAXPATHLEN - tlen, tlen - len);
}
*lenp = len;
*mpp = mp3;
*mpendp = mp;
out:
NFSEXITCODE(error);
return (error);
}
/*
* Read vnode op call into mbuf list.
*/
int
nfsvno_read(struct vnode *vp, off_t off, int cnt, struct ucred *cred,
struct thread *p, struct mbuf **mpp, struct mbuf **mpendp)
{
struct mbuf *m;
int i;
struct iovec *iv;
struct iovec *iv2;
int error = 0, len, left, siz, tlen, ioflag = 0;
struct mbuf *m2 = NULL, *m3;
struct uio io, *uiop = &io;
struct nfsheur *nh;
len = left = NFSM_RNDUP(cnt);
m3 = NULL;
/*
* Generate the mbuf list with the uio_iov ref. to it.
*/
i = 0;
while (left > 0) {
NFSMGET(m);
MCLGET(m, M_WAITOK);
m->m_len = 0;
siz = min(M_TRAILINGSPACE(m), left);
left -= siz;
i++;
if (m3)
m2->m_next = m;
else
m3 = m;
m2 = m;
}
MALLOC(iv, struct iovec *, i * sizeof (struct iovec),
M_TEMP, M_WAITOK);
uiop->uio_iov = iv2 = iv;
m = m3;
left = len;
i = 0;
while (left > 0) {
if (m == NULL)
panic("nfsvno_read iov");
siz = min(M_TRAILINGSPACE(m), left);
if (siz > 0) {
iv->iov_base = mtod(m, caddr_t) + m->m_len;
iv->iov_len = siz;
m->m_len += siz;
left -= siz;
iv++;
i++;
}
m = m->m_next;
}
uiop->uio_iovcnt = i;
uiop->uio_offset = off;
uiop->uio_resid = len;
uiop->uio_rw = UIO_READ;
uiop->uio_segflg = UIO_SYSSPACE;
uiop->uio_td = NULL;
nh = nfsrv_sequential_heuristic(uiop, vp);
ioflag |= nh->nh_seqcount << IO_SEQSHIFT;
/* XXX KDM make this more systematic? */
nfsstatsv1.srvbytes[NFSV4OP_READ] += uiop->uio_resid;
error = VOP_READ(vp, uiop, IO_NODELOCKED | ioflag, cred);
FREE((caddr_t)iv2, M_TEMP);
if (error) {
m_freem(m3);
*mpp = NULL;
goto out;
}
nh->nh_nextoff = uiop->uio_offset;
tlen = len - uiop->uio_resid;
cnt = cnt < tlen ? cnt : tlen;
tlen = NFSM_RNDUP(cnt);
if (tlen == 0) {
m_freem(m3);
m3 = NULL;
} else if (len != tlen || tlen != cnt)
nfsrv_adj(m3, len - tlen, tlen - cnt);
*mpp = m3;
*mpendp = m2;
out:
NFSEXITCODE(error);
return (error);
}
/*
* Write vnode op from an mbuf list.
*/
int
nfsvno_write(struct vnode *vp, off_t off, int retlen, int cnt, int stable,
struct mbuf *mp, char *cp, struct ucred *cred, struct thread *p)
{
struct iovec *ivp;
int i, len;
struct iovec *iv;
int ioflags, error;
struct uio io, *uiop = &io;
struct nfsheur *nh;
MALLOC(ivp, struct iovec *, cnt * sizeof (struct iovec), M_TEMP,
M_WAITOK);
uiop->uio_iov = iv = ivp;
uiop->uio_iovcnt = cnt;
i = mtod(mp, caddr_t) + mp->m_len - cp;
len = retlen;
while (len > 0) {
if (mp == NULL)
panic("nfsvno_write");
if (i > 0) {
i = min(i, len);
ivp->iov_base = cp;
ivp->iov_len = i;
ivp++;
len -= i;
}
mp = mp->m_next;
if (mp) {
i = mp->m_len;
cp = mtod(mp, caddr_t);
}
}
if (stable == NFSWRITE_UNSTABLE)
ioflags = IO_NODELOCKED;
else
ioflags = (IO_SYNC | IO_NODELOCKED);
uiop->uio_resid = retlen;
uiop->uio_rw = UIO_WRITE;
uiop->uio_segflg = UIO_SYSSPACE;
NFSUIOPROC(uiop, p);
uiop->uio_offset = off;
nh = nfsrv_sequential_heuristic(uiop, vp);
ioflags |= nh->nh_seqcount << IO_SEQSHIFT;
/* XXX KDM make this more systematic? */
nfsstatsv1.srvbytes[NFSV4OP_WRITE] += uiop->uio_resid;
error = VOP_WRITE(vp, uiop, ioflags, cred);
if (error == 0)
nh->nh_nextoff = uiop->uio_offset;
FREE((caddr_t)iv, M_TEMP);
NFSEXITCODE(error);
return (error);
}
/*
* Common code for creating a regular file (plus special files for V2).
*/
int
nfsvno_createsub(struct nfsrv_descript *nd, struct nameidata *ndp,
struct vnode **vpp, struct nfsvattr *nvap, int *exclusive_flagp,
int32_t *cverf, NFSDEV_T rdev, struct thread *p, struct nfsexstuff *exp)
{
u_quad_t tempsize;
int error;
error = nd->nd_repstat;
if (!error && ndp->ni_vp == NULL) {
if (nvap->na_type == VREG || nvap->na_type == VSOCK) {
vrele(ndp->ni_startdir);
error = VOP_CREATE(ndp->ni_dvp,
&ndp->ni_vp, &ndp->ni_cnd, &nvap->na_vattr);
vput(ndp->ni_dvp);
nfsvno_relpathbuf(ndp);
if (!error) {
if (*exclusive_flagp) {
*exclusive_flagp = 0;
NFSVNO_ATTRINIT(nvap);
nvap->na_atime.tv_sec = cverf[0];
nvap->na_atime.tv_nsec = cverf[1];
error = VOP_SETATTR(ndp->ni_vp,
&nvap->na_vattr, nd->nd_cred);
if (error != 0) {
vput(ndp->ni_vp);
ndp->ni_vp = NULL;
error = NFSERR_NOTSUPP;
}
}
}
/*
* NFS V2 Only. nfsrvd_mknod() does this for V3.
* (This implies, just get out on an error.)
*/
} else if (nvap->na_type == VCHR || nvap->na_type == VBLK ||
nvap->na_type == VFIFO) {
if (nvap->na_type == VCHR && rdev == 0xffffffff)
nvap->na_type = VFIFO;
if (nvap->na_type != VFIFO &&
(error = priv_check_cred(nd->nd_cred,
PRIV_VFS_MKNOD_DEV, 0))) {
vrele(ndp->ni_startdir);
nfsvno_relpathbuf(ndp);
vput(ndp->ni_dvp);
goto out;
}
nvap->na_rdev = rdev;
error = VOP_MKNOD(ndp->ni_dvp, &ndp->ni_vp,
&ndp->ni_cnd, &nvap->na_vattr);
vput(ndp->ni_dvp);
nfsvno_relpathbuf(ndp);
vrele(ndp->ni_startdir);
if (error)
goto out;
} else {
vrele(ndp->ni_startdir);
nfsvno_relpathbuf(ndp);
vput(ndp->ni_dvp);
error = ENXIO;
goto out;
}
*vpp = ndp->ni_vp;
} else {
/*
* Handle cases where error is already set and/or
* the file exists.
* 1 - clean up the lookup
* 2 - iff !error and na_size set, truncate it
*/
vrele(ndp->ni_startdir);
nfsvno_relpathbuf(ndp);
*vpp = ndp->ni_vp;
if (ndp->ni_dvp == *vpp)
vrele(ndp->ni_dvp);
else
vput(ndp->ni_dvp);
if (!error && nvap->na_size != VNOVAL) {
error = nfsvno_accchk(*vpp, VWRITE,
nd->nd_cred, exp, p, NFSACCCHK_NOOVERRIDE,
NFSACCCHK_VPISLOCKED, NULL);
if (!error) {
tempsize = nvap->na_size;
NFSVNO_ATTRINIT(nvap);
nvap->na_size = tempsize;
error = VOP_SETATTR(*vpp,
&nvap->na_vattr, nd->nd_cred);
}
}
if (error)
vput(*vpp);
}
out:
NFSEXITCODE(error);
return (error);
}
/*
* Do a mknod vnode op.
*/
int
nfsvno_mknod(struct nameidata *ndp, struct nfsvattr *nvap, struct ucred *cred,
struct thread *p)
{
int error = 0;
enum vtype vtyp;
vtyp = nvap->na_type;
/*
* Iff doesn't exist, create it.
*/
if (ndp->ni_vp) {
vrele(ndp->ni_startdir);
nfsvno_relpathbuf(ndp);
vput(ndp->ni_dvp);
vrele(ndp->ni_vp);
error = EEXIST;
goto out;
}
if (vtyp != VCHR && vtyp != VBLK && vtyp != VSOCK && vtyp != VFIFO) {
vrele(ndp->ni_startdir);
nfsvno_relpathbuf(ndp);
vput(ndp->ni_dvp);
error = NFSERR_BADTYPE;
goto out;
}
if (vtyp == VSOCK) {
vrele(ndp->ni_startdir);
error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
&ndp->ni_cnd, &nvap->na_vattr);
vput(ndp->ni_dvp);
nfsvno_relpathbuf(ndp);
} else {
if (nvap->na_type != VFIFO &&
(error = priv_check_cred(cred, PRIV_VFS_MKNOD_DEV, 0))) {
vrele(ndp->ni_startdir);
nfsvno_relpathbuf(ndp);
vput(ndp->ni_dvp);
goto out;
}
error = VOP_MKNOD(ndp->ni_dvp, &ndp->ni_vp,
&ndp->ni_cnd, &nvap->na_vattr);
vput(ndp->ni_dvp);
nfsvno_relpathbuf(ndp);
vrele(ndp->ni_startdir);
/*
* Since VOP_MKNOD returns the ni_vp, I can't
* see any reason to do the lookup.
*/
}
out:
NFSEXITCODE(error);
return (error);
}
/*
* Mkdir vnode op.
*/
int
nfsvno_mkdir(struct nameidata *ndp, struct nfsvattr *nvap, uid_t saved_uid,
struct ucred *cred, struct thread *p, struct nfsexstuff *exp)
{
int error = 0;
if (ndp->ni_vp != NULL) {
if (ndp->ni_dvp == ndp->ni_vp)
vrele(ndp->ni_dvp);
else
vput(ndp->ni_dvp);
vrele(ndp->ni_vp);
nfsvno_relpathbuf(ndp);
error = EEXIST;
goto out;
}
error = VOP_MKDIR(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd,
&nvap->na_vattr);
vput(ndp->ni_dvp);
nfsvno_relpathbuf(ndp);
out:
NFSEXITCODE(error);
return (error);
}
/*
* symlink vnode op.
*/
int
nfsvno_symlink(struct nameidata *ndp, struct nfsvattr *nvap, char *pathcp,
int pathlen, int not_v2, uid_t saved_uid, struct ucred *cred, struct thread *p,
struct nfsexstuff *exp)
{
int error = 0;
if (ndp->ni_vp) {
vrele(ndp->ni_startdir);
nfsvno_relpathbuf(ndp);
if (ndp->ni_dvp == ndp->ni_vp)
vrele(ndp->ni_dvp);
else
vput(ndp->ni_dvp);
vrele(ndp->ni_vp);
error = EEXIST;
goto out;
}
error = VOP_SYMLINK(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd,
&nvap->na_vattr, pathcp);
vput(ndp->ni_dvp);
vrele(ndp->ni_startdir);
nfsvno_relpathbuf(ndp);
/*
* Although FreeBSD still had the lookup code in
* it for 7/current, there doesn't seem to be any
* point, since VOP_SYMLINK() returns the ni_vp.
* Just vput it for v2.
*/
if (!not_v2 && !error)
vput(ndp->ni_vp);
out:
NFSEXITCODE(error);
return (error);
}
/*
* Parse symbolic link arguments.
* This function has an ugly side effect. It will MALLOC() an area for
* the symlink and set iov_base to point to it, only if it succeeds.
* So, if it returns with uiop->uio_iov->iov_base != NULL, that must
* be FREE'd later.
*/
int
nfsvno_getsymlink(struct nfsrv_descript *nd, struct nfsvattr *nvap,
struct thread *p, char **pathcpp, int *lenp)
{
u_int32_t *tl;
char *pathcp = NULL;
int error = 0, len;
struct nfsv2_sattr *sp;
*pathcpp = NULL;
*lenp = 0;
if ((nd->nd_flag & ND_NFSV3) &&
(error = nfsrv_sattr(nd, NULL, nvap, NULL, NULL, p)))
goto nfsmout;
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
len = fxdr_unsigned(int, *tl);
if (len > NFS_MAXPATHLEN || len <= 0) {
error = EBADRPC;
goto nfsmout;
}
MALLOC(pathcp, caddr_t, len + 1, M_TEMP, M_WAITOK);
error = nfsrv_mtostr(nd, pathcp, len);
if (error)
goto nfsmout;
if (nd->nd_flag & ND_NFSV2) {
NFSM_DISSECT(sp, struct nfsv2_sattr *, NFSX_V2SATTR);
nvap->na_mode = fxdr_unsigned(u_int16_t, sp->sa_mode);
}
*pathcpp = pathcp;
*lenp = len;
NFSEXITCODE2(0, nd);
return (0);
nfsmout:
if (pathcp)
free(pathcp, M_TEMP);
NFSEXITCODE2(error, nd);
return (error);
}
/*
* Remove a non-directory object.
*/
int
nfsvno_removesub(struct nameidata *ndp, int is_v4, struct ucred *cred,
struct thread *p, struct nfsexstuff *exp)
{
struct vnode *vp;
int error = 0;
vp = ndp->ni_vp;
if (vp->v_type == VDIR)
error = NFSERR_ISDIR;
else if (is_v4)
error = nfsrv_checkremove(vp, 1, p);
if (!error)
error = VOP_REMOVE(ndp->ni_dvp, vp, &ndp->ni_cnd);
if (ndp->ni_dvp == vp)
vrele(ndp->ni_dvp);
else
vput(ndp->ni_dvp);
vput(vp);
if ((ndp->ni_cnd.cn_flags & SAVENAME) != 0)
nfsvno_relpathbuf(ndp);
NFSEXITCODE(error);
return (error);
}
/*
* Remove a directory.
*/
int
nfsvno_rmdirsub(struct nameidata *ndp, int is_v4, struct ucred *cred,
struct thread *p, struct nfsexstuff *exp)
{
struct vnode *vp;
int error = 0;
vp = ndp->ni_vp;
if (vp->v_type != VDIR) {
error = ENOTDIR;
goto out;
}
/*
* No rmdir "." please.
*/
if (ndp->ni_dvp == vp) {
error = EINVAL;
goto out;
}
/*
* The root of a mounted filesystem cannot be deleted.
*/
if (vp->v_vflag & VV_ROOT)
error = EBUSY;
out:
if (!error)
error = VOP_RMDIR(ndp->ni_dvp, vp, &ndp->ni_cnd);
if (ndp->ni_dvp == vp)
vrele(ndp->ni_dvp);
else
vput(ndp->ni_dvp);
vput(vp);
if ((ndp->ni_cnd.cn_flags & SAVENAME) != 0)
nfsvno_relpathbuf(ndp);
NFSEXITCODE(error);
return (error);
}
/*
* Rename vnode op.
*/
int
nfsvno_rename(struct nameidata *fromndp, struct nameidata *tondp,
u_int32_t ndstat, u_int32_t ndflag, struct ucred *cred, struct thread *p)
{
struct vnode *fvp, *tvp, *tdvp;
int error = 0;
fvp = fromndp->ni_vp;
if (ndstat) {
vrele(fromndp->ni_dvp);
vrele(fvp);
error = ndstat;
goto out1;
}
tdvp = tondp->ni_dvp;
tvp = tondp->ni_vp;
if (tvp != NULL) {
if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
error = (ndflag & ND_NFSV2) ? EISDIR : EEXIST;
goto out;
} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
error = (ndflag & ND_NFSV2) ? ENOTDIR : EEXIST;
goto out;
}
if (tvp->v_type == VDIR && tvp->v_mountedhere) {
error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EXDEV;
goto out;
}
/*
* A rename to '.' or '..' results in a prematurely
* unlocked vnode on FreeBSD5, so I'm just going to fail that
* here.
*/
if ((tondp->ni_cnd.cn_namelen == 1 &&
tondp->ni_cnd.cn_nameptr[0] == '.') ||
(tondp->ni_cnd.cn_namelen == 2 &&
tondp->ni_cnd.cn_nameptr[0] == '.' &&
tondp->ni_cnd.cn_nameptr[1] == '.')) {
error = EINVAL;
goto out;
}
}
if (fvp->v_type == VDIR && fvp->v_mountedhere) {
error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EXDEV;
goto out;
}
if (fvp->v_mount != tdvp->v_mount) {
error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EXDEV;
goto out;
}
if (fvp == tdvp) {
error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EINVAL;
goto out;
}
if (fvp == tvp) {
/*
* If source and destination are the same, there is nothing to
* do. Set error to -1 to indicate this.
*/
error = -1;
goto out;
}
if (ndflag & ND_NFSV4) {
if (NFSVOPLOCK(fvp, LK_EXCLUSIVE) == 0) {
error = nfsrv_checkremove(fvp, 0, p);
NFSVOPUNLOCK(fvp, 0);
} else
error = EPERM;
if (tvp && !error)
error = nfsrv_checkremove(tvp, 1, p);
} else {
/*
* For NFSv2 and NFSv3, try to get rid of the delegation, so
* that the NFSv4 client won't be confused by the rename.
* Since nfsd_recalldelegation() can only be called on an
* unlocked vnode at this point and fvp is the file that will
* still exist after the rename, just do fvp.
*/
nfsd_recalldelegation(fvp, p);
}
out:
if (!error) {
error = VOP_RENAME(fromndp->ni_dvp, fromndp->ni_vp,
&fromndp->ni_cnd, tondp->ni_dvp, tondp->ni_vp,
&tondp->ni_cnd);
} else {
if (tdvp == tvp)
vrele(tdvp);
else
vput(tdvp);
if (tvp)
vput(tvp);
vrele(fromndp->ni_dvp);
vrele(fvp);
if (error == -1)
error = 0;
}
vrele(tondp->ni_startdir);
nfsvno_relpathbuf(tondp);
out1:
vrele(fromndp->ni_startdir);
nfsvno_relpathbuf(fromndp);
NFSEXITCODE(error);
return (error);
}
/*
* Link vnode op.
*/
int
nfsvno_link(struct nameidata *ndp, struct vnode *vp, struct ucred *cred,
struct thread *p, struct nfsexstuff *exp)
{
struct vnode *xp;
int error = 0;
xp = ndp->ni_vp;
if (xp != NULL) {
error = EEXIST;
} else {
xp = ndp->ni_dvp;
if (vp->v_mount != xp->v_mount)
error = EXDEV;
}
if (!error) {
NFSVOPLOCK(vp, LK_EXCLUSIVE | LK_RETRY);
if ((vp->v_iflag & VI_DOOMED) == 0)
error = VOP_LINK(ndp->ni_dvp, vp, &ndp->ni_cnd);
else
error = EPERM;
if (ndp->ni_dvp == vp)
vrele(ndp->ni_dvp);
else
vput(ndp->ni_dvp);
NFSVOPUNLOCK(vp, 0);
} else {
if (ndp->ni_dvp == ndp->ni_vp)
vrele(ndp->ni_dvp);
else
vput(ndp->ni_dvp);
if (ndp->ni_vp)
vrele(ndp->ni_vp);
}
nfsvno_relpathbuf(ndp);
NFSEXITCODE(error);
return (error);
}
/*
* Do the fsync() appropriate for the commit.
*/
int
nfsvno_fsync(struct vnode *vp, u_int64_t off, int cnt, struct ucred *cred,
struct thread *td)
{
int error = 0;
/*
* RFC 1813 3.3.21: if count is 0, a flush from offset to the end of
* file is done. At this time VOP_FSYNC does not accept offset and
* byte count parameters so call VOP_FSYNC the whole file for now.
* The same is true for NFSv4: RFC 3530 Sec. 14.2.3.
* File systems that do not use the buffer cache (as indicated
* by MNTK_USES_BCACHE not being set) must use VOP_FSYNC().
*/
if (cnt == 0 || cnt > MAX_COMMIT_COUNT ||
(vp->v_mount->mnt_kern_flag & MNTK_USES_BCACHE) == 0) {
/*
* Give up and do the whole thing
*/
if (vp->v_object &&
(vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
VM_OBJECT_WLOCK(vp->v_object);
vm_object_page_clean(vp->v_object, 0, 0, OBJPC_SYNC);
VM_OBJECT_WUNLOCK(vp->v_object);
}
error = VOP_FSYNC(vp, MNT_WAIT, td);
} else {
/*
* Locate and synchronously write any buffers that fall
* into the requested range. Note: we are assuming that
* f_iosize is a power of 2.
*/
int iosize = vp->v_mount->mnt_stat.f_iosize;
int iomask = iosize - 1;
struct bufobj *bo;
daddr_t lblkno;
/*
* Align to iosize boundary, super-align to page boundary.
*/
if (off & iomask) {
cnt += off & iomask;
off &= ~(u_quad_t)iomask;
}
if (off & PAGE_MASK) {
cnt += off & PAGE_MASK;
off &= ~(u_quad_t)PAGE_MASK;
}
lblkno = off / iosize;
if (vp->v_object &&
(vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
VM_OBJECT_WLOCK(vp->v_object);
vm_object_page_clean(vp->v_object, off, off + cnt,
OBJPC_SYNC);
VM_OBJECT_WUNLOCK(vp->v_object);
}
bo = &vp->v_bufobj;
BO_LOCK(bo);
while (cnt > 0) {
struct buf *bp;
/*
* If we have a buffer and it is marked B_DELWRI we
* have to lock and write it. Otherwise the prior
* write is assumed to have already been committed.
*
* gbincore() can return invalid buffers now so we
* have to check that bit as well (though B_DELWRI
* should not be set if B_INVAL is set there could be
* a race here since we haven't locked the buffer).
*/
if ((bp = gbincore(&vp->v_bufobj, lblkno)) != NULL) {
if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) {
BO_LOCK(bo);
continue; /* retry */
}
if ((bp->b_flags & (B_DELWRI|B_INVAL)) ==
B_DELWRI) {
bremfree(bp);
bp->b_flags &= ~B_ASYNC;
bwrite(bp);
++nfs_commit_miss;
} else
BUF_UNLOCK(bp);
BO_LOCK(bo);
}
++nfs_commit_blks;
if (cnt < iosize)
break;
cnt -= iosize;
++lblkno;
}
BO_UNLOCK(bo);
}
NFSEXITCODE(error);
return (error);
}
/*
* Statfs vnode op.
*/
int
nfsvno_statfs(struct vnode *vp, struct statfs *sf)
{
int error;
error = VFS_STATFS(vp->v_mount, sf);
if (error == 0) {
/*
* Since NFS handles these values as unsigned on the
* wire, there is no way to represent negative values,
* so set them to 0. Without this, they will appear
* to be very large positive values for clients like
* Solaris10.
*/
if (sf->f_bavail < 0)
sf->f_bavail = 0;
if (sf->f_ffree < 0)
sf->f_ffree = 0;
}
NFSEXITCODE(error);
return (error);
}
/*
* Do the vnode op stuff for Open. Similar to nfsvno_createsub(), but
* must handle nfsrv_opencheck() calls after any other access checks.
*/
void
nfsvno_open(struct nfsrv_descript *nd, struct nameidata *ndp,
nfsquad_t clientid, nfsv4stateid_t *stateidp, struct nfsstate *stp,
int *exclusive_flagp, struct nfsvattr *nvap, int32_t *cverf, int create,
NFSACL_T *aclp, nfsattrbit_t *attrbitp, struct ucred *cred, struct thread *p,
struct nfsexstuff *exp, struct vnode **vpp)
{
struct vnode *vp = NULL;
u_quad_t tempsize;
struct nfsexstuff nes;
if (ndp->ni_vp == NULL)
nd->nd_repstat = nfsrv_opencheck(clientid,
stateidp, stp, NULL, nd, p, nd->nd_repstat);
if (!nd->nd_repstat) {
if (ndp->ni_vp == NULL) {
vrele(ndp->ni_startdir);
nd->nd_repstat = VOP_CREATE(ndp->ni_dvp,
&ndp->ni_vp, &ndp->ni_cnd, &nvap->na_vattr);
vput(ndp->ni_dvp);
nfsvno_relpathbuf(ndp);
if (!nd->nd_repstat) {
if (*exclusive_flagp) {
*exclusive_flagp = 0;
NFSVNO_ATTRINIT(nvap);
nvap->na_atime.tv_sec = cverf[0];
nvap->na_atime.tv_nsec = cverf[1];
nd->nd_repstat = VOP_SETATTR(ndp->ni_vp,
&nvap->na_vattr, cred);
if (nd->nd_repstat != 0) {
vput(ndp->ni_vp);
ndp->ni_vp = NULL;
nd->nd_repstat = NFSERR_NOTSUPP;
} else
NFSSETBIT_ATTRBIT(attrbitp,
NFSATTRBIT_TIMEACCESS);
} else {
nfsrv_fixattr(nd, ndp->ni_vp, nvap,
aclp, p, attrbitp, exp);
}
}
vp = ndp->ni_vp;
} else {
if (ndp->ni_startdir)
vrele(ndp->ni_startdir);
nfsvno_relpathbuf(ndp);
vp = ndp->ni_vp;
if (create == NFSV4OPEN_CREATE) {
if (ndp->ni_dvp == vp)
vrele(ndp->ni_dvp);
else
vput(ndp->ni_dvp);
}
if (NFSVNO_ISSETSIZE(nvap) && vp->v_type == VREG) {
if (ndp->ni_cnd.cn_flags & RDONLY)
NFSVNO_SETEXRDONLY(&nes);
else
NFSVNO_EXINIT(&nes);
nd->nd_repstat = nfsvno_accchk(vp,
VWRITE, cred, &nes, p,
NFSACCCHK_NOOVERRIDE,
NFSACCCHK_VPISLOCKED, NULL);
nd->nd_repstat = nfsrv_opencheck(clientid,
stateidp, stp, vp, nd, p, nd->nd_repstat);
if (!nd->nd_repstat) {
tempsize = nvap->na_size;
NFSVNO_ATTRINIT(nvap);
nvap->na_size = tempsize;
nd->nd_repstat = VOP_SETATTR(vp,
&nvap->na_vattr, cred);
}
} else if (vp->v_type == VREG) {
nd->nd_repstat = nfsrv_opencheck(clientid,
stateidp, stp, vp, nd, p, nd->nd_repstat);
}
}
} else {
if (ndp->ni_cnd.cn_flags & HASBUF)
nfsvno_relpathbuf(ndp);
if (ndp->ni_startdir && create == NFSV4OPEN_CREATE) {
vrele(ndp->ni_startdir);
if (ndp->ni_dvp == ndp->ni_vp)
vrele(ndp->ni_dvp);
else
vput(ndp->ni_dvp);
if (ndp->ni_vp)
vput(ndp->ni_vp);
}
}
*vpp = vp;
NFSEXITCODE2(0, nd);
}
/*
* Updates the file rev and sets the mtime and ctime
* to the current clock time, returning the va_filerev and va_Xtime
* values.
* Return ESTALE to indicate the vnode is VI_DOOMED.
*/
int
nfsvno_updfilerev(struct vnode *vp, struct nfsvattr *nvap,
struct ucred *cred, struct thread *p)
{
struct vattr va;
VATTR_NULL(&va);
vfs_timestamp(&va.va_mtime);
if (NFSVOPISLOCKED(vp) != LK_EXCLUSIVE) {
NFSVOPLOCK(vp, LK_UPGRADE | LK_RETRY);
if ((vp->v_iflag & VI_DOOMED) != 0)
return (ESTALE);
}
(void) VOP_SETATTR(vp, &va, cred);
(void) nfsvno_getattr(vp, nvap, cred, p, 1);
return (0);
}
/*
* Glue routine to nfsv4_fillattr().
*/
int
nfsvno_fillattr(struct nfsrv_descript *nd, struct mount *mp, struct vnode *vp,
struct nfsvattr *nvap, fhandle_t *fhp, int rderror, nfsattrbit_t *attrbitp,
struct ucred *cred, struct thread *p, int isdgram, int reterr,
int supports_nfsv4acls, int at_root, uint64_t mounted_on_fileno)
{
int error;
error = nfsv4_fillattr(nd, mp, vp, NULL, &nvap->na_vattr, fhp, rderror,
attrbitp, cred, p, isdgram, reterr, supports_nfsv4acls, at_root,
mounted_on_fileno);
NFSEXITCODE2(0, nd);
return (error);
}
/* Since the Readdir vnode ops vary, put the entire functions in here. */
/*
* nfs readdir service
* - mallocs what it thinks is enough to read
* count rounded up to a multiple of DIRBLKSIZ <= NFS_MAXREADDIR
* - calls VOP_READDIR()
* - loops around building the reply
* if the output generated exceeds count break out of loop
* The NFSM_CLGET macro is used here so that the reply will be packed
* tightly in mbuf clusters.
* - it trims out records with d_fileno == 0
* this doesn't matter for Unix clients, but they might confuse clients
* for other os'.
* - it trims out records with d_type == DT_WHT
* these cannot be seen through NFS (unless we extend the protocol)
* The alternate call nfsrvd_readdirplus() does lookups as well.
* PS: The NFS protocol spec. does not clarify what the "count" byte
* argument is a count of.. just name strings and file id's or the
* entire reply rpc or ...
* I tried just file name and id sizes and it confused the Sun client,
* so I am using the full rpc size now. The "paranoia.." comment refers
* to including the status longwords that are not a part of the dir.
* "entry" structures, but are in the rpc.
*/
int
nfsrvd_readdir(struct nfsrv_descript *nd, int isdgram,
struct vnode *vp, struct thread *p, struct nfsexstuff *exp)
{
struct dirent *dp;
u_int32_t *tl;
int dirlen;
char *cpos, *cend, *rbuf;
struct nfsvattr at;
int nlen, error = 0, getret = 1;
int siz, cnt, fullsiz, eofflag, ncookies;
u_int64_t off, toff, verf;
u_long *cookies = NULL, *cookiep;
struct uio io;
struct iovec iv;
int is_ufs;
if (nd->nd_repstat) {
nfsrv_postopattr(nd, getret, &at);
goto out;
}
if (nd->nd_flag & ND_NFSV2) {
NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
off = fxdr_unsigned(u_quad_t, *tl++);
} else {
NFSM_DISSECT(tl, u_int32_t *, 5 * NFSX_UNSIGNED);
off = fxdr_hyper(tl);
tl += 2;
verf = fxdr_hyper(tl);
tl += 2;
}
toff = off;
cnt = fxdr_unsigned(int, *tl);
if (cnt > NFS_SRVMAXDATA(nd) || cnt < 0)
cnt = NFS_SRVMAXDATA(nd);
siz = ((cnt + DIRBLKSIZ - 1) & ~(DIRBLKSIZ - 1));
fullsiz = siz;
if (nd->nd_flag & ND_NFSV3) {
nd->nd_repstat = getret = nfsvno_getattr(vp, &at, nd->nd_cred,
p, 1);
#if 0
/*
* va_filerev is not sufficient as a cookie verifier,
* since it is not supposed to change when entries are
* removed/added unless that offset cookies returned to
* the client are no longer valid.
*/
if (!nd->nd_repstat && toff && verf != at.na_filerev)
nd->nd_repstat = NFSERR_BAD_COOKIE;
#endif
}
if (!nd->nd_repstat && vp->v_type != VDIR)
nd->nd_repstat = NFSERR_NOTDIR;
if (nd->nd_repstat == 0 && cnt == 0) {
if (nd->nd_flag & ND_NFSV2)
/* NFSv2 does not have NFSERR_TOOSMALL */
nd->nd_repstat = EPERM;
else
nd->nd_repstat = NFSERR_TOOSMALL;
}
if (!nd->nd_repstat)
nd->nd_repstat = nfsvno_accchk(vp, VEXEC,
nd->nd_cred, exp, p, NFSACCCHK_NOOVERRIDE,
NFSACCCHK_VPISLOCKED, NULL);
if (nd->nd_repstat) {
vput(vp);
if (nd->nd_flag & ND_NFSV3)
nfsrv_postopattr(nd, getret, &at);
goto out;
}
is_ufs = strcmp(vp->v_mount->mnt_vfc->vfc_name, "ufs") == 0;
MALLOC(rbuf, caddr_t, siz, M_TEMP, M_WAITOK);
again:
eofflag = 0;
if (cookies) {
free((caddr_t)cookies, M_TEMP);
cookies = NULL;
}
iv.iov_base = rbuf;
iv.iov_len = siz;
io.uio_iov = &iv;
io.uio_iovcnt = 1;
io.uio_offset = (off_t)off;
io.uio_resid = siz;
io.uio_segflg = UIO_SYSSPACE;
io.uio_rw = UIO_READ;
io.uio_td = NULL;
nd->nd_repstat = VOP_READDIR(vp, &io, nd->nd_cred, &eofflag, &ncookies,
&cookies);
off = (u_int64_t)io.uio_offset;
if (io.uio_resid)
siz -= io.uio_resid;
if (!cookies && !nd->nd_repstat)
nd->nd_repstat = NFSERR_PERM;
if (nd->nd_flag & ND_NFSV3) {
getret = nfsvno_getattr(vp, &at, nd->nd_cred, p, 1);
if (!nd->nd_repstat)
nd->nd_repstat = getret;
}
/*
* Handles the failed cases. nd->nd_repstat == 0 past here.
*/
if (nd->nd_repstat) {
vput(vp);
free((caddr_t)rbuf, M_TEMP);
if (cookies)
free((caddr_t)cookies, M_TEMP);
if (nd->nd_flag & ND_NFSV3)
nfsrv_postopattr(nd, getret, &at);
goto out;
}
/*
* If nothing read, return eof
* rpc reply
*/
if (siz == 0) {
vput(vp);
if (nd->nd_flag & ND_NFSV2) {
NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
} else {
nfsrv_postopattr(nd, getret, &at);
NFSM_BUILD(tl, u_int32_t *, 4 * NFSX_UNSIGNED);
txdr_hyper(at.na_filerev, tl);
tl += 2;
}
*tl++ = newnfs_false;
*tl = newnfs_true;
FREE((caddr_t)rbuf, M_TEMP);
FREE((caddr_t)cookies, M_TEMP);
goto out;
}
/*
* Check for degenerate cases of nothing useful read.
* If so go try again
*/
cpos = rbuf;
cend = rbuf + siz;
dp = (struct dirent *)cpos;
cookiep = cookies;
/*
* For some reason FreeBSD's ufs_readdir() chooses to back the
* directory offset up to a block boundary, so it is necessary to
* skip over the records that precede the requested offset. This
* requires the assumption that file offset cookies monotonically
* increase.
*/
while (cpos < cend && ncookies > 0 &&
(dp->d_fileno == 0 || dp->d_type == DT_WHT ||
(is_ufs == 1 && ((u_quad_t)(*cookiep)) <= toff))) {
cpos += dp->d_reclen;
dp = (struct dirent *)cpos;
cookiep++;
ncookies--;
}
if (cpos >= cend || ncookies == 0) {
siz = fullsiz;
toff = off;
goto again;
}
vput(vp);
/*
* dirlen is the size of the reply, including all XDR and must
* not exceed cnt. For NFSv2, RFC1094 didn't clearly indicate
* if the XDR should be included in "count", but to be safe, we do.
* (Include the two booleans at the end of the reply in dirlen now.)
*/
if (nd->nd_flag & ND_NFSV3) {
nfsrv_postopattr(nd, getret, &at);
NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
txdr_hyper(at.na_filerev, tl);
dirlen = NFSX_V3POSTOPATTR + NFSX_VERF + 2 * NFSX_UNSIGNED;
} else {
dirlen = 2 * NFSX_UNSIGNED;
}
/* Loop through the records and build reply */
while (cpos < cend && ncookies > 0) {
nlen = dp->d_namlen;
if (dp->d_fileno != 0 && dp->d_type != DT_WHT &&
nlen <= NFS_MAXNAMLEN) {
if (nd->nd_flag & ND_NFSV3)
dirlen += (6*NFSX_UNSIGNED + NFSM_RNDUP(nlen));
else
dirlen += (4*NFSX_UNSIGNED + NFSM_RNDUP(nlen));
if (dirlen > cnt) {
eofflag = 0;
break;
}
/*
* Build the directory record xdr from
* the dirent entry.
*/
if (nd->nd_flag & ND_NFSV3) {
NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
*tl++ = newnfs_true;
*tl++ = 0;
} else {
NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
*tl++ = newnfs_true;
}
*tl = txdr_unsigned(dp->d_fileno);
(void) nfsm_strtom(nd, dp->d_name, nlen);
if (nd->nd_flag & ND_NFSV3) {
NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
*tl++ = 0;
} else
NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
*tl = txdr_unsigned(*cookiep);
}
cpos += dp->d_reclen;
dp = (struct dirent *)cpos;
cookiep++;
ncookies--;
}
if (cpos < cend)
eofflag = 0;
NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
*tl++ = newnfs_false;
if (eofflag)
*tl = newnfs_true;
else
*tl = newnfs_false;
FREE((caddr_t)rbuf, M_TEMP);
FREE((caddr_t)cookies, M_TEMP);
out:
NFSEXITCODE2(0, nd);
return (0);
nfsmout:
vput(vp);
NFSEXITCODE2(error, nd);
return (error);
}
/*
* Readdirplus for V3 and Readdir for V4.
*/
int
nfsrvd_readdirplus(struct nfsrv_descript *nd, int isdgram,
struct vnode *vp, struct thread *p, struct nfsexstuff *exp)
{
struct dirent *dp;
u_int32_t *tl;
int dirlen;
char *cpos, *cend, *rbuf;
struct vnode *nvp;
fhandle_t nfh;
struct nfsvattr nva, at, *nvap = &nva;
struct mbuf *mb0, *mb1;
struct nfsreferral *refp;
int nlen, r, error = 0, getret = 1, usevget = 1;
int siz, cnt, fullsiz, eofflag, ncookies, entrycnt;
caddr_t bpos0, bpos1;
u_int64_t off, toff, verf;
u_long *cookies = NULL, *cookiep;
nfsattrbit_t attrbits, rderrbits, savbits;
struct uio io;
struct iovec iv;
struct componentname cn;
int at_root, is_ufs, is_zfs, needs_unbusy, supports_nfsv4acls;
struct mount *mp, *new_mp;
uint64_t mounted_on_fileno;
if (nd->nd_repstat) {
nfsrv_postopattr(nd, getret, &at);
goto out;
}
NFSM_DISSECT(tl, u_int32_t *, 6 * NFSX_UNSIGNED);
off = fxdr_hyper(tl);
toff = off;
tl += 2;
verf = fxdr_hyper(tl);
tl += 2;
siz = fxdr_unsigned(int, *tl++);
cnt = fxdr_unsigned(int, *tl);
/*
* Use the server's maximum data transfer size as the upper bound
* on reply datalen.
*/
if (cnt > NFS_SRVMAXDATA(nd) || cnt < 0)
cnt = NFS_SRVMAXDATA(nd);
/*
* siz is a "hint" of how much directory information (name, fileid,
* cookie) should be in the reply. At least one client "hints" 0,
* so I set it to cnt for that case. I also round it up to the
* next multiple of DIRBLKSIZ.
*/
if (siz <= 0)
siz = cnt;
siz = ((siz + DIRBLKSIZ - 1) & ~(DIRBLKSIZ - 1));
if (nd->nd_flag & ND_NFSV4) {
error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL);
if (error)
goto nfsmout;
NFSSET_ATTRBIT(&savbits, &attrbits);
NFSCLRNOTFILLABLE_ATTRBIT(&attrbits);
NFSZERO_ATTRBIT(&rderrbits);
NFSSETBIT_ATTRBIT(&rderrbits, NFSATTRBIT_RDATTRERROR);
} else {
NFSZERO_ATTRBIT(&attrbits);
}
fullsiz = siz;
nd->nd_repstat = getret = nfsvno_getattr(vp, &at, nd->nd_cred, p, 1);
if (!nd->nd_repstat) {
if (off && verf != at.na_filerev) {
/*
* va_filerev is not sufficient as a cookie verifier,
* since it is not supposed to change when entries are
* removed/added unless that offset cookies returned to
* the client are no longer valid.
*/
#if 0
if (nd->nd_flag & ND_NFSV4) {
nd->nd_repstat = NFSERR_NOTSAME;
} else {
nd->nd_repstat = NFSERR_BAD_COOKIE;
}
#endif
} else if ((nd->nd_flag & ND_NFSV4) && off == 0 && verf != 0) {
nd->nd_repstat = NFSERR_BAD_COOKIE;
}
}
if (!nd->nd_repstat && vp->v_type != VDIR)
nd->nd_repstat = NFSERR_NOTDIR;
if (!nd->nd_repstat && cnt == 0)
nd->nd_repstat = NFSERR_TOOSMALL;
if (!nd->nd_repstat)
nd->nd_repstat = nfsvno_accchk(vp, VEXEC,
nd->nd_cred, exp, p, NFSACCCHK_NOOVERRIDE,
NFSACCCHK_VPISLOCKED, NULL);
if (nd->nd_repstat) {
vput(vp);
if (nd->nd_flag & ND_NFSV3)
nfsrv_postopattr(nd, getret, &at);
goto out;
}
is_ufs = strcmp(vp->v_mount->mnt_vfc->vfc_name, "ufs") == 0;
is_zfs = strcmp(vp->v_mount->mnt_vfc->vfc_name, "zfs") == 0;
MALLOC(rbuf, caddr_t, siz, M_TEMP, M_WAITOK);
again:
eofflag = 0;
if (cookies) {
free((caddr_t)cookies, M_TEMP);
cookies = NULL;
}
iv.iov_base = rbuf;
iv.iov_len = siz;
io.uio_iov = &iv;
io.uio_iovcnt = 1;
io.uio_offset = (off_t)off;
io.uio_resid = siz;
io.uio_segflg = UIO_SYSSPACE;
io.uio_rw = UIO_READ;
io.uio_td = NULL;
nd->nd_repstat = VOP_READDIR(vp, &io, nd->nd_cred, &eofflag, &ncookies,
&cookies);
off = (u_int64_t)io.uio_offset;
if (io.uio_resid)
siz -= io.uio_resid;
getret = nfsvno_getattr(vp, &at, nd->nd_cred, p, 1);
if (!cookies && !nd->nd_repstat)
nd->nd_repstat = NFSERR_PERM;
if (!nd->nd_repstat)
nd->nd_repstat = getret;
if (nd->nd_repstat) {
vput(vp);
if (cookies)
free((caddr_t)cookies, M_TEMP);
free((caddr_t)rbuf, M_TEMP);
if (nd->nd_flag & ND_NFSV3)
nfsrv_postopattr(nd, getret, &at);
goto out;
}
/*
* If nothing read, return eof
* rpc reply
*/
if (siz == 0) {
vput(vp);
if (nd->nd_flag & ND_NFSV3)
nfsrv_postopattr(nd, getret, &at);
NFSM_BUILD(tl, u_int32_t *, 4 * NFSX_UNSIGNED);
txdr_hyper(at.na_filerev, tl);
tl += 2;
*tl++ = newnfs_false;
*tl = newnfs_true;
free((caddr_t)cookies, M_TEMP);
free((caddr_t)rbuf, M_TEMP);
goto out;
}
/*
* Check for degenerate cases of nothing useful read.
* If so go try again
*/
cpos = rbuf;
cend = rbuf + siz;
dp = (struct dirent *)cpos;
cookiep = cookies;
/*
* For some reason FreeBSD's ufs_readdir() chooses to back the
* directory offset up to a block boundary, so it is necessary to
* skip over the records that precede the requested offset. This
* requires the assumption that file offset cookies monotonically
* increase.
*/
while (cpos < cend && ncookies > 0 &&
(dp->d_fileno == 0 || dp->d_type == DT_WHT ||
(is_ufs == 1 && ((u_quad_t)(*cookiep)) <= toff) ||
((nd->nd_flag & ND_NFSV4) &&
((dp->d_namlen == 1 && dp->d_name[0] == '.') ||
(dp->d_namlen==2 && dp->d_name[0]=='.' && dp->d_name[1]=='.'))))) {
cpos += dp->d_reclen;
dp = (struct dirent *)cpos;
cookiep++;
ncookies--;
}
if (cpos >= cend || ncookies == 0) {
siz = fullsiz;
toff = off;
goto again;
}
/*
* Busy the file system so that the mount point won't go away
* and, as such, VFS_VGET() can be used safely.
*/
mp = vp->v_mount;
vfs_ref(mp);
NFSVOPUNLOCK(vp, 0);
nd->nd_repstat = vfs_busy(mp, 0);
vfs_rel(mp);
if (nd->nd_repstat != 0) {
vrele(vp);
free(cookies, M_TEMP);
free(rbuf, M_TEMP);
if (nd->nd_flag & ND_NFSV3)
nfsrv_postopattr(nd, getret, &at);
goto out;
}
/*
* Check to see if entries in this directory can be safely acquired
* via VFS_VGET() or if a switch to VOP_LOOKUP() is required.
* ZFS snapshot directories need VOP_LOOKUP(), so that any
* automount of the snapshot directory that is required will
* be done.
* This needs to be done here for NFSv4, since NFSv4 never does
* a VFS_VGET() for "." or "..".
*/
if (is_zfs == 1) {
r = VFS_VGET(mp, at.na_fileid, LK_SHARED, &nvp);
if (r == EOPNOTSUPP) {
usevget = 0;
cn.cn_nameiop = LOOKUP;
cn.cn_lkflags = LK_SHARED | LK_RETRY;
cn.cn_cred = nd->nd_cred;
cn.cn_thread = p;
} else if (r == 0)
vput(nvp);
}
/*
* Save this position, in case there is an error before one entry
* is created.
*/
mb0 = nd->nd_mb;
bpos0 = nd->nd_bpos;
/*
* Fill in the first part of the reply.
* dirlen is the reply length in bytes and cannot exceed cnt.
* (Include the two booleans at the end of the reply in dirlen now,
* so we recognize when we have exceeded cnt.)
*/
if (nd->nd_flag & ND_NFSV3) {
dirlen = NFSX_V3POSTOPATTR + NFSX_VERF + 2 * NFSX_UNSIGNED;
nfsrv_postopattr(nd, getret, &at);
} else {
dirlen = NFSX_VERF + 2 * NFSX_UNSIGNED;
}
NFSM_BUILD(tl, u_int32_t *, NFSX_VERF);
txdr_hyper(at.na_filerev, tl);
/*
* Save this position, in case there is an empty reply needed.
*/
mb1 = nd->nd_mb;
bpos1 = nd->nd_bpos;
/* Loop through the records and build reply */
entrycnt = 0;
while (cpos < cend && ncookies > 0 && dirlen < cnt) {
nlen = dp->d_namlen;
if (dp->d_fileno != 0 && dp->d_type != DT_WHT &&
nlen <= NFS_MAXNAMLEN &&
((nd->nd_flag & ND_NFSV3) || nlen > 2 ||
(nlen==2 && (dp->d_name[0]!='.' || dp->d_name[1]!='.'))
|| (nlen == 1 && dp->d_name[0] != '.'))) {
/*
* Save the current position in the reply, in case
* this entry exceeds cnt.
*/
mb1 = nd->nd_mb;
bpos1 = nd->nd_bpos;
/*
* For readdir_and_lookup get the vnode using
* the file number.
*/
nvp = NULL;
refp = NULL;
r = 0;
at_root = 0;
needs_unbusy = 0;
new_mp = mp;
mounted_on_fileno = (uint64_t)dp->d_fileno;
if ((nd->nd_flag & ND_NFSV3) ||
NFSNONZERO_ATTRBIT(&savbits)) {
if (nd->nd_flag & ND_NFSV4)
refp = nfsv4root_getreferral(NULL,
vp, dp->d_fileno);
if (refp == NULL) {
if (usevget)
r = VFS_VGET(mp, dp->d_fileno,
LK_SHARED, &nvp);
else
r = EOPNOTSUPP;
if (r == EOPNOTSUPP) {
if (usevget) {
usevget = 0;
cn.cn_nameiop = LOOKUP;
cn.cn_lkflags =
LK_SHARED |
LK_RETRY;
cn.cn_cred =
nd->nd_cred;
cn.cn_thread = p;
}
cn.cn_nameptr = dp->d_name;
cn.cn_namelen = nlen;
cn.cn_flags = ISLASTCN |
NOFOLLOW | LOCKLEAF;
if (nlen == 2 &&
dp->d_name[0] == '.' &&
dp->d_name[1] == '.')
cn.cn_flags |=
ISDOTDOT;
if (NFSVOPLOCK(vp, LK_SHARED)
!= 0) {
nd->nd_repstat = EPERM;
break;
}
if ((vp->v_vflag & VV_ROOT) != 0
&& (cn.cn_flags & ISDOTDOT)
!= 0) {
vref(vp);
nvp = vp;
r = 0;
} else {
r = VOP_LOOKUP(vp, &nvp,
&cn);
if (vp != nvp)
NFSVOPUNLOCK(vp,
0);
}
}
/*
* For NFSv4, check to see if nvp is
* a mount point and get the mount
* point vnode, as required.
*/
if (r == 0 &&
nfsrv_enable_crossmntpt != 0 &&
(nd->nd_flag & ND_NFSV4) != 0 &&
nvp->v_type == VDIR &&
nvp->v_mountedhere != NULL) {
new_mp = nvp->v_mountedhere;
r = vfs_busy(new_mp, 0);
vput(nvp);
nvp = NULL;
if (r == 0) {
r = VFS_ROOT(new_mp,
LK_SHARED, &nvp);
needs_unbusy = 1;
if (r == 0)
at_root = 1;
}
}
}
if (!r) {
if (refp == NULL &&
((nd->nd_flag & ND_NFSV3) ||
NFSNONZERO_ATTRBIT(&attrbits))) {
r = nfsvno_getfh(nvp, &nfh, p);
if (!r)
r = nfsvno_getattr(nvp, nvap,
nd->nd_cred, p, 1);
if (r == 0 && is_zfs == 1 &&
nfsrv_enable_crossmntpt != 0 &&
(nd->nd_flag & ND_NFSV4) != 0 &&
nvp->v_type == VDIR &&
vp->v_mount != nvp->v_mount) {
/*
* For a ZFS snapshot, there is a
* pseudo mount that does not set
* v_mountedhere, so it needs to
* be detected via a different
* mount structure.
*/
at_root = 1;
if (new_mp == mp)
new_mp = nvp->v_mount;
}
}
} else {
nvp = NULL;
}
if (r) {
if (!NFSISSET_ATTRBIT(&attrbits,
NFSATTRBIT_RDATTRERROR)) {
if (nvp != NULL)
vput(nvp);
if (needs_unbusy != 0)
vfs_unbusy(new_mp);
nd->nd_repstat = r;
break;
}
}
}
/*
* Build the directory record xdr
*/
if (nd->nd_flag & ND_NFSV3) {
NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
*tl++ = newnfs_true;
*tl++ = 0;
*tl = txdr_unsigned(dp->d_fileno);
dirlen += nfsm_strtom(nd, dp->d_name, nlen);
NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
*tl++ = 0;
*tl = txdr_unsigned(*cookiep);
nfsrv_postopattr(nd, 0, nvap);
dirlen += nfsm_fhtom(nd,(u_int8_t *)&nfh,0,1);
dirlen += (5*NFSX_UNSIGNED+NFSX_V3POSTOPATTR);
if (nvp != NULL)
vput(nvp);
} else {
NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
*tl++ = newnfs_true;
*tl++ = 0;
*tl = txdr_unsigned(*cookiep);
dirlen += nfsm_strtom(nd, dp->d_name, nlen);
if (nvp != NULL) {
supports_nfsv4acls =
nfs_supportsnfsv4acls(nvp);
NFSVOPUNLOCK(nvp, 0);
} else
supports_nfsv4acls = 0;
if (refp != NULL) {
dirlen += nfsrv_putreferralattr(nd,
&savbits, refp, 0,
&nd->nd_repstat);
if (nd->nd_repstat) {
if (nvp != NULL)
vrele(nvp);
if (needs_unbusy != 0)
vfs_unbusy(new_mp);
break;
}
} else if (r) {
dirlen += nfsvno_fillattr(nd, new_mp,
nvp, nvap, &nfh, r, &rderrbits,
nd->nd_cred, p, isdgram, 0,
supports_nfsv4acls, at_root,
mounted_on_fileno);
} else {
dirlen += nfsvno_fillattr(nd, new_mp,
nvp, nvap, &nfh, r, &attrbits,
nd->nd_cred, p, isdgram, 0,
supports_nfsv4acls, at_root,
mounted_on_fileno);
}
if (nvp != NULL)
vrele(nvp);
dirlen += (3 * NFSX_UNSIGNED);
}
if (needs_unbusy != 0)
vfs_unbusy(new_mp);
if (dirlen <= cnt)
entrycnt++;
}
cpos += dp->d_reclen;
dp = (struct dirent *)cpos;
cookiep++;
ncookies--;
}
vrele(vp);
vfs_unbusy(mp);
/*
* If dirlen > cnt, we must strip off the last entry. If that
* results in an empty reply, report NFSERR_TOOSMALL.
*/
if (dirlen > cnt || nd->nd_repstat) {
if (!nd->nd_repstat && entrycnt == 0)
nd->nd_repstat = NFSERR_TOOSMALL;
if (nd->nd_repstat) {
newnfs_trimtrailing(nd, mb0, bpos0);
if (nd->nd_flag & ND_NFSV3)
nfsrv_postopattr(nd, getret, &at);
} else
newnfs_trimtrailing(nd, mb1, bpos1);
eofflag = 0;
} else if (cpos < cend)
eofflag = 0;
if (!nd->nd_repstat) {
NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
*tl++ = newnfs_false;
if (eofflag)
*tl = newnfs_true;
else
*tl = newnfs_false;
}
FREE((caddr_t)cookies, M_TEMP);
FREE((caddr_t)rbuf, M_TEMP);
out:
NFSEXITCODE2(0, nd);
return (0);
nfsmout:
vput(vp);
NFSEXITCODE2(error, nd);
return (error);
}
/*
* Get the settable attributes out of the mbuf list.
* (Return 0 or EBADRPC)
*/
int
nfsrv_sattr(struct nfsrv_descript *nd, vnode_t vp, struct nfsvattr *nvap,
nfsattrbit_t *attrbitp, NFSACL_T *aclp, struct thread *p)
{
u_int32_t *tl;
struct nfsv2_sattr *sp;
int error = 0, toclient = 0;
switch (nd->nd_flag & (ND_NFSV2 | ND_NFSV3 | ND_NFSV4)) {
case ND_NFSV2:
NFSM_DISSECT(sp, struct nfsv2_sattr *, NFSX_V2SATTR);
/*
* Some old clients didn't fill in the high order 16bits.
* --> check the low order 2 bytes for 0xffff
*/
if ((fxdr_unsigned(int, sp->sa_mode) & 0xffff) != 0xffff)
nvap->na_mode = nfstov_mode(sp->sa_mode);
if (sp->sa_uid != newnfs_xdrneg1)
nvap->na_uid = fxdr_unsigned(uid_t, sp->sa_uid);
if (sp->sa_gid != newnfs_xdrneg1)
nvap->na_gid = fxdr_unsigned(gid_t, sp->sa_gid);
if (sp->sa_size != newnfs_xdrneg1)
nvap->na_size = fxdr_unsigned(u_quad_t, sp->sa_size);
if (sp->sa_atime.nfsv2_sec != newnfs_xdrneg1) {
#ifdef notyet
fxdr_nfsv2time(&sp->sa_atime, &nvap->na_atime);
#else
nvap->na_atime.tv_sec =
fxdr_unsigned(u_int32_t,sp->sa_atime.nfsv2_sec);
nvap->na_atime.tv_nsec = 0;
#endif
}
if (sp->sa_mtime.nfsv2_sec != newnfs_xdrneg1)
fxdr_nfsv2time(&sp->sa_mtime, &nvap->na_mtime);
break;
case ND_NFSV3:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (*tl == newnfs_true) {
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
nvap->na_mode = nfstov_mode(*tl);
}
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (*tl == newnfs_true) {
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
nvap->na_uid = fxdr_unsigned(uid_t, *tl);
}
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (*tl == newnfs_true) {
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
nvap->na_gid = fxdr_unsigned(gid_t, *tl);
}
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (*tl == newnfs_true) {
NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
nvap->na_size = fxdr_hyper(tl);
}
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
switch (fxdr_unsigned(int, *tl)) {
case NFSV3SATTRTIME_TOCLIENT:
NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
fxdr_nfsv3time(tl, &nvap->na_atime);
toclient = 1;
break;
case NFSV3SATTRTIME_TOSERVER:
vfs_timestamp(&nvap->na_atime);
nvap->na_vaflags |= VA_UTIMES_NULL;
break;
}
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
switch (fxdr_unsigned(int, *tl)) {
case NFSV3SATTRTIME_TOCLIENT:
NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
fxdr_nfsv3time(tl, &nvap->na_mtime);
nvap->na_vaflags &= ~VA_UTIMES_NULL;
break;
case NFSV3SATTRTIME_TOSERVER:
vfs_timestamp(&nvap->na_mtime);
if (!toclient)
nvap->na_vaflags |= VA_UTIMES_NULL;
break;
}
break;
case ND_NFSV4:
error = nfsv4_sattr(nd, vp, nvap, attrbitp, aclp, p);
}
nfsmout:
NFSEXITCODE2(error, nd);
return (error);
}
/*
* Handle the setable attributes for V4.
* Returns NFSERR_BADXDR if it can't be parsed, 0 otherwise.
*/
int
nfsv4_sattr(struct nfsrv_descript *nd, vnode_t vp, struct nfsvattr *nvap,
nfsattrbit_t *attrbitp, NFSACL_T *aclp, struct thread *p)
{
u_int32_t *tl;
int attrsum = 0;
int i, j;
int error, attrsize, bitpos, aclsize, aceerr, retnotsup = 0;
int toclient = 0;
u_char *cp, namestr[NFSV4_SMALLSTR + 1];
uid_t uid;
gid_t gid;
error = nfsrv_getattrbits(nd, attrbitp, NULL, &retnotsup);
if (error)
goto nfsmout;
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
attrsize = fxdr_unsigned(int, *tl);
/*
* Loop around getting the setable attributes. If an unsupported
* one is found, set nd_repstat == NFSERR_ATTRNOTSUPP and return.
*/
if (retnotsup) {
nd->nd_repstat = NFSERR_ATTRNOTSUPP;
bitpos = NFSATTRBIT_MAX;
} else {
bitpos = 0;
}
for (; bitpos < NFSATTRBIT_MAX; bitpos++) {
if (attrsum > attrsize) {
error = NFSERR_BADXDR;
goto nfsmout;
}
if (NFSISSET_ATTRBIT(attrbitp, bitpos))
switch (bitpos) {
case NFSATTRBIT_SIZE:
NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
if (vp != NULL && vp->v_type != VREG) {
error = (vp->v_type == VDIR) ? NFSERR_ISDIR :
NFSERR_INVAL;
goto nfsmout;
}
nvap->na_size = fxdr_hyper(tl);
attrsum += NFSX_HYPER;
break;
case NFSATTRBIT_ACL:
error = nfsrv_dissectacl(nd, aclp, &aceerr, &aclsize,
p);
if (error)
goto nfsmout;
if (aceerr && !nd->nd_repstat)
nd->nd_repstat = aceerr;
attrsum += aclsize;
break;
case NFSATTRBIT_ARCHIVE:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (!nd->nd_repstat)
nd->nd_repstat = NFSERR_ATTRNOTSUPP;
attrsum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_HIDDEN:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (!nd->nd_repstat)
nd->nd_repstat = NFSERR_ATTRNOTSUPP;
attrsum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_MIMETYPE:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
i = fxdr_unsigned(int, *tl);
error = nfsm_advance(nd, NFSM_RNDUP(i), -1);
if (error)
goto nfsmout;
if (!nd->nd_repstat)
nd->nd_repstat = NFSERR_ATTRNOTSUPP;
attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(i));
break;
case NFSATTRBIT_MODE:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
nvap->na_mode = nfstov_mode(*tl);
attrsum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_OWNER:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
j = fxdr_unsigned(int, *tl);
if (j < 0) {
error = NFSERR_BADXDR;
goto nfsmout;
}
if (j > NFSV4_SMALLSTR)
cp = malloc(j + 1, M_NFSSTRING, M_WAITOK);
else
cp = namestr;
error = nfsrv_mtostr(nd, cp, j);
if (error) {
if (j > NFSV4_SMALLSTR)
free(cp, M_NFSSTRING);
goto nfsmout;
}
if (!nd->nd_repstat) {
nd->nd_repstat = nfsv4_strtouid(nd, cp, j, &uid,
p);
if (!nd->nd_repstat)
nvap->na_uid = uid;
}
if (j > NFSV4_SMALLSTR)
free(cp, M_NFSSTRING);
attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(j));
break;
case NFSATTRBIT_OWNERGROUP:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
j = fxdr_unsigned(int, *tl);
if (j < 0) {
error = NFSERR_BADXDR;
goto nfsmout;
}
if (j > NFSV4_SMALLSTR)
cp = malloc(j + 1, M_NFSSTRING, M_WAITOK);
else
cp = namestr;
error = nfsrv_mtostr(nd, cp, j);
if (error) {
if (j > NFSV4_SMALLSTR)
free(cp, M_NFSSTRING);
goto nfsmout;
}
if (!nd->nd_repstat) {
nd->nd_repstat = nfsv4_strtogid(nd, cp, j, &gid,
p);
if (!nd->nd_repstat)
nvap->na_gid = gid;
}
if (j > NFSV4_SMALLSTR)
free(cp, M_NFSSTRING);
attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(j));
break;
case NFSATTRBIT_SYSTEM:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (!nd->nd_repstat)
nd->nd_repstat = NFSERR_ATTRNOTSUPP;
attrsum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_TIMEACCESSSET:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
attrsum += NFSX_UNSIGNED;
if (fxdr_unsigned(int, *tl)==NFSV4SATTRTIME_TOCLIENT) {
NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
fxdr_nfsv4time(tl, &nvap->na_atime);
toclient = 1;
attrsum += NFSX_V4TIME;
} else {
vfs_timestamp(&nvap->na_atime);
nvap->na_vaflags |= VA_UTIMES_NULL;
}
break;
case NFSATTRBIT_TIMEBACKUP:
NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
if (!nd->nd_repstat)
nd->nd_repstat = NFSERR_ATTRNOTSUPP;
attrsum += NFSX_V4TIME;
break;
case NFSATTRBIT_TIMECREATE:
NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
if (!nd->nd_repstat)
nd->nd_repstat = NFSERR_ATTRNOTSUPP;
attrsum += NFSX_V4TIME;
break;
case NFSATTRBIT_TIMEMODIFYSET:
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
attrsum += NFSX_UNSIGNED;
if (fxdr_unsigned(int, *tl)==NFSV4SATTRTIME_TOCLIENT) {
NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
fxdr_nfsv4time(tl, &nvap->na_mtime);
nvap->na_vaflags &= ~VA_UTIMES_NULL;
attrsum += NFSX_V4TIME;
} else {
vfs_timestamp(&nvap->na_mtime);
if (!toclient)
nvap->na_vaflags |= VA_UTIMES_NULL;
}
break;
default:
nd->nd_repstat = NFSERR_ATTRNOTSUPP;
/*
* set bitpos so we drop out of the loop.
*/
bitpos = NFSATTRBIT_MAX;
break;
}
}
/*
* some clients pad the attrlist, so we need to skip over the
* padding.
*/
if (attrsum > attrsize) {
error = NFSERR_BADXDR;
} else {
attrsize = NFSM_RNDUP(attrsize);
if (attrsum < attrsize)
error = nfsm_advance(nd, attrsize - attrsum, -1);
}
nfsmout:
NFSEXITCODE2(error, nd);
return (error);
}
/*
* Check/setup export credentials.
*/
int
nfsd_excred(struct nfsrv_descript *nd, struct nfsexstuff *exp,
struct ucred *credanon)
{
int error = 0;
/*
* Check/setup credentials.
*/
if (nd->nd_flag & ND_GSS)
exp->nes_exflag &= ~MNT_EXPORTANON;
/*
* Check to see if the operation is allowed for this security flavor.
* RFC2623 suggests that the NFSv3 Fsinfo RPC be allowed to
* AUTH_NONE or AUTH_SYS for file systems requiring RPCSEC_GSS.
* Also, allow Secinfo, so that it can acquire the correct flavor(s).
*/
if (nfsvno_testexp(nd, exp) &&
nd->nd_procnum != NFSV4OP_SECINFO &&
nd->nd_procnum != NFSPROC_FSINFO) {
if (nd->nd_flag & ND_NFSV4)
error = NFSERR_WRONGSEC;
else
error = (NFSERR_AUTHERR | AUTH_TOOWEAK);
goto out;
}
/*
* Check to see if the file system is exported V4 only.
*/
if (NFSVNO_EXV4ONLY(exp) && !(nd->nd_flag & ND_NFSV4)) {
error = NFSERR_PROGNOTV4;
goto out;
}
/*
* Now, map the user credentials.
* (Note that ND_AUTHNONE will only be set for an NFSv3
* Fsinfo RPC. If set for anything else, this code might need
* to change.)
*/
if (NFSVNO_EXPORTED(exp)) {
if (((nd->nd_flag & ND_GSS) == 0 && nd->nd_cred->cr_uid == 0) ||
NFSVNO_EXPORTANON(exp) ||
(nd->nd_flag & ND_AUTHNONE) != 0) {
nd->nd_cred->cr_uid = credanon->cr_uid;
nd->nd_cred->cr_gid = credanon->cr_gid;
crsetgroups(nd->nd_cred, credanon->cr_ngroups,
credanon->cr_groups);
} else if ((nd->nd_flag & ND_GSS) == 0) {
/*
* If using AUTH_SYS, call nfsrv_getgrpscred() to see
* if there is a replacement credential with a group
* list set up by "nfsuserd -manage-gids".
* If there is no replacement, nfsrv_getgrpscred()
* simply returns its argument.
*/
nd->nd_cred = nfsrv_getgrpscred(nd->nd_cred);
}
}
out:
NFSEXITCODE2(error, nd);
return (error);
}
/*
* Check exports.
*/
int
nfsvno_checkexp(struct mount *mp, struct sockaddr *nam, struct nfsexstuff *exp,
struct ucred **credp)
{
int i, error, *secflavors;
error = VFS_CHECKEXP(mp, nam, &exp->nes_exflag, credp,
&exp->nes_numsecflavor, &secflavors);
if (error) {
if (nfs_rootfhset) {
exp->nes_exflag = 0;
exp->nes_numsecflavor = 0;
error = 0;
}
} else {
/* Copy the security flavors. */
for (i = 0; i < exp->nes_numsecflavor; i++)
exp->nes_secflavors[i] = secflavors[i];
}
NFSEXITCODE(error);
return (error);
}
/*
* Get a vnode for a file handle and export stuff.
*/
int
nfsvno_fhtovp(struct mount *mp, fhandle_t *fhp, struct sockaddr *nam,
int lktype, struct vnode **vpp, struct nfsexstuff *exp,
struct ucred **credp)
{
int i, error, *secflavors;
*credp = NULL;
exp->nes_numsecflavor = 0;
error = VFS_FHTOVP(mp, &fhp->fh_fid, lktype, vpp);
if (error != 0)
/* Make sure the server replies ESTALE to the client. */
error = ESTALE;
if (nam && !error) {
error = VFS_CHECKEXP(mp, nam, &exp->nes_exflag, credp,
&exp->nes_numsecflavor, &secflavors);
if (error) {
if (nfs_rootfhset) {
exp->nes_exflag = 0;
exp->nes_numsecflavor = 0;
error = 0;
} else {
vput(*vpp);
}
} else {
/* Copy the security flavors. */
for (i = 0; i < exp->nes_numsecflavor; i++)
exp->nes_secflavors[i] = secflavors[i];
}
}
NFSEXITCODE(error);
return (error);
}
/*
* nfsd_fhtovp() - convert a fh to a vnode ptr
* - look up fsid in mount list (if not found ret error)
* - get vp and export rights by calling nfsvno_fhtovp()
* - if cred->cr_uid == 0 or MNT_EXPORTANON set it to credanon
* for AUTH_SYS
* - if mpp != NULL, return the mount point so that it can
* be used for vn_finished_write() by the caller
*/
void
nfsd_fhtovp(struct nfsrv_descript *nd, struct nfsrvfh *nfp, int lktype,
struct vnode **vpp, struct nfsexstuff *exp,
struct mount **mpp, int startwrite, struct thread *p)
{
struct mount *mp;
struct ucred *credanon;
fhandle_t *fhp;
fhp = (fhandle_t *)nfp->nfsrvfh_data;
/*
* Check for the special case of the nfsv4root_fh.
*/
mp = vfs_busyfs(&fhp->fh_fsid);
if (mpp != NULL)
*mpp = mp;
if (mp == NULL) {
*vpp = NULL;
nd->nd_repstat = ESTALE;
goto out;
}
if (startwrite) {
vn_start_write(NULL, mpp, V_WAIT);
if (lktype == LK_SHARED && !(MNT_SHARED_WRITES(mp)))
lktype = LK_EXCLUSIVE;
}
nd->nd_repstat = nfsvno_fhtovp(mp, fhp, nd->nd_nam, lktype, vpp, exp,
&credanon);
vfs_unbusy(mp);
/*
* For NFSv4 without a pseudo root fs, unexported file handles
* can be returned, so that Lookup works everywhere.
*/
if (!nd->nd_repstat && exp->nes_exflag == 0 &&
!(nd->nd_flag & ND_NFSV4)) {
vput(*vpp);
nd->nd_repstat = EACCES;
}
/*
* Personally, I've never seen any point in requiring a
* reserved port#, since only in the rare case where the
* clients are all boxes with secure system privileges,
* does it provide any enhanced security, but... some people
* believe it to be useful and keep putting this code back in.
* (There is also some "security checker" out there that
* complains if the nfs server doesn't enforce this.)
* However, note the following:
* RFC3530 (NFSv4) specifies that a reserved port# not be
* required.
* RFC2623 recommends that, if a reserved port# is checked for,
* that there be a way to turn that off--> ifdef'd.
*/
#ifdef NFS_REQRSVPORT
if (!nd->nd_repstat) {
struct sockaddr_in *saddr;
struct sockaddr_in6 *saddr6;
saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
saddr6 = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in6 *);
if (!(nd->nd_flag & ND_NFSV4) &&
((saddr->sin_family == AF_INET &&
ntohs(saddr->sin_port) >= IPPORT_RESERVED) ||
(saddr6->sin6_family == AF_INET6 &&
ntohs(saddr6->sin6_port) >= IPPORT_RESERVED))) {
vput(*vpp);
nd->nd_repstat = (NFSERR_AUTHERR | AUTH_TOOWEAK);
}
}
#endif /* NFS_REQRSVPORT */
/*
* Check/setup credentials.
*/
if (!nd->nd_repstat) {
nd->nd_saveduid = nd->nd_cred->cr_uid;
nd->nd_repstat = nfsd_excred(nd, exp, credanon);
if (nd->nd_repstat)
vput(*vpp);
}
if (credanon != NULL)
crfree(credanon);
if (nd->nd_repstat) {
if (startwrite)
vn_finished_write(mp);
*vpp = NULL;
if (mpp != NULL)
*mpp = NULL;
}
out:
NFSEXITCODE2(0, nd);
}
/*
* glue for fp.
*/
static int
fp_getfvp(struct thread *p, int fd, struct file **fpp, struct vnode **vpp)
{
struct filedesc *fdp;
struct file *fp;
int error = 0;
fdp = p->td_proc->p_fd;
if (fd < 0 || fd >= fdp->fd_nfiles ||
(fp = fdp->fd_ofiles[fd].fde_file) == NULL) {
error = EBADF;
goto out;
}
*fpp = fp;
out:
NFSEXITCODE(error);
return (error);
}
/*
* Called from nfssvc() to update the exports list. Just call
* vfs_export(). This has to be done, since the v4 root fake fs isn't
* in the mount list.
*/
int
nfsrv_v4rootexport(void *argp, struct ucred *cred, struct thread *p)
{
struct nfsex_args *nfsexargp = (struct nfsex_args *)argp;
int error = 0;
struct nameidata nd;
fhandle_t fh;
error = vfs_export(&nfsv4root_mnt, &nfsexargp->export);
if ((nfsexargp->export.ex_flags & MNT_DELEXPORT) != 0)
nfs_rootfhset = 0;
else if (error == 0) {
if (nfsexargp->fspec == NULL) {
error = EPERM;
goto out;
}
/*
* If fspec != NULL, this is the v4root path.
*/
NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE,
nfsexargp->fspec, p);
if ((error = namei(&nd)) != 0)
goto out;
error = nfsvno_getfh(nd.ni_vp, &fh, p);
vrele(nd.ni_vp);
if (!error) {
nfs_rootfh.nfsrvfh_len = NFSX_MYFH;
NFSBCOPY((caddr_t)&fh,
nfs_rootfh.nfsrvfh_data,
sizeof (fhandle_t));
nfs_rootfhset = 1;
}
}
out:
NFSEXITCODE(error);
return (error);
}
/*
* This function needs to test to see if the system is near its limit
* for memory allocation via malloc() or mget() and return True iff
* either of these resources are near their limit.
* XXX (For now, this is just a stub.)
*/
int nfsrv_testmalloclimit = 0;
int
nfsrv_mallocmget_limit(void)
{
static int printmesg = 0;
static int testval = 1;
if (nfsrv_testmalloclimit && (testval++ % 1000) == 0) {
if ((printmesg++ % 100) == 0)
printf("nfsd: malloc/mget near limit\n");
return (1);
}
return (0);
}
/*
* BSD specific initialization of a mount point.
*/
void
nfsd_mntinit(void)
{
static int inited = 0;
if (inited)
return;
inited = 1;
nfsv4root_mnt.mnt_flag = (MNT_RDONLY | MNT_EXPORTED);
TAILQ_INIT(&nfsv4root_mnt.mnt_nvnodelist);
TAILQ_INIT(&nfsv4root_mnt.mnt_activevnodelist);
nfsv4root_mnt.mnt_export = NULL;
TAILQ_INIT(&nfsv4root_opt);
TAILQ_INIT(&nfsv4root_newopt);
nfsv4root_mnt.mnt_opt = &nfsv4root_opt;
nfsv4root_mnt.mnt_optnew = &nfsv4root_newopt;
nfsv4root_mnt.mnt_nvnodelistsize = 0;
nfsv4root_mnt.mnt_activevnodelistsize = 0;
}
/*
* Get a vnode for a file handle, without checking exports, etc.
*/
struct vnode *
nfsvno_getvp(fhandle_t *fhp)
{
struct mount *mp;
struct vnode *vp;
int error;
mp = vfs_busyfs(&fhp->fh_fsid);
if (mp == NULL)
return (NULL);
error = VFS_FHTOVP(mp, &fhp->fh_fid, LK_EXCLUSIVE, &vp);
vfs_unbusy(mp);
if (error)
return (NULL);
return (vp);
}
/*
* Do a local VOP_ADVLOCK().
*/
int
nfsvno_advlock(struct vnode *vp, int ftype, u_int64_t first,
u_int64_t end, struct thread *td)
{
int error = 0;
struct flock fl;
u_int64_t tlen;
if (nfsrv_dolocallocks == 0)
goto out;
ASSERT_VOP_UNLOCKED(vp, "nfsvno_advlock: vp locked");
fl.l_whence = SEEK_SET;
fl.l_type = ftype;
fl.l_start = (off_t)first;
if (end == NFS64BITSSET) {
fl.l_len = 0;
} else {
tlen = end - first;
fl.l_len = (off_t)tlen;
}
/*
* For FreeBSD8, the l_pid and l_sysid must be set to the same
* values for all calls, so that all locks will be held by the
* nfsd server. (The nfsd server handles conflicts between the
* various clients.)
* Since an NFSv4 lockowner is a ClientID plus an array of up to 1024
* bytes, so it can't be put in l_sysid.
*/
if (nfsv4_sysid == 0)
nfsv4_sysid = nlm_acquire_next_sysid();
fl.l_pid = (pid_t)0;
fl.l_sysid = (int)nfsv4_sysid;
if (ftype == F_UNLCK)
error = VOP_ADVLOCK(vp, (caddr_t)td->td_proc, F_UNLCK, &fl,
(F_POSIX | F_REMOTE));
else
error = VOP_ADVLOCK(vp, (caddr_t)td->td_proc, F_SETLK, &fl,
(F_POSIX | F_REMOTE));
out:
NFSEXITCODE(error);
return (error);
}
/*
* Check the nfsv4 root exports.
*/
int
nfsvno_v4rootexport(struct nfsrv_descript *nd)
{
struct ucred *credanon;
int exflags, error = 0, numsecflavor, *secflavors, i;
error = vfs_stdcheckexp(&nfsv4root_mnt, nd->nd_nam, &exflags,
&credanon, &numsecflavor, &secflavors);
if (error) {
error = NFSERR_PROGUNAVAIL;
goto out;
}
if (credanon != NULL)
crfree(credanon);
for (i = 0; i < numsecflavor; i++) {
if (secflavors[i] == AUTH_SYS)
nd->nd_flag |= ND_EXAUTHSYS;
else if (secflavors[i] == RPCSEC_GSS_KRB5)
nd->nd_flag |= ND_EXGSS;
else if (secflavors[i] == RPCSEC_GSS_KRB5I)
nd->nd_flag |= ND_EXGSSINTEGRITY;
else if (secflavors[i] == RPCSEC_GSS_KRB5P)
nd->nd_flag |= ND_EXGSSPRIVACY;
}
out:
NFSEXITCODE(error);
return (error);
}
/*
* Nfs server pseudo system call for the nfsd's
*/
/*
* MPSAFE
*/
static int
nfssvc_nfsd(struct thread *td, struct nfssvc_args *uap)
{
struct file *fp;
struct nfsd_addsock_args sockarg;
struct nfsd_nfsd_args nfsdarg;
cap_rights_t rights;
int error;
if (uap->flag & NFSSVC_NFSDADDSOCK) {
error = copyin(uap->argp, (caddr_t)&sockarg, sizeof (sockarg));
if (error)
goto out;
/*
* Since we don't know what rights might be required,
* pretend that we need them all. It is better to be too
* careful than too reckless.
*/
error = fget(td, sockarg.sock,
cap_rights_init(&rights, CAP_SOCK_SERVER), &fp);
if (error != 0)
goto out;
if (fp->f_type != DTYPE_SOCKET) {
fdrop(fp, td);
error = EPERM;
goto out;
}
error = nfsrvd_addsock(fp);
fdrop(fp, td);
} else if (uap->flag & NFSSVC_NFSDNFSD) {
if (uap->argp == NULL) {
error = EINVAL;
goto out;
}
error = copyin(uap->argp, (caddr_t)&nfsdarg,
sizeof (nfsdarg));
if (error)
goto out;
error = nfsrvd_nfsd(td, &nfsdarg);
} else {
error = nfssvc_srvcall(td, uap, td->td_ucred);
}
out:
NFSEXITCODE(error);
return (error);
}
static int
nfssvc_srvcall(struct thread *p, struct nfssvc_args *uap, struct ucred *cred)
{
struct nfsex_args export;
struct file *fp = NULL;
int stablefd, len;
struct nfsd_clid adminrevoke;
struct nfsd_dumplist dumplist;
struct nfsd_dumpclients *dumpclients;
struct nfsd_dumplocklist dumplocklist;
struct nfsd_dumplocks *dumplocks;
struct nameidata nd;
vnode_t vp;
int error = EINVAL, igotlock;
struct proc *procp;
static int suspend_nfsd = 0;
if (uap->flag & NFSSVC_PUBLICFH) {
NFSBZERO((caddr_t)&nfs_pubfh.nfsrvfh_data,
sizeof (fhandle_t));
error = copyin(uap->argp,
&nfs_pubfh.nfsrvfh_data, sizeof (fhandle_t));
if (!error)
nfs_pubfhset = 1;
} else if (uap->flag & NFSSVC_V4ROOTEXPORT) {
error = copyin(uap->argp,(caddr_t)&export,
sizeof (struct nfsex_args));
if (!error)
error = nfsrv_v4rootexport(&export, cred, p);
} else if (uap->flag & NFSSVC_NOPUBLICFH) {
nfs_pubfhset = 0;
error = 0;
} else if (uap->flag & NFSSVC_STABLERESTART) {
error = copyin(uap->argp, (caddr_t)&stablefd,
sizeof (int));
if (!error)
error = fp_getfvp(p, stablefd, &fp, &vp);
if (!error && (NFSFPFLAG(fp) & (FREAD | FWRITE)) != (FREAD | FWRITE))
error = EBADF;
if (!error && newnfs_numnfsd != 0)
error = EPERM;
if (!error) {
nfsrv_stablefirst.nsf_fp = fp;
nfsrv_setupstable(p);
}
} else if (uap->flag & NFSSVC_ADMINREVOKE) {
error = copyin(uap->argp, (caddr_t)&adminrevoke,
sizeof (struct nfsd_clid));
if (!error)
error = nfsrv_adminrevoke(&adminrevoke, p);
} else if (uap->flag & NFSSVC_DUMPCLIENTS) {
error = copyin(uap->argp, (caddr_t)&dumplist,
sizeof (struct nfsd_dumplist));
if (!error && (dumplist.ndl_size < 1 ||
dumplist.ndl_size > NFSRV_MAXDUMPLIST))
error = EPERM;
if (!error) {
len = sizeof (struct nfsd_dumpclients) * dumplist.ndl_size;
dumpclients = (struct nfsd_dumpclients *)malloc(len,
M_TEMP, M_WAITOK);
nfsrv_dumpclients(dumpclients, dumplist.ndl_size);
error = copyout(dumpclients,
CAST_USER_ADDR_T(dumplist.ndl_list), len);
free((caddr_t)dumpclients, M_TEMP);
}
} else if (uap->flag & NFSSVC_DUMPLOCKS) {
error = copyin(uap->argp, (caddr_t)&dumplocklist,
sizeof (struct nfsd_dumplocklist));
if (!error && (dumplocklist.ndllck_size < 1 ||
dumplocklist.ndllck_size > NFSRV_MAXDUMPLIST))
error = EPERM;
if (!error)
error = nfsrv_lookupfilename(&nd,
dumplocklist.ndllck_fname, p);
if (!error) {
len = sizeof (struct nfsd_dumplocks) *
dumplocklist.ndllck_size;
dumplocks = (struct nfsd_dumplocks *)malloc(len,
M_TEMP, M_WAITOK);
nfsrv_dumplocks(nd.ni_vp, dumplocks,
dumplocklist.ndllck_size, p);
vput(nd.ni_vp);
error = copyout(dumplocks,
CAST_USER_ADDR_T(dumplocklist.ndllck_list), len);
free((caddr_t)dumplocks, M_TEMP);
}
} else if (uap->flag & NFSSVC_BACKUPSTABLE) {
procp = p->td_proc;
PROC_LOCK(procp);
nfsd_master_pid = procp->p_pid;
bcopy(procp->p_comm, nfsd_master_comm, MAXCOMLEN + 1);
nfsd_master_start = procp->p_stats->p_start;
nfsd_master_proc = procp;
PROC_UNLOCK(procp);
} else if ((uap->flag & NFSSVC_SUSPENDNFSD) != 0) {
NFSLOCKV4ROOTMUTEX();
if (suspend_nfsd == 0) {
/* Lock out all nfsd threads */
do {
igotlock = nfsv4_lock(&nfsd_suspend_lock, 1,
NULL, NFSV4ROOTLOCKMUTEXPTR, NULL);
} while (igotlock == 0 && suspend_nfsd == 0);
suspend_nfsd = 1;
}
NFSUNLOCKV4ROOTMUTEX();
error = 0;
} else if ((uap->flag & NFSSVC_RESUMENFSD) != 0) {
NFSLOCKV4ROOTMUTEX();
if (suspend_nfsd != 0) {
nfsv4_unlock(&nfsd_suspend_lock, 0);
suspend_nfsd = 0;
}
NFSUNLOCKV4ROOTMUTEX();
error = 0;
}
NFSEXITCODE(error);
return (error);
}
/*
* Check exports.
* Returns 0 if ok, 1 otherwise.
*/
int
nfsvno_testexp(struct nfsrv_descript *nd, struct nfsexstuff *exp)
{
int i;
/*
* This seems odd, but allow the case where the security flavor
* list is empty. This happens when NFSv4 is traversing non-exported
* file systems. Exported file systems should always have a non-empty
* security flavor list.
*/
if (exp->nes_numsecflavor == 0)
return (0);
for (i = 0; i < exp->nes_numsecflavor; i++) {
/*
* The tests for privacy and integrity must be first,
* since ND_GSS is set for everything but AUTH_SYS.
*/
if (exp->nes_secflavors[i] == RPCSEC_GSS_KRB5P &&
(nd->nd_flag & ND_GSSPRIVACY))
return (0);
if (exp->nes_secflavors[i] == RPCSEC_GSS_KRB5I &&
(nd->nd_flag & ND_GSSINTEGRITY))
return (0);
if (exp->nes_secflavors[i] == RPCSEC_GSS_KRB5 &&
(nd->nd_flag & ND_GSS))
return (0);
if (exp->nes_secflavors[i] == AUTH_SYS &&
(nd->nd_flag & ND_GSS) == 0)
return (0);
}
return (1);
}
/*
* Calculate a hash value for the fid in a file handle.
*/
uint32_t
nfsrv_hashfh(fhandle_t *fhp)
{
uint32_t hashval;
hashval = hash32_buf(&fhp->fh_fid, sizeof(struct fid), 0);
return (hashval);
}
/*
* Calculate a hash value for the sessionid.
*/
uint32_t
nfsrv_hashsessionid(uint8_t *sessionid)
{
uint32_t hashval;
hashval = hash32_buf(sessionid, NFSX_V4SESSIONID, 0);
return (hashval);
}
/*
* Signal the userland master nfsd to backup the stable restart file.
*/
void
nfsrv_backupstable(void)
{
struct proc *procp;
if (nfsd_master_proc != NULL) {
procp = pfind(nfsd_master_pid);
/* Try to make sure it is the correct process. */
if (procp == nfsd_master_proc &&
procp->p_stats->p_start.tv_sec ==
nfsd_master_start.tv_sec &&
procp->p_stats->p_start.tv_usec ==
nfsd_master_start.tv_usec &&
strcmp(procp->p_comm, nfsd_master_comm) == 0)
kern_psignal(procp, SIGUSR2);
else
nfsd_master_proc = NULL;
if (procp != NULL)
PROC_UNLOCK(procp);
}
}
extern int (*nfsd_call_nfsd)(struct thread *, struct nfssvc_args *);
/*
* Called once to initialize data structures...
*/
static int
nfsd_modevent(module_t mod, int type, void *data)
{
int error = 0, i;
static int loaded = 0;
switch (type) {
case MOD_LOAD:
if (loaded)
goto out;
newnfs_portinit();
for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
mtx_init(&nfsrchash_table[i].mtx, "nfsrtc", NULL,
MTX_DEF);
mtx_init(&nfsrcahash_table[i].mtx, "nfsrtca", NULL,
MTX_DEF);
}
mtx_init(&nfsrc_udpmtx, "nfsuc", NULL, MTX_DEF);
mtx_init(&nfs_v4root_mutex, "nfs4rt", NULL, MTX_DEF);
mtx_init(&nfsv4root_mnt.mnt_mtx, "nfs4mnt", NULL, MTX_DEF);
lockinit(&nfsv4root_mnt.mnt_explock, PVFS, "explock", 0, 0);
nfsrvd_initcache();
nfsd_init();
NFSD_LOCK();
nfsrvd_init(0);
NFSD_UNLOCK();
nfsd_mntinit();
#ifdef VV_DISABLEDELEG
vn_deleg_ops.vndeleg_recall = nfsd_recalldelegation;
vn_deleg_ops.vndeleg_disable = nfsd_disabledelegation;
#endif
nfsd_call_servertimer = nfsrv_servertimer;
nfsd_call_nfsd = nfssvc_nfsd;
loaded = 1;
break;
case MOD_UNLOAD:
if (newnfs_numnfsd != 0) {
error = EBUSY;
break;
}
#ifdef VV_DISABLEDELEG
vn_deleg_ops.vndeleg_recall = NULL;
vn_deleg_ops.vndeleg_disable = NULL;
#endif
nfsd_call_servertimer = NULL;
nfsd_call_nfsd = NULL;
/* Clean out all NFSv4 state. */
nfsrv_throwawayallstate(curthread);
/* Clean the NFS server reply cache */
nfsrvd_cleancache();
/* Free up the krpc server pool. */
if (nfsrvd_pool != NULL)
svcpool_destroy(nfsrvd_pool);
/* and get rid of the locks */
for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
mtx_destroy(&nfsrchash_table[i].mtx);
mtx_destroy(&nfsrcahash_table[i].mtx);
}
mtx_destroy(&nfsrc_udpmtx);
mtx_destroy(&nfs_v4root_mutex);
mtx_destroy(&nfsv4root_mnt.mnt_mtx);
for (i = 0; i < nfsrv_sessionhashsize; i++)
mtx_destroy(&nfssessionhash[i].mtx);
lockdestroy(&nfsv4root_mnt.mnt_explock);
free(nfsclienthash, M_NFSDCLIENT);
free(nfslockhash, M_NFSDLOCKFILE);
free(nfssessionhash, M_NFSDSESSION);
loaded = 0;
break;
default:
error = EOPNOTSUPP;
break;
}
out:
NFSEXITCODE(error);
return (error);
}
static moduledata_t nfsd_mod = {
"nfsd",
nfsd_modevent,
NULL,
};
DECLARE_MODULE(nfsd, nfsd_mod, SI_SUB_VFS, SI_ORDER_ANY);
/* So that loader and kldload(2) can find us, wherever we are.. */
MODULE_VERSION(nfsd, 1);
MODULE_DEPEND(nfsd, nfscommon, 1, 1, 1);
MODULE_DEPEND(nfsd, nfslock, 1, 1, 1);
MODULE_DEPEND(nfsd, nfslockd, 1, 1, 1);
MODULE_DEPEND(nfsd, krpc, 1, 1, 1);
MODULE_DEPEND(nfsd, nfssvc, 1, 1, 1);
Index: head/sys/fs/nfsserver/nfs_nfsdstate.c
===================================================================
--- head/sys/fs/nfsserver/nfs_nfsdstate.c (revision 327172)
+++ head/sys/fs/nfsserver/nfs_nfsdstate.c (revision 327173)
@@ -1,6141 +1,6140 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2009 Rick Macklem, University of Guelph
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#ifndef APPLEKEXT
#include <fs/nfs/nfsport.h>
struct nfsrv_stablefirst nfsrv_stablefirst;
int nfsrv_issuedelegs = 0;
int nfsrv_dolocallocks = 0;
struct nfsv4lock nfsv4rootfs_lock;
extern int newnfs_numnfsd;
extern struct nfsstatsv1 nfsstatsv1;
extern int nfsrv_lease;
extern struct timeval nfsboottime;
extern u_int32_t newnfs_true, newnfs_false;
NFSV4ROOTLOCKMUTEX;
NFSSTATESPINLOCK;
SYSCTL_DECL(_vfs_nfsd);
int nfsrv_statehashsize = NFSSTATEHASHSIZE;
SYSCTL_INT(_vfs_nfsd, OID_AUTO, statehashsize, CTLFLAG_RDTUN,
&nfsrv_statehashsize, 0,
"Size of state hash table set via loader.conf");
int nfsrv_clienthashsize = NFSCLIENTHASHSIZE;
SYSCTL_INT(_vfs_nfsd, OID_AUTO, clienthashsize, CTLFLAG_RDTUN,
&nfsrv_clienthashsize, 0,
"Size of client hash table set via loader.conf");
int nfsrv_lockhashsize = NFSLOCKHASHSIZE;
SYSCTL_INT(_vfs_nfsd, OID_AUTO, fhhashsize, CTLFLAG_RDTUN,
&nfsrv_lockhashsize, 0,
"Size of file handle hash table set via loader.conf");
int nfsrv_sessionhashsize = NFSSESSIONHASHSIZE;
SYSCTL_INT(_vfs_nfsd, OID_AUTO, sessionhashsize, CTLFLAG_RDTUN,
&nfsrv_sessionhashsize, 0,
"Size of session hash table set via loader.conf");
static int nfsrv_v4statelimit = NFSRV_V4STATELIMIT;
SYSCTL_INT(_vfs_nfsd, OID_AUTO, v4statelimit, CTLFLAG_RWTUN,
&nfsrv_v4statelimit, 0,
"High water limit for NFSv4 opens+locks+delegations");
static int nfsrv_writedelegifpos = 0;
SYSCTL_INT(_vfs_nfsd, OID_AUTO, writedelegifpos, CTLFLAG_RW,
&nfsrv_writedelegifpos, 0,
"Issue a write delegation for read opens if possible");
static int nfsrv_allowreadforwriteopen = 1;
SYSCTL_INT(_vfs_nfsd, OID_AUTO, allowreadforwriteopen, CTLFLAG_RW,
&nfsrv_allowreadforwriteopen, 0,
"Allow Reads to be done with Write Access StateIDs");
/*
* Hash lists for nfs V4.
*/
struct nfsclienthashhead *nfsclienthash;
struct nfslockhashhead *nfslockhash;
struct nfssessionhash *nfssessionhash;
#endif /* !APPLEKEXT */
static u_int32_t nfsrv_openpluslock = 0, nfsrv_delegatecnt = 0;
static time_t nfsrvboottime;
static int nfsrv_returnoldstateid = 0, nfsrv_clients = 0;
static int nfsrv_clienthighwater = NFSRV_CLIENTHIGHWATER;
static int nfsrv_nogsscallback = 0;
static volatile int nfsrv_writedelegcnt = 0;
/* local functions */
static void nfsrv_dumpaclient(struct nfsclient *clp,
struct nfsd_dumpclients *dumpp);
static void nfsrv_freeopenowner(struct nfsstate *stp, int cansleep,
NFSPROC_T *p);
static int nfsrv_freeopen(struct nfsstate *stp, vnode_t vp, int cansleep,
NFSPROC_T *p);
static void nfsrv_freelockowner(struct nfsstate *stp, vnode_t vp, int cansleep,
NFSPROC_T *p);
static void nfsrv_freeallnfslocks(struct nfsstate *stp, vnode_t vp,
int cansleep, NFSPROC_T *p);
static void nfsrv_freenfslock(struct nfslock *lop);
static void nfsrv_freenfslockfile(struct nfslockfile *lfp);
static void nfsrv_freedeleg(struct nfsstate *);
static int nfsrv_getstate(struct nfsclient *clp, nfsv4stateid_t *stateidp,
u_int32_t flags, struct nfsstate **stpp);
static void nfsrv_getowner(struct nfsstatehead *hp, struct nfsstate *new_stp,
struct nfsstate **stpp);
static int nfsrv_getlockfh(vnode_t vp, u_short flags,
struct nfslockfile *new_lfp, fhandle_t *nfhp, NFSPROC_T *p);
static int nfsrv_getlockfile(u_short flags, struct nfslockfile **new_lfpp,
struct nfslockfile **lfpp, fhandle_t *nfhp, int lockit);
static void nfsrv_insertlock(struct nfslock *new_lop,
struct nfslock *insert_lop, struct nfsstate *stp, struct nfslockfile *lfp);
static void nfsrv_updatelock(struct nfsstate *stp, struct nfslock **new_lopp,
struct nfslock **other_lopp, struct nfslockfile *lfp);
static int nfsrv_getipnumber(u_char *cp);
static int nfsrv_checkrestart(nfsquad_t clientid, u_int32_t flags,
nfsv4stateid_t *stateidp, int specialid);
static int nfsrv_checkgrace(struct nfsrv_descript *nd, struct nfsclient *clp,
u_int32_t flags);
static int nfsrv_docallback(struct nfsclient *clp, int procnum,
nfsv4stateid_t *stateidp, int trunc, fhandle_t *fhp,
struct nfsvattr *nap, nfsattrbit_t *attrbitp, NFSPROC_T *p);
static int nfsrv_cbcallargs(struct nfsrv_descript *nd, struct nfsclient *clp,
uint32_t callback, int op, const char *optag, struct nfsdsession **sepp);
static u_int32_t nfsrv_nextclientindex(void);
static u_int32_t nfsrv_nextstateindex(struct nfsclient *clp);
static void nfsrv_markstable(struct nfsclient *clp);
static int nfsrv_checkstable(struct nfsclient *clp);
static int nfsrv_clientconflict(struct nfsclient *clp, int *haslockp, struct
vnode *vp, NFSPROC_T *p);
static int nfsrv_delegconflict(struct nfsstate *stp, int *haslockp,
NFSPROC_T *p, vnode_t vp);
static int nfsrv_cleandeleg(vnode_t vp, struct nfslockfile *lfp,
struct nfsclient *clp, int *haslockp, NFSPROC_T *p);
static int nfsrv_notsamecredname(struct nfsrv_descript *nd,
struct nfsclient *clp);
static time_t nfsrv_leaseexpiry(void);
static void nfsrv_delaydelegtimeout(struct nfsstate *stp);
static int nfsrv_checkseqid(struct nfsrv_descript *nd, u_int32_t seqid,
struct nfsstate *stp, struct nfsrvcache *op);
static int nfsrv_nootherstate(struct nfsstate *stp);
static int nfsrv_locallock(vnode_t vp, struct nfslockfile *lfp, int flags,
uint64_t first, uint64_t end, struct nfslockconflict *cfp, NFSPROC_T *p);
static void nfsrv_localunlock(vnode_t vp, struct nfslockfile *lfp,
uint64_t init_first, uint64_t init_end, NFSPROC_T *p);
static int nfsrv_dolocal(vnode_t vp, struct nfslockfile *lfp, int flags,
int oldflags, uint64_t first, uint64_t end, struct nfslockconflict *cfp,
NFSPROC_T *p);
static void nfsrv_locallock_rollback(vnode_t vp, struct nfslockfile *lfp,
NFSPROC_T *p);
static void nfsrv_locallock_commit(struct nfslockfile *lfp, int flags,
uint64_t first, uint64_t end);
static void nfsrv_locklf(struct nfslockfile *lfp);
static void nfsrv_unlocklf(struct nfslockfile *lfp);
static struct nfsdsession *nfsrv_findsession(uint8_t *sessionid);
static int nfsrv_freesession(struct nfsdsession *sep, uint8_t *sessionid);
static int nfsv4_setcbsequence(struct nfsrv_descript *nd, struct nfsclient *clp,
int dont_replycache, struct nfsdsession **sepp);
static int nfsv4_getcbsession(struct nfsclient *clp, struct nfsdsession **sepp);
/*
* Scan the client list for a match and either return the current one,
* create a new entry or return an error.
* If returning a non-error, the clp structure must either be linked into
* the client list or free'd.
*/
APPLESTATIC int
nfsrv_setclient(struct nfsrv_descript *nd, struct nfsclient **new_clpp,
nfsquad_t *clientidp, nfsquad_t *confirmp, NFSPROC_T *p)
{
struct nfsclient *clp = NULL, *new_clp = *new_clpp;
int i, error = 0;
struct nfsstate *stp, *tstp;
struct sockaddr_in *sad, *rad;
int zapit = 0, gotit, hasstate = 0, igotlock;
static u_int64_t confirm_index = 0;
/*
* Check for state resource limit exceeded.
*/
if (nfsrv_openpluslock > nfsrv_v4statelimit) {
error = NFSERR_RESOURCE;
goto out;
}
if (nfsrv_issuedelegs == 0 ||
((nd->nd_flag & ND_GSS) != 0 && nfsrv_nogsscallback != 0))
/*
* Don't do callbacks when delegations are disabled or
* for AUTH_GSS unless enabled via nfsrv_nogsscallback.
* If establishing a callback connection is attempted
* when a firewall is blocking the callback path, the
* server may wait too long for the connect attempt to
* succeed during the Open. Some clients, such as Linux,
* may timeout and give up on the Open before the server
* replies. Also, since AUTH_GSS callbacks are not
* yet interoperability tested, they might cause the
* server to crap out, if they get past the Init call to
* the client.
*/
new_clp->lc_program = 0;
/* Lock out other nfsd threads */
NFSLOCKV4ROOTMUTEX();
nfsv4_relref(&nfsv4rootfs_lock);
do {
igotlock = nfsv4_lock(&nfsv4rootfs_lock, 1, NULL,
NFSV4ROOTLOCKMUTEXPTR, NULL);
} while (!igotlock);
NFSUNLOCKV4ROOTMUTEX();
/*
* Search for a match in the client list.
*/
gotit = i = 0;
while (i < nfsrv_clienthashsize && !gotit) {
LIST_FOREACH(clp, &nfsclienthash[i], lc_hash) {
if (new_clp->lc_idlen == clp->lc_idlen &&
!NFSBCMP(new_clp->lc_id, clp->lc_id, clp->lc_idlen)) {
gotit = 1;
break;
}
}
if (gotit == 0)
i++;
}
if (!gotit ||
(clp->lc_flags & (LCL_NEEDSCONFIRM | LCL_ADMINREVOKED))) {
if ((nd->nd_flag & ND_NFSV41) != 0 && confirmp->lval[1] != 0) {
/*
* For NFSv4.1, if confirmp->lval[1] is non-zero, the
* client is trying to update a confirmed clientid.
*/
NFSLOCKV4ROOTMUTEX();
nfsv4_unlock(&nfsv4rootfs_lock, 1);
NFSUNLOCKV4ROOTMUTEX();
confirmp->lval[1] = 0;
error = NFSERR_NOENT;
goto out;
}
/*
* Get rid of the old one.
*/
if (i != nfsrv_clienthashsize) {
LIST_REMOVE(clp, lc_hash);
nfsrv_cleanclient(clp, p);
nfsrv_freedeleglist(&clp->lc_deleg);
nfsrv_freedeleglist(&clp->lc_olddeleg);
zapit = 1;
}
/*
* Add it after assigning a client id to it.
*/
new_clp->lc_flags |= LCL_NEEDSCONFIRM;
if ((nd->nd_flag & ND_NFSV41) != 0)
new_clp->lc_confirm.lval[0] = confirmp->lval[0] =
++confirm_index;
else
confirmp->qval = new_clp->lc_confirm.qval =
++confirm_index;
clientidp->lval[0] = new_clp->lc_clientid.lval[0] =
(u_int32_t)nfsrvboottime;
clientidp->lval[1] = new_clp->lc_clientid.lval[1] =
nfsrv_nextclientindex();
new_clp->lc_stateindex = 0;
new_clp->lc_statemaxindex = 0;
new_clp->lc_cbref = 0;
new_clp->lc_expiry = nfsrv_leaseexpiry();
LIST_INIT(&new_clp->lc_open);
LIST_INIT(&new_clp->lc_deleg);
LIST_INIT(&new_clp->lc_olddeleg);
LIST_INIT(&new_clp->lc_session);
for (i = 0; i < nfsrv_statehashsize; i++)
LIST_INIT(&new_clp->lc_stateid[i]);
LIST_INSERT_HEAD(NFSCLIENTHASH(new_clp->lc_clientid), new_clp,
lc_hash);
nfsstatsv1.srvclients++;
nfsrv_openpluslock++;
nfsrv_clients++;
NFSLOCKV4ROOTMUTEX();
nfsv4_unlock(&nfsv4rootfs_lock, 1);
NFSUNLOCKV4ROOTMUTEX();
if (zapit)
nfsrv_zapclient(clp, p);
*new_clpp = NULL;
goto out;
}
/*
* Now, handle the cases where the id is already issued.
*/
if (nfsrv_notsamecredname(nd, clp)) {
/*
* Check to see if there is expired state that should go away.
*/
if (clp->lc_expiry < NFSD_MONOSEC &&
(!LIST_EMPTY(&clp->lc_open) || !LIST_EMPTY(&clp->lc_deleg))) {
nfsrv_cleanclient(clp, p);
nfsrv_freedeleglist(&clp->lc_deleg);
}
/*
* If there is outstanding state, then reply NFSERR_CLIDINUSE per
* RFC3530 Sec. 8.1.2 last para.
*/
if (!LIST_EMPTY(&clp->lc_deleg)) {
hasstate = 1;
} else if (LIST_EMPTY(&clp->lc_open)) {
hasstate = 0;
} else {
hasstate = 0;
/* Look for an Open on the OpenOwner */
LIST_FOREACH(stp, &clp->lc_open, ls_list) {
if (!LIST_EMPTY(&stp->ls_open)) {
hasstate = 1;
break;
}
}
}
if (hasstate) {
/*
* If the uid doesn't match, return NFSERR_CLIDINUSE after
* filling out the correct ipaddr and portnum.
*/
sad = NFSSOCKADDR(new_clp->lc_req.nr_nam, struct sockaddr_in *);
rad = NFSSOCKADDR(clp->lc_req.nr_nam, struct sockaddr_in *);
sad->sin_addr.s_addr = rad->sin_addr.s_addr;
sad->sin_port = rad->sin_port;
NFSLOCKV4ROOTMUTEX();
nfsv4_unlock(&nfsv4rootfs_lock, 1);
NFSUNLOCKV4ROOTMUTEX();
error = NFSERR_CLIDINUSE;
goto out;
}
}
if (NFSBCMP(new_clp->lc_verf, clp->lc_verf, NFSX_VERF)) {
/*
* If the verifier has changed, the client has rebooted
* and a new client id is issued. The old state info
* can be thrown away once the SETCLIENTID_CONFIRM occurs.
*/
LIST_REMOVE(clp, lc_hash);
new_clp->lc_flags |= LCL_NEEDSCONFIRM;
if ((nd->nd_flag & ND_NFSV41) != 0)
new_clp->lc_confirm.lval[0] = confirmp->lval[0] =
++confirm_index;
else
confirmp->qval = new_clp->lc_confirm.qval =
++confirm_index;
clientidp->lval[0] = new_clp->lc_clientid.lval[0] =
nfsrvboottime;
clientidp->lval[1] = new_clp->lc_clientid.lval[1] =
nfsrv_nextclientindex();
new_clp->lc_stateindex = 0;
new_clp->lc_statemaxindex = 0;
new_clp->lc_cbref = 0;
new_clp->lc_expiry = nfsrv_leaseexpiry();
/*
* Save the state until confirmed.
*/
LIST_NEWHEAD(&new_clp->lc_open, &clp->lc_open, ls_list);
LIST_FOREACH(tstp, &new_clp->lc_open, ls_list)
tstp->ls_clp = new_clp;
LIST_NEWHEAD(&new_clp->lc_deleg, &clp->lc_deleg, ls_list);
LIST_FOREACH(tstp, &new_clp->lc_deleg, ls_list)
tstp->ls_clp = new_clp;
LIST_NEWHEAD(&new_clp->lc_olddeleg, &clp->lc_olddeleg,
ls_list);
LIST_FOREACH(tstp, &new_clp->lc_olddeleg, ls_list)
tstp->ls_clp = new_clp;
for (i = 0; i < nfsrv_statehashsize; i++) {
LIST_NEWHEAD(&new_clp->lc_stateid[i],
&clp->lc_stateid[i], ls_hash);
LIST_FOREACH(tstp, &new_clp->lc_stateid[i], ls_hash)
tstp->ls_clp = new_clp;
}
LIST_INSERT_HEAD(NFSCLIENTHASH(new_clp->lc_clientid), new_clp,
lc_hash);
nfsstatsv1.srvclients++;
nfsrv_openpluslock++;
nfsrv_clients++;
NFSLOCKV4ROOTMUTEX();
nfsv4_unlock(&nfsv4rootfs_lock, 1);
NFSUNLOCKV4ROOTMUTEX();
/*
* Must wait until any outstanding callback on the old clp
* completes.
*/
NFSLOCKSTATE();
while (clp->lc_cbref) {
clp->lc_flags |= LCL_WAKEUPWANTED;
(void)mtx_sleep(clp, NFSSTATEMUTEXPTR, PZERO - 1,
"nfsd clp", 10 * hz);
}
NFSUNLOCKSTATE();
nfsrv_zapclient(clp, p);
*new_clpp = NULL;
goto out;
}
/* For NFSv4.1, mark that we found a confirmed clientid. */
if ((nd->nd_flag & ND_NFSV41) != 0) {
clientidp->lval[0] = clp->lc_clientid.lval[0];
clientidp->lval[1] = clp->lc_clientid.lval[1];
confirmp->lval[0] = 0; /* Ignored by client */
confirmp->lval[1] = 1;
} else {
/*
* id and verifier match, so update the net address info
* and get rid of any existing callback authentication
* handle, so a new one will be acquired.
*/
LIST_REMOVE(clp, lc_hash);
new_clp->lc_flags |= (LCL_NEEDSCONFIRM | LCL_DONTCLEAN);
new_clp->lc_expiry = nfsrv_leaseexpiry();
confirmp->qval = new_clp->lc_confirm.qval = ++confirm_index;
clientidp->lval[0] = new_clp->lc_clientid.lval[0] =
clp->lc_clientid.lval[0];
clientidp->lval[1] = new_clp->lc_clientid.lval[1] =
clp->lc_clientid.lval[1];
new_clp->lc_delegtime = clp->lc_delegtime;
new_clp->lc_stateindex = clp->lc_stateindex;
new_clp->lc_statemaxindex = clp->lc_statemaxindex;
new_clp->lc_cbref = 0;
LIST_NEWHEAD(&new_clp->lc_open, &clp->lc_open, ls_list);
LIST_FOREACH(tstp, &new_clp->lc_open, ls_list)
tstp->ls_clp = new_clp;
LIST_NEWHEAD(&new_clp->lc_deleg, &clp->lc_deleg, ls_list);
LIST_FOREACH(tstp, &new_clp->lc_deleg, ls_list)
tstp->ls_clp = new_clp;
LIST_NEWHEAD(&new_clp->lc_olddeleg, &clp->lc_olddeleg, ls_list);
LIST_FOREACH(tstp, &new_clp->lc_olddeleg, ls_list)
tstp->ls_clp = new_clp;
for (i = 0; i < nfsrv_statehashsize; i++) {
LIST_NEWHEAD(&new_clp->lc_stateid[i],
&clp->lc_stateid[i], ls_hash);
LIST_FOREACH(tstp, &new_clp->lc_stateid[i], ls_hash)
tstp->ls_clp = new_clp;
}
LIST_INSERT_HEAD(NFSCLIENTHASH(new_clp->lc_clientid), new_clp,
lc_hash);
nfsstatsv1.srvclients++;
nfsrv_openpluslock++;
nfsrv_clients++;
}
NFSLOCKV4ROOTMUTEX();
nfsv4_unlock(&nfsv4rootfs_lock, 1);
NFSUNLOCKV4ROOTMUTEX();
if ((nd->nd_flag & ND_NFSV41) == 0) {
/*
* Must wait until any outstanding callback on the old clp
* completes.
*/
NFSLOCKSTATE();
while (clp->lc_cbref) {
clp->lc_flags |= LCL_WAKEUPWANTED;
(void)mtx_sleep(clp, NFSSTATEMUTEXPTR, PZERO - 1,
"nfsdclp", 10 * hz);
}
NFSUNLOCKSTATE();
nfsrv_zapclient(clp, p);
*new_clpp = NULL;
}
out:
NFSEXITCODE2(error, nd);
return (error);
}
/*
* Check to see if the client id exists and optionally confirm it.
*/
APPLESTATIC int
nfsrv_getclient(nfsquad_t clientid, int opflags, struct nfsclient **clpp,
struct nfsdsession *nsep, nfsquad_t confirm, uint32_t cbprogram,
struct nfsrv_descript *nd, NFSPROC_T *p)
{
struct nfsclient *clp;
struct nfsstate *stp;
int i;
struct nfsclienthashhead *hp;
int error = 0, igotlock, doneok;
struct nfssessionhash *shp;
struct nfsdsession *sep;
uint64_t sessid[2];
static uint64_t next_sess = 0;
if (clpp)
*clpp = NULL;
if ((nd == NULL || (nd->nd_flag & ND_NFSV41) == 0 ||
opflags != CLOPS_RENEW) && nfsrvboottime != clientid.lval[0]) {
error = NFSERR_STALECLIENTID;
goto out;
}
/*
* If called with opflags == CLOPS_RENEW, the State Lock is
* already held. Otherwise, we need to get either that or,
* for the case of Confirm, lock out the nfsd threads.
*/
if (opflags & CLOPS_CONFIRM) {
NFSLOCKV4ROOTMUTEX();
nfsv4_relref(&nfsv4rootfs_lock);
do {
igotlock = nfsv4_lock(&nfsv4rootfs_lock, 1, NULL,
NFSV4ROOTLOCKMUTEXPTR, NULL);
} while (!igotlock);
/*
* Create a new sessionid here, since we need to do it where
* there is a mutex held to serialize update of next_sess.
*/
if ((nd->nd_flag & ND_NFSV41) != 0) {
sessid[0] = ++next_sess;
sessid[1] = clientid.qval;
}
NFSUNLOCKV4ROOTMUTEX();
} else if (opflags != CLOPS_RENEW) {
NFSLOCKSTATE();
}
/* For NFSv4.1, the clp is acquired from the associated session. */
if (nd != NULL && (nd->nd_flag & ND_NFSV41) != 0 &&
opflags == CLOPS_RENEW) {
clp = NULL;
if ((nd->nd_flag & ND_HASSEQUENCE) != 0) {
shp = NFSSESSIONHASH(nd->nd_sessionid);
NFSLOCKSESSION(shp);
sep = nfsrv_findsession(nd->nd_sessionid);
if (sep != NULL)
clp = sep->sess_clp;
NFSUNLOCKSESSION(shp);
}
} else {
hp = NFSCLIENTHASH(clientid);
LIST_FOREACH(clp, hp, lc_hash) {
if (clp->lc_clientid.lval[1] == clientid.lval[1])
break;
}
}
if (clp == NULL) {
if (opflags & CLOPS_CONFIRM)
error = NFSERR_STALECLIENTID;
else
error = NFSERR_EXPIRED;
} else if (clp->lc_flags & LCL_ADMINREVOKED) {
/*
* If marked admin revoked, just return the error.
*/
error = NFSERR_ADMINREVOKED;
}
if (error) {
if (opflags & CLOPS_CONFIRM) {
NFSLOCKV4ROOTMUTEX();
nfsv4_unlock(&nfsv4rootfs_lock, 1);
NFSUNLOCKV4ROOTMUTEX();
} else if (opflags != CLOPS_RENEW) {
NFSUNLOCKSTATE();
}
goto out;
}
/*
* Perform any operations specified by the opflags.
*/
if (opflags & CLOPS_CONFIRM) {
if (((nd->nd_flag & ND_NFSV41) != 0 &&
clp->lc_confirm.lval[0] != confirm.lval[0]) ||
((nd->nd_flag & ND_NFSV41) == 0 &&
clp->lc_confirm.qval != confirm.qval))
error = NFSERR_STALECLIENTID;
else if (nfsrv_notsamecredname(nd, clp))
error = NFSERR_CLIDINUSE;
if (!error) {
if ((clp->lc_flags & (LCL_NEEDSCONFIRM | LCL_DONTCLEAN)) ==
LCL_NEEDSCONFIRM) {
/*
* Hang onto the delegations (as old delegations)
* for an Open with CLAIM_DELEGATE_PREV unless in
* grace, but get rid of the rest of the state.
*/
nfsrv_cleanclient(clp, p);
nfsrv_freedeleglist(&clp->lc_olddeleg);
if (nfsrv_checkgrace(nd, clp, 0)) {
/* In grace, so just delete delegations */
nfsrv_freedeleglist(&clp->lc_deleg);
} else {
LIST_FOREACH(stp, &clp->lc_deleg, ls_list)
stp->ls_flags |= NFSLCK_OLDDELEG;
clp->lc_delegtime = NFSD_MONOSEC +
nfsrv_lease + NFSRV_LEASEDELTA;
LIST_NEWHEAD(&clp->lc_olddeleg, &clp->lc_deleg,
ls_list);
}
if ((nd->nd_flag & ND_NFSV41) != 0)
clp->lc_program = cbprogram;
}
clp->lc_flags &= ~(LCL_NEEDSCONFIRM | LCL_DONTCLEAN);
if (clp->lc_program)
clp->lc_flags |= LCL_NEEDSCBNULL;
/* For NFSv4.1, link the session onto the client. */
if (nsep != NULL) {
/* Hold a reference on the xprt for a backchannel. */
if ((nsep->sess_crflags & NFSV4CRSESS_CONNBACKCHAN)
!= 0 && clp->lc_req.nr_client == NULL) {
clp->lc_req.nr_client = (struct __rpc_client *)
clnt_bck_create(nd->nd_xprt->xp_socket,
cbprogram, NFSV4_CBVERS);
if (clp->lc_req.nr_client != NULL) {
SVC_ACQUIRE(nd->nd_xprt);
nd->nd_xprt->xp_p2 =
clp->lc_req.nr_client->cl_private;
/* Disable idle timeout. */
nd->nd_xprt->xp_idletimeout = 0;
nsep->sess_cbsess.nfsess_xprt = nd->nd_xprt;
} else
nsep->sess_crflags &= ~NFSV4CRSESS_CONNBACKCHAN;
}
NFSBCOPY(sessid, nsep->sess_sessionid,
NFSX_V4SESSIONID);
NFSBCOPY(sessid, nsep->sess_cbsess.nfsess_sessionid,
NFSX_V4SESSIONID);
shp = NFSSESSIONHASH(nsep->sess_sessionid);
NFSLOCKSTATE();
NFSLOCKSESSION(shp);
LIST_INSERT_HEAD(&shp->list, nsep, sess_hash);
LIST_INSERT_HEAD(&clp->lc_session, nsep, sess_list);
nsep->sess_clp = clp;
NFSUNLOCKSESSION(shp);
NFSUNLOCKSTATE();
}
}
} else if (clp->lc_flags & LCL_NEEDSCONFIRM) {
error = NFSERR_EXPIRED;
}
/*
* If called by the Renew Op, we must check the principal.
*/
if (!error && (opflags & CLOPS_RENEWOP)) {
if (nfsrv_notsamecredname(nd, clp)) {
doneok = 0;
for (i = 0; i < nfsrv_statehashsize && doneok == 0; i++) {
LIST_FOREACH(stp, &clp->lc_stateid[i], ls_hash) {
if ((stp->ls_flags & NFSLCK_OPEN) &&
stp->ls_uid == nd->nd_cred->cr_uid) {
doneok = 1;
break;
}
}
}
if (!doneok)
error = NFSERR_ACCES;
}
if (!error && (clp->lc_flags & LCL_CBDOWN))
error = NFSERR_CBPATHDOWN;
}
if ((!error || error == NFSERR_CBPATHDOWN) &&
(opflags & CLOPS_RENEW)) {
clp->lc_expiry = nfsrv_leaseexpiry();
}
if (opflags & CLOPS_CONFIRM) {
NFSLOCKV4ROOTMUTEX();
nfsv4_unlock(&nfsv4rootfs_lock, 1);
NFSUNLOCKV4ROOTMUTEX();
} else if (opflags != CLOPS_RENEW) {
NFSUNLOCKSTATE();
}
if (clpp)
*clpp = clp;
out:
NFSEXITCODE2(error, nd);
return (error);
}
/*
* Perform the NFSv4.1 destroy clientid.
*/
int
nfsrv_destroyclient(nfsquad_t clientid, NFSPROC_T *p)
{
struct nfsclient *clp;
struct nfsclienthashhead *hp;
int error = 0, i, igotlock;
if (nfsrvboottime != clientid.lval[0]) {
error = NFSERR_STALECLIENTID;
goto out;
}
/* Lock out other nfsd threads */
NFSLOCKV4ROOTMUTEX();
nfsv4_relref(&nfsv4rootfs_lock);
do {
igotlock = nfsv4_lock(&nfsv4rootfs_lock, 1, NULL,
NFSV4ROOTLOCKMUTEXPTR, NULL);
} while (igotlock == 0);
NFSUNLOCKV4ROOTMUTEX();
hp = NFSCLIENTHASH(clientid);
LIST_FOREACH(clp, hp, lc_hash) {
if (clp->lc_clientid.lval[1] == clientid.lval[1])
break;
}
if (clp == NULL) {
NFSLOCKV4ROOTMUTEX();
nfsv4_unlock(&nfsv4rootfs_lock, 1);
NFSUNLOCKV4ROOTMUTEX();
/* Just return ok, since it is gone. */
goto out;
}
/* Scan for state on the clientid. */
for (i = 0; i < nfsrv_statehashsize; i++)
if (!LIST_EMPTY(&clp->lc_stateid[i])) {
NFSLOCKV4ROOTMUTEX();
nfsv4_unlock(&nfsv4rootfs_lock, 1);
NFSUNLOCKV4ROOTMUTEX();
error = NFSERR_CLIENTIDBUSY;
goto out;
}
if (!LIST_EMPTY(&clp->lc_session) || !LIST_EMPTY(&clp->lc_deleg)) {
NFSLOCKV4ROOTMUTEX();
nfsv4_unlock(&nfsv4rootfs_lock, 1);
NFSUNLOCKV4ROOTMUTEX();
error = NFSERR_CLIENTIDBUSY;
goto out;
}
/* Destroy the clientid and return ok. */
nfsrv_cleanclient(clp, p);
nfsrv_freedeleglist(&clp->lc_deleg);
nfsrv_freedeleglist(&clp->lc_olddeleg);
LIST_REMOVE(clp, lc_hash);
NFSLOCKV4ROOTMUTEX();
nfsv4_unlock(&nfsv4rootfs_lock, 1);
NFSUNLOCKV4ROOTMUTEX();
nfsrv_zapclient(clp, p);
out:
NFSEXITCODE2(error, nd);
return (error);
}
/*
* Called from the new nfssvc syscall to admin revoke a clientid.
* Returns 0 for success, error otherwise.
*/
APPLESTATIC int
nfsrv_adminrevoke(struct nfsd_clid *revokep, NFSPROC_T *p)
{
struct nfsclient *clp = NULL;
int i, error = 0;
int gotit, igotlock;
/*
* First, lock out the nfsd so that state won't change while the
* revocation record is being written to the stable storage restart
* file.
*/
NFSLOCKV4ROOTMUTEX();
do {
igotlock = nfsv4_lock(&nfsv4rootfs_lock, 1, NULL,
NFSV4ROOTLOCKMUTEXPTR, NULL);
} while (!igotlock);
NFSUNLOCKV4ROOTMUTEX();
/*
* Search for a match in the client list.
*/
gotit = i = 0;
while (i < nfsrv_clienthashsize && !gotit) {
LIST_FOREACH(clp, &nfsclienthash[i], lc_hash) {
if (revokep->nclid_idlen == clp->lc_idlen &&
!NFSBCMP(revokep->nclid_id, clp->lc_id, clp->lc_idlen)) {
gotit = 1;
break;
}
}
i++;
}
if (!gotit) {
NFSLOCKV4ROOTMUTEX();
nfsv4_unlock(&nfsv4rootfs_lock, 0);
NFSUNLOCKV4ROOTMUTEX();
error = EPERM;
goto out;
}
/*
* Now, write out the revocation record
*/
nfsrv_writestable(clp->lc_id, clp->lc_idlen, NFSNST_REVOKE, p);
nfsrv_backupstable();
/*
* and clear out the state, marking the clientid revoked.
*/
clp->lc_flags &= ~LCL_CALLBACKSON;
clp->lc_flags |= LCL_ADMINREVOKED;
nfsrv_cleanclient(clp, p);
nfsrv_freedeleglist(&clp->lc_deleg);
nfsrv_freedeleglist(&clp->lc_olddeleg);
NFSLOCKV4ROOTMUTEX();
nfsv4_unlock(&nfsv4rootfs_lock, 0);
NFSUNLOCKV4ROOTMUTEX();
out:
NFSEXITCODE(error);
return (error);
}
/*
* Dump out stats for all clients. Called from nfssvc(2), that is used
* nfsstatsv1.
*/
APPLESTATIC void
nfsrv_dumpclients(struct nfsd_dumpclients *dumpp, int maxcnt)
{
struct nfsclient *clp;
int i = 0, cnt = 0;
/*
* First, get a reference on the nfsv4rootfs_lock so that an
* exclusive lock cannot be acquired while dumping the clients.
*/
NFSLOCKV4ROOTMUTEX();
nfsv4_getref(&nfsv4rootfs_lock, NULL, NFSV4ROOTLOCKMUTEXPTR, NULL);
NFSUNLOCKV4ROOTMUTEX();
NFSLOCKSTATE();
/*
* Rattle through the client lists until done.
*/
while (i < nfsrv_clienthashsize && cnt < maxcnt) {
clp = LIST_FIRST(&nfsclienthash[i]);
while (clp != LIST_END(&nfsclienthash[i]) && cnt < maxcnt) {
nfsrv_dumpaclient(clp, &dumpp[cnt]);
cnt++;
clp = LIST_NEXT(clp, lc_hash);
}
i++;
}
if (cnt < maxcnt)
dumpp[cnt].ndcl_clid.nclid_idlen = 0;
NFSUNLOCKSTATE();
NFSLOCKV4ROOTMUTEX();
nfsv4_relref(&nfsv4rootfs_lock);
NFSUNLOCKV4ROOTMUTEX();
}
/*
* Dump stats for a client. Must be called with the NFSSTATELOCK and spl'd.
*/
static void
nfsrv_dumpaclient(struct nfsclient *clp, struct nfsd_dumpclients *dumpp)
{
struct nfsstate *stp, *openstp, *lckownstp;
struct nfslock *lop;
struct sockaddr *sad;
struct sockaddr_in *rad;
struct sockaddr_in6 *rad6;
dumpp->ndcl_nopenowners = dumpp->ndcl_nlockowners = 0;
dumpp->ndcl_nopens = dumpp->ndcl_nlocks = 0;
dumpp->ndcl_ndelegs = dumpp->ndcl_nolddelegs = 0;
dumpp->ndcl_flags = clp->lc_flags;
dumpp->ndcl_clid.nclid_idlen = clp->lc_idlen;
NFSBCOPY(clp->lc_id, dumpp->ndcl_clid.nclid_id, clp->lc_idlen);
sad = NFSSOCKADDR(clp->lc_req.nr_nam, struct sockaddr *);
dumpp->ndcl_addrfam = sad->sa_family;
if (sad->sa_family == AF_INET) {
rad = (struct sockaddr_in *)sad;
dumpp->ndcl_cbaddr.sin_addr = rad->sin_addr;
} else {
rad6 = (struct sockaddr_in6 *)sad;
dumpp->ndcl_cbaddr.sin6_addr = rad6->sin6_addr;
}
/*
* Now, scan the state lists and total up the opens and locks.
*/
LIST_FOREACH(stp, &clp->lc_open, ls_list) {
dumpp->ndcl_nopenowners++;
LIST_FOREACH(openstp, &stp->ls_open, ls_list) {
dumpp->ndcl_nopens++;
LIST_FOREACH(lckownstp, &openstp->ls_open, ls_list) {
dumpp->ndcl_nlockowners++;
LIST_FOREACH(lop, &lckownstp->ls_lock, lo_lckowner) {
dumpp->ndcl_nlocks++;
}
}
}
}
/*
* and the delegation lists.
*/
LIST_FOREACH(stp, &clp->lc_deleg, ls_list) {
dumpp->ndcl_ndelegs++;
}
LIST_FOREACH(stp, &clp->lc_olddeleg, ls_list) {
dumpp->ndcl_nolddelegs++;
}
}
/*
* Dump out lock stats for a file.
*/
APPLESTATIC void
nfsrv_dumplocks(vnode_t vp, struct nfsd_dumplocks *ldumpp, int maxcnt,
NFSPROC_T *p)
{
struct nfsstate *stp;
struct nfslock *lop;
int cnt = 0;
struct nfslockfile *lfp;
struct sockaddr *sad;
struct sockaddr_in *rad;
struct sockaddr_in6 *rad6;
int ret;
fhandle_t nfh;
ret = nfsrv_getlockfh(vp, 0, NULL, &nfh, p);
/*
* First, get a reference on the nfsv4rootfs_lock so that an
* exclusive lock on it cannot be acquired while dumping the locks.
*/
NFSLOCKV4ROOTMUTEX();
nfsv4_getref(&nfsv4rootfs_lock, NULL, NFSV4ROOTLOCKMUTEXPTR, NULL);
NFSUNLOCKV4ROOTMUTEX();
NFSLOCKSTATE();
if (!ret)
ret = nfsrv_getlockfile(0, NULL, &lfp, &nfh, 0);
if (ret) {
ldumpp[0].ndlck_clid.nclid_idlen = 0;
NFSUNLOCKSTATE();
NFSLOCKV4ROOTMUTEX();
nfsv4_relref(&nfsv4rootfs_lock);
NFSUNLOCKV4ROOTMUTEX();
return;
}
/*
* For each open share on file, dump it out.
*/
stp = LIST_FIRST(&lfp->lf_open);
while (stp != LIST_END(&lfp->lf_open) && cnt < maxcnt) {
ldumpp[cnt].ndlck_flags = stp->ls_flags;
ldumpp[cnt].ndlck_stateid.seqid = stp->ls_stateid.seqid;
ldumpp[cnt].ndlck_stateid.other[0] = stp->ls_stateid.other[0];
ldumpp[cnt].ndlck_stateid.other[1] = stp->ls_stateid.other[1];
ldumpp[cnt].ndlck_stateid.other[2] = stp->ls_stateid.other[2];
ldumpp[cnt].ndlck_owner.nclid_idlen =
stp->ls_openowner->ls_ownerlen;
NFSBCOPY(stp->ls_openowner->ls_owner,
ldumpp[cnt].ndlck_owner.nclid_id,
stp->ls_openowner->ls_ownerlen);
ldumpp[cnt].ndlck_clid.nclid_idlen = stp->ls_clp->lc_idlen;
NFSBCOPY(stp->ls_clp->lc_id, ldumpp[cnt].ndlck_clid.nclid_id,
stp->ls_clp->lc_idlen);
sad=NFSSOCKADDR(stp->ls_clp->lc_req.nr_nam, struct sockaddr *);
ldumpp[cnt].ndlck_addrfam = sad->sa_family;
if (sad->sa_family == AF_INET) {
rad = (struct sockaddr_in *)sad;
ldumpp[cnt].ndlck_cbaddr.sin_addr = rad->sin_addr;
} else {
rad6 = (struct sockaddr_in6 *)sad;
ldumpp[cnt].ndlck_cbaddr.sin6_addr = rad6->sin6_addr;
}
stp = LIST_NEXT(stp, ls_file);
cnt++;
}
/*
* and all locks.
*/
lop = LIST_FIRST(&lfp->lf_lock);
while (lop != LIST_END(&lfp->lf_lock) && cnt < maxcnt) {
stp = lop->lo_stp;
ldumpp[cnt].ndlck_flags = lop->lo_flags;
ldumpp[cnt].ndlck_first = lop->lo_first;
ldumpp[cnt].ndlck_end = lop->lo_end;
ldumpp[cnt].ndlck_stateid.seqid = stp->ls_stateid.seqid;
ldumpp[cnt].ndlck_stateid.other[0] = stp->ls_stateid.other[0];
ldumpp[cnt].ndlck_stateid.other[1] = stp->ls_stateid.other[1];
ldumpp[cnt].ndlck_stateid.other[2] = stp->ls_stateid.other[2];
ldumpp[cnt].ndlck_owner.nclid_idlen = stp->ls_ownerlen;
NFSBCOPY(stp->ls_owner, ldumpp[cnt].ndlck_owner.nclid_id,
stp->ls_ownerlen);
ldumpp[cnt].ndlck_clid.nclid_idlen = stp->ls_clp->lc_idlen;
NFSBCOPY(stp->ls_clp->lc_id, ldumpp[cnt].ndlck_clid.nclid_id,
stp->ls_clp->lc_idlen);
sad=NFSSOCKADDR(stp->ls_clp->lc_req.nr_nam, struct sockaddr *);
ldumpp[cnt].ndlck_addrfam = sad->sa_family;
if (sad->sa_family == AF_INET) {
rad = (struct sockaddr_in *)sad;
ldumpp[cnt].ndlck_cbaddr.sin_addr = rad->sin_addr;
} else {
rad6 = (struct sockaddr_in6 *)sad;
ldumpp[cnt].ndlck_cbaddr.sin6_addr = rad6->sin6_addr;
}
lop = LIST_NEXT(lop, lo_lckfile);
cnt++;
}
/*
* and the delegations.
*/
stp = LIST_FIRST(&lfp->lf_deleg);
while (stp != LIST_END(&lfp->lf_deleg) && cnt < maxcnt) {
ldumpp[cnt].ndlck_flags = stp->ls_flags;
ldumpp[cnt].ndlck_stateid.seqid = stp->ls_stateid.seqid;
ldumpp[cnt].ndlck_stateid.other[0] = stp->ls_stateid.other[0];
ldumpp[cnt].ndlck_stateid.other[1] = stp->ls_stateid.other[1];
ldumpp[cnt].ndlck_stateid.other[2] = stp->ls_stateid.other[2];
ldumpp[cnt].ndlck_owner.nclid_idlen = 0;
ldumpp[cnt].ndlck_clid.nclid_idlen = stp->ls_clp->lc_idlen;
NFSBCOPY(stp->ls_clp->lc_id, ldumpp[cnt].ndlck_clid.nclid_id,
stp->ls_clp->lc_idlen);
sad=NFSSOCKADDR(stp->ls_clp->lc_req.nr_nam, struct sockaddr *);
ldumpp[cnt].ndlck_addrfam = sad->sa_family;
if (sad->sa_family == AF_INET) {
rad = (struct sockaddr_in *)sad;
ldumpp[cnt].ndlck_cbaddr.sin_addr = rad->sin_addr;
} else {
rad6 = (struct sockaddr_in6 *)sad;
ldumpp[cnt].ndlck_cbaddr.sin6_addr = rad6->sin6_addr;
}
stp = LIST_NEXT(stp, ls_file);
cnt++;
}
/*
* If list isn't full, mark end of list by setting the client name
* to zero length.
*/
if (cnt < maxcnt)
ldumpp[cnt].ndlck_clid.nclid_idlen = 0;
NFSUNLOCKSTATE();
NFSLOCKV4ROOTMUTEX();
nfsv4_relref(&nfsv4rootfs_lock);
NFSUNLOCKV4ROOTMUTEX();
}
/*
* Server timer routine. It can scan any linked list, so long
* as it holds the spin/mutex lock and there is no exclusive lock on
* nfsv4rootfs_lock.
* (For OpenBSD, a kthread is ok. For FreeBSD, I think it is ok
* to do this from a callout, since the spin locks work. For
* Darwin, I'm not sure what will work correctly yet.)
* Should be called once per second.
*/
APPLESTATIC void
nfsrv_servertimer(void)
{
struct nfsclient *clp, *nclp;
struct nfsstate *stp, *nstp;
int got_ref, i;
/*
* Make sure nfsboottime is set. This is used by V3 as well
* as V4. Note that nfsboottime is not nfsrvboottime, which is
* only used by the V4 server for leases.
*/
if (nfsboottime.tv_sec == 0)
NFSSETBOOTTIME(nfsboottime);
/*
* If server hasn't started yet, just return.
*/
NFSLOCKSTATE();
if (nfsrv_stablefirst.nsf_eograce == 0) {
NFSUNLOCKSTATE();
return;
}
if (!(nfsrv_stablefirst.nsf_flags & NFSNSF_UPDATEDONE)) {
if (!(nfsrv_stablefirst.nsf_flags & NFSNSF_GRACEOVER) &&
NFSD_MONOSEC > nfsrv_stablefirst.nsf_eograce)
nfsrv_stablefirst.nsf_flags |=
(NFSNSF_GRACEOVER | NFSNSF_NEEDLOCK);
NFSUNLOCKSTATE();
return;
}
/*
* Try and get a reference count on the nfsv4rootfs_lock so that
* no nfsd thread can acquire an exclusive lock on it before this
* call is done. If it is already exclusively locked, just return.
*/
NFSLOCKV4ROOTMUTEX();
got_ref = nfsv4_getref_nonblock(&nfsv4rootfs_lock);
NFSUNLOCKV4ROOTMUTEX();
if (got_ref == 0) {
NFSUNLOCKSTATE();
return;
}
/*
* For each client...
*/
for (i = 0; i < nfsrv_clienthashsize; i++) {
clp = LIST_FIRST(&nfsclienthash[i]);
while (clp != LIST_END(&nfsclienthash[i])) {
nclp = LIST_NEXT(clp, lc_hash);
if (!(clp->lc_flags & LCL_EXPIREIT)) {
if (((clp->lc_expiry + NFSRV_STALELEASE) < NFSD_MONOSEC
&& ((LIST_EMPTY(&clp->lc_deleg)
&& LIST_EMPTY(&clp->lc_open)) ||
nfsrv_clients > nfsrv_clienthighwater)) ||
(clp->lc_expiry + NFSRV_MOULDYLEASE) < NFSD_MONOSEC ||
(clp->lc_expiry < NFSD_MONOSEC &&
(nfsrv_openpluslock * 10 / 9) > nfsrv_v4statelimit)) {
/*
* Lease has expired several nfsrv_lease times ago:
* PLUS
* - no state is associated with it
* OR
* - above high water mark for number of clients
* (nfsrv_clienthighwater should be large enough
* that this only occurs when clients fail to
* use the same nfs_client_id4.id. Maybe somewhat
* higher that the maximum number of clients that
* will mount this server?)
* OR
* Lease has expired a very long time ago
* OR
* Lease has expired PLUS the number of opens + locks
* has exceeded 90% of capacity
*
* --> Mark for expiry. The actual expiry will be done
* by an nfsd sometime soon.
*/
clp->lc_flags |= LCL_EXPIREIT;
nfsrv_stablefirst.nsf_flags |=
(NFSNSF_NEEDLOCK | NFSNSF_EXPIREDCLIENT);
} else {
/*
* If there are no opens, increment no open tick cnt
* If time exceeds NFSNOOPEN, mark it to be thrown away
* otherwise, if there is an open, reset no open time
* Hopefully, this will avoid excessive re-creation
* of open owners and subsequent open confirms.
*/
stp = LIST_FIRST(&clp->lc_open);
while (stp != LIST_END(&clp->lc_open)) {
nstp = LIST_NEXT(stp, ls_list);
if (LIST_EMPTY(&stp->ls_open)) {
stp->ls_noopens++;
if (stp->ls_noopens > NFSNOOPEN ||
(nfsrv_openpluslock * 2) >
nfsrv_v4statelimit)
nfsrv_stablefirst.nsf_flags |=
NFSNSF_NOOPENS;
} else {
stp->ls_noopens = 0;
}
stp = nstp;
}
}
}
clp = nclp;
}
}
NFSUNLOCKSTATE();
NFSLOCKV4ROOTMUTEX();
nfsv4_relref(&nfsv4rootfs_lock);
NFSUNLOCKV4ROOTMUTEX();
}
/*
* The following set of functions free up the various data structures.
*/
/*
* Clear out all open/lock state related to this nfsclient.
* Caller must hold an exclusive lock on nfsv4rootfs_lock, so that
* there are no other active nfsd threads.
*/
APPLESTATIC void
nfsrv_cleanclient(struct nfsclient *clp, NFSPROC_T *p)
{
struct nfsstate *stp, *nstp;
struct nfsdsession *sep, *nsep;
LIST_FOREACH_SAFE(stp, &clp->lc_open, ls_list, nstp)
nfsrv_freeopenowner(stp, 1, p);
if ((clp->lc_flags & LCL_ADMINREVOKED) == 0)
LIST_FOREACH_SAFE(sep, &clp->lc_session, sess_list, nsep)
(void)nfsrv_freesession(sep, NULL);
}
/*
* Free a client that has been cleaned. It should also already have been
* removed from the lists.
* (Just to be safe w.r.t. newnfs_disconnect(), call this function when
* softclock interrupts are enabled.)
*/
APPLESTATIC void
nfsrv_zapclient(struct nfsclient *clp, NFSPROC_T *p)
{
#ifdef notyet
if ((clp->lc_flags & (LCL_GSS | LCL_CALLBACKSON)) ==
(LCL_GSS | LCL_CALLBACKSON) &&
(clp->lc_hand.nfsh_flag & NFSG_COMPLETE) &&
clp->lc_handlelen > 0) {
clp->lc_hand.nfsh_flag &= ~NFSG_COMPLETE;
clp->lc_hand.nfsh_flag |= NFSG_DESTROYED;
(void) nfsrv_docallback(clp, NFSV4PROC_CBNULL,
NULL, 0, NULL, NULL, NULL, p);
}
#endif
newnfs_disconnect(&clp->lc_req);
NFSSOCKADDRFREE(clp->lc_req.nr_nam);
NFSFREEMUTEX(&clp->lc_req.nr_mtx);
free(clp->lc_stateid, M_NFSDCLIENT);
free(clp, M_NFSDCLIENT);
NFSLOCKSTATE();
nfsstatsv1.srvclients--;
nfsrv_openpluslock--;
nfsrv_clients--;
NFSUNLOCKSTATE();
}
/*
* Free a list of delegation state structures.
* (This function will also free all nfslockfile structures that no
* longer have associated state.)
*/
APPLESTATIC void
nfsrv_freedeleglist(struct nfsstatehead *sthp)
{
struct nfsstate *stp, *nstp;
LIST_FOREACH_SAFE(stp, sthp, ls_list, nstp) {
nfsrv_freedeleg(stp);
}
LIST_INIT(sthp);
}
/*
* Free up a delegation.
*/
static void
nfsrv_freedeleg(struct nfsstate *stp)
{
struct nfslockfile *lfp;
LIST_REMOVE(stp, ls_hash);
LIST_REMOVE(stp, ls_list);
LIST_REMOVE(stp, ls_file);
if ((stp->ls_flags & NFSLCK_DELEGWRITE) != 0)
nfsrv_writedelegcnt--;
lfp = stp->ls_lfp;
if (LIST_EMPTY(&lfp->lf_open) &&
LIST_EMPTY(&lfp->lf_lock) && LIST_EMPTY(&lfp->lf_deleg) &&
LIST_EMPTY(&lfp->lf_locallock) && LIST_EMPTY(&lfp->lf_rollback) &&
lfp->lf_usecount == 0 &&
nfsv4_testlock(&lfp->lf_locallock_lck) == 0)
nfsrv_freenfslockfile(lfp);
FREE((caddr_t)stp, M_NFSDSTATE);
nfsstatsv1.srvdelegates--;
nfsrv_openpluslock--;
nfsrv_delegatecnt--;
}
/*
* This function frees an open owner and all associated opens.
*/
static void
nfsrv_freeopenowner(struct nfsstate *stp, int cansleep, NFSPROC_T *p)
{
struct nfsstate *nstp, *tstp;
LIST_REMOVE(stp, ls_list);
/*
* Now, free all associated opens.
*/
nstp = LIST_FIRST(&stp->ls_open);
while (nstp != LIST_END(&stp->ls_open)) {
tstp = nstp;
nstp = LIST_NEXT(nstp, ls_list);
(void) nfsrv_freeopen(tstp, NULL, cansleep, p);
}
if (stp->ls_op)
nfsrvd_derefcache(stp->ls_op);
FREE((caddr_t)stp, M_NFSDSTATE);
nfsstatsv1.srvopenowners--;
nfsrv_openpluslock--;
}
/*
* This function frees an open (nfsstate open structure) with all associated
* lock_owners and locks. It also frees the nfslockfile structure iff there
* are no other opens on the file.
* Returns 1 if it free'd the nfslockfile, 0 otherwise.
*/
static int
nfsrv_freeopen(struct nfsstate *stp, vnode_t vp, int cansleep, NFSPROC_T *p)
{
struct nfsstate *nstp, *tstp;
struct nfslockfile *lfp;
int ret;
LIST_REMOVE(stp, ls_hash);
LIST_REMOVE(stp, ls_list);
LIST_REMOVE(stp, ls_file);
lfp = stp->ls_lfp;
/*
* Now, free all lockowners associated with this open.
*/
LIST_FOREACH_SAFE(tstp, &stp->ls_open, ls_list, nstp)
nfsrv_freelockowner(tstp, vp, cansleep, p);
/*
* The nfslockfile is freed here if there are no locks
* associated with the open.
* If there are locks associated with the open, the
* nfslockfile structure can be freed via nfsrv_freelockowner().
* Acquire the state mutex to avoid races with calls to
* nfsrv_getlockfile().
*/
if (cansleep != 0)
NFSLOCKSTATE();
if (lfp != NULL && LIST_EMPTY(&lfp->lf_open) &&
LIST_EMPTY(&lfp->lf_deleg) && LIST_EMPTY(&lfp->lf_lock) &&
LIST_EMPTY(&lfp->lf_locallock) && LIST_EMPTY(&lfp->lf_rollback) &&
lfp->lf_usecount == 0 &&
(cansleep != 0 || nfsv4_testlock(&lfp->lf_locallock_lck) == 0)) {
nfsrv_freenfslockfile(lfp);
ret = 1;
} else
ret = 0;
if (cansleep != 0)
NFSUNLOCKSTATE();
FREE((caddr_t)stp, M_NFSDSTATE);
nfsstatsv1.srvopens--;
nfsrv_openpluslock--;
return (ret);
}
/*
* Frees a lockowner and all associated locks.
*/
static void
nfsrv_freelockowner(struct nfsstate *stp, vnode_t vp, int cansleep,
NFSPROC_T *p)
{
LIST_REMOVE(stp, ls_hash);
LIST_REMOVE(stp, ls_list);
nfsrv_freeallnfslocks(stp, vp, cansleep, p);
if (stp->ls_op)
nfsrvd_derefcache(stp->ls_op);
FREE((caddr_t)stp, M_NFSDSTATE);
nfsstatsv1.srvlockowners--;
nfsrv_openpluslock--;
}
/*
* Free all the nfs locks on a lockowner.
*/
static void
nfsrv_freeallnfslocks(struct nfsstate *stp, vnode_t vp, int cansleep,
NFSPROC_T *p)
{
struct nfslock *lop, *nlop;
struct nfsrollback *rlp, *nrlp;
struct nfslockfile *lfp = NULL;
int gottvp = 0;
vnode_t tvp = NULL;
uint64_t first, end;
if (vp != NULL)
ASSERT_VOP_UNLOCKED(vp, "nfsrv_freeallnfslocks: vnode locked");
lop = LIST_FIRST(&stp->ls_lock);
while (lop != LIST_END(&stp->ls_lock)) {
nlop = LIST_NEXT(lop, lo_lckowner);
/*
* Since all locks should be for the same file, lfp should
* not change.
*/
if (lfp == NULL)
lfp = lop->lo_lfp;
else if (lfp != lop->lo_lfp)
panic("allnfslocks");
/*
* If vp is NULL and cansleep != 0, a vnode must be acquired
* from the file handle. This only occurs when called from
* nfsrv_cleanclient().
*/
if (gottvp == 0) {
if (nfsrv_dolocallocks == 0)
tvp = NULL;
else if (vp == NULL && cansleep != 0) {
tvp = nfsvno_getvp(&lfp->lf_fh);
NFSVOPUNLOCK(tvp, 0);
} else
tvp = vp;
gottvp = 1;
}
if (tvp != NULL) {
if (cansleep == 0)
panic("allnfs2");
first = lop->lo_first;
end = lop->lo_end;
nfsrv_freenfslock(lop);
nfsrv_localunlock(tvp, lfp, first, end, p);
LIST_FOREACH_SAFE(rlp, &lfp->lf_rollback, rlck_list,
nrlp)
free(rlp, M_NFSDROLLBACK);
LIST_INIT(&lfp->lf_rollback);
} else
nfsrv_freenfslock(lop);
lop = nlop;
}
if (vp == NULL && tvp != NULL)
vrele(tvp);
}
/*
* Free an nfslock structure.
*/
static void
nfsrv_freenfslock(struct nfslock *lop)
{
if (lop->lo_lckfile.le_prev != NULL) {
LIST_REMOVE(lop, lo_lckfile);
nfsstatsv1.srvlocks--;
nfsrv_openpluslock--;
}
LIST_REMOVE(lop, lo_lckowner);
FREE((caddr_t)lop, M_NFSDLOCK);
}
/*
* This function frees an nfslockfile structure.
*/
static void
nfsrv_freenfslockfile(struct nfslockfile *lfp)
{
LIST_REMOVE(lfp, lf_hash);
FREE((caddr_t)lfp, M_NFSDLOCKFILE);
}
/*
* This function looks up an nfsstate structure via stateid.
*/
static int
nfsrv_getstate(struct nfsclient *clp, nfsv4stateid_t *stateidp, __unused u_int32_t flags,
struct nfsstate **stpp)
{
struct nfsstate *stp;
struct nfsstatehead *hp;
int error = 0;
*stpp = NULL;
hp = NFSSTATEHASH(clp, *stateidp);
LIST_FOREACH(stp, hp, ls_hash) {
if (!NFSBCMP(stp->ls_stateid.other, stateidp->other,
NFSX_STATEIDOTHER))
break;
}
/*
* If no state id in list, return NFSERR_BADSTATEID.
*/
if (stp == LIST_END(hp)) {
error = NFSERR_BADSTATEID;
goto out;
}
*stpp = stp;
out:
NFSEXITCODE(error);
return (error);
}
/*
* This function gets an nfsstate structure via owner string.
*/
static void
nfsrv_getowner(struct nfsstatehead *hp, struct nfsstate *new_stp,
struct nfsstate **stpp)
{
struct nfsstate *stp;
*stpp = NULL;
LIST_FOREACH(stp, hp, ls_list) {
if (new_stp->ls_ownerlen == stp->ls_ownerlen &&
!NFSBCMP(new_stp->ls_owner,stp->ls_owner,stp->ls_ownerlen)) {
*stpp = stp;
return;
}
}
}
/*
* Lock control function called to update lock status.
* Returns 0 upon success, -1 if there is no lock and the flags indicate
* that one isn't to be created and an NFSERR_xxx for other errors.
* The structures new_stp and new_lop are passed in as pointers that should
* be set to NULL if the structure is used and shouldn't be free'd.
* For the NFSLCK_TEST and NFSLCK_CHECK cases, the structures are
* never used and can safely be allocated on the stack. For all other
* cases, *new_stpp and *new_lopp should be malloc'd before the call,
* in case they are used.
*/
APPLESTATIC int
nfsrv_lockctrl(vnode_t vp, struct nfsstate **new_stpp,
struct nfslock **new_lopp, struct nfslockconflict *cfp,
nfsquad_t clientid, nfsv4stateid_t *stateidp,
__unused struct nfsexstuff *exp,
struct nfsrv_descript *nd, NFSPROC_T *p)
{
struct nfslock *lop;
struct nfsstate *new_stp = *new_stpp;
struct nfslock *new_lop = *new_lopp;
struct nfsstate *tstp, *mystp, *nstp;
int specialid = 0;
struct nfslockfile *lfp;
struct nfslock *other_lop = NULL;
struct nfsstate *stp, *lckstp = NULL;
struct nfsclient *clp = NULL;
u_int32_t bits;
int error = 0, haslock = 0, ret, reterr;
int getlckret, delegation = 0, filestruct_locked, vnode_unlocked = 0;
fhandle_t nfh;
uint64_t first, end;
uint32_t lock_flags;
if (new_stp->ls_flags & (NFSLCK_CHECK | NFSLCK_SETATTR)) {
/*
* Note the special cases of "all 1s" or "all 0s" stateids and
* let reads with all 1s go ahead.
*/
if (new_stp->ls_stateid.seqid == 0x0 &&
new_stp->ls_stateid.other[0] == 0x0 &&
new_stp->ls_stateid.other[1] == 0x0 &&
new_stp->ls_stateid.other[2] == 0x0)
specialid = 1;
else if (new_stp->ls_stateid.seqid == 0xffffffff &&
new_stp->ls_stateid.other[0] == 0xffffffff &&
new_stp->ls_stateid.other[1] == 0xffffffff &&
new_stp->ls_stateid.other[2] == 0xffffffff)
specialid = 2;
}
/*
* Check for restart conditions (client and server).
*/
error = nfsrv_checkrestart(clientid, new_stp->ls_flags,
&new_stp->ls_stateid, specialid);
if (error)
goto out;
/*
* Check for state resource limit exceeded.
*/
if ((new_stp->ls_flags & NFSLCK_LOCK) &&
nfsrv_openpluslock > nfsrv_v4statelimit) {
error = NFSERR_RESOURCE;
goto out;
}
/*
* For the lock case, get another nfslock structure,
* just in case we need it.
* Malloc now, before we start sifting through the linked lists,
* in case we have to wait for memory.
*/
tryagain:
if (new_stp->ls_flags & NFSLCK_LOCK)
MALLOC(other_lop, struct nfslock *, sizeof (struct nfslock),
M_NFSDLOCK, M_WAITOK);
filestruct_locked = 0;
reterr = 0;
lfp = NULL;
/*
* Get the lockfile structure for CFH now, so we can do a sanity
* check against the stateid, before incrementing the seqid#, since
* we want to return NFSERR_BADSTATEID on failure and the seqid#
* shouldn't be incremented for this case.
* If nfsrv_getlockfile() returns -1, it means "not found", which
* will be handled later.
* If we are doing Lock/LockU and local locking is enabled, sleep
* lock the nfslockfile structure.
*/
getlckret = nfsrv_getlockfh(vp, new_stp->ls_flags, NULL, &nfh, p);
NFSLOCKSTATE();
if (getlckret == 0) {
if ((new_stp->ls_flags & (NFSLCK_LOCK | NFSLCK_UNLOCK)) != 0 &&
nfsrv_dolocallocks != 0 && nd->nd_repstat == 0) {
getlckret = nfsrv_getlockfile(new_stp->ls_flags, NULL,
&lfp, &nfh, 1);
if (getlckret == 0)
filestruct_locked = 1;
} else
getlckret = nfsrv_getlockfile(new_stp->ls_flags, NULL,
&lfp, &nfh, 0);
}
if (getlckret != 0 && getlckret != -1)
reterr = getlckret;
if (filestruct_locked != 0) {
LIST_INIT(&lfp->lf_rollback);
if ((new_stp->ls_flags & NFSLCK_LOCK)) {
/*
* For local locking, do the advisory locking now, so
* that any conflict can be detected. A failure later
* can be rolled back locally. If an error is returned,
* struct nfslockfile has been unlocked and any local
* locking rolled back.
*/
NFSUNLOCKSTATE();
if (vnode_unlocked == 0) {
ASSERT_VOP_ELOCKED(vp, "nfsrv_lockctrl1");
vnode_unlocked = 1;
NFSVOPUNLOCK(vp, 0);
}
reterr = nfsrv_locallock(vp, lfp,
(new_lop->lo_flags & (NFSLCK_READ | NFSLCK_WRITE)),
new_lop->lo_first, new_lop->lo_end, cfp, p);
NFSLOCKSTATE();
}
}
if (specialid == 0) {
if (new_stp->ls_flags & NFSLCK_TEST) {
/*
* RFC 3530 does not list LockT as an op that renews a
* lease, but the consensus seems to be that it is ok
* for a server to do so.
*/
error = nfsrv_getclient(clientid, CLOPS_RENEW, &clp, NULL,
(nfsquad_t)((u_quad_t)0), 0, nd, p);
/*
* Since NFSERR_EXPIRED, NFSERR_ADMINREVOKED are not valid
* error returns for LockT, just go ahead and test for a lock,
* since there are no locks for this client, but other locks
* can conflict. (ie. same client will always be false)
*/
if (error == NFSERR_EXPIRED || error == NFSERR_ADMINREVOKED)
error = 0;
lckstp = new_stp;
} else {
error = nfsrv_getclient(clientid, CLOPS_RENEW, &clp, NULL,
(nfsquad_t)((u_quad_t)0), 0, nd, p);
if (error == 0)
/*
* Look up the stateid
*/
error = nfsrv_getstate(clp, &new_stp->ls_stateid,
new_stp->ls_flags, &stp);
/*
* do some sanity checks for an unconfirmed open or a
* stateid that refers to the wrong file, for an open stateid
*/
if (error == 0 && (stp->ls_flags & NFSLCK_OPEN) &&
((stp->ls_openowner->ls_flags & NFSLCK_NEEDSCONFIRM) ||
(getlckret == 0 && stp->ls_lfp != lfp))){
/*
* NFSLCK_SETATTR should return OK rather than NFSERR_BADSTATEID
* The only exception is using SETATTR with SIZE.
* */
if ((new_stp->ls_flags &
(NFSLCK_SETATTR | NFSLCK_CHECK)) != NFSLCK_SETATTR)
error = NFSERR_BADSTATEID;
}
if (error == 0 &&
(stp->ls_flags & (NFSLCK_DELEGREAD | NFSLCK_DELEGWRITE)) &&
getlckret == 0 && stp->ls_lfp != lfp)
error = NFSERR_BADSTATEID;
/*
* If the lockowner stateid doesn't refer to the same file,
* I believe that is considered ok, since some clients will
* only create a single lockowner and use that for all locks
* on all files.
* For now, log it as a diagnostic, instead of considering it
* a BadStateid.
*/
if (error == 0 && (stp->ls_flags &
(NFSLCK_OPEN | NFSLCK_DELEGREAD | NFSLCK_DELEGWRITE)) == 0 &&
getlckret == 0 && stp->ls_lfp != lfp) {
#ifdef DIAGNOSTIC
printf("Got a lock statid for different file open\n");
#endif
/*
error = NFSERR_BADSTATEID;
*/
}
if (error == 0) {
if (new_stp->ls_flags & NFSLCK_OPENTOLOCK) {
/*
* If haslock set, we've already checked the seqid.
*/
if (!haslock) {
if (stp->ls_flags & NFSLCK_OPEN)
error = nfsrv_checkseqid(nd, new_stp->ls_seq,
stp->ls_openowner, new_stp->ls_op);
else
error = NFSERR_BADSTATEID;
}
if (!error)
nfsrv_getowner(&stp->ls_open, new_stp, &lckstp);
if (lckstp)
/*
* I believe this should be an error, but it
* isn't obvious what NFSERR_xxx would be
* appropriate, so I'll use NFSERR_INVAL for now.
*/
error = NFSERR_INVAL;
else
lckstp = new_stp;
} else if (new_stp->ls_flags&(NFSLCK_LOCK|NFSLCK_UNLOCK)) {
/*
* If haslock set, ditto above.
*/
if (!haslock) {
if (stp->ls_flags & NFSLCK_OPEN)
error = NFSERR_BADSTATEID;
else
error = nfsrv_checkseqid(nd, new_stp->ls_seq,
stp, new_stp->ls_op);
}
lckstp = stp;
} else {
lckstp = stp;
}
}
/*
* If the seqid part of the stateid isn't the same, return
* NFSERR_OLDSTATEID for cases other than I/O Ops.
* For I/O Ops, only return NFSERR_OLDSTATEID if
* nfsrv_returnoldstateid is set. (The consensus on the email
* list was that most clients would prefer to not receive
* NFSERR_OLDSTATEID for I/O Ops, but the RFC suggests that that
* is what will happen, so I use the nfsrv_returnoldstateid to
* allow for either server configuration.)
*/
if (!error && stp->ls_stateid.seqid!=new_stp->ls_stateid.seqid &&
(((nd->nd_flag & ND_NFSV41) == 0 &&
(!(new_stp->ls_flags & NFSLCK_CHECK) ||
nfsrv_returnoldstateid)) ||
((nd->nd_flag & ND_NFSV41) != 0 &&
new_stp->ls_stateid.seqid != 0)))
error = NFSERR_OLDSTATEID;
}
}
/*
* Now we can check for grace.
*/
if (!error)
error = nfsrv_checkgrace(nd, clp, new_stp->ls_flags);
if ((new_stp->ls_flags & NFSLCK_RECLAIM) && !error &&
nfsrv_checkstable(clp))
error = NFSERR_NOGRACE;
/*
* If we successfully Reclaimed state, note that.
*/
if ((new_stp->ls_flags & NFSLCK_RECLAIM) && !error)
nfsrv_markstable(clp);
/*
* At this point, either error == NFSERR_BADSTATEID or the
* seqid# has been updated, so we can return any error.
* If error == 0, there may be an error in:
* nd_repstat - Set by the calling function.
* reterr - Set above, if getting the nfslockfile structure
* or acquiring the local lock failed.
* (If both of these are set, nd_repstat should probably be
* returned, since that error was detected before this
* function call.)
*/
if (error != 0 || nd->nd_repstat != 0 || reterr != 0) {
if (error == 0) {
if (nd->nd_repstat != 0)
error = nd->nd_repstat;
else
error = reterr;
}
if (filestruct_locked != 0) {
/* Roll back local locks. */
NFSUNLOCKSTATE();
if (vnode_unlocked == 0) {
ASSERT_VOP_ELOCKED(vp, "nfsrv_lockctrl2");
vnode_unlocked = 1;
NFSVOPUNLOCK(vp, 0);
}
nfsrv_locallock_rollback(vp, lfp, p);
NFSLOCKSTATE();
nfsrv_unlocklf(lfp);
}
NFSUNLOCKSTATE();
goto out;
}
/*
* Check the nfsrv_getlockfile return.
* Returned -1 if no structure found.
*/
if (getlckret == -1) {
error = NFSERR_EXPIRED;
/*
* Called from lockt, so no lock is OK.
*/
if (new_stp->ls_flags & NFSLCK_TEST) {
error = 0;
} else if (new_stp->ls_flags &
(NFSLCK_CHECK | NFSLCK_SETATTR)) {
/*
* Called to check for a lock, OK if the stateid is all
* 1s or all 0s, but there should be an nfsstate
* otherwise.
* (ie. If there is no open, I'll assume no share
* deny bits.)
*/
if (specialid)
error = 0;
else
error = NFSERR_BADSTATEID;
}
NFSUNLOCKSTATE();
goto out;
}
/*
* For NFSLCK_CHECK and NFSLCK_LOCK, test for a share conflict.
* For NFSLCK_CHECK, allow a read if write access is granted,
* but check for a deny. For NFSLCK_LOCK, require correct access,
* which implies a conflicting deny can't exist.
*/
if (new_stp->ls_flags & (NFSLCK_CHECK | NFSLCK_LOCK)) {
/*
* Four kinds of state id:
* - specialid (all 0s or all 1s), only for NFSLCK_CHECK
* - stateid for an open
* - stateid for a delegation
* - stateid for a lock owner
*/
if (!specialid) {
if (stp->ls_flags & (NFSLCK_DELEGREAD | NFSLCK_DELEGWRITE)) {
delegation = 1;
mystp = stp;
nfsrv_delaydelegtimeout(stp);
} else if (stp->ls_flags & NFSLCK_OPEN) {
mystp = stp;
} else {
mystp = stp->ls_openstp;
}
/*
* If locking or checking, require correct access
* bit set.
*/
if (((new_stp->ls_flags & NFSLCK_LOCK) &&
!((new_lop->lo_flags >> NFSLCK_LOCKSHIFT) &
mystp->ls_flags & NFSLCK_ACCESSBITS)) ||
((new_stp->ls_flags & (NFSLCK_CHECK|NFSLCK_READACCESS)) ==
(NFSLCK_CHECK | NFSLCK_READACCESS) &&
!(mystp->ls_flags & NFSLCK_READACCESS) &&
nfsrv_allowreadforwriteopen == 0) ||
((new_stp->ls_flags & (NFSLCK_CHECK|NFSLCK_WRITEACCESS)) ==
(NFSLCK_CHECK | NFSLCK_WRITEACCESS) &&
!(mystp->ls_flags & NFSLCK_WRITEACCESS))) {
if (filestruct_locked != 0) {
/* Roll back local locks. */
NFSUNLOCKSTATE();
if (vnode_unlocked == 0) {
ASSERT_VOP_ELOCKED(vp,
"nfsrv_lockctrl3");
vnode_unlocked = 1;
NFSVOPUNLOCK(vp, 0);
}
nfsrv_locallock_rollback(vp, lfp, p);
NFSLOCKSTATE();
nfsrv_unlocklf(lfp);
}
NFSUNLOCKSTATE();
error = NFSERR_OPENMODE;
goto out;
}
} else
mystp = NULL;
if ((new_stp->ls_flags & NFSLCK_CHECK) && !delegation) {
/*
* Check for a conflicting deny bit.
*/
LIST_FOREACH(tstp, &lfp->lf_open, ls_file) {
if (tstp != mystp) {
bits = tstp->ls_flags;
bits >>= NFSLCK_SHIFT;
if (new_stp->ls_flags & bits & NFSLCK_ACCESSBITS) {
KASSERT(vnode_unlocked == 0,
("nfsrv_lockctrl: vnode unlocked1"));
ret = nfsrv_clientconflict(tstp->ls_clp, &haslock,
vp, p);
if (ret == 1) {
/*
* nfsrv_clientconflict unlocks state
* when it returns non-zero.
*/
lckstp = NULL;
goto tryagain;
}
if (ret == 0)
NFSUNLOCKSTATE();
if (ret == 2)
error = NFSERR_PERM;
else
error = NFSERR_OPENMODE;
goto out;
}
}
}
/* We're outta here */
NFSUNLOCKSTATE();
goto out;
}
}
/*
* For setattr, just get rid of all the Delegations for other clients.
*/
if (new_stp->ls_flags & NFSLCK_SETATTR) {
KASSERT(vnode_unlocked == 0,
("nfsrv_lockctrl: vnode unlocked2"));
ret = nfsrv_cleandeleg(vp, lfp, clp, &haslock, p);
if (ret) {
/*
* nfsrv_cleandeleg() unlocks state when it
* returns non-zero.
*/
if (ret == -1) {
lckstp = NULL;
goto tryagain;
}
error = ret;
goto out;
}
if (!(new_stp->ls_flags & NFSLCK_CHECK) ||
(LIST_EMPTY(&lfp->lf_open) && LIST_EMPTY(&lfp->lf_lock) &&
LIST_EMPTY(&lfp->lf_deleg))) {
NFSUNLOCKSTATE();
goto out;
}
}
/*
* Check for a conflicting delegation. If one is found, call
* nfsrv_delegconflict() to handle it. If the v4root lock hasn't
* been set yet, it will get the lock. Otherwise, it will recall
* the delegation. Then, we try try again...
* I currently believe the conflict algorithm to be:
* For Lock Ops (Lock/LockT/LockU)
* - there is a conflict iff a different client has a write delegation
* For Reading (Read Op)
* - there is a conflict iff a different client has a write delegation
* (the specialids are always a different client)
* For Writing (Write/Setattr of size)
* - there is a conflict if a different client has any delegation
* - there is a conflict if the same client has a read delegation
* (I don't understand why this isn't allowed, but that seems to be
* the current consensus?)
*/
tstp = LIST_FIRST(&lfp->lf_deleg);
while (tstp != LIST_END(&lfp->lf_deleg)) {
nstp = LIST_NEXT(tstp, ls_file);
if ((((new_stp->ls_flags&(NFSLCK_LOCK|NFSLCK_UNLOCK|NFSLCK_TEST))||
((new_stp->ls_flags & NFSLCK_CHECK) &&
(new_lop->lo_flags & NFSLCK_READ))) &&
clp != tstp->ls_clp &&
(tstp->ls_flags & NFSLCK_DELEGWRITE)) ||
((new_stp->ls_flags & NFSLCK_CHECK) &&
(new_lop->lo_flags & NFSLCK_WRITE) &&
(clp != tstp->ls_clp ||
(tstp->ls_flags & NFSLCK_DELEGREAD)))) {
ret = 0;
if (filestruct_locked != 0) {
/* Roll back local locks. */
NFSUNLOCKSTATE();
if (vnode_unlocked == 0) {
ASSERT_VOP_ELOCKED(vp, "nfsrv_lockctrl4");
NFSVOPUNLOCK(vp, 0);
}
nfsrv_locallock_rollback(vp, lfp, p);
NFSLOCKSTATE();
nfsrv_unlocklf(lfp);
NFSUNLOCKSTATE();
NFSVOPLOCK(vp, LK_EXCLUSIVE | LK_RETRY);
vnode_unlocked = 0;
if ((vp->v_iflag & VI_DOOMED) != 0)
ret = NFSERR_SERVERFAULT;
NFSLOCKSTATE();
}
if (ret == 0)
ret = nfsrv_delegconflict(tstp, &haslock, p, vp);
if (ret) {
/*
* nfsrv_delegconflict unlocks state when it
* returns non-zero, which it always does.
*/
if (other_lop) {
FREE((caddr_t)other_lop, M_NFSDLOCK);
other_lop = NULL;
}
if (ret == -1) {
lckstp = NULL;
goto tryagain;
}
error = ret;
goto out;
}
/* Never gets here. */
}
tstp = nstp;
}
/*
* Handle the unlock case by calling nfsrv_updatelock().
* (Should I have done some access checking above for unlock? For now,
* just let it happen.)
*/
if (new_stp->ls_flags & NFSLCK_UNLOCK) {
first = new_lop->lo_first;
end = new_lop->lo_end;
nfsrv_updatelock(stp, new_lopp, &other_lop, lfp);
stateidp->seqid = ++(stp->ls_stateid.seqid);
if ((nd->nd_flag & ND_NFSV41) != 0 && stateidp->seqid == 0)
stateidp->seqid = stp->ls_stateid.seqid = 1;
stateidp->other[0] = stp->ls_stateid.other[0];
stateidp->other[1] = stp->ls_stateid.other[1];
stateidp->other[2] = stp->ls_stateid.other[2];
if (filestruct_locked != 0) {
NFSUNLOCKSTATE();
if (vnode_unlocked == 0) {
ASSERT_VOP_ELOCKED(vp, "nfsrv_lockctrl5");
vnode_unlocked = 1;
NFSVOPUNLOCK(vp, 0);
}
/* Update the local locks. */
nfsrv_localunlock(vp, lfp, first, end, p);
NFSLOCKSTATE();
nfsrv_unlocklf(lfp);
}
NFSUNLOCKSTATE();
goto out;
}
/*
* Search for a conflicting lock. A lock conflicts if:
* - the lock range overlaps and
* - at least one lock is a write lock and
* - it is not owned by the same lock owner
*/
if (!delegation) {
LIST_FOREACH(lop, &lfp->lf_lock, lo_lckfile) {
if (new_lop->lo_end > lop->lo_first &&
new_lop->lo_first < lop->lo_end &&
(new_lop->lo_flags == NFSLCK_WRITE ||
lop->lo_flags == NFSLCK_WRITE) &&
lckstp != lop->lo_stp &&
(clp != lop->lo_stp->ls_clp ||
lckstp->ls_ownerlen != lop->lo_stp->ls_ownerlen ||
NFSBCMP(lckstp->ls_owner, lop->lo_stp->ls_owner,
lckstp->ls_ownerlen))) {
if (other_lop) {
FREE((caddr_t)other_lop, M_NFSDLOCK);
other_lop = NULL;
}
if (vnode_unlocked != 0)
ret = nfsrv_clientconflict(lop->lo_stp->ls_clp, &haslock,
NULL, p);
else
ret = nfsrv_clientconflict(lop->lo_stp->ls_clp, &haslock,
vp, p);
if (ret == 1) {
if (filestruct_locked != 0) {
if (vnode_unlocked == 0) {
ASSERT_VOP_ELOCKED(vp, "nfsrv_lockctrl6");
NFSVOPUNLOCK(vp, 0);
}
/* Roll back local locks. */
nfsrv_locallock_rollback(vp, lfp, p);
NFSLOCKSTATE();
nfsrv_unlocklf(lfp);
NFSUNLOCKSTATE();
NFSVOPLOCK(vp, LK_EXCLUSIVE | LK_RETRY);
vnode_unlocked = 0;
if ((vp->v_iflag & VI_DOOMED) != 0) {
error = NFSERR_SERVERFAULT;
goto out;
}
}
/*
* nfsrv_clientconflict() unlocks state when it
* returns non-zero.
*/
lckstp = NULL;
goto tryagain;
}
/*
* Found a conflicting lock, so record the conflict and
* return the error.
*/
if (cfp != NULL && ret == 0) {
cfp->cl_clientid.lval[0]=lop->lo_stp->ls_stateid.other[0];
cfp->cl_clientid.lval[1]=lop->lo_stp->ls_stateid.other[1];
cfp->cl_first = lop->lo_first;
cfp->cl_end = lop->lo_end;
cfp->cl_flags = lop->lo_flags;
cfp->cl_ownerlen = lop->lo_stp->ls_ownerlen;
NFSBCOPY(lop->lo_stp->ls_owner, cfp->cl_owner,
cfp->cl_ownerlen);
}
if (ret == 2)
error = NFSERR_PERM;
else if (new_stp->ls_flags & NFSLCK_RECLAIM)
error = NFSERR_RECLAIMCONFLICT;
else if (new_stp->ls_flags & NFSLCK_CHECK)
error = NFSERR_LOCKED;
else
error = NFSERR_DENIED;
if (filestruct_locked != 0 && ret == 0) {
/* Roll back local locks. */
NFSUNLOCKSTATE();
if (vnode_unlocked == 0) {
ASSERT_VOP_ELOCKED(vp, "nfsrv_lockctrl7");
vnode_unlocked = 1;
NFSVOPUNLOCK(vp, 0);
}
nfsrv_locallock_rollback(vp, lfp, p);
NFSLOCKSTATE();
nfsrv_unlocklf(lfp);
}
if (ret == 0)
NFSUNLOCKSTATE();
goto out;
}
}
}
/*
* We only get here if there was no lock that conflicted.
*/
if (new_stp->ls_flags & (NFSLCK_TEST | NFSLCK_CHECK)) {
NFSUNLOCKSTATE();
goto out;
}
/*
* We only get here when we are creating or modifying a lock.
* There are two variants:
* - exist_lock_owner where lock_owner exists
* - open_to_lock_owner with new lock_owner
*/
first = new_lop->lo_first;
end = new_lop->lo_end;
lock_flags = new_lop->lo_flags;
if (!(new_stp->ls_flags & NFSLCK_OPENTOLOCK)) {
nfsrv_updatelock(lckstp, new_lopp, &other_lop, lfp);
stateidp->seqid = ++(lckstp->ls_stateid.seqid);
if ((nd->nd_flag & ND_NFSV41) != 0 && stateidp->seqid == 0)
stateidp->seqid = lckstp->ls_stateid.seqid = 1;
stateidp->other[0] = lckstp->ls_stateid.other[0];
stateidp->other[1] = lckstp->ls_stateid.other[1];
stateidp->other[2] = lckstp->ls_stateid.other[2];
} else {
/*
* The new open_to_lock_owner case.
* Link the new nfsstate into the lists.
*/
new_stp->ls_seq = new_stp->ls_opentolockseq;
nfsrvd_refcache(new_stp->ls_op);
stateidp->seqid = new_stp->ls_stateid.seqid = 1;
stateidp->other[0] = new_stp->ls_stateid.other[0] =
clp->lc_clientid.lval[0];
stateidp->other[1] = new_stp->ls_stateid.other[1] =
clp->lc_clientid.lval[1];
stateidp->other[2] = new_stp->ls_stateid.other[2] =
nfsrv_nextstateindex(clp);
new_stp->ls_clp = clp;
LIST_INIT(&new_stp->ls_lock);
new_stp->ls_openstp = stp;
new_stp->ls_lfp = lfp;
nfsrv_insertlock(new_lop, (struct nfslock *)new_stp, new_stp,
lfp);
LIST_INSERT_HEAD(NFSSTATEHASH(clp, new_stp->ls_stateid),
new_stp, ls_hash);
LIST_INSERT_HEAD(&stp->ls_open, new_stp, ls_list);
*new_lopp = NULL;
*new_stpp = NULL;
nfsstatsv1.srvlockowners++;
nfsrv_openpluslock++;
}
if (filestruct_locked != 0) {
NFSUNLOCKSTATE();
nfsrv_locallock_commit(lfp, lock_flags, first, end);
NFSLOCKSTATE();
nfsrv_unlocklf(lfp);
}
NFSUNLOCKSTATE();
out:
if (haslock) {
NFSLOCKV4ROOTMUTEX();
nfsv4_unlock(&nfsv4rootfs_lock, 1);
NFSUNLOCKV4ROOTMUTEX();
}
if (vnode_unlocked != 0) {
NFSVOPLOCK(vp, LK_EXCLUSIVE | LK_RETRY);
if (error == 0 && (vp->v_iflag & VI_DOOMED) != 0)
error = NFSERR_SERVERFAULT;
}
if (other_lop)
FREE((caddr_t)other_lop, M_NFSDLOCK);
NFSEXITCODE2(error, nd);
return (error);
}
/*
* Check for state errors for Open.
* repstat is passed back out as an error if more critical errors
* are not detected.
*/
APPLESTATIC int
nfsrv_opencheck(nfsquad_t clientid, nfsv4stateid_t *stateidp,
struct nfsstate *new_stp, vnode_t vp, struct nfsrv_descript *nd,
NFSPROC_T *p, int repstat)
{
struct nfsstate *stp, *nstp;
struct nfsclient *clp;
struct nfsstate *ownerstp;
struct nfslockfile *lfp, *new_lfp;
int error = 0, haslock = 0, ret, readonly = 0, getfhret = 0;
if ((new_stp->ls_flags & NFSLCK_SHAREBITS) == NFSLCK_READACCESS)
readonly = 1;
/*
* Check for restart conditions (client and server).
*/
error = nfsrv_checkrestart(clientid, new_stp->ls_flags,
&new_stp->ls_stateid, 0);
if (error)
goto out;
/*
* Check for state resource limit exceeded.
* Technically this should be SMP protected, but the worst
* case error is "out by one or two" on the count when it
* returns NFSERR_RESOURCE and the limit is just a rather
* arbitrary high water mark, so no harm is done.
*/
if (nfsrv_openpluslock > nfsrv_v4statelimit) {
error = NFSERR_RESOURCE;
goto out;
}
tryagain:
MALLOC(new_lfp, struct nfslockfile *, sizeof (struct nfslockfile),
M_NFSDLOCKFILE, M_WAITOK);
if (vp)
getfhret = nfsrv_getlockfh(vp, new_stp->ls_flags, new_lfp,
NULL, p);
NFSLOCKSTATE();
/*
* Get the nfsclient structure.
*/
error = nfsrv_getclient(clientid, CLOPS_RENEW, &clp, NULL,
(nfsquad_t)((u_quad_t)0), 0, nd, p);
/*
* Look up the open owner. See if it needs confirmation and
* check the seq#, as required.
*/
if (!error)
nfsrv_getowner(&clp->lc_open, new_stp, &ownerstp);
if (!error && ownerstp) {
error = nfsrv_checkseqid(nd, new_stp->ls_seq, ownerstp,
new_stp->ls_op);
/*
* If the OpenOwner hasn't been confirmed, assume the
* old one was a replay and this one is ok.
* See: RFC3530 Sec. 14.2.18.
*/
if (error == NFSERR_BADSEQID &&
(ownerstp->ls_flags & NFSLCK_NEEDSCONFIRM))
error = 0;
}
/*
* Check for grace.
*/
if (!error)
error = nfsrv_checkgrace(nd, clp, new_stp->ls_flags);
if ((new_stp->ls_flags & NFSLCK_RECLAIM) && !error &&
nfsrv_checkstable(clp))
error = NFSERR_NOGRACE;
/*
* If none of the above errors occurred, let repstat be
* returned.
*/
if (repstat && !error)
error = repstat;
if (error) {
NFSUNLOCKSTATE();
if (haslock) {
NFSLOCKV4ROOTMUTEX();
nfsv4_unlock(&nfsv4rootfs_lock, 1);
NFSUNLOCKV4ROOTMUTEX();
}
free((caddr_t)new_lfp, M_NFSDLOCKFILE);
goto out;
}
/*
* If vp == NULL, the file doesn't exist yet, so return ok.
* (This always happens on the first pass, so haslock must be 0.)
*/
if (vp == NULL) {
NFSUNLOCKSTATE();
FREE((caddr_t)new_lfp, M_NFSDLOCKFILE);
goto out;
}
/*
* Get the structure for the underlying file.
*/
if (getfhret)
error = getfhret;
else
error = nfsrv_getlockfile(new_stp->ls_flags, &new_lfp, &lfp,
NULL, 0);
if (new_lfp)
FREE((caddr_t)new_lfp, M_NFSDLOCKFILE);
if (error) {
NFSUNLOCKSTATE();
if (haslock) {
NFSLOCKV4ROOTMUTEX();
nfsv4_unlock(&nfsv4rootfs_lock, 1);
NFSUNLOCKV4ROOTMUTEX();
}
goto out;
}
/*
* Search for a conflicting open/share.
*/
if (new_stp->ls_flags & NFSLCK_DELEGCUR) {
/*
* For Delegate_Cur, search for the matching Delegation,
* which indicates no conflict.
* An old delegation should have been recovered by the
* client doing a Claim_DELEGATE_Prev, so I won't let
* it match and return NFSERR_EXPIRED. Should I let it
* match?
*/
LIST_FOREACH(stp, &lfp->lf_deleg, ls_file) {
if (!(stp->ls_flags & NFSLCK_OLDDELEG) &&
(((nd->nd_flag & ND_NFSV41) != 0 &&
stateidp->seqid == 0) ||
stateidp->seqid == stp->ls_stateid.seqid) &&
!NFSBCMP(stateidp->other, stp->ls_stateid.other,
NFSX_STATEIDOTHER))
break;
}
if (stp == LIST_END(&lfp->lf_deleg) ||
((new_stp->ls_flags & NFSLCK_WRITEACCESS) &&
(stp->ls_flags & NFSLCK_DELEGREAD))) {
NFSUNLOCKSTATE();
if (haslock) {
NFSLOCKV4ROOTMUTEX();
nfsv4_unlock(&nfsv4rootfs_lock, 1);
NFSUNLOCKV4ROOTMUTEX();
}
error = NFSERR_EXPIRED;
goto out;
}
}
/*
* Check for access/deny bit conflicts. I check for the same
* owner as well, in case the client didn't bother.
*/
LIST_FOREACH(stp, &lfp->lf_open, ls_file) {
if (!(new_stp->ls_flags & NFSLCK_DELEGCUR) &&
(((new_stp->ls_flags & NFSLCK_ACCESSBITS) &
((stp->ls_flags>>NFSLCK_SHIFT) & NFSLCK_ACCESSBITS))||
((stp->ls_flags & NFSLCK_ACCESSBITS) &
((new_stp->ls_flags>>NFSLCK_SHIFT)&NFSLCK_ACCESSBITS)))){
ret = nfsrv_clientconflict(stp->ls_clp,&haslock,vp,p);
if (ret == 1) {
/*
* nfsrv_clientconflict() unlocks
* state when it returns non-zero.
*/
goto tryagain;
}
if (ret == 2)
error = NFSERR_PERM;
else if (new_stp->ls_flags & NFSLCK_RECLAIM)
error = NFSERR_RECLAIMCONFLICT;
else
error = NFSERR_SHAREDENIED;
if (ret == 0)
NFSUNLOCKSTATE();
if (haslock) {
NFSLOCKV4ROOTMUTEX();
nfsv4_unlock(&nfsv4rootfs_lock, 1);
NFSUNLOCKV4ROOTMUTEX();
}
goto out;
}
}
/*
* Check for a conflicting delegation. If one is found, call
* nfsrv_delegconflict() to handle it. If the v4root lock hasn't
* been set yet, it will get the lock. Otherwise, it will recall
* the delegation. Then, we try try again...
* (If NFSLCK_DELEGCUR is set, it has a delegation, so there
* isn't a conflict.)
* I currently believe the conflict algorithm to be:
* For Open with Read Access and Deny None
* - there is a conflict iff a different client has a write delegation
* For Open with other Write Access or any Deny except None
* - there is a conflict if a different client has any delegation
* - there is a conflict if the same client has a read delegation
* (The current consensus is that this last case should be
* considered a conflict since the client with a read delegation
* could have done an Open with ReadAccess and WriteDeny
* locally and then not have checked for the WriteDeny.)
* Don't check for a Reclaim, since that will be dealt with
* by nfsrv_openctrl().
*/
if (!(new_stp->ls_flags &
(NFSLCK_DELEGPREV | NFSLCK_DELEGCUR | NFSLCK_RECLAIM))) {
stp = LIST_FIRST(&lfp->lf_deleg);
while (stp != LIST_END(&lfp->lf_deleg)) {
nstp = LIST_NEXT(stp, ls_file);
if ((readonly && stp->ls_clp != clp &&
(stp->ls_flags & NFSLCK_DELEGWRITE)) ||
(!readonly && (stp->ls_clp != clp ||
(stp->ls_flags & NFSLCK_DELEGREAD)))) {
ret = nfsrv_delegconflict(stp, &haslock, p, vp);
if (ret) {
/*
* nfsrv_delegconflict() unlocks state
* when it returns non-zero.
*/
if (ret == -1)
goto tryagain;
error = ret;
goto out;
}
}
stp = nstp;
}
}
NFSUNLOCKSTATE();
if (haslock) {
NFSLOCKV4ROOTMUTEX();
nfsv4_unlock(&nfsv4rootfs_lock, 1);
NFSUNLOCKV4ROOTMUTEX();
}
out:
NFSEXITCODE2(error, nd);
return (error);
}
/*
* Open control function to create/update open state for an open.
*/
APPLESTATIC int
nfsrv_openctrl(struct nfsrv_descript *nd, vnode_t vp,
struct nfsstate **new_stpp, nfsquad_t clientid, nfsv4stateid_t *stateidp,
nfsv4stateid_t *delegstateidp, u_int32_t *rflagsp, struct nfsexstuff *exp,
NFSPROC_T *p, u_quad_t filerev)
{
struct nfsstate *new_stp = *new_stpp;
struct nfsstate *stp, *nstp;
struct nfsstate *openstp = NULL, *new_open, *ownerstp, *new_deleg;
struct nfslockfile *lfp, *new_lfp;
struct nfsclient *clp;
int error = 0, haslock = 0, ret, delegate = 1, writedeleg = 1;
int readonly = 0, cbret = 1, getfhret = 0;
int gotstate = 0, len = 0;
u_char *clidp = NULL;
if ((new_stp->ls_flags & NFSLCK_SHAREBITS) == NFSLCK_READACCESS)
readonly = 1;
/*
* Check for restart conditions (client and server).
* (Paranoia, should have been detected by nfsrv_opencheck().)
* If an error does show up, return NFSERR_EXPIRED, since the
* the seqid# has already been incremented.
*/
error = nfsrv_checkrestart(clientid, new_stp->ls_flags,
&new_stp->ls_stateid, 0);
if (error) {
printf("Nfsd: openctrl unexpected restart err=%d\n",
error);
error = NFSERR_EXPIRED;
goto out;
}
clidp = malloc(NFSV4_OPAQUELIMIT, M_TEMP, M_WAITOK);
tryagain:
MALLOC(new_lfp, struct nfslockfile *, sizeof (struct nfslockfile),
M_NFSDLOCKFILE, M_WAITOK);
MALLOC(new_open, struct nfsstate *, sizeof (struct nfsstate),
M_NFSDSTATE, M_WAITOK);
MALLOC(new_deleg, struct nfsstate *, sizeof (struct nfsstate),
M_NFSDSTATE, M_WAITOK);
getfhret = nfsrv_getlockfh(vp, new_stp->ls_flags, new_lfp,
NULL, p);
NFSLOCKSTATE();
/*
* Get the client structure. Since the linked lists could be changed
* by other nfsd processes if this process does a tsleep(), one of
* two things must be done.
* 1 - don't tsleep()
* or
* 2 - get the nfsv4_lock() { indicated by haslock == 1 }
* before using the lists, since this lock stops the other
* nfsd. This should only be used for rare cases, since it
* essentially single threads the nfsd.
* At this time, it is only done for cases where the stable
* storage file must be written prior to completion of state
* expiration.
*/
error = nfsrv_getclient(clientid, CLOPS_RENEW, &clp, NULL,
(nfsquad_t)((u_quad_t)0), 0, nd, p);
if (!error && (clp->lc_flags & LCL_NEEDSCBNULL) &&
clp->lc_program) {
/*
* This happens on the first open for a client
* that supports callbacks.
*/
NFSUNLOCKSTATE();
/*
* Although nfsrv_docallback() will sleep, clp won't
* go away, since they are only removed when the
* nfsv4_lock() has blocked the nfsd threads. The
* fields in clp can change, but having multiple
* threads do this Null callback RPC should be
* harmless.
*/
cbret = nfsrv_docallback(clp, NFSV4PROC_CBNULL,
NULL, 0, NULL, NULL, NULL, p);
NFSLOCKSTATE();
clp->lc_flags &= ~LCL_NEEDSCBNULL;
if (!cbret)
clp->lc_flags |= LCL_CALLBACKSON;
}
/*
* Look up the open owner. See if it needs confirmation and
* check the seq#, as required.
*/
if (!error)
nfsrv_getowner(&clp->lc_open, new_stp, &ownerstp);
if (error) {
NFSUNLOCKSTATE();
printf("Nfsd: openctrl unexpected state err=%d\n",
error);
free((caddr_t)new_lfp, M_NFSDLOCKFILE);
free((caddr_t)new_open, M_NFSDSTATE);
free((caddr_t)new_deleg, M_NFSDSTATE);
if (haslock) {
NFSLOCKV4ROOTMUTEX();
nfsv4_unlock(&nfsv4rootfs_lock, 1);
NFSUNLOCKV4ROOTMUTEX();
}
error = NFSERR_EXPIRED;
goto out;
}
if (new_stp->ls_flags & NFSLCK_RECLAIM)
nfsrv_markstable(clp);
/*
* Get the structure for the underlying file.
*/
if (getfhret)
error = getfhret;
else
error = nfsrv_getlockfile(new_stp->ls_flags, &new_lfp, &lfp,
NULL, 0);
if (new_lfp)
FREE((caddr_t)new_lfp, M_NFSDLOCKFILE);
if (error) {
NFSUNLOCKSTATE();
printf("Nfsd openctrl unexpected getlockfile err=%d\n",
error);
free((caddr_t)new_open, M_NFSDSTATE);
free((caddr_t)new_deleg, M_NFSDSTATE);
if (haslock) {
NFSLOCKV4ROOTMUTEX();
nfsv4_unlock(&nfsv4rootfs_lock, 1);
NFSUNLOCKV4ROOTMUTEX();
}
goto out;
}
/*
* Search for a conflicting open/share.
*/
if (new_stp->ls_flags & NFSLCK_DELEGCUR) {
/*
* For Delegate_Cur, search for the matching Delegation,
* which indicates no conflict.
* An old delegation should have been recovered by the
* client doing a Claim_DELEGATE_Prev, so I won't let
* it match and return NFSERR_EXPIRED. Should I let it
* match?
*/
LIST_FOREACH(stp, &lfp->lf_deleg, ls_file) {
if (!(stp->ls_flags & NFSLCK_OLDDELEG) &&
(((nd->nd_flag & ND_NFSV41) != 0 &&
stateidp->seqid == 0) ||
stateidp->seqid == stp->ls_stateid.seqid) &&
!NFSBCMP(stateidp->other, stp->ls_stateid.other,
NFSX_STATEIDOTHER))
break;
}
if (stp == LIST_END(&lfp->lf_deleg) ||
((new_stp->ls_flags & NFSLCK_WRITEACCESS) &&
(stp->ls_flags & NFSLCK_DELEGREAD))) {
NFSUNLOCKSTATE();
printf("Nfsd openctrl unexpected expiry\n");
free((caddr_t)new_open, M_NFSDSTATE);
free((caddr_t)new_deleg, M_NFSDSTATE);
if (haslock) {
NFSLOCKV4ROOTMUTEX();
nfsv4_unlock(&nfsv4rootfs_lock, 1);
NFSUNLOCKV4ROOTMUTEX();
}
error = NFSERR_EXPIRED;
goto out;
}
/*
* Don't issue a Delegation, since one already exists and
* delay delegation timeout, as required.
*/
delegate = 0;
nfsrv_delaydelegtimeout(stp);
}
/*
* Check for access/deny bit conflicts. I also check for the
* same owner, since the client might not have bothered to check.
* Also, note an open for the same file and owner, if found,
* which is all we do here for Delegate_Cur, since conflict
* checking is already done.
*/
LIST_FOREACH(stp, &lfp->lf_open, ls_file) {
if (ownerstp && stp->ls_openowner == ownerstp)
openstp = stp;
if (!(new_stp->ls_flags & NFSLCK_DELEGCUR)) {
/*
* If another client has the file open, the only
* delegation that can be issued is a Read delegation
* and only if it is a Read open with Deny none.
*/
if (clp != stp->ls_clp) {
if ((stp->ls_flags & NFSLCK_SHAREBITS) ==
NFSLCK_READACCESS)
writedeleg = 0;
else
delegate = 0;
}
if(((new_stp->ls_flags & NFSLCK_ACCESSBITS) &
((stp->ls_flags>>NFSLCK_SHIFT) & NFSLCK_ACCESSBITS))||
((stp->ls_flags & NFSLCK_ACCESSBITS) &
((new_stp->ls_flags>>NFSLCK_SHIFT)&NFSLCK_ACCESSBITS))){
ret = nfsrv_clientconflict(stp->ls_clp,&haslock,vp,p);
if (ret == 1) {
/*
* nfsrv_clientconflict() unlocks state
* when it returns non-zero.
*/
free((caddr_t)new_open, M_NFSDSTATE);
free((caddr_t)new_deleg, M_NFSDSTATE);
openstp = NULL;
goto tryagain;
}
if (ret == 2)
error = NFSERR_PERM;
else if (new_stp->ls_flags & NFSLCK_RECLAIM)
error = NFSERR_RECLAIMCONFLICT;
else
error = NFSERR_SHAREDENIED;
if (ret == 0)
NFSUNLOCKSTATE();
if (haslock) {
NFSLOCKV4ROOTMUTEX();
nfsv4_unlock(&nfsv4rootfs_lock, 1);
NFSUNLOCKV4ROOTMUTEX();
}
free((caddr_t)new_open, M_NFSDSTATE);
free((caddr_t)new_deleg, M_NFSDSTATE);
printf("nfsd openctrl unexpected client cnfl\n");
goto out;
}
}
}
/*
* Check for a conflicting delegation. If one is found, call
* nfsrv_delegconflict() to handle it. If the v4root lock hasn't
* been set yet, it will get the lock. Otherwise, it will recall
* the delegation. Then, we try try again...
* (If NFSLCK_DELEGCUR is set, it has a delegation, so there
* isn't a conflict.)
* I currently believe the conflict algorithm to be:
* For Open with Read Access and Deny None
* - there is a conflict iff a different client has a write delegation
* For Open with other Write Access or any Deny except None
* - there is a conflict if a different client has any delegation
* - there is a conflict if the same client has a read delegation
* (The current consensus is that this last case should be
* considered a conflict since the client with a read delegation
* could have done an Open with ReadAccess and WriteDeny
* locally and then not have checked for the WriteDeny.)
*/
if (!(new_stp->ls_flags & (NFSLCK_DELEGPREV | NFSLCK_DELEGCUR))) {
stp = LIST_FIRST(&lfp->lf_deleg);
while (stp != LIST_END(&lfp->lf_deleg)) {
nstp = LIST_NEXT(stp, ls_file);
if (stp->ls_clp != clp && (stp->ls_flags & NFSLCK_DELEGREAD))
writedeleg = 0;
else
delegate = 0;
if ((readonly && stp->ls_clp != clp &&
(stp->ls_flags & NFSLCK_DELEGWRITE)) ||
(!readonly && (stp->ls_clp != clp ||
(stp->ls_flags & NFSLCK_DELEGREAD)))) {
if (new_stp->ls_flags & NFSLCK_RECLAIM) {
delegate = 2;
} else {
ret = nfsrv_delegconflict(stp, &haslock, p, vp);
if (ret) {
/*
* nfsrv_delegconflict() unlocks state
* when it returns non-zero.
*/
printf("Nfsd openctrl unexpected deleg cnfl\n");
free((caddr_t)new_open, M_NFSDSTATE);
free((caddr_t)new_deleg, M_NFSDSTATE);
if (ret == -1) {
openstp = NULL;
goto tryagain;
}
error = ret;
goto out;
}
}
}
stp = nstp;
}
}
/*
* We only get here if there was no open that conflicted.
* If an open for the owner exists, or in the access/deny bits.
* Otherwise it is a new open. If the open_owner hasn't been
* confirmed, replace the open with the new one needing confirmation,
* otherwise add the open.
*/
if (new_stp->ls_flags & NFSLCK_DELEGPREV) {
/*
* Handle NFSLCK_DELEGPREV by searching the old delegations for
* a match. If found, just move the old delegation to the current
* delegation list and issue open. If not found, return
* NFSERR_EXPIRED.
*/
LIST_FOREACH(stp, &clp->lc_olddeleg, ls_list) {
if (stp->ls_lfp == lfp) {
/* Found it */
if (stp->ls_clp != clp)
panic("olddeleg clp");
LIST_REMOVE(stp, ls_list);
LIST_REMOVE(stp, ls_hash);
stp->ls_flags &= ~NFSLCK_OLDDELEG;
stp->ls_stateid.seqid = delegstateidp->seqid = 1;
stp->ls_stateid.other[0] = delegstateidp->other[0] =
clp->lc_clientid.lval[0];
stp->ls_stateid.other[1] = delegstateidp->other[1] =
clp->lc_clientid.lval[1];
stp->ls_stateid.other[2] = delegstateidp->other[2] =
nfsrv_nextstateindex(clp);
stp->ls_compref = nd->nd_compref;
LIST_INSERT_HEAD(&clp->lc_deleg, stp, ls_list);
LIST_INSERT_HEAD(NFSSTATEHASH(clp,
stp->ls_stateid), stp, ls_hash);
if (stp->ls_flags & NFSLCK_DELEGWRITE)
*rflagsp |= NFSV4OPEN_WRITEDELEGATE;
else
*rflagsp |= NFSV4OPEN_READDELEGATE;
clp->lc_delegtime = NFSD_MONOSEC +
nfsrv_lease + NFSRV_LEASEDELTA;
/*
* Now, do the associated open.
*/
new_open->ls_stateid.seqid = 1;
new_open->ls_stateid.other[0] = clp->lc_clientid.lval[0];
new_open->ls_stateid.other[1] = clp->lc_clientid.lval[1];
new_open->ls_stateid.other[2] = nfsrv_nextstateindex(clp);
new_open->ls_flags = (new_stp->ls_flags&NFSLCK_DENYBITS)|
NFSLCK_OPEN;
if (stp->ls_flags & NFSLCK_DELEGWRITE)
new_open->ls_flags |= (NFSLCK_READACCESS |
NFSLCK_WRITEACCESS);
else
new_open->ls_flags |= NFSLCK_READACCESS;
new_open->ls_uid = new_stp->ls_uid;
new_open->ls_lfp = lfp;
new_open->ls_clp = clp;
LIST_INIT(&new_open->ls_open);
LIST_INSERT_HEAD(&lfp->lf_open, new_open, ls_file);
LIST_INSERT_HEAD(NFSSTATEHASH(clp, new_open->ls_stateid),
new_open, ls_hash);
/*
* and handle the open owner
*/
if (ownerstp) {
new_open->ls_openowner = ownerstp;
LIST_INSERT_HEAD(&ownerstp->ls_open,new_open,ls_list);
} else {
new_open->ls_openowner = new_stp;
new_stp->ls_flags = 0;
nfsrvd_refcache(new_stp->ls_op);
new_stp->ls_noopens = 0;
LIST_INIT(&new_stp->ls_open);
LIST_INSERT_HEAD(&new_stp->ls_open, new_open, ls_list);
LIST_INSERT_HEAD(&clp->lc_open, new_stp, ls_list);
*new_stpp = NULL;
nfsstatsv1.srvopenowners++;
nfsrv_openpluslock++;
}
openstp = new_open;
new_open = NULL;
nfsstatsv1.srvopens++;
nfsrv_openpluslock++;
break;
}
}
if (stp == LIST_END(&clp->lc_olddeleg))
error = NFSERR_EXPIRED;
} else if (new_stp->ls_flags & (NFSLCK_DELEGREAD | NFSLCK_DELEGWRITE)) {
/*
* Scan to see that no delegation for this client and file
* doesn't already exist.
* There also shouldn't yet be an Open for this file and
* openowner.
*/
LIST_FOREACH(stp, &lfp->lf_deleg, ls_file) {
if (stp->ls_clp == clp)
break;
}
if (stp == LIST_END(&lfp->lf_deleg) && openstp == NULL) {
/*
* This is the Claim_Previous case with a delegation
* type != Delegate_None.
*/
/*
* First, add the delegation. (Although we must issue the
* delegation, we can also ask for an immediate return.)
*/
new_deleg->ls_stateid.seqid = delegstateidp->seqid = 1;
new_deleg->ls_stateid.other[0] = delegstateidp->other[0] =
clp->lc_clientid.lval[0];
new_deleg->ls_stateid.other[1] = delegstateidp->other[1] =
clp->lc_clientid.lval[1];
new_deleg->ls_stateid.other[2] = delegstateidp->other[2] =
nfsrv_nextstateindex(clp);
if (new_stp->ls_flags & NFSLCK_DELEGWRITE) {
new_deleg->ls_flags = (NFSLCK_DELEGWRITE |
NFSLCK_READACCESS | NFSLCK_WRITEACCESS);
*rflagsp |= NFSV4OPEN_WRITEDELEGATE;
nfsrv_writedelegcnt++;
} else {
new_deleg->ls_flags = (NFSLCK_DELEGREAD |
NFSLCK_READACCESS);
*rflagsp |= NFSV4OPEN_READDELEGATE;
}
new_deleg->ls_uid = new_stp->ls_uid;
new_deleg->ls_lfp = lfp;
new_deleg->ls_clp = clp;
new_deleg->ls_filerev = filerev;
new_deleg->ls_compref = nd->nd_compref;
LIST_INSERT_HEAD(&lfp->lf_deleg, new_deleg, ls_file);
LIST_INSERT_HEAD(NFSSTATEHASH(clp,
new_deleg->ls_stateid), new_deleg, ls_hash);
LIST_INSERT_HEAD(&clp->lc_deleg, new_deleg, ls_list);
new_deleg = NULL;
if (delegate == 2 || nfsrv_issuedelegs == 0 ||
(clp->lc_flags & (LCL_CALLBACKSON | LCL_CBDOWN)) !=
LCL_CALLBACKSON ||
NFSRV_V4DELEGLIMIT(nfsrv_delegatecnt) ||
!NFSVNO_DELEGOK(vp))
*rflagsp |= NFSV4OPEN_RECALL;
nfsstatsv1.srvdelegates++;
nfsrv_openpluslock++;
nfsrv_delegatecnt++;
/*
* Now, do the associated open.
*/
new_open->ls_stateid.seqid = 1;
new_open->ls_stateid.other[0] = clp->lc_clientid.lval[0];
new_open->ls_stateid.other[1] = clp->lc_clientid.lval[1];
new_open->ls_stateid.other[2] = nfsrv_nextstateindex(clp);
new_open->ls_flags = (new_stp->ls_flags & NFSLCK_DENYBITS) |
NFSLCK_OPEN;
if (new_stp->ls_flags & NFSLCK_DELEGWRITE)
new_open->ls_flags |= (NFSLCK_READACCESS |
NFSLCK_WRITEACCESS);
else
new_open->ls_flags |= NFSLCK_READACCESS;
new_open->ls_uid = new_stp->ls_uid;
new_open->ls_lfp = lfp;
new_open->ls_clp = clp;
LIST_INIT(&new_open->ls_open);
LIST_INSERT_HEAD(&lfp->lf_open, new_open, ls_file);
LIST_INSERT_HEAD(NFSSTATEHASH(clp, new_open->ls_stateid),
new_open, ls_hash);
/*
* and handle the open owner
*/
if (ownerstp) {
new_open->ls_openowner = ownerstp;
LIST_INSERT_HEAD(&ownerstp->ls_open, new_open, ls_list);
} else {
new_open->ls_openowner = new_stp;
new_stp->ls_flags = 0;
nfsrvd_refcache(new_stp->ls_op);
new_stp->ls_noopens = 0;
LIST_INIT(&new_stp->ls_open);
LIST_INSERT_HEAD(&new_stp->ls_open, new_open, ls_list);
LIST_INSERT_HEAD(&clp->lc_open, new_stp, ls_list);
*new_stpp = NULL;
nfsstatsv1.srvopenowners++;
nfsrv_openpluslock++;
}
openstp = new_open;
new_open = NULL;
nfsstatsv1.srvopens++;
nfsrv_openpluslock++;
} else {
error = NFSERR_RECLAIMCONFLICT;
}
} else if (ownerstp) {
if (ownerstp->ls_flags & NFSLCK_NEEDSCONFIRM) {
/* Replace the open */
if (ownerstp->ls_op)
nfsrvd_derefcache(ownerstp->ls_op);
ownerstp->ls_op = new_stp->ls_op;
nfsrvd_refcache(ownerstp->ls_op);
ownerstp->ls_seq = new_stp->ls_seq;
*rflagsp |= NFSV4OPEN_RESULTCONFIRM;
stp = LIST_FIRST(&ownerstp->ls_open);
stp->ls_flags = (new_stp->ls_flags & NFSLCK_SHAREBITS) |
NFSLCK_OPEN;
stp->ls_stateid.seqid = 1;
stp->ls_uid = new_stp->ls_uid;
if (lfp != stp->ls_lfp) {
LIST_REMOVE(stp, ls_file);
LIST_INSERT_HEAD(&lfp->lf_open, stp, ls_file);
stp->ls_lfp = lfp;
}
openstp = stp;
} else if (openstp) {
openstp->ls_flags |= (new_stp->ls_flags & NFSLCK_SHAREBITS);
openstp->ls_stateid.seqid++;
if ((nd->nd_flag & ND_NFSV41) != 0 &&
openstp->ls_stateid.seqid == 0)
openstp->ls_stateid.seqid = 1;
/*
* This is where we can choose to issue a delegation.
*/
if (delegate == 0 || writedeleg == 0 ||
NFSVNO_EXRDONLY(exp) || (readonly != 0 &&
nfsrv_writedelegifpos == 0) ||
!NFSVNO_DELEGOK(vp) ||
(new_stp->ls_flags & NFSLCK_WANTRDELEG) != 0 ||
(clp->lc_flags & (LCL_CALLBACKSON | LCL_CBDOWN)) !=
LCL_CALLBACKSON)
*rflagsp |= NFSV4OPEN_WDCONTENTION;
else if (nfsrv_issuedelegs == 0 ||
NFSRV_V4DELEGLIMIT(nfsrv_delegatecnt))
*rflagsp |= NFSV4OPEN_WDRESOURCE;
else if ((new_stp->ls_flags & NFSLCK_WANTNODELEG) != 0)
*rflagsp |= NFSV4OPEN_WDNOTWANTED;
else {
new_deleg->ls_stateid.seqid = delegstateidp->seqid = 1;
new_deleg->ls_stateid.other[0] = delegstateidp->other[0]
= clp->lc_clientid.lval[0];
new_deleg->ls_stateid.other[1] = delegstateidp->other[1]
= clp->lc_clientid.lval[1];
new_deleg->ls_stateid.other[2] = delegstateidp->other[2]
= nfsrv_nextstateindex(clp);
new_deleg->ls_flags = (NFSLCK_DELEGWRITE |
NFSLCK_READACCESS | NFSLCK_WRITEACCESS);
*rflagsp |= NFSV4OPEN_WRITEDELEGATE;
new_deleg->ls_uid = new_stp->ls_uid;
new_deleg->ls_lfp = lfp;
new_deleg->ls_clp = clp;
new_deleg->ls_filerev = filerev;
new_deleg->ls_compref = nd->nd_compref;
nfsrv_writedelegcnt++;
LIST_INSERT_HEAD(&lfp->lf_deleg, new_deleg, ls_file);
LIST_INSERT_HEAD(NFSSTATEHASH(clp,
new_deleg->ls_stateid), new_deleg, ls_hash);
LIST_INSERT_HEAD(&clp->lc_deleg, new_deleg, ls_list);
new_deleg = NULL;
nfsstatsv1.srvdelegates++;
nfsrv_openpluslock++;
nfsrv_delegatecnt++;
}
} else {
new_open->ls_stateid.seqid = 1;
new_open->ls_stateid.other[0] = clp->lc_clientid.lval[0];
new_open->ls_stateid.other[1] = clp->lc_clientid.lval[1];
new_open->ls_stateid.other[2] = nfsrv_nextstateindex(clp);
new_open->ls_flags = (new_stp->ls_flags & NFSLCK_SHAREBITS)|
NFSLCK_OPEN;
new_open->ls_uid = new_stp->ls_uid;
new_open->ls_openowner = ownerstp;
new_open->ls_lfp = lfp;
new_open->ls_clp = clp;
LIST_INIT(&new_open->ls_open);
LIST_INSERT_HEAD(&lfp->lf_open, new_open, ls_file);
LIST_INSERT_HEAD(&ownerstp->ls_open, new_open, ls_list);
LIST_INSERT_HEAD(NFSSTATEHASH(clp, new_open->ls_stateid),
new_open, ls_hash);
openstp = new_open;
new_open = NULL;
nfsstatsv1.srvopens++;
nfsrv_openpluslock++;
/*
* This is where we can choose to issue a delegation.
*/
if (delegate == 0 || (writedeleg == 0 && readonly == 0) ||
!NFSVNO_DELEGOK(vp) ||
(clp->lc_flags & (LCL_CALLBACKSON | LCL_CBDOWN)) !=
LCL_CALLBACKSON)
*rflagsp |= NFSV4OPEN_WDCONTENTION;
else if (nfsrv_issuedelegs == 0 ||
NFSRV_V4DELEGLIMIT(nfsrv_delegatecnt))
*rflagsp |= NFSV4OPEN_WDRESOURCE;
else if ((new_stp->ls_flags & NFSLCK_WANTNODELEG) != 0)
*rflagsp |= NFSV4OPEN_WDNOTWANTED;
else {
new_deleg->ls_stateid.seqid = delegstateidp->seqid = 1;
new_deleg->ls_stateid.other[0] = delegstateidp->other[0]
= clp->lc_clientid.lval[0];
new_deleg->ls_stateid.other[1] = delegstateidp->other[1]
= clp->lc_clientid.lval[1];
new_deleg->ls_stateid.other[2] = delegstateidp->other[2]
= nfsrv_nextstateindex(clp);
if (writedeleg && !NFSVNO_EXRDONLY(exp) &&
(nfsrv_writedelegifpos || !readonly) &&
(new_stp->ls_flags & NFSLCK_WANTRDELEG) == 0) {
new_deleg->ls_flags = (NFSLCK_DELEGWRITE |
NFSLCK_READACCESS | NFSLCK_WRITEACCESS);
*rflagsp |= NFSV4OPEN_WRITEDELEGATE;
nfsrv_writedelegcnt++;
} else {
new_deleg->ls_flags = (NFSLCK_DELEGREAD |
NFSLCK_READACCESS);
*rflagsp |= NFSV4OPEN_READDELEGATE;
}
new_deleg->ls_uid = new_stp->ls_uid;
new_deleg->ls_lfp = lfp;
new_deleg->ls_clp = clp;
new_deleg->ls_filerev = filerev;
new_deleg->ls_compref = nd->nd_compref;
LIST_INSERT_HEAD(&lfp->lf_deleg, new_deleg, ls_file);
LIST_INSERT_HEAD(NFSSTATEHASH(clp,
new_deleg->ls_stateid), new_deleg, ls_hash);
LIST_INSERT_HEAD(&clp->lc_deleg, new_deleg, ls_list);
new_deleg = NULL;
nfsstatsv1.srvdelegates++;
nfsrv_openpluslock++;
nfsrv_delegatecnt++;
}
}
} else {
/*
* New owner case. Start the open_owner sequence with a
* Needs confirmation (unless a reclaim) and hang the
* new open off it.
*/
new_open->ls_stateid.seqid = 1;
new_open->ls_stateid.other[0] = clp->lc_clientid.lval[0];
new_open->ls_stateid.other[1] = clp->lc_clientid.lval[1];
new_open->ls_stateid.other[2] = nfsrv_nextstateindex(clp);
new_open->ls_flags = (new_stp->ls_flags & NFSLCK_SHAREBITS) |
NFSLCK_OPEN;
new_open->ls_uid = new_stp->ls_uid;
LIST_INIT(&new_open->ls_open);
new_open->ls_openowner = new_stp;
new_open->ls_lfp = lfp;
new_open->ls_clp = clp;
LIST_INSERT_HEAD(&lfp->lf_open, new_open, ls_file);
if (new_stp->ls_flags & NFSLCK_RECLAIM) {
new_stp->ls_flags = 0;
} else if ((nd->nd_flag & ND_NFSV41) != 0) {
/* NFSv4.1 never needs confirmation. */
new_stp->ls_flags = 0;
/*
* This is where we can choose to issue a delegation.
*/
if (delegate && nfsrv_issuedelegs &&
(writedeleg || readonly) &&
(clp->lc_flags & (LCL_CALLBACKSON | LCL_CBDOWN)) ==
LCL_CALLBACKSON &&
!NFSRV_V4DELEGLIMIT(nfsrv_delegatecnt) &&
NFSVNO_DELEGOK(vp) &&
((nd->nd_flag & ND_NFSV41) == 0 ||
(new_stp->ls_flags & NFSLCK_WANTNODELEG) == 0)) {
new_deleg->ls_stateid.seqid =
delegstateidp->seqid = 1;
new_deleg->ls_stateid.other[0] =
delegstateidp->other[0]
= clp->lc_clientid.lval[0];
new_deleg->ls_stateid.other[1] =
delegstateidp->other[1]
= clp->lc_clientid.lval[1];
new_deleg->ls_stateid.other[2] =
delegstateidp->other[2]
= nfsrv_nextstateindex(clp);
if (writedeleg && !NFSVNO_EXRDONLY(exp) &&
(nfsrv_writedelegifpos || !readonly) &&
((nd->nd_flag & ND_NFSV41) == 0 ||
(new_stp->ls_flags & NFSLCK_WANTRDELEG) ==
0)) {
new_deleg->ls_flags =
(NFSLCK_DELEGWRITE |
NFSLCK_READACCESS |
NFSLCK_WRITEACCESS);
*rflagsp |= NFSV4OPEN_WRITEDELEGATE;
nfsrv_writedelegcnt++;
} else {
new_deleg->ls_flags =
(NFSLCK_DELEGREAD |
NFSLCK_READACCESS);
*rflagsp |= NFSV4OPEN_READDELEGATE;
}
new_deleg->ls_uid = new_stp->ls_uid;
new_deleg->ls_lfp = lfp;
new_deleg->ls_clp = clp;
new_deleg->ls_filerev = filerev;
new_deleg->ls_compref = nd->nd_compref;
LIST_INSERT_HEAD(&lfp->lf_deleg, new_deleg,
ls_file);
LIST_INSERT_HEAD(NFSSTATEHASH(clp,
new_deleg->ls_stateid), new_deleg, ls_hash);
LIST_INSERT_HEAD(&clp->lc_deleg, new_deleg,
ls_list);
new_deleg = NULL;
nfsstatsv1.srvdelegates++;
nfsrv_openpluslock++;
nfsrv_delegatecnt++;
}
/*
* Since NFSv4.1 never does an OpenConfirm, the first
* open state will be acquired here.
*/
if (!(clp->lc_flags & LCL_STAMPEDSTABLE)) {
clp->lc_flags |= LCL_STAMPEDSTABLE;
len = clp->lc_idlen;
NFSBCOPY(clp->lc_id, clidp, len);
gotstate = 1;
}
} else {
*rflagsp |= NFSV4OPEN_RESULTCONFIRM;
new_stp->ls_flags = NFSLCK_NEEDSCONFIRM;
}
nfsrvd_refcache(new_stp->ls_op);
new_stp->ls_noopens = 0;
LIST_INIT(&new_stp->ls_open);
LIST_INSERT_HEAD(&new_stp->ls_open, new_open, ls_list);
LIST_INSERT_HEAD(&clp->lc_open, new_stp, ls_list);
LIST_INSERT_HEAD(NFSSTATEHASH(clp, new_open->ls_stateid),
new_open, ls_hash);
openstp = new_open;
new_open = NULL;
*new_stpp = NULL;
nfsstatsv1.srvopens++;
nfsrv_openpluslock++;
nfsstatsv1.srvopenowners++;
nfsrv_openpluslock++;
}
if (!error) {
stateidp->seqid = openstp->ls_stateid.seqid;
stateidp->other[0] = openstp->ls_stateid.other[0];
stateidp->other[1] = openstp->ls_stateid.other[1];
stateidp->other[2] = openstp->ls_stateid.other[2];
}
NFSUNLOCKSTATE();
if (haslock) {
NFSLOCKV4ROOTMUTEX();
nfsv4_unlock(&nfsv4rootfs_lock, 1);
NFSUNLOCKV4ROOTMUTEX();
}
if (new_open)
FREE((caddr_t)new_open, M_NFSDSTATE);
if (new_deleg)
FREE((caddr_t)new_deleg, M_NFSDSTATE);
/*
* If the NFSv4.1 client just acquired its first open, write a timestamp
* to the stable storage file.
*/
if (gotstate != 0) {
nfsrv_writestable(clidp, len, NFSNST_NEWSTATE, p);
nfsrv_backupstable();
}
out:
free(clidp, M_TEMP);
NFSEXITCODE2(error, nd);
return (error);
}
/*
* Open update. Does the confirm, downgrade and close.
*/
APPLESTATIC int
nfsrv_openupdate(vnode_t vp, struct nfsstate *new_stp, nfsquad_t clientid,
nfsv4stateid_t *stateidp, struct nfsrv_descript *nd, NFSPROC_T *p)
{
- struct nfsstate *stp, *ownerstp;
+ struct nfsstate *stp;
struct nfsclient *clp;
struct nfslockfile *lfp;
u_int32_t bits;
int error = 0, gotstate = 0, len = 0;
u_char *clidp = NULL;
/*
* Check for restart conditions (client and server).
*/
error = nfsrv_checkrestart(clientid, new_stp->ls_flags,
&new_stp->ls_stateid, 0);
if (error)
goto out;
clidp = malloc(NFSV4_OPAQUELIMIT, M_TEMP, M_WAITOK);
NFSLOCKSTATE();
/*
* Get the open structure via clientid and stateid.
*/
error = nfsrv_getclient(clientid, CLOPS_RENEW, &clp, NULL,
(nfsquad_t)((u_quad_t)0), 0, nd, p);
if (!error)
error = nfsrv_getstate(clp, &new_stp->ls_stateid,
new_stp->ls_flags, &stp);
/*
* Sanity check the open.
*/
if (!error && (!(stp->ls_flags & NFSLCK_OPEN) ||
(!(new_stp->ls_flags & NFSLCK_CONFIRM) &&
(stp->ls_openowner->ls_flags & NFSLCK_NEEDSCONFIRM)) ||
((new_stp->ls_flags & NFSLCK_CONFIRM) &&
(!(stp->ls_openowner->ls_flags & NFSLCK_NEEDSCONFIRM)))))
error = NFSERR_BADSTATEID;
if (!error)
error = nfsrv_checkseqid(nd, new_stp->ls_seq,
stp->ls_openowner, new_stp->ls_op);
if (!error && stp->ls_stateid.seqid != new_stp->ls_stateid.seqid &&
(((nd->nd_flag & ND_NFSV41) == 0 &&
!(new_stp->ls_flags & NFSLCK_CONFIRM)) ||
((nd->nd_flag & ND_NFSV41) != 0 &&
new_stp->ls_stateid.seqid != 0)))
error = NFSERR_OLDSTATEID;
if (!error && vnode_vtype(vp) != VREG) {
if (vnode_vtype(vp) == VDIR)
error = NFSERR_ISDIR;
else
error = NFSERR_INVAL;
}
if (error) {
/*
* If a client tries to confirm an Open with a bad
* seqid# and there are no byte range locks or other Opens
* on the openowner, just throw it away, so the next use of the
* openowner will start a fresh seq#.
*/
if (error == NFSERR_BADSEQID &&
(new_stp->ls_flags & NFSLCK_CONFIRM) &&
nfsrv_nootherstate(stp))
nfsrv_freeopenowner(stp->ls_openowner, 0, p);
NFSUNLOCKSTATE();
goto out;
}
/*
* Set the return stateid.
*/
stateidp->seqid = stp->ls_stateid.seqid + 1;
if ((nd->nd_flag & ND_NFSV41) != 0 && stateidp->seqid == 0)
stateidp->seqid = 1;
stateidp->other[0] = stp->ls_stateid.other[0];
stateidp->other[1] = stp->ls_stateid.other[1];
stateidp->other[2] = stp->ls_stateid.other[2];
/*
* Now, handle the three cases.
*/
if (new_stp->ls_flags & NFSLCK_CONFIRM) {
/*
* If the open doesn't need confirmation, it seems to me that
* there is a client error, but I'll just log it and keep going?
*/
if (!(stp->ls_openowner->ls_flags & NFSLCK_NEEDSCONFIRM))
printf("Nfsv4d: stray open confirm\n");
stp->ls_openowner->ls_flags = 0;
stp->ls_stateid.seqid++;
if ((nd->nd_flag & ND_NFSV41) != 0 &&
stp->ls_stateid.seqid == 0)
stp->ls_stateid.seqid = 1;
if (!(clp->lc_flags & LCL_STAMPEDSTABLE)) {
clp->lc_flags |= LCL_STAMPEDSTABLE;
len = clp->lc_idlen;
NFSBCOPY(clp->lc_id, clidp, len);
gotstate = 1;
}
NFSUNLOCKSTATE();
} else if (new_stp->ls_flags & NFSLCK_CLOSE) {
- ownerstp = stp->ls_openowner;
lfp = stp->ls_lfp;
if (nfsrv_dolocallocks != 0 && !LIST_EMPTY(&stp->ls_open)) {
/* Get the lf lock */
nfsrv_locklf(lfp);
NFSUNLOCKSTATE();
ASSERT_VOP_ELOCKED(vp, "nfsrv_openupdate");
NFSVOPUNLOCK(vp, 0);
if (nfsrv_freeopen(stp, vp, 1, p) == 0) {
NFSLOCKSTATE();
nfsrv_unlocklf(lfp);
NFSUNLOCKSTATE();
}
NFSVOPLOCK(vp, LK_EXCLUSIVE | LK_RETRY);
} else {
(void) nfsrv_freeopen(stp, NULL, 0, p);
NFSUNLOCKSTATE();
}
} else {
/*
* Update the share bits, making sure that the new set are a
* subset of the old ones.
*/
bits = (new_stp->ls_flags & NFSLCK_SHAREBITS);
if (~(stp->ls_flags) & bits) {
NFSUNLOCKSTATE();
error = NFSERR_INVAL;
goto out;
}
stp->ls_flags = (bits | NFSLCK_OPEN);
stp->ls_stateid.seqid++;
if ((nd->nd_flag & ND_NFSV41) != 0 &&
stp->ls_stateid.seqid == 0)
stp->ls_stateid.seqid = 1;
NFSUNLOCKSTATE();
}
/*
* If the client just confirmed its first open, write a timestamp
* to the stable storage file.
*/
if (gotstate != 0) {
nfsrv_writestable(clidp, len, NFSNST_NEWSTATE, p);
nfsrv_backupstable();
}
out:
free(clidp, M_TEMP);
NFSEXITCODE2(error, nd);
return (error);
}
/*
* Delegation update. Does the purge and return.
*/
APPLESTATIC int
nfsrv_delegupdate(struct nfsrv_descript *nd, nfsquad_t clientid,
nfsv4stateid_t *stateidp, vnode_t vp, int op, struct ucred *cred,
NFSPROC_T *p)
{
struct nfsstate *stp;
struct nfsclient *clp;
int error = 0;
fhandle_t fh;
/*
* Do a sanity check against the file handle for DelegReturn.
*/
if (vp) {
error = nfsvno_getfh(vp, &fh, p);
if (error)
goto out;
}
/*
* Check for restart conditions (client and server).
*/
if (op == NFSV4OP_DELEGRETURN)
error = nfsrv_checkrestart(clientid, NFSLCK_DELEGRETURN,
stateidp, 0);
else
error = nfsrv_checkrestart(clientid, NFSLCK_DELEGPURGE,
stateidp, 0);
NFSLOCKSTATE();
/*
* Get the open structure via clientid and stateid.
*/
if (!error)
error = nfsrv_getclient(clientid, CLOPS_RENEW, &clp, NULL,
(nfsquad_t)((u_quad_t)0), 0, nd, p);
if (error) {
if (error == NFSERR_CBPATHDOWN)
error = 0;
if (error == NFSERR_STALECLIENTID && op == NFSV4OP_DELEGRETURN)
error = NFSERR_STALESTATEID;
}
if (!error && op == NFSV4OP_DELEGRETURN) {
error = nfsrv_getstate(clp, stateidp, NFSLCK_DELEGRETURN, &stp);
if (!error && stp->ls_stateid.seqid != stateidp->seqid &&
((nd->nd_flag & ND_NFSV41) == 0 || stateidp->seqid != 0))
error = NFSERR_OLDSTATEID;
}
/*
* NFSERR_EXPIRED means that the state has gone away,
* so Delegations have been purged. Just return ok.
*/
if (error == NFSERR_EXPIRED && op == NFSV4OP_DELEGPURGE) {
NFSUNLOCKSTATE();
error = 0;
goto out;
}
if (error) {
NFSUNLOCKSTATE();
goto out;
}
if (op == NFSV4OP_DELEGRETURN) {
if (NFSBCMP((caddr_t)&fh, (caddr_t)&stp->ls_lfp->lf_fh,
sizeof (fhandle_t))) {
NFSUNLOCKSTATE();
error = NFSERR_BADSTATEID;
goto out;
}
nfsrv_freedeleg(stp);
} else {
nfsrv_freedeleglist(&clp->lc_olddeleg);
}
NFSUNLOCKSTATE();
error = 0;
out:
NFSEXITCODE(error);
return (error);
}
/*
* Release lock owner.
*/
APPLESTATIC int
nfsrv_releaselckown(struct nfsstate *new_stp, nfsquad_t clientid,
NFSPROC_T *p)
{
struct nfsstate *stp, *nstp, *openstp, *ownstp;
struct nfsclient *clp;
int error = 0;
/*
* Check for restart conditions (client and server).
*/
error = nfsrv_checkrestart(clientid, new_stp->ls_flags,
&new_stp->ls_stateid, 0);
if (error)
goto out;
NFSLOCKSTATE();
/*
* Get the lock owner by name.
*/
error = nfsrv_getclient(clientid, CLOPS_RENEW, &clp, NULL,
(nfsquad_t)((u_quad_t)0), 0, NULL, p);
if (error) {
NFSUNLOCKSTATE();
goto out;
}
LIST_FOREACH(ownstp, &clp->lc_open, ls_list) {
LIST_FOREACH(openstp, &ownstp->ls_open, ls_list) {
stp = LIST_FIRST(&openstp->ls_open);
while (stp != LIST_END(&openstp->ls_open)) {
nstp = LIST_NEXT(stp, ls_list);
/*
* If the owner matches, check for locks and
* then free or return an error.
*/
if (stp->ls_ownerlen == new_stp->ls_ownerlen &&
!NFSBCMP(stp->ls_owner, new_stp->ls_owner,
stp->ls_ownerlen)){
if (LIST_EMPTY(&stp->ls_lock)) {
nfsrv_freelockowner(stp, NULL, 0, p);
} else {
NFSUNLOCKSTATE();
error = NFSERR_LOCKSHELD;
goto out;
}
}
stp = nstp;
}
}
}
NFSUNLOCKSTATE();
out:
NFSEXITCODE(error);
return (error);
}
/*
* Get the file handle for a lock structure.
*/
static int
nfsrv_getlockfh(vnode_t vp, u_short flags, struct nfslockfile *new_lfp,
fhandle_t *nfhp, NFSPROC_T *p)
{
fhandle_t *fhp = NULL;
int error;
/*
* For lock, use the new nfslock structure, otherwise just
* a fhandle_t on the stack.
*/
if (flags & NFSLCK_OPEN) {
KASSERT(new_lfp != NULL, ("nfsrv_getlockfh: new_lfp NULL"));
fhp = &new_lfp->lf_fh;
} else if (nfhp) {
fhp = nfhp;
} else {
panic("nfsrv_getlockfh");
}
error = nfsvno_getfh(vp, fhp, p);
NFSEXITCODE(error);
return (error);
}
/*
* Get an nfs lock structure. Allocate one, as required, and return a
* pointer to it.
* Returns an NFSERR_xxx upon failure or -1 to indicate no current lock.
*/
static int
nfsrv_getlockfile(u_short flags, struct nfslockfile **new_lfpp,
struct nfslockfile **lfpp, fhandle_t *nfhp, int lockit)
{
struct nfslockfile *lfp;
fhandle_t *fhp = NULL, *tfhp;
struct nfslockhashhead *hp;
struct nfslockfile *new_lfp = NULL;
/*
* For lock, use the new nfslock structure, otherwise just
* a fhandle_t on the stack.
*/
if (flags & NFSLCK_OPEN) {
new_lfp = *new_lfpp;
fhp = &new_lfp->lf_fh;
} else if (nfhp) {
fhp = nfhp;
} else {
panic("nfsrv_getlockfile");
}
hp = NFSLOCKHASH(fhp);
LIST_FOREACH(lfp, hp, lf_hash) {
tfhp = &lfp->lf_fh;
if (NFSVNO_CMPFH(fhp, tfhp)) {
if (lockit)
nfsrv_locklf(lfp);
*lfpp = lfp;
return (0);
}
}
if (!(flags & NFSLCK_OPEN))
return (-1);
/*
* No match, so chain the new one into the list.
*/
LIST_INIT(&new_lfp->lf_open);
LIST_INIT(&new_lfp->lf_lock);
LIST_INIT(&new_lfp->lf_deleg);
LIST_INIT(&new_lfp->lf_locallock);
LIST_INIT(&new_lfp->lf_rollback);
new_lfp->lf_locallock_lck.nfslock_usecnt = 0;
new_lfp->lf_locallock_lck.nfslock_lock = 0;
new_lfp->lf_usecount = 0;
LIST_INSERT_HEAD(hp, new_lfp, lf_hash);
*lfpp = new_lfp;
*new_lfpp = NULL;
return (0);
}
/*
* This function adds a nfslock lock structure to the list for the associated
* nfsstate and nfslockfile structures. It will be inserted after the
* entry pointed at by insert_lop.
*/
static void
nfsrv_insertlock(struct nfslock *new_lop, struct nfslock *insert_lop,
struct nfsstate *stp, struct nfslockfile *lfp)
{
struct nfslock *lop, *nlop;
new_lop->lo_stp = stp;
new_lop->lo_lfp = lfp;
if (stp != NULL) {
/* Insert in increasing lo_first order */
lop = LIST_FIRST(&lfp->lf_lock);
if (lop == LIST_END(&lfp->lf_lock) ||
new_lop->lo_first <= lop->lo_first) {
LIST_INSERT_HEAD(&lfp->lf_lock, new_lop, lo_lckfile);
} else {
nlop = LIST_NEXT(lop, lo_lckfile);
while (nlop != LIST_END(&lfp->lf_lock) &&
nlop->lo_first < new_lop->lo_first) {
lop = nlop;
nlop = LIST_NEXT(lop, lo_lckfile);
}
LIST_INSERT_AFTER(lop, new_lop, lo_lckfile);
}
} else {
new_lop->lo_lckfile.le_prev = NULL; /* list not used */
}
/*
* Insert after insert_lop, which is overloaded as stp or lfp for
* an empty list.
*/
if (stp == NULL && (struct nfslockfile *)insert_lop == lfp)
LIST_INSERT_HEAD(&lfp->lf_locallock, new_lop, lo_lckowner);
else if ((struct nfsstate *)insert_lop == stp)
LIST_INSERT_HEAD(&stp->ls_lock, new_lop, lo_lckowner);
else
LIST_INSERT_AFTER(insert_lop, new_lop, lo_lckowner);
if (stp != NULL) {
nfsstatsv1.srvlocks++;
nfsrv_openpluslock++;
}
}
/*
* This function updates the locking for a lock owner and given file. It
* maintains a list of lock ranges ordered on increasing file offset that
* are NFSLCK_READ or NFSLCK_WRITE and non-overlapping (aka POSIX style).
* It always adds new_lop to the list and sometimes uses the one pointed
* at by other_lopp.
*/
static void
nfsrv_updatelock(struct nfsstate *stp, struct nfslock **new_lopp,
struct nfslock **other_lopp, struct nfslockfile *lfp)
{
struct nfslock *new_lop = *new_lopp;
struct nfslock *lop, *tlop, *ilop;
struct nfslock *other_lop = *other_lopp;
int unlock = 0, myfile = 0;
u_int64_t tmp;
/*
* Work down the list until the lock is merged.
*/
if (new_lop->lo_flags & NFSLCK_UNLOCK)
unlock = 1;
if (stp != NULL) {
ilop = (struct nfslock *)stp;
lop = LIST_FIRST(&stp->ls_lock);
} else {
ilop = (struct nfslock *)lfp;
lop = LIST_FIRST(&lfp->lf_locallock);
}
while (lop != NULL) {
/*
* Only check locks for this file that aren't before the start of
* new lock's range.
*/
if (lop->lo_lfp == lfp) {
myfile = 1;
if (lop->lo_end >= new_lop->lo_first) {
if (new_lop->lo_end < lop->lo_first) {
/*
* If the new lock ends before the start of the
* current lock's range, no merge, just insert
* the new lock.
*/
break;
}
if (new_lop->lo_flags == lop->lo_flags ||
(new_lop->lo_first <= lop->lo_first &&
new_lop->lo_end >= lop->lo_end)) {
/*
* This lock can be absorbed by the new lock/unlock.
* This happens when it covers the entire range
* of the old lock or is contiguous
* with the old lock and is of the same type or an
* unlock.
*/
if (lop->lo_first < new_lop->lo_first)
new_lop->lo_first = lop->lo_first;
if (lop->lo_end > new_lop->lo_end)
new_lop->lo_end = lop->lo_end;
tlop = lop;
lop = LIST_NEXT(lop, lo_lckowner);
nfsrv_freenfslock(tlop);
continue;
}
/*
* All these cases are for contiguous locks that are not the
* same type, so they can't be merged.
*/
if (new_lop->lo_first <= lop->lo_first) {
/*
* This case is where the new lock overlaps with the
* first part of the old lock. Move the start of the
* old lock to just past the end of the new lock. The
* new lock will be inserted in front of the old, since
* ilop hasn't been updated. (We are done now.)
*/
lop->lo_first = new_lop->lo_end;
break;
}
if (new_lop->lo_end >= lop->lo_end) {
/*
* This case is where the new lock overlaps with the
* end of the old lock's range. Move the old lock's
* end to just before the new lock's first and insert
* the new lock after the old lock.
* Might not be done yet, since the new lock could
* overlap further locks with higher ranges.
*/
lop->lo_end = new_lop->lo_first;
ilop = lop;
lop = LIST_NEXT(lop, lo_lckowner);
continue;
}
/*
* The final case is where the new lock's range is in the
* middle of the current lock's and splits the current lock
* up. Use *other_lopp to handle the second part of the
* split old lock range. (We are done now.)
* For unlock, we use new_lop as other_lop and tmp, since
* other_lop and new_lop are the same for this case.
* We noted the unlock case above, so we don't need
* new_lop->lo_flags any longer.
*/
tmp = new_lop->lo_first;
if (other_lop == NULL) {
if (!unlock)
panic("nfsd srv update unlock");
other_lop = new_lop;
*new_lopp = NULL;
}
other_lop->lo_first = new_lop->lo_end;
other_lop->lo_end = lop->lo_end;
other_lop->lo_flags = lop->lo_flags;
other_lop->lo_stp = stp;
other_lop->lo_lfp = lfp;
lop->lo_end = tmp;
nfsrv_insertlock(other_lop, lop, stp, lfp);
*other_lopp = NULL;
ilop = lop;
break;
}
}
ilop = lop;
lop = LIST_NEXT(lop, lo_lckowner);
if (myfile && (lop == NULL || lop->lo_lfp != lfp))
break;
}
/*
* Insert the new lock in the list at the appropriate place.
*/
if (!unlock) {
nfsrv_insertlock(new_lop, ilop, stp, lfp);
*new_lopp = NULL;
}
}
/*
* This function handles sequencing of locks, etc.
* It returns an error that indicates what the caller should do.
*/
static int
nfsrv_checkseqid(struct nfsrv_descript *nd, u_int32_t seqid,
struct nfsstate *stp, struct nfsrvcache *op)
{
int error = 0;
if ((nd->nd_flag & ND_NFSV41) != 0)
/* NFSv4.1 ignores the open_seqid and lock_seqid. */
goto out;
if (op != nd->nd_rp)
panic("nfsrvstate checkseqid");
if (!(op->rc_flag & RC_INPROG))
panic("nfsrvstate not inprog");
if (stp->ls_op && stp->ls_op->rc_refcnt <= 0) {
printf("refcnt=%d\n", stp->ls_op->rc_refcnt);
panic("nfsrvstate op refcnt");
}
if ((stp->ls_seq + 1) == seqid) {
if (stp->ls_op)
nfsrvd_derefcache(stp->ls_op);
stp->ls_op = op;
nfsrvd_refcache(op);
stp->ls_seq = seqid;
goto out;
} else if (stp->ls_seq == seqid && stp->ls_op &&
op->rc_xid == stp->ls_op->rc_xid &&
op->rc_refcnt == 0 &&
op->rc_reqlen == stp->ls_op->rc_reqlen &&
op->rc_cksum == stp->ls_op->rc_cksum) {
if (stp->ls_op->rc_flag & RC_INPROG) {
error = NFSERR_DONTREPLY;
goto out;
}
nd->nd_rp = stp->ls_op;
nd->nd_rp->rc_flag |= RC_INPROG;
nfsrvd_delcache(op);
error = NFSERR_REPLYFROMCACHE;
goto out;
}
error = NFSERR_BADSEQID;
out:
NFSEXITCODE2(error, nd);
return (error);
}
/*
* Get the client ip address for callbacks. If the strings can't be parsed,
* just set lc_program to 0 to indicate no callbacks are possible.
* (For cases where the address can't be parsed or is 0.0.0.0.0.0, set
* the address to the client's transport address. This won't be used
* for callbacks, but can be printed out by nfsstats for info.)
* Return error if the xdr can't be parsed, 0 otherwise.
*/
APPLESTATIC int
nfsrv_getclientipaddr(struct nfsrv_descript *nd, struct nfsclient *clp)
{
u_int32_t *tl;
u_char *cp, *cp2;
int i, j;
struct sockaddr_in *rad, *sad;
u_char protocol[5], addr[24];
int error = 0, cantparse = 0;
union {
in_addr_t ival;
u_char cval[4];
} ip;
union {
in_port_t sval;
u_char cval[2];
} port;
rad = NFSSOCKADDR(clp->lc_req.nr_nam, struct sockaddr_in *);
rad->sin_family = AF_INET;
rad->sin_len = sizeof (struct sockaddr_in);
rad->sin_addr.s_addr = 0;
rad->sin_port = 0;
clp->lc_req.nr_client = NULL;
clp->lc_req.nr_lock = 0;
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
i = fxdr_unsigned(int, *tl);
if (i >= 3 && i <= 4) {
error = nfsrv_mtostr(nd, protocol, i);
if (error)
goto nfsmout;
if (!strcmp(protocol, "tcp")) {
clp->lc_flags |= LCL_TCPCALLBACK;
clp->lc_req.nr_sotype = SOCK_STREAM;
clp->lc_req.nr_soproto = IPPROTO_TCP;
} else if (!strcmp(protocol, "udp")) {
clp->lc_req.nr_sotype = SOCK_DGRAM;
clp->lc_req.nr_soproto = IPPROTO_UDP;
} else {
cantparse = 1;
}
} else {
cantparse = 1;
if (i > 0) {
error = nfsm_advance(nd, NFSM_RNDUP(i), -1);
if (error)
goto nfsmout;
}
}
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
i = fxdr_unsigned(int, *tl);
if (i < 0) {
error = NFSERR_BADXDR;
goto nfsmout;
} else if (i == 0) {
cantparse = 1;
} else if (!cantparse && i <= 23 && i >= 11) {
error = nfsrv_mtostr(nd, addr, i);
if (error)
goto nfsmout;
/*
* Parse out the address fields. We expect 6 decimal numbers
* separated by '.'s.
*/
cp = addr;
i = 0;
while (*cp && i < 6) {
cp2 = cp;
while (*cp2 && *cp2 != '.')
cp2++;
if (*cp2)
*cp2++ = '\0';
else if (i != 5) {
cantparse = 1;
break;
}
j = nfsrv_getipnumber(cp);
if (j >= 0) {
if (i < 4)
ip.cval[3 - i] = j;
else
port.cval[5 - i] = j;
} else {
cantparse = 1;
break;
}
cp = cp2;
i++;
}
if (!cantparse) {
if (ip.ival != 0x0) {
rad->sin_addr.s_addr = htonl(ip.ival);
rad->sin_port = htons(port.sval);
} else {
cantparse = 1;
}
}
} else {
cantparse = 1;
if (i > 0) {
error = nfsm_advance(nd, NFSM_RNDUP(i), -1);
if (error)
goto nfsmout;
}
}
if (cantparse) {
sad = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
if (sad->sin_family == AF_INET) {
rad->sin_addr.s_addr = sad->sin_addr.s_addr;
rad->sin_port = 0x0;
}
clp->lc_program = 0;
}
nfsmout:
NFSEXITCODE2(error, nd);
return (error);
}
/*
* Turn a string of up to three decimal digits into a number. Return -1 upon
* error.
*/
static int
nfsrv_getipnumber(u_char *cp)
{
int i = 0, j = 0;
while (*cp) {
if (j > 2 || *cp < '0' || *cp > '9')
return (-1);
i *= 10;
i += (*cp - '0');
cp++;
j++;
}
if (i < 256)
return (i);
return (-1);
}
/*
* This function checks for restart conditions.
*/
static int
nfsrv_checkrestart(nfsquad_t clientid, u_int32_t flags,
nfsv4stateid_t *stateidp, int specialid)
{
int ret = 0;
/*
* First check for a server restart. Open, LockT, ReleaseLockOwner
* and DelegPurge have a clientid, the rest a stateid.
*/
if (flags &
(NFSLCK_OPEN | NFSLCK_TEST | NFSLCK_RELEASE | NFSLCK_DELEGPURGE)) {
if (clientid.lval[0] != nfsrvboottime) {
ret = NFSERR_STALECLIENTID;
goto out;
}
} else if (stateidp->other[0] != nfsrvboottime &&
specialid == 0) {
ret = NFSERR_STALESTATEID;
goto out;
}
/*
* Read, Write, Setattr and LockT can return NFSERR_GRACE and do
* not use a lock/open owner seqid#, so the check can be done now.
* (The others will be checked, as required, later.)
*/
if (!(flags & (NFSLCK_CHECK | NFSLCK_TEST)))
goto out;
NFSLOCKSTATE();
ret = nfsrv_checkgrace(NULL, NULL, flags);
NFSUNLOCKSTATE();
out:
NFSEXITCODE(ret);
return (ret);
}
/*
* Check for grace.
*/
static int
nfsrv_checkgrace(struct nfsrv_descript *nd, struct nfsclient *clp,
u_int32_t flags)
{
int error = 0;
if ((nfsrv_stablefirst.nsf_flags & NFSNSF_GRACEOVER) != 0) {
if (flags & NFSLCK_RECLAIM) {
error = NFSERR_NOGRACE;
goto out;
}
} else {
if (!(flags & NFSLCK_RECLAIM)) {
error = NFSERR_GRACE;
goto out;
}
if (nd != NULL && clp != NULL &&
(nd->nd_flag & ND_NFSV41) != 0 &&
(clp->lc_flags & LCL_RECLAIMCOMPLETE) != 0) {
error = NFSERR_NOGRACE;
goto out;
}
/*
* If grace is almost over and we are still getting Reclaims,
* extend grace a bit.
*/
if ((NFSD_MONOSEC + NFSRV_LEASEDELTA) >
nfsrv_stablefirst.nsf_eograce)
nfsrv_stablefirst.nsf_eograce = NFSD_MONOSEC +
NFSRV_LEASEDELTA;
}
out:
NFSEXITCODE(error);
return (error);
}
/*
* Do a server callback.
*/
static int
nfsrv_docallback(struct nfsclient *clp, int procnum,
nfsv4stateid_t *stateidp, int trunc, fhandle_t *fhp,
struct nfsvattr *nap, nfsattrbit_t *attrbitp, NFSPROC_T *p)
{
mbuf_t m;
u_int32_t *tl;
struct nfsrv_descript nfsd, *nd = &nfsd;
struct ucred *cred;
int error = 0;
u_int32_t callback;
struct nfsdsession *sep = NULL;
cred = newnfs_getcred();
NFSLOCKSTATE(); /* mostly for lc_cbref++ */
if (clp->lc_flags & LCL_NEEDSCONFIRM) {
NFSUNLOCKSTATE();
panic("docallb");
}
clp->lc_cbref++;
/*
* Fill the callback program# and version into the request
* structure for newnfs_connect() to use.
*/
clp->lc_req.nr_prog = clp->lc_program;
#ifdef notnow
if ((clp->lc_flags & LCL_NFSV41) != 0)
clp->lc_req.nr_vers = NFSV41_CBVERS;
else
#endif
clp->lc_req.nr_vers = NFSV4_CBVERS;
/*
* First, fill in some of the fields of nd and cr.
*/
nd->nd_flag = ND_NFSV4;
if (clp->lc_flags & LCL_GSS)
nd->nd_flag |= ND_KERBV;
if ((clp->lc_flags & LCL_NFSV41) != 0)
nd->nd_flag |= ND_NFSV41;
nd->nd_repstat = 0;
cred->cr_uid = clp->lc_uid;
cred->cr_gid = clp->lc_gid;
callback = clp->lc_callback;
NFSUNLOCKSTATE();
cred->cr_ngroups = 1;
/*
* Get the first mbuf for the request.
*/
MGET(m, M_WAITOK, MT_DATA);
mbuf_setlen(m, 0);
nd->nd_mreq = nd->nd_mb = m;
nd->nd_bpos = NFSMTOD(m, caddr_t);
/*
* and build the callback request.
*/
if (procnum == NFSV4OP_CBGETATTR) {
nd->nd_procnum = NFSV4PROC_CBCOMPOUND;
error = nfsrv_cbcallargs(nd, clp, callback, NFSV4OP_CBGETATTR,
"CB Getattr", &sep);
if (error != 0) {
mbuf_freem(nd->nd_mreq);
goto errout;
}
(void)nfsm_fhtom(nd, (u_int8_t *)fhp, NFSX_MYFH, 0);
(void)nfsrv_putattrbit(nd, attrbitp);
} else if (procnum == NFSV4OP_CBRECALL) {
nd->nd_procnum = NFSV4PROC_CBCOMPOUND;
error = nfsrv_cbcallargs(nd, clp, callback, NFSV4OP_CBRECALL,
"CB Recall", &sep);
if (error != 0) {
mbuf_freem(nd->nd_mreq);
goto errout;
}
NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED + NFSX_STATEID);
*tl++ = txdr_unsigned(stateidp->seqid);
NFSBCOPY((caddr_t)stateidp->other, (caddr_t)tl,
NFSX_STATEIDOTHER);
tl += (NFSX_STATEIDOTHER / NFSX_UNSIGNED);
if (trunc)
*tl = newnfs_true;
else
*tl = newnfs_false;
(void)nfsm_fhtom(nd, (u_int8_t *)fhp, NFSX_MYFH, 0);
} else if (procnum == NFSV4PROC_CBNULL) {
nd->nd_procnum = NFSV4PROC_CBNULL;
if ((clp->lc_flags & LCL_NFSV41) != 0) {
error = nfsv4_getcbsession(clp, &sep);
if (error != 0) {
mbuf_freem(nd->nd_mreq);
goto errout;
}
}
} else {
error = NFSERR_SERVERFAULT;
mbuf_freem(nd->nd_mreq);
goto errout;
}
/*
* Call newnfs_connect(), as required, and then newnfs_request().
*/
(void) newnfs_sndlock(&clp->lc_req.nr_lock);
if (clp->lc_req.nr_client == NULL) {
if ((clp->lc_flags & LCL_NFSV41) != 0)
error = ECONNREFUSED;
else if (nd->nd_procnum == NFSV4PROC_CBNULL)
error = newnfs_connect(NULL, &clp->lc_req, cred,
NULL, 1);
else
error = newnfs_connect(NULL, &clp->lc_req, cred,
NULL, 3);
}
newnfs_sndunlock(&clp->lc_req.nr_lock);
if (!error) {
if ((nd->nd_flag & ND_NFSV41) != 0) {
KASSERT(sep != NULL, ("sep NULL"));
if (sep->sess_cbsess.nfsess_xprt != NULL)
error = newnfs_request(nd, NULL, clp,
&clp->lc_req, NULL, NULL, cred,
clp->lc_program, clp->lc_req.nr_vers, NULL,
1, NULL, &sep->sess_cbsess);
else {
/*
* This should probably never occur, but if a
* client somehow does an RPC without a
* SequenceID Op that causes a callback just
* after the nfsd threads have been terminated
* and restared we could conceivably get here
* without a backchannel xprt.
*/
printf("nfsrv_docallback: no xprt\n");
error = ECONNREFUSED;
}
nfsrv_freesession(sep, NULL);
} else
error = newnfs_request(nd, NULL, clp, &clp->lc_req,
NULL, NULL, cred, clp->lc_program,
clp->lc_req.nr_vers, NULL, 1, NULL, NULL);
}
errout:
NFSFREECRED(cred);
/*
* If error is set here, the Callback path isn't working
* properly, so twiddle the appropriate LCL_ flags.
* (nd_repstat != 0 indicates the Callback path is working,
* but the callback failed on the client.)
*/
if (error) {
/*
* Mark the callback pathway down, which disabled issuing
* of delegations and gets Renew to return NFSERR_CBPATHDOWN.
*/
NFSLOCKSTATE();
clp->lc_flags |= LCL_CBDOWN;
NFSUNLOCKSTATE();
} else {
/*
* Callback worked. If the callback path was down, disable
* callbacks, so no more delegations will be issued. (This
* is done on the assumption that the callback pathway is
* flakey.)
*/
NFSLOCKSTATE();
if (clp->lc_flags & LCL_CBDOWN)
clp->lc_flags &= ~(LCL_CBDOWN | LCL_CALLBACKSON);
NFSUNLOCKSTATE();
if (nd->nd_repstat)
error = nd->nd_repstat;
else if (error == 0 && procnum == NFSV4OP_CBGETATTR)
error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0,
NULL, NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL,
p, NULL);
mbuf_freem(nd->nd_mrep);
}
NFSLOCKSTATE();
clp->lc_cbref--;
if ((clp->lc_flags & LCL_WAKEUPWANTED) && clp->lc_cbref == 0) {
clp->lc_flags &= ~LCL_WAKEUPWANTED;
wakeup(clp);
}
NFSUNLOCKSTATE();
NFSEXITCODE(error);
return (error);
}
/*
* Set up the compound RPC for the callback.
*/
static int
nfsrv_cbcallargs(struct nfsrv_descript *nd, struct nfsclient *clp,
uint32_t callback, int op, const char *optag, struct nfsdsession **sepp)
{
uint32_t *tl;
int error, len;
len = strlen(optag);
(void)nfsm_strtom(nd, optag, len);
NFSM_BUILD(tl, uint32_t *, 4 * NFSX_UNSIGNED);
if ((nd->nd_flag & ND_NFSV41) != 0) {
*tl++ = txdr_unsigned(NFSV41_MINORVERSION);
*tl++ = txdr_unsigned(callback);
*tl++ = txdr_unsigned(2);
*tl = txdr_unsigned(NFSV4OP_CBSEQUENCE);
error = nfsv4_setcbsequence(nd, clp, 1, sepp);
if (error != 0)
return (error);
NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
*tl = txdr_unsigned(op);
} else {
*tl++ = txdr_unsigned(NFSV4_MINORVERSION);
*tl++ = txdr_unsigned(callback);
*tl++ = txdr_unsigned(1);
*tl = txdr_unsigned(op);
}
return (0);
}
/*
* Return the next index# for a clientid. Mostly just increment and return
* the next one, but... if the 32bit unsigned does actually wrap around,
* it should be rebooted.
* At an average rate of one new client per second, it will wrap around in
* approximately 136 years. (I think the server will have been shut
* down or rebooted before then.)
*/
static u_int32_t
nfsrv_nextclientindex(void)
{
static u_int32_t client_index = 0;
client_index++;
if (client_index != 0)
return (client_index);
printf("%s: out of clientids\n", __func__);
return (client_index);
}
/*
* Return the next index# for a stateid. Mostly just increment and return
* the next one, but... if the 32bit unsigned does actually wrap around
* (will a BSD server stay up that long?), find
* new start and end values.
*/
static u_int32_t
nfsrv_nextstateindex(struct nfsclient *clp)
{
struct nfsstate *stp;
int i;
u_int32_t canuse, min_index, max_index;
if (!(clp->lc_flags & LCL_INDEXNOTOK)) {
clp->lc_stateindex++;
if (clp->lc_stateindex != clp->lc_statemaxindex)
return (clp->lc_stateindex);
}
/*
* Yuck, we've hit the end.
* Look for a new min and max.
*/
min_index = 0;
max_index = 0xffffffff;
for (i = 0; i < nfsrv_statehashsize; i++) {
LIST_FOREACH(stp, &clp->lc_stateid[i], ls_hash) {
if (stp->ls_stateid.other[2] > 0x80000000) {
if (stp->ls_stateid.other[2] < max_index)
max_index = stp->ls_stateid.other[2];
} else {
if (stp->ls_stateid.other[2] > min_index)
min_index = stp->ls_stateid.other[2];
}
}
}
/*
* Yikes, highly unlikely, but I'll handle it anyhow.
*/
if (min_index == 0x80000000 && max_index == 0x80000001) {
canuse = 0;
/*
* Loop around until we find an unused entry. Return that
* and set LCL_INDEXNOTOK, so the search will continue next time.
* (This is one of those rare cases where a goto is the
* cleanest way to code the loop.)
*/
tryagain:
for (i = 0; i < nfsrv_statehashsize; i++) {
LIST_FOREACH(stp, &clp->lc_stateid[i], ls_hash) {
if (stp->ls_stateid.other[2] == canuse) {
canuse++;
goto tryagain;
}
}
}
clp->lc_flags |= LCL_INDEXNOTOK;
return (canuse);
}
/*
* Ok to start again from min + 1.
*/
clp->lc_stateindex = min_index + 1;
clp->lc_statemaxindex = max_index;
clp->lc_flags &= ~LCL_INDEXNOTOK;
return (clp->lc_stateindex);
}
/*
* The following functions handle the stable storage file that deals with
* the edge conditions described in RFC3530 Sec. 8.6.3.
* The file is as follows:
* - a single record at the beginning that has the lease time of the
* previous server instance (before the last reboot) and the nfsrvboottime
* values for the previous server boots.
* These previous boot times are used to ensure that the current
* nfsrvboottime does not, somehow, get set to a previous one.
* (This is important so that Stale ClientIDs and StateIDs can
* be recognized.)
* The number of previous nfsvrboottime values precedes the list.
* - followed by some number of appended records with:
* - client id string
* - flag that indicates it is a record revoking state via lease
* expiration or similar
* OR has successfully acquired state.
* These structures vary in length, with the client string at the end, up
* to NFSV4_OPAQUELIMIT in size.
*
* At the end of the grace period, the file is truncated, the first
* record is rewritten with updated information and any acquired state
* records for successful reclaims of state are written.
*
* Subsequent records are appended when the first state is issued to
* a client and when state is revoked for a client.
*
* When reading the file in, state issued records that come later in
* the file override older ones, since the append log is in cronological order.
* If, for some reason, the file can't be read, the grace period is
* immediately terminated and all reclaims get NFSERR_NOGRACE.
*/
/*
* Read in the stable storage file. Called by nfssvc() before the nfsd
* processes start servicing requests.
*/
APPLESTATIC void
nfsrv_setupstable(NFSPROC_T *p)
{
struct nfsrv_stablefirst *sf = &nfsrv_stablefirst;
struct nfsrv_stable *sp, *nsp;
struct nfst_rec *tsp;
int error, i, tryagain;
off_t off = 0;
ssize_t aresid, len;
/*
* If NFSNSF_UPDATEDONE is set, this is a restart of the nfsds without
* a reboot, so state has not been lost.
*/
if (sf->nsf_flags & NFSNSF_UPDATEDONE)
return;
/*
* Set Grace over just until the file reads successfully.
*/
nfsrvboottime = time_second;
LIST_INIT(&sf->nsf_head);
sf->nsf_flags = (NFSNSF_GRACEOVER | NFSNSF_NEEDLOCK);
sf->nsf_eograce = NFSD_MONOSEC + NFSRV_LEASEDELTA;
if (sf->nsf_fp == NULL)
return;
error = NFSD_RDWR(UIO_READ, NFSFPVNODE(sf->nsf_fp),
(caddr_t)&sf->nsf_rec, sizeof (struct nfsf_rec), off, UIO_SYSSPACE,
0, NFSFPCRED(sf->nsf_fp), &aresid, p);
if (error || aresid || sf->nsf_numboots == 0 ||
sf->nsf_numboots > NFSNSF_MAXNUMBOOTS)
return;
/*
* Now, read in the boottimes.
*/
sf->nsf_bootvals = (time_t *)malloc((sf->nsf_numboots + 1) *
sizeof (time_t), M_TEMP, M_WAITOK);
off = sizeof (struct nfsf_rec);
error = NFSD_RDWR(UIO_READ, NFSFPVNODE(sf->nsf_fp),
(caddr_t)sf->nsf_bootvals, sf->nsf_numboots * sizeof (time_t), off,
UIO_SYSSPACE, 0, NFSFPCRED(sf->nsf_fp), &aresid, p);
if (error || aresid) {
free((caddr_t)sf->nsf_bootvals, M_TEMP);
sf->nsf_bootvals = NULL;
return;
}
/*
* Make sure this nfsrvboottime is different from all recorded
* previous ones.
*/
do {
tryagain = 0;
for (i = 0; i < sf->nsf_numboots; i++) {
if (nfsrvboottime == sf->nsf_bootvals[i]) {
nfsrvboottime++;
tryagain = 1;
break;
}
}
} while (tryagain);
sf->nsf_flags |= NFSNSF_OK;
off += (sf->nsf_numboots * sizeof (time_t));
/*
* Read through the file, building a list of records for grace
* checking.
* Each record is between sizeof (struct nfst_rec) and
* sizeof (struct nfst_rec) + NFSV4_OPAQUELIMIT - 1
* and is actually sizeof (struct nfst_rec) + nst_len - 1.
*/
tsp = (struct nfst_rec *)malloc(sizeof (struct nfst_rec) +
NFSV4_OPAQUELIMIT - 1, M_TEMP, M_WAITOK);
do {
error = NFSD_RDWR(UIO_READ, NFSFPVNODE(sf->nsf_fp),
(caddr_t)tsp, sizeof (struct nfst_rec) + NFSV4_OPAQUELIMIT - 1,
off, UIO_SYSSPACE, 0, NFSFPCRED(sf->nsf_fp), &aresid, p);
len = (sizeof (struct nfst_rec) + NFSV4_OPAQUELIMIT - 1) - aresid;
if (error || (len > 0 && (len < sizeof (struct nfst_rec) ||
len < (sizeof (struct nfst_rec) + tsp->len - 1)))) {
/*
* Yuck, the file has been corrupted, so just return
* after clearing out any restart state, so the grace period
* is over.
*/
LIST_FOREACH_SAFE(sp, &sf->nsf_head, nst_list, nsp) {
LIST_REMOVE(sp, nst_list);
free((caddr_t)sp, M_TEMP);
}
free((caddr_t)tsp, M_TEMP);
sf->nsf_flags &= ~NFSNSF_OK;
free((caddr_t)sf->nsf_bootvals, M_TEMP);
sf->nsf_bootvals = NULL;
return;
}
if (len > 0) {
off += sizeof (struct nfst_rec) + tsp->len - 1;
/*
* Search the list for a matching client.
*/
LIST_FOREACH(sp, &sf->nsf_head, nst_list) {
if (tsp->len == sp->nst_len &&
!NFSBCMP(tsp->client, sp->nst_client, tsp->len))
break;
}
if (sp == LIST_END(&sf->nsf_head)) {
sp = (struct nfsrv_stable *)malloc(tsp->len +
sizeof (struct nfsrv_stable) - 1, M_TEMP,
M_WAITOK);
NFSBCOPY((caddr_t)tsp, (caddr_t)&sp->nst_rec,
sizeof (struct nfst_rec) + tsp->len - 1);
LIST_INSERT_HEAD(&sf->nsf_head, sp, nst_list);
} else {
if (tsp->flag == NFSNST_REVOKE)
sp->nst_flag |= NFSNST_REVOKE;
else
/*
* A subsequent timestamp indicates the client
* did a setclientid/confirm and any previous
* revoke is no longer relevant.
*/
sp->nst_flag &= ~NFSNST_REVOKE;
}
}
} while (len > 0);
free((caddr_t)tsp, M_TEMP);
sf->nsf_flags = NFSNSF_OK;
sf->nsf_eograce = NFSD_MONOSEC + sf->nsf_lease +
NFSRV_LEASEDELTA;
}
/*
* Update the stable storage file, now that the grace period is over.
*/
APPLESTATIC void
nfsrv_updatestable(NFSPROC_T *p)
{
struct nfsrv_stablefirst *sf = &nfsrv_stablefirst;
struct nfsrv_stable *sp, *nsp;
int i;
struct nfsvattr nva;
vnode_t vp;
#if defined(__FreeBSD_version) && (__FreeBSD_version >= 500000)
mount_t mp = NULL;
#endif
int error;
if (sf->nsf_fp == NULL || (sf->nsf_flags & NFSNSF_UPDATEDONE))
return;
sf->nsf_flags |= NFSNSF_UPDATEDONE;
/*
* Ok, we need to rewrite the stable storage file.
* - truncate to 0 length
* - write the new first structure
* - loop through the data structures, writing out any that
* have timestamps older than the old boot
*/
if (sf->nsf_bootvals) {
sf->nsf_numboots++;
for (i = sf->nsf_numboots - 2; i >= 0; i--)
sf->nsf_bootvals[i + 1] = sf->nsf_bootvals[i];
} else {
sf->nsf_numboots = 1;
sf->nsf_bootvals = (time_t *)malloc(sizeof (time_t),
M_TEMP, M_WAITOK);
}
sf->nsf_bootvals[0] = nfsrvboottime;
sf->nsf_lease = nfsrv_lease;
NFSVNO_ATTRINIT(&nva);
NFSVNO_SETATTRVAL(&nva, size, 0);
vp = NFSFPVNODE(sf->nsf_fp);
vn_start_write(vp, &mp, V_WAIT);
if (NFSVOPLOCK(vp, LK_EXCLUSIVE) == 0) {
error = nfsvno_setattr(vp, &nva, NFSFPCRED(sf->nsf_fp), p,
NULL);
NFSVOPUNLOCK(vp, 0);
} else
error = EPERM;
vn_finished_write(mp);
if (!error)
error = NFSD_RDWR(UIO_WRITE, vp,
(caddr_t)&sf->nsf_rec, sizeof (struct nfsf_rec), (off_t)0,
UIO_SYSSPACE, IO_SYNC, NFSFPCRED(sf->nsf_fp), NULL, p);
if (!error)
error = NFSD_RDWR(UIO_WRITE, vp,
(caddr_t)sf->nsf_bootvals,
sf->nsf_numboots * sizeof (time_t),
(off_t)(sizeof (struct nfsf_rec)),
UIO_SYSSPACE, IO_SYNC, NFSFPCRED(sf->nsf_fp), NULL, p);
free((caddr_t)sf->nsf_bootvals, M_TEMP);
sf->nsf_bootvals = NULL;
if (error) {
sf->nsf_flags &= ~NFSNSF_OK;
printf("EEK! Can't write NfsV4 stable storage file\n");
return;
}
sf->nsf_flags |= NFSNSF_OK;
/*
* Loop through the list and write out timestamp records for
* any clients that successfully reclaimed state.
*/
LIST_FOREACH_SAFE(sp, &sf->nsf_head, nst_list, nsp) {
if (sp->nst_flag & NFSNST_GOTSTATE) {
nfsrv_writestable(sp->nst_client, sp->nst_len,
NFSNST_NEWSTATE, p);
sp->nst_clp->lc_flags |= LCL_STAMPEDSTABLE;
}
LIST_REMOVE(sp, nst_list);
free((caddr_t)sp, M_TEMP);
}
nfsrv_backupstable();
}
/*
* Append a record to the stable storage file.
*/
APPLESTATIC void
nfsrv_writestable(u_char *client, int len, int flag, NFSPROC_T *p)
{
struct nfsrv_stablefirst *sf = &nfsrv_stablefirst;
struct nfst_rec *sp;
int error;
if (!(sf->nsf_flags & NFSNSF_OK) || sf->nsf_fp == NULL)
return;
sp = (struct nfst_rec *)malloc(sizeof (struct nfst_rec) +
len - 1, M_TEMP, M_WAITOK);
sp->len = len;
NFSBCOPY(client, sp->client, len);
sp->flag = flag;
error = NFSD_RDWR(UIO_WRITE, NFSFPVNODE(sf->nsf_fp),
(caddr_t)sp, sizeof (struct nfst_rec) + len - 1, (off_t)0,
UIO_SYSSPACE, (IO_SYNC | IO_APPEND), NFSFPCRED(sf->nsf_fp), NULL, p);
free((caddr_t)sp, M_TEMP);
if (error) {
sf->nsf_flags &= ~NFSNSF_OK;
printf("EEK! Can't write NfsV4 stable storage file\n");
}
}
/*
* This function is called during the grace period to mark a client
* that successfully reclaimed state.
*/
static void
nfsrv_markstable(struct nfsclient *clp)
{
struct nfsrv_stable *sp;
/*
* First find the client structure.
*/
LIST_FOREACH(sp, &nfsrv_stablefirst.nsf_head, nst_list) {
if (sp->nst_len == clp->lc_idlen &&
!NFSBCMP(sp->nst_client, clp->lc_id, sp->nst_len))
break;
}
if (sp == LIST_END(&nfsrv_stablefirst.nsf_head))
return;
/*
* Now, just mark it and set the nfsclient back pointer.
*/
sp->nst_flag |= NFSNST_GOTSTATE;
sp->nst_clp = clp;
}
/*
* This function is called for a reclaim, to see if it gets grace.
* It returns 0 if a reclaim is allowed, 1 otherwise.
*/
static int
nfsrv_checkstable(struct nfsclient *clp)
{
struct nfsrv_stable *sp;
/*
* First, find the entry for the client.
*/
LIST_FOREACH(sp, &nfsrv_stablefirst.nsf_head, nst_list) {
if (sp->nst_len == clp->lc_idlen &&
!NFSBCMP(sp->nst_client, clp->lc_id, sp->nst_len))
break;
}
/*
* If not in the list, state was revoked or no state was issued
* since the previous reboot, a reclaim is denied.
*/
if (sp == LIST_END(&nfsrv_stablefirst.nsf_head) ||
(sp->nst_flag & NFSNST_REVOKE) ||
!(nfsrv_stablefirst.nsf_flags & NFSNSF_OK))
return (1);
return (0);
}
/*
* Test for and try to clear out a conflicting client. This is called by
* nfsrv_lockctrl() and nfsrv_openctrl() when conflicts with other clients
* a found.
* The trick here is that it can't revoke a conflicting client with an
* expired lease unless it holds the v4root lock, so...
* If no v4root lock, get the lock and return 1 to indicate "try again".
* Return 0 to indicate the conflict can't be revoked and 1 to indicate
* the revocation worked and the conflicting client is "bye, bye", so it
* can be tried again.
* Return 2 to indicate that the vnode is VI_DOOMED after NFSVOPLOCK().
* Unlocks State before a non-zero value is returned.
*/
static int
nfsrv_clientconflict(struct nfsclient *clp, int *haslockp, vnode_t vp,
NFSPROC_T *p)
{
int gotlock, lktype = 0;
/*
* If lease hasn't expired, we can't fix it.
*/
if (clp->lc_expiry >= NFSD_MONOSEC ||
!(nfsrv_stablefirst.nsf_flags & NFSNSF_UPDATEDONE))
return (0);
if (*haslockp == 0) {
NFSUNLOCKSTATE();
if (vp != NULL) {
lktype = NFSVOPISLOCKED(vp);
NFSVOPUNLOCK(vp, 0);
}
NFSLOCKV4ROOTMUTEX();
nfsv4_relref(&nfsv4rootfs_lock);
do {
gotlock = nfsv4_lock(&nfsv4rootfs_lock, 1, NULL,
NFSV4ROOTLOCKMUTEXPTR, NULL);
} while (!gotlock);
NFSUNLOCKV4ROOTMUTEX();
*haslockp = 1;
if (vp != NULL) {
NFSVOPLOCK(vp, lktype | LK_RETRY);
if ((vp->v_iflag & VI_DOOMED) != 0)
return (2);
}
return (1);
}
NFSUNLOCKSTATE();
/*
* Ok, we can expire the conflicting client.
*/
nfsrv_writestable(clp->lc_id, clp->lc_idlen, NFSNST_REVOKE, p);
nfsrv_backupstable();
nfsrv_cleanclient(clp, p);
nfsrv_freedeleglist(&clp->lc_deleg);
nfsrv_freedeleglist(&clp->lc_olddeleg);
LIST_REMOVE(clp, lc_hash);
nfsrv_zapclient(clp, p);
return (1);
}
/*
* Resolve a delegation conflict.
* Returns 0 to indicate the conflict was resolved without sleeping.
* Return -1 to indicate that the caller should check for conflicts again.
* Return > 0 for an error that should be returned, normally NFSERR_DELAY.
*
* Also, manipulate the nfsv4root_lock, as required. It isn't changed
* for a return of 0, since there was no sleep and it could be required
* later. It is released for a return of NFSERR_DELAY, since the caller
* will return that error. It is released when a sleep was done waiting
* for the delegation to be returned or expire (so that other nfsds can
* handle ops). Then, it must be acquired for the write to stable storage.
* (This function is somewhat similar to nfsrv_clientconflict(), but
* the semantics differ in a couple of subtle ways. The return of 0
* indicates the conflict was resolved without sleeping here, not
* that the conflict can't be resolved and the handling of nfsv4root_lock
* differs, as noted above.)
* Unlocks State before returning a non-zero value.
*/
static int
nfsrv_delegconflict(struct nfsstate *stp, int *haslockp, NFSPROC_T *p,
vnode_t vp)
{
struct nfsclient *clp = stp->ls_clp;
int gotlock, error, lktype = 0, retrycnt, zapped_clp;
nfsv4stateid_t tstateid;
fhandle_t tfh;
/*
* If the conflict is with an old delegation...
*/
if (stp->ls_flags & NFSLCK_OLDDELEG) {
/*
* You can delete it, if it has expired.
*/
if (clp->lc_delegtime < NFSD_MONOSEC) {
nfsrv_freedeleg(stp);
NFSUNLOCKSTATE();
error = -1;
goto out;
}
NFSUNLOCKSTATE();
/*
* During this delay, the old delegation could expire or it
* could be recovered by the client via an Open with
* CLAIM_DELEGATE_PREV.
* Release the nfsv4root_lock, if held.
*/
if (*haslockp) {
*haslockp = 0;
NFSLOCKV4ROOTMUTEX();
nfsv4_unlock(&nfsv4rootfs_lock, 1);
NFSUNLOCKV4ROOTMUTEX();
}
error = NFSERR_DELAY;
goto out;
}
/*
* It's a current delegation, so:
* - check to see if the delegation has expired
* - if so, get the v4root lock and then expire it
*/
if (!(stp->ls_flags & NFSLCK_DELEGRECALL)) {
/*
* - do a recall callback, since not yet done
* For now, never allow truncate to be set. To use
* truncate safely, it must be guaranteed that the
* Remove, Rename or Setattr with size of 0 will
* succeed and that would require major changes to
* the VFS/Vnode OPs.
* Set the expiry time large enough so that it won't expire
* until after the callback, then set it correctly, once
* the callback is done. (The delegation will now time
* out whether or not the Recall worked ok. The timeout
* will be extended when ops are done on the delegation
* stateid, up to the timelimit.)
*/
stp->ls_delegtime = NFSD_MONOSEC + (2 * nfsrv_lease) +
NFSRV_LEASEDELTA;
stp->ls_delegtimelimit = NFSD_MONOSEC + (6 * nfsrv_lease) +
NFSRV_LEASEDELTA;
stp->ls_flags |= NFSLCK_DELEGRECALL;
/*
* Loop NFSRV_CBRETRYCNT times while the CBRecall replies
* NFSERR_BADSTATEID or NFSERR_BADHANDLE. This is done
* in order to try and avoid a race that could happen
* when a CBRecall request passed the Open reply with
* the delegation in it when transitting the network.
* Since nfsrv_docallback will sleep, don't use stp after
* the call.
*/
NFSBCOPY((caddr_t)&stp->ls_stateid, (caddr_t)&tstateid,
sizeof (tstateid));
NFSBCOPY((caddr_t)&stp->ls_lfp->lf_fh, (caddr_t)&tfh,
sizeof (tfh));
NFSUNLOCKSTATE();
if (*haslockp) {
*haslockp = 0;
NFSLOCKV4ROOTMUTEX();
nfsv4_unlock(&nfsv4rootfs_lock, 1);
NFSUNLOCKV4ROOTMUTEX();
}
retrycnt = 0;
do {
error = nfsrv_docallback(clp, NFSV4OP_CBRECALL,
&tstateid, 0, &tfh, NULL, NULL, p);
retrycnt++;
} while ((error == NFSERR_BADSTATEID ||
error == NFSERR_BADHANDLE) && retrycnt < NFSV4_CBRETRYCNT);
error = NFSERR_DELAY;
goto out;
}
if (clp->lc_expiry >= NFSD_MONOSEC &&
stp->ls_delegtime >= NFSD_MONOSEC) {
NFSUNLOCKSTATE();
/*
* A recall has been done, but it has not yet expired.
* So, RETURN_DELAY.
*/
if (*haslockp) {
*haslockp = 0;
NFSLOCKV4ROOTMUTEX();
nfsv4_unlock(&nfsv4rootfs_lock, 1);
NFSUNLOCKV4ROOTMUTEX();
}
error = NFSERR_DELAY;
goto out;
}
/*
* If we don't yet have the lock, just get it and then return,
* since we need that before deleting expired state, such as
* this delegation.
* When getting the lock, unlock the vnode, so other nfsds that
* are in progress, won't get stuck waiting for the vnode lock.
*/
if (*haslockp == 0) {
NFSUNLOCKSTATE();
if (vp != NULL) {
lktype = NFSVOPISLOCKED(vp);
NFSVOPUNLOCK(vp, 0);
}
NFSLOCKV4ROOTMUTEX();
nfsv4_relref(&nfsv4rootfs_lock);
do {
gotlock = nfsv4_lock(&nfsv4rootfs_lock, 1, NULL,
NFSV4ROOTLOCKMUTEXPTR, NULL);
} while (!gotlock);
NFSUNLOCKV4ROOTMUTEX();
*haslockp = 1;
if (vp != NULL) {
NFSVOPLOCK(vp, lktype | LK_RETRY);
if ((vp->v_iflag & VI_DOOMED) != 0) {
*haslockp = 0;
NFSLOCKV4ROOTMUTEX();
nfsv4_unlock(&nfsv4rootfs_lock, 1);
NFSUNLOCKV4ROOTMUTEX();
error = NFSERR_PERM;
goto out;
}
}
error = -1;
goto out;
}
NFSUNLOCKSTATE();
/*
* Ok, we can delete the expired delegation.
* First, write the Revoke record to stable storage and then
* clear out the conflict.
* Since all other nfsd threads are now blocked, we can safely
* sleep without the state changing.
*/
nfsrv_writestable(clp->lc_id, clp->lc_idlen, NFSNST_REVOKE, p);
nfsrv_backupstable();
if (clp->lc_expiry < NFSD_MONOSEC) {
nfsrv_cleanclient(clp, p);
nfsrv_freedeleglist(&clp->lc_deleg);
nfsrv_freedeleglist(&clp->lc_olddeleg);
LIST_REMOVE(clp, lc_hash);
zapped_clp = 1;
} else {
nfsrv_freedeleg(stp);
zapped_clp = 0;
}
if (zapped_clp)
nfsrv_zapclient(clp, p);
error = -1;
out:
NFSEXITCODE(error);
return (error);
}
/*
* Check for a remove allowed, if remove is set to 1 and get rid of
* delegations.
*/
APPLESTATIC int
nfsrv_checkremove(vnode_t vp, int remove, NFSPROC_T *p)
{
struct nfsstate *stp;
struct nfslockfile *lfp;
int error, haslock = 0;
fhandle_t nfh;
/*
* First, get the lock file structure.
* (A return of -1 means no associated state, so remove ok.)
*/
error = nfsrv_getlockfh(vp, NFSLCK_CHECK, NULL, &nfh, p);
tryagain:
NFSLOCKSTATE();
if (!error)
error = nfsrv_getlockfile(NFSLCK_CHECK, NULL, &lfp, &nfh, 0);
if (error) {
NFSUNLOCKSTATE();
if (haslock) {
NFSLOCKV4ROOTMUTEX();
nfsv4_unlock(&nfsv4rootfs_lock, 1);
NFSUNLOCKV4ROOTMUTEX();
}
if (error == -1)
error = 0;
goto out;
}
/*
* Now, we must Recall any delegations.
*/
error = nfsrv_cleandeleg(vp, lfp, NULL, &haslock, p);
if (error) {
/*
* nfsrv_cleandeleg() unlocks state for non-zero
* return.
*/
if (error == -1)
goto tryagain;
if (haslock) {
NFSLOCKV4ROOTMUTEX();
nfsv4_unlock(&nfsv4rootfs_lock, 1);
NFSUNLOCKV4ROOTMUTEX();
}
goto out;
}
/*
* Now, look for a conflicting open share.
*/
if (remove) {
/*
* If the entry in the directory was the last reference to the
* corresponding filesystem object, the object can be destroyed
* */
if(lfp->lf_usecount>1)
LIST_FOREACH(stp, &lfp->lf_open, ls_file) {
if (stp->ls_flags & NFSLCK_WRITEDENY) {
error = NFSERR_FILEOPEN;
break;
}
}
}
NFSUNLOCKSTATE();
if (haslock) {
NFSLOCKV4ROOTMUTEX();
nfsv4_unlock(&nfsv4rootfs_lock, 1);
NFSUNLOCKV4ROOTMUTEX();
}
out:
NFSEXITCODE(error);
return (error);
}
/*
* Clear out all delegations for the file referred to by lfp.
* May return NFSERR_DELAY, if there will be a delay waiting for
* delegations to expire.
* Returns -1 to indicate it slept while recalling a delegation.
* This function has the side effect of deleting the nfslockfile structure,
* if it no longer has associated state and didn't have to sleep.
* Unlocks State before a non-zero value is returned.
*/
static int
nfsrv_cleandeleg(vnode_t vp, struct nfslockfile *lfp,
struct nfsclient *clp, int *haslockp, NFSPROC_T *p)
{
struct nfsstate *stp, *nstp;
int ret = 0;
stp = LIST_FIRST(&lfp->lf_deleg);
while (stp != LIST_END(&lfp->lf_deleg)) {
nstp = LIST_NEXT(stp, ls_file);
if (stp->ls_clp != clp) {
ret = nfsrv_delegconflict(stp, haslockp, p, vp);
if (ret) {
/*
* nfsrv_delegconflict() unlocks state
* when it returns non-zero.
*/
goto out;
}
}
stp = nstp;
}
out:
NFSEXITCODE(ret);
return (ret);
}
/*
* There are certain operations that, when being done outside of NFSv4,
* require that any NFSv4 delegation for the file be recalled.
* This function is to be called for those cases:
* VOP_RENAME() - When a delegation is being recalled for any reason,
* the client may have to do Opens against the server, using the file's
* final component name. If the file has been renamed on the server,
* that component name will be incorrect and the Open will fail.
* VOP_REMOVE() - Theoretically, a client could Open a file after it has
* been removed on the server, if there is a delegation issued to
* that client for the file. I say "theoretically" since clients
* normally do an Access Op before the Open and that Access Op will
* fail with ESTALE. Note that NFSv2 and 3 don't even do Opens, so
* they will detect the file's removal in the same manner. (There is
* one case where RFC3530 allows a client to do an Open without first
* doing an Access Op, which is passage of a check against the ACE
* returned with a Write delegation, but current practice is to ignore
* the ACE and always do an Access Op.)
* Since the functions can only be called with an unlocked vnode, this
* can't be done at this time.
* VOP_ADVLOCK() - When a client holds a delegation, it can issue byte range
* locks locally in the client, which are not visible to the server. To
* deal with this, issuing of delegations for a vnode must be disabled
* and all delegations for the vnode recalled. This is done via the
* second function, using the VV_DISABLEDELEG vflag on the vnode.
*/
APPLESTATIC void
nfsd_recalldelegation(vnode_t vp, NFSPROC_T *p)
{
time_t starttime;
int error;
/*
* First, check to see if the server is currently running and it has
* been called for a regular file when issuing delegations.
*/
if (newnfs_numnfsd == 0 || vp->v_type != VREG ||
nfsrv_issuedelegs == 0)
return;
KASSERT((NFSVOPISLOCKED(vp) != LK_EXCLUSIVE), ("vp %p is locked", vp));
/*
* First, get a reference on the nfsv4rootfs_lock so that an
* exclusive lock cannot be acquired by another thread.
*/
NFSLOCKV4ROOTMUTEX();
nfsv4_getref(&nfsv4rootfs_lock, NULL, NFSV4ROOTLOCKMUTEXPTR, NULL);
NFSUNLOCKV4ROOTMUTEX();
/*
* Now, call nfsrv_checkremove() in a loop while it returns
* NFSERR_DELAY. Return upon any other error or when timed out.
*/
starttime = NFSD_MONOSEC;
do {
if (NFSVOPLOCK(vp, LK_EXCLUSIVE) == 0) {
error = nfsrv_checkremove(vp, 0, p);
NFSVOPUNLOCK(vp, 0);
} else
error = EPERM;
if (error == NFSERR_DELAY) {
if (NFSD_MONOSEC - starttime > NFS_REMOVETIMEO)
break;
/* Sleep for a short period of time */
(void) nfs_catnap(PZERO, 0, "nfsremove");
}
} while (error == NFSERR_DELAY);
NFSLOCKV4ROOTMUTEX();
nfsv4_relref(&nfsv4rootfs_lock);
NFSUNLOCKV4ROOTMUTEX();
}
APPLESTATIC void
nfsd_disabledelegation(vnode_t vp, NFSPROC_T *p)
{
#ifdef VV_DISABLEDELEG
/*
* First, flag issuance of delegations disabled.
*/
atomic_set_long(&vp->v_vflag, VV_DISABLEDELEG);
#endif
/*
* Then call nfsd_recalldelegation() to get rid of all extant
* delegations.
*/
nfsd_recalldelegation(vp, p);
}
/*
* Check for conflicting locks, etc. and then get rid of delegations.
* (At one point I thought that I should get rid of delegations for any
* Setattr, since it could potentially disallow the I/O op (read or write)
* allowed by the delegation. However, Setattr Ops that aren't changing
* the size get a stateid of all 0s, so you can't tell if it is a delegation
* for the same client or a different one, so I decided to only get rid
* of delegations for other clients when the size is being changed.)
* In general, a Setattr can disable NFS I/O Ops that are outstanding, such
* as Write backs, even if there is no delegation, so it really isn't any
* different?)
*/
APPLESTATIC int
nfsrv_checksetattr(vnode_t vp, struct nfsrv_descript *nd,
nfsv4stateid_t *stateidp, struct nfsvattr *nvap, nfsattrbit_t *attrbitp,
struct nfsexstuff *exp, NFSPROC_T *p)
{
struct nfsstate st, *stp = &st;
struct nfslock lo, *lop = &lo;
int error = 0;
nfsquad_t clientid;
if (NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_SIZE)) {
stp->ls_flags = (NFSLCK_CHECK | NFSLCK_WRITEACCESS);
lop->lo_first = nvap->na_size;
} else {
stp->ls_flags = 0;
lop->lo_first = 0;
}
if (NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_OWNER) ||
NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_OWNERGROUP) ||
NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_MODE) ||
NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_ACL))
stp->ls_flags |= NFSLCK_SETATTR;
if (stp->ls_flags == 0)
goto out;
lop->lo_end = NFS64BITSSET;
lop->lo_flags = NFSLCK_WRITE;
stp->ls_ownerlen = 0;
stp->ls_op = NULL;
stp->ls_uid = nd->nd_cred->cr_uid;
stp->ls_stateid.seqid = stateidp->seqid;
clientid.lval[0] = stp->ls_stateid.other[0] = stateidp->other[0];
clientid.lval[1] = stp->ls_stateid.other[1] = stateidp->other[1];
stp->ls_stateid.other[2] = stateidp->other[2];
error = nfsrv_lockctrl(vp, &stp, &lop, NULL, clientid,
stateidp, exp, nd, p);
out:
NFSEXITCODE2(error, nd);
return (error);
}
/*
* Check for a write delegation and do a CBGETATTR if there is one, updating
* the attributes, as required.
* Should I return an error if I can't get the attributes? (For now, I'll
* just return ok.
*/
APPLESTATIC int
nfsrv_checkgetattr(struct nfsrv_descript *nd, vnode_t vp,
struct nfsvattr *nvap, nfsattrbit_t *attrbitp, struct ucred *cred,
NFSPROC_T *p)
{
struct nfsstate *stp;
struct nfslockfile *lfp;
struct nfsclient *clp;
struct nfsvattr nva;
fhandle_t nfh;
int error = 0;
nfsattrbit_t cbbits;
u_quad_t delegfilerev;
NFSCBGETATTR_ATTRBIT(attrbitp, &cbbits);
if (!NFSNONZERO_ATTRBIT(&cbbits))
goto out;
if (nfsrv_writedelegcnt == 0)
goto out;
/*
* Get the lock file structure.
* (A return of -1 means no associated state, so return ok.)
*/
error = nfsrv_getlockfh(vp, NFSLCK_CHECK, NULL, &nfh, p);
NFSLOCKSTATE();
if (!error)
error = nfsrv_getlockfile(NFSLCK_CHECK, NULL, &lfp, &nfh, 0);
if (error) {
NFSUNLOCKSTATE();
if (error == -1)
error = 0;
goto out;
}
/*
* Now, look for a write delegation.
*/
LIST_FOREACH(stp, &lfp->lf_deleg, ls_file) {
if (stp->ls_flags & NFSLCK_DELEGWRITE)
break;
}
if (stp == LIST_END(&lfp->lf_deleg)) {
NFSUNLOCKSTATE();
goto out;
}
clp = stp->ls_clp;
delegfilerev = stp->ls_filerev;
/*
* If the Write delegation was issued as a part of this Compound RPC
* or if we have an Implied Clientid (used in a previous Op in this
* compound) and it is the client the delegation was issued to,
* just return ok.
* I also assume that it is from the same client iff the network
* host IP address is the same as the callback address. (Not
* exactly correct by the RFC, but avoids a lot of Getattr
* callbacks.)
*/
if (nd->nd_compref == stp->ls_compref ||
((nd->nd_flag & ND_IMPLIEDCLID) &&
clp->lc_clientid.qval == nd->nd_clientid.qval) ||
nfsaddr2_match(clp->lc_req.nr_nam, nd->nd_nam)) {
NFSUNLOCKSTATE();
goto out;
}
/*
* We are now done with the delegation state structure,
* so the statelock can be released and we can now tsleep().
*/
/*
* Now, we must do the CB Getattr callback, to see if Change or Size
* has changed.
*/
if (clp->lc_expiry >= NFSD_MONOSEC) {
NFSUNLOCKSTATE();
NFSVNO_ATTRINIT(&nva);
nva.na_filerev = NFS64BITSSET;
error = nfsrv_docallback(clp, NFSV4OP_CBGETATTR, NULL,
0, &nfh, &nva, &cbbits, p);
if (!error) {
if ((nva.na_filerev != NFS64BITSSET &&
nva.na_filerev > delegfilerev) ||
(NFSVNO_ISSETSIZE(&nva) &&
nva.na_size != nvap->na_size)) {
error = nfsvno_updfilerev(vp, nvap, cred, p);
if (NFSVNO_ISSETSIZE(&nva))
nvap->na_size = nva.na_size;
}
} else
error = 0; /* Ignore callback errors for now. */
} else {
NFSUNLOCKSTATE();
}
out:
NFSEXITCODE2(error, nd);
return (error);
}
/*
* This function looks for openowners that haven't had any opens for
* a while and throws them away. Called by an nfsd when NFSNSF_NOOPENS
* is set.
*/
APPLESTATIC void
nfsrv_throwawayopens(NFSPROC_T *p)
{
struct nfsclient *clp, *nclp;
struct nfsstate *stp, *nstp;
int i;
NFSLOCKSTATE();
nfsrv_stablefirst.nsf_flags &= ~NFSNSF_NOOPENS;
/*
* For each client...
*/
for (i = 0; i < nfsrv_clienthashsize; i++) {
LIST_FOREACH_SAFE(clp, &nfsclienthash[i], lc_hash, nclp) {
LIST_FOREACH_SAFE(stp, &clp->lc_open, ls_list, nstp) {
if (LIST_EMPTY(&stp->ls_open) &&
(stp->ls_noopens > NFSNOOPEN ||
(nfsrv_openpluslock * 2) >
nfsrv_v4statelimit))
nfsrv_freeopenowner(stp, 0, p);
}
}
}
NFSUNLOCKSTATE();
}
/*
* This function checks to see if the credentials are the same.
* Returns 1 for not same, 0 otherwise.
*/
static int
nfsrv_notsamecredname(struct nfsrv_descript *nd, struct nfsclient *clp)
{
if (nd->nd_flag & ND_GSS) {
if (!(clp->lc_flags & LCL_GSS))
return (1);
if (clp->lc_flags & LCL_NAME) {
if (nd->nd_princlen != clp->lc_namelen ||
NFSBCMP(nd->nd_principal, clp->lc_name,
clp->lc_namelen))
return (1);
else
return (0);
}
if (nd->nd_cred->cr_uid == clp->lc_uid)
return (0);
else
return (1);
} else if (clp->lc_flags & LCL_GSS)
return (1);
/*
* For AUTH_SYS, allow the same uid or root. (This is underspecified
* in RFC3530, which talks about principals, but doesn't say anything
* about uids for AUTH_SYS.)
*/
if (nd->nd_cred->cr_uid == clp->lc_uid || nd->nd_cred->cr_uid == 0)
return (0);
else
return (1);
}
/*
* Calculate the lease expiry time.
*/
static time_t
nfsrv_leaseexpiry(void)
{
if (nfsrv_stablefirst.nsf_eograce > NFSD_MONOSEC)
return (NFSD_MONOSEC + 2 * (nfsrv_lease + NFSRV_LEASEDELTA));
return (NFSD_MONOSEC + nfsrv_lease + NFSRV_LEASEDELTA);
}
/*
* Delay the delegation timeout as far as ls_delegtimelimit, as required.
*/
static void
nfsrv_delaydelegtimeout(struct nfsstate *stp)
{
if ((stp->ls_flags & NFSLCK_DELEGRECALL) == 0)
return;
if ((stp->ls_delegtime + 15) > NFSD_MONOSEC &&
stp->ls_delegtime < stp->ls_delegtimelimit) {
stp->ls_delegtime += nfsrv_lease;
if (stp->ls_delegtime > stp->ls_delegtimelimit)
stp->ls_delegtime = stp->ls_delegtimelimit;
}
}
/*
* This function checks to see if there is any other state associated
* with the openowner for this Open.
* It returns 1 if there is no other state, 0 otherwise.
*/
static int
nfsrv_nootherstate(struct nfsstate *stp)
{
struct nfsstate *tstp;
LIST_FOREACH(tstp, &stp->ls_openowner->ls_open, ls_list) {
if (tstp != stp || !LIST_EMPTY(&tstp->ls_lock))
return (0);
}
return (1);
}
/*
* Create a list of lock deltas (changes to local byte range locking
* that can be rolled back using the list) and apply the changes via
* nfsvno_advlock(). Optionally, lock the list. It is expected that either
* the rollback or update function will be called after this.
* It returns an error (and rolls back, as required), if any nfsvno_advlock()
* call fails. If it returns an error, it will unlock the list.
*/
static int
nfsrv_locallock(vnode_t vp, struct nfslockfile *lfp, int flags,
uint64_t first, uint64_t end, struct nfslockconflict *cfp, NFSPROC_T *p)
{
struct nfslock *lop, *nlop;
int error = 0;
/* Loop through the list of locks. */
lop = LIST_FIRST(&lfp->lf_locallock);
while (first < end && lop != NULL) {
nlop = LIST_NEXT(lop, lo_lckowner);
if (first >= lop->lo_end) {
/* not there yet */
lop = nlop;
} else if (first < lop->lo_first) {
/* new one starts before entry in list */
if (end <= lop->lo_first) {
/* no overlap between old and new */
error = nfsrv_dolocal(vp, lfp, flags,
NFSLCK_UNLOCK, first, end, cfp, p);
if (error != 0)
break;
first = end;
} else {
/* handle fragment overlapped with new one */
error = nfsrv_dolocal(vp, lfp, flags,
NFSLCK_UNLOCK, first, lop->lo_first, cfp,
p);
if (error != 0)
break;
first = lop->lo_first;
}
} else {
/* new one overlaps this entry in list */
if (end <= lop->lo_end) {
/* overlaps all of new one */
error = nfsrv_dolocal(vp, lfp, flags,
lop->lo_flags, first, end, cfp, p);
if (error != 0)
break;
first = end;
} else {
/* handle fragment overlapped with new one */
error = nfsrv_dolocal(vp, lfp, flags,
lop->lo_flags, first, lop->lo_end, cfp, p);
if (error != 0)
break;
first = lop->lo_end;
lop = nlop;
}
}
}
if (first < end && error == 0)
/* handle fragment past end of list */
error = nfsrv_dolocal(vp, lfp, flags, NFSLCK_UNLOCK, first,
end, cfp, p);
NFSEXITCODE(error);
return (error);
}
/*
* Local lock unlock. Unlock all byte ranges that are no longer locked
* by NFSv4. To do this, unlock any subranges of first-->end that
* do not overlap with the byte ranges of any lock in the lfp->lf_lock
* list. This list has all locks for the file held by other
* <clientid, lockowner> tuples. The list is ordered by increasing
* lo_first value, but may have entries that overlap each other, for
* the case of read locks.
*/
static void
nfsrv_localunlock(vnode_t vp, struct nfslockfile *lfp, uint64_t init_first,
uint64_t init_end, NFSPROC_T *p)
{
struct nfslock *lop;
uint64_t first, end, prevfirst;
first = init_first;
end = init_end;
while (first < init_end) {
/* Loop through all nfs locks, adjusting first and end */
prevfirst = 0;
LIST_FOREACH(lop, &lfp->lf_lock, lo_lckfile) {
KASSERT(prevfirst <= lop->lo_first,
("nfsv4 locks out of order"));
KASSERT(lop->lo_first < lop->lo_end,
("nfsv4 bogus lock"));
prevfirst = lop->lo_first;
if (first >= lop->lo_first &&
first < lop->lo_end)
/*
* Overlaps with initial part, so trim
* off that initial part by moving first past
* it.
*/
first = lop->lo_end;
else if (end > lop->lo_first &&
lop->lo_first > first) {
/*
* This lock defines the end of the
* segment to unlock, so set end to the
* start of it and break out of the loop.
*/
end = lop->lo_first;
break;
}
if (first >= end)
/*
* There is no segment left to do, so
* break out of this loop and then exit
* the outer while() since first will be set
* to end, which must equal init_end here.
*/
break;
}
if (first < end) {
/* Unlock this segment */
(void) nfsrv_dolocal(vp, lfp, NFSLCK_UNLOCK,
NFSLCK_READ, first, end, NULL, p);
nfsrv_locallock_commit(lfp, NFSLCK_UNLOCK,
first, end);
}
/*
* Now move past this segment and look for any further
* segment in the range, if there is one.
*/
first = end;
end = init_end;
}
}
/*
* Do the local lock operation and update the rollback list, as required.
* Perform the rollback and return the error if nfsvno_advlock() fails.
*/
static int
nfsrv_dolocal(vnode_t vp, struct nfslockfile *lfp, int flags, int oldflags,
uint64_t first, uint64_t end, struct nfslockconflict *cfp, NFSPROC_T *p)
{
struct nfsrollback *rlp;
int error = 0, ltype, oldltype;
if (flags & NFSLCK_WRITE)
ltype = F_WRLCK;
else if (flags & NFSLCK_READ)
ltype = F_RDLCK;
else
ltype = F_UNLCK;
if (oldflags & NFSLCK_WRITE)
oldltype = F_WRLCK;
else if (oldflags & NFSLCK_READ)
oldltype = F_RDLCK;
else
oldltype = F_UNLCK;
if (ltype == oldltype || (oldltype == F_WRLCK && ltype == F_RDLCK))
/* nothing to do */
goto out;
error = nfsvno_advlock(vp, ltype, first, end, p);
if (error != 0) {
if (cfp != NULL) {
cfp->cl_clientid.lval[0] = 0;
cfp->cl_clientid.lval[1] = 0;
cfp->cl_first = 0;
cfp->cl_end = NFS64BITSSET;
cfp->cl_flags = NFSLCK_WRITE;
cfp->cl_ownerlen = 5;
NFSBCOPY("LOCAL", cfp->cl_owner, 5);
}
nfsrv_locallock_rollback(vp, lfp, p);
} else if (ltype != F_UNLCK) {
rlp = malloc(sizeof (struct nfsrollback), M_NFSDROLLBACK,
M_WAITOK);
rlp->rlck_first = first;
rlp->rlck_end = end;
rlp->rlck_type = oldltype;
LIST_INSERT_HEAD(&lfp->lf_rollback, rlp, rlck_list);
}
out:
NFSEXITCODE(error);
return (error);
}
/*
* Roll back local lock changes and free up the rollback list.
*/
static void
nfsrv_locallock_rollback(vnode_t vp, struct nfslockfile *lfp, NFSPROC_T *p)
{
struct nfsrollback *rlp, *nrlp;
LIST_FOREACH_SAFE(rlp, &lfp->lf_rollback, rlck_list, nrlp) {
(void) nfsvno_advlock(vp, rlp->rlck_type, rlp->rlck_first,
rlp->rlck_end, p);
free(rlp, M_NFSDROLLBACK);
}
LIST_INIT(&lfp->lf_rollback);
}
/*
* Update local lock list and delete rollback list (ie now committed to the
* local locks). Most of the work is done by the internal function.
*/
static void
nfsrv_locallock_commit(struct nfslockfile *lfp, int flags, uint64_t first,
uint64_t end)
{
struct nfsrollback *rlp, *nrlp;
struct nfslock *new_lop, *other_lop;
new_lop = malloc(sizeof (struct nfslock), M_NFSDLOCK, M_WAITOK);
if (flags & (NFSLCK_READ | NFSLCK_WRITE))
other_lop = malloc(sizeof (struct nfslock), M_NFSDLOCK,
M_WAITOK);
else
other_lop = NULL;
new_lop->lo_flags = flags;
new_lop->lo_first = first;
new_lop->lo_end = end;
nfsrv_updatelock(NULL, &new_lop, &other_lop, lfp);
if (new_lop != NULL)
free(new_lop, M_NFSDLOCK);
if (other_lop != NULL)
free(other_lop, M_NFSDLOCK);
/* and get rid of the rollback list */
LIST_FOREACH_SAFE(rlp, &lfp->lf_rollback, rlck_list, nrlp)
free(rlp, M_NFSDROLLBACK);
LIST_INIT(&lfp->lf_rollback);
}
/*
* Lock the struct nfslockfile for local lock updating.
*/
static void
nfsrv_locklf(struct nfslockfile *lfp)
{
int gotlock;
/* lf_usecount ensures *lfp won't be free'd */
lfp->lf_usecount++;
do {
gotlock = nfsv4_lock(&lfp->lf_locallock_lck, 1, NULL,
NFSSTATEMUTEXPTR, NULL);
} while (gotlock == 0);
lfp->lf_usecount--;
}
/*
* Unlock the struct nfslockfile after local lock updating.
*/
static void
nfsrv_unlocklf(struct nfslockfile *lfp)
{
nfsv4_unlock(&lfp->lf_locallock_lck, 0);
}
/*
* Clear out all state for the NFSv4 server.
* Must be called by a thread that can sleep when no nfsds are running.
*/
void
nfsrv_throwawayallstate(NFSPROC_T *p)
{
struct nfsclient *clp, *nclp;
struct nfslockfile *lfp, *nlfp;
int i;
/*
* For each client, clean out the state and then free the structure.
*/
for (i = 0; i < nfsrv_clienthashsize; i++) {
LIST_FOREACH_SAFE(clp, &nfsclienthash[i], lc_hash, nclp) {
nfsrv_cleanclient(clp, p);
nfsrv_freedeleglist(&clp->lc_deleg);
nfsrv_freedeleglist(&clp->lc_olddeleg);
free(clp->lc_stateid, M_NFSDCLIENT);
free(clp, M_NFSDCLIENT);
}
}
/*
* Also, free up any remaining lock file structures.
*/
for (i = 0; i < nfsrv_lockhashsize; i++) {
LIST_FOREACH_SAFE(lfp, &nfslockhash[i], lf_hash, nlfp) {
printf("nfsd unload: fnd a lock file struct\n");
nfsrv_freenfslockfile(lfp);
}
}
}
/*
* Check the sequence# for the session and slot provided as an argument.
* Also, renew the lease if the session will return NFS_OK.
*/
int
nfsrv_checksequence(struct nfsrv_descript *nd, uint32_t sequenceid,
uint32_t *highest_slotidp, uint32_t *target_highest_slotidp, int cache_this,
uint32_t *sflagsp, NFSPROC_T *p)
{
struct nfsdsession *sep;
struct nfssessionhash *shp;
int error;
SVCXPRT *savxprt;
shp = NFSSESSIONHASH(nd->nd_sessionid);
NFSLOCKSESSION(shp);
sep = nfsrv_findsession(nd->nd_sessionid);
if (sep == NULL) {
NFSUNLOCKSESSION(shp);
return (NFSERR_BADSESSION);
}
error = nfsv4_seqsession(sequenceid, nd->nd_slotid, *highest_slotidp,
sep->sess_slots, NULL, NFSV4_SLOTS - 1);
if (error != 0) {
NFSUNLOCKSESSION(shp);
return (error);
}
if (cache_this != 0)
nd->nd_flag |= ND_SAVEREPLY;
/* Renew the lease. */
sep->sess_clp->lc_expiry = nfsrv_leaseexpiry();
nd->nd_clientid.qval = sep->sess_clp->lc_clientid.qval;
nd->nd_flag |= ND_IMPLIEDCLID;
/*
* If this session handles the backchannel, save the nd_xprt for this
* RPC, since this is the one being used.
*/
if (sep->sess_clp->lc_req.nr_client != NULL &&
(sep->sess_crflags & NFSV4CRSESS_CONNBACKCHAN) != 0) {
savxprt = sep->sess_cbsess.nfsess_xprt;
SVC_ACQUIRE(nd->nd_xprt);
nd->nd_xprt->xp_p2 =
sep->sess_clp->lc_req.nr_client->cl_private;
nd->nd_xprt->xp_idletimeout = 0; /* Disable timeout. */
sep->sess_cbsess.nfsess_xprt = nd->nd_xprt;
if (savxprt != NULL)
SVC_RELEASE(savxprt);
}
*sflagsp = 0;
if (sep->sess_clp->lc_req.nr_client == NULL)
*sflagsp |= NFSV4SEQ_CBPATHDOWN;
NFSUNLOCKSESSION(shp);
if (error == NFSERR_EXPIRED) {
*sflagsp |= NFSV4SEQ_EXPIREDALLSTATEREVOKED;
error = 0;
} else if (error == NFSERR_ADMINREVOKED) {
*sflagsp |= NFSV4SEQ_ADMINSTATEREVOKED;
error = 0;
}
*highest_slotidp = *target_highest_slotidp = NFSV4_SLOTS - 1;
return (0);
}
/*
* Check/set reclaim complete for this session/clientid.
*/
int
nfsrv_checkreclaimcomplete(struct nfsrv_descript *nd)
{
struct nfsdsession *sep;
struct nfssessionhash *shp;
int error = 0;
shp = NFSSESSIONHASH(nd->nd_sessionid);
NFSLOCKSTATE();
NFSLOCKSESSION(shp);
sep = nfsrv_findsession(nd->nd_sessionid);
if (sep == NULL) {
NFSUNLOCKSESSION(shp);
NFSUNLOCKSTATE();
return (NFSERR_BADSESSION);
}
/* Check to see if reclaim complete has already happened. */
if ((sep->sess_clp->lc_flags & LCL_RECLAIMCOMPLETE) != 0)
error = NFSERR_COMPLETEALREADY;
else
sep->sess_clp->lc_flags |= LCL_RECLAIMCOMPLETE;
NFSUNLOCKSESSION(shp);
NFSUNLOCKSTATE();
return (error);
}
/*
* Cache the reply in a session slot.
*/
void
nfsrv_cache_session(uint8_t *sessionid, uint32_t slotid, int repstat,
struct mbuf **m)
{
struct nfsdsession *sep;
struct nfssessionhash *shp;
shp = NFSSESSIONHASH(sessionid);
NFSLOCKSESSION(shp);
sep = nfsrv_findsession(sessionid);
if (sep == NULL) {
NFSUNLOCKSESSION(shp);
printf("nfsrv_cache_session: no session\n");
m_freem(*m);
return;
}
nfsv4_seqsess_cacherep(slotid, sep->sess_slots, repstat, m);
NFSUNLOCKSESSION(shp);
}
/*
* Search for a session that matches the sessionid.
*/
static struct nfsdsession *
nfsrv_findsession(uint8_t *sessionid)
{
struct nfsdsession *sep;
struct nfssessionhash *shp;
shp = NFSSESSIONHASH(sessionid);
LIST_FOREACH(sep, &shp->list, sess_hash) {
if (!NFSBCMP(sessionid, sep->sess_sessionid, NFSX_V4SESSIONID))
break;
}
return (sep);
}
/*
* Destroy a session.
*/
int
nfsrv_destroysession(struct nfsrv_descript *nd, uint8_t *sessionid)
{
int error, samesess;
samesess = 0;
if (!NFSBCMP(sessionid, nd->nd_sessionid, NFSX_V4SESSIONID)) {
samesess = 1;
if ((nd->nd_flag & ND_LASTOP) == 0)
return (NFSERR_BADSESSION);
}
error = nfsrv_freesession(NULL, sessionid);
if (error == 0 && samesess != 0)
nd->nd_flag &= ~ND_HASSEQUENCE;
return (error);
}
/*
* Free up a session structure.
*/
static int
nfsrv_freesession(struct nfsdsession *sep, uint8_t *sessionid)
{
struct nfssessionhash *shp;
int i;
NFSLOCKSTATE();
if (sep == NULL) {
shp = NFSSESSIONHASH(sessionid);
NFSLOCKSESSION(shp);
sep = nfsrv_findsession(sessionid);
} else {
shp = NFSSESSIONHASH(sep->sess_sessionid);
NFSLOCKSESSION(shp);
}
if (sep != NULL) {
sep->sess_refcnt--;
if (sep->sess_refcnt > 0) {
NFSUNLOCKSESSION(shp);
NFSUNLOCKSTATE();
return (0);
}
LIST_REMOVE(sep, sess_hash);
LIST_REMOVE(sep, sess_list);
}
NFSUNLOCKSESSION(shp);
NFSUNLOCKSTATE();
if (sep == NULL)
return (NFSERR_BADSESSION);
for (i = 0; i < NFSV4_SLOTS; i++)
if (sep->sess_slots[i].nfssl_reply != NULL)
m_freem(sep->sess_slots[i].nfssl_reply);
if (sep->sess_cbsess.nfsess_xprt != NULL)
SVC_RELEASE(sep->sess_cbsess.nfsess_xprt);
free(sep, M_NFSDSESSION);
return (0);
}
/*
* Free a stateid.
* RFC5661 says that it should fail when there are associated opens, locks
* or delegations. Since stateids represent opens, I don't see how you can
* free an open stateid (it will be free'd when closed), so this function
* only works for lock stateids (freeing the lock_owner) or delegations.
*/
int
nfsrv_freestateid(struct nfsrv_descript *nd, nfsv4stateid_t *stateidp,
NFSPROC_T *p)
{
struct nfsclient *clp;
struct nfsstate *stp;
int error;
NFSLOCKSTATE();
/*
* Look up the stateid
*/
error = nfsrv_getclient((nfsquad_t)((u_quad_t)0), CLOPS_RENEW, &clp,
NULL, (nfsquad_t)((u_quad_t)0), 0, nd, p);
if (error == 0) {
/* First, check for a delegation. */
LIST_FOREACH(stp, &clp->lc_deleg, ls_list) {
if (!NFSBCMP(stp->ls_stateid.other, stateidp->other,
NFSX_STATEIDOTHER))
break;
}
if (stp != NULL) {
nfsrv_freedeleg(stp);
NFSUNLOCKSTATE();
return (error);
}
}
/* Not a delegation, try for a lock_owner. */
if (error == 0)
error = nfsrv_getstate(clp, stateidp, 0, &stp);
if (error == 0 && ((stp->ls_flags & (NFSLCK_OPEN | NFSLCK_DELEGREAD |
NFSLCK_DELEGWRITE)) != 0 || (stp->ls_flags & NFSLCK_LOCK) == 0))
/* Not a lock_owner stateid. */
error = NFSERR_LOCKSHELD;
if (error == 0 && !LIST_EMPTY(&stp->ls_lock))
error = NFSERR_LOCKSHELD;
if (error == 0)
nfsrv_freelockowner(stp, NULL, 0, p);
NFSUNLOCKSTATE();
return (error);
}
/*
* Generate the xdr for an NFSv4.1 CBSequence Operation.
*/
static int
nfsv4_setcbsequence(struct nfsrv_descript *nd, struct nfsclient *clp,
int dont_replycache, struct nfsdsession **sepp)
{
struct nfsdsession *sep;
uint32_t *tl, slotseq = 0;
int maxslot, slotpos;
uint8_t sessionid[NFSX_V4SESSIONID];
int error;
error = nfsv4_getcbsession(clp, sepp);
if (error != 0)
return (error);
sep = *sepp;
(void)nfsv4_sequencelookup(NULL, &sep->sess_cbsess, &slotpos, &maxslot,
&slotseq, sessionid);
KASSERT(maxslot >= 0, ("nfsv4_setcbsequence neg maxslot"));
/* Build the Sequence arguments. */
NFSM_BUILD(tl, uint32_t *, NFSX_V4SESSIONID + 5 * NFSX_UNSIGNED);
bcopy(sessionid, tl, NFSX_V4SESSIONID);
tl += NFSX_V4SESSIONID / NFSX_UNSIGNED;
nd->nd_slotseq = tl;
*tl++ = txdr_unsigned(slotseq);
*tl++ = txdr_unsigned(slotpos);
*tl++ = txdr_unsigned(maxslot);
if (dont_replycache == 0)
*tl++ = newnfs_true;
else
*tl++ = newnfs_false;
*tl = 0; /* No referring call list, for now. */
nd->nd_flag |= ND_HASSEQUENCE;
return (0);
}
/*
* Get a session for the callback.
*/
static int
nfsv4_getcbsession(struct nfsclient *clp, struct nfsdsession **sepp)
{
struct nfsdsession *sep;
NFSLOCKSTATE();
LIST_FOREACH(sep, &clp->lc_session, sess_list) {
if ((sep->sess_crflags & NFSV4CRSESS_CONNBACKCHAN) != 0)
break;
}
if (sep == NULL) {
NFSUNLOCKSTATE();
return (NFSERR_BADSESSION);
}
sep->sess_refcnt++;
*sepp = sep;
NFSUNLOCKSTATE();
return (0);
}
/*
* Free up all backchannel xprts. This needs to be done when the nfsd threads
* exit, since those transports will all be going away.
* This is only called after all the nfsd threads are done performing RPCs,
* so locking shouldn't be an issue.
*/
APPLESTATIC void
nfsrv_freeallbackchannel_xprts(void)
{
struct nfsdsession *sep;
struct nfsclient *clp;
SVCXPRT *xprt;
int i;
for (i = 0; i < nfsrv_clienthashsize; i++) {
LIST_FOREACH(clp, &nfsclienthash[i], lc_hash) {
LIST_FOREACH(sep, &clp->lc_session, sess_list) {
xprt = sep->sess_cbsess.nfsess_xprt;
sep->sess_cbsess.nfsess_xprt = NULL;
if (xprt != NULL)
SVC_RELEASE(xprt);
}
}
}
}
Index: head/sys/geom/geom_subr.c
===================================================================
--- head/sys/geom/geom_subr.c (revision 327172)
+++ head/sys/geom/geom_subr.c (revision 327173)
@@ -1,1572 +1,1571 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (c) 2002 Poul-Henning Kamp
* Copyright (c) 2002 Networks Associates Technology, Inc.
* All rights reserved.
*
* This software was developed for the FreeBSD Project by Poul-Henning Kamp
* and NAI Labs, the Security Research Division of Network Associates, Inc.
* under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
* DARPA CHATS research program.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The names of the authors may not be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_ddb.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/devicestat.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/bio.h>
#include <sys/sysctl.h>
#include <sys/proc.h>
#include <sys/kthread.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/errno.h>
#include <sys/sbuf.h>
#include <geom/geom.h>
#include <geom/geom_int.h>
#include <machine/stdarg.h>
#ifdef DDB
#include <ddb/ddb.h>
#endif
#ifdef KDB
#include <sys/kdb.h>
#endif
struct class_list_head g_classes = LIST_HEAD_INITIALIZER(g_classes);
static struct g_tailq_head geoms = TAILQ_HEAD_INITIALIZER(geoms);
char *g_wait_event, *g_wait_up, *g_wait_down, *g_wait_sim;
struct g_hh00 {
struct g_class *mp;
struct g_provider *pp;
off_t size;
int error;
int post;
};
/*
* This event offers a new class a chance to taste all preexisting providers.
*/
static void
g_load_class(void *arg, int flag)
{
struct g_hh00 *hh;
struct g_class *mp2, *mp;
struct g_geom *gp;
struct g_provider *pp;
g_topology_assert();
if (flag == EV_CANCEL) /* XXX: can't happen ? */
return;
if (g_shutdown)
return;
hh = arg;
mp = hh->mp;
hh->error = 0;
if (hh->post) {
g_free(hh);
hh = NULL;
}
g_trace(G_T_TOPOLOGY, "g_load_class(%s)", mp->name);
KASSERT(mp->name != NULL && *mp->name != '\0',
("GEOM class has no name"));
LIST_FOREACH(mp2, &g_classes, class) {
if (mp2 == mp) {
printf("The GEOM class %s is already loaded.\n",
mp2->name);
if (hh != NULL)
hh->error = EEXIST;
return;
} else if (strcmp(mp2->name, mp->name) == 0) {
printf("A GEOM class %s is already loaded.\n",
mp2->name);
if (hh != NULL)
hh->error = EEXIST;
return;
}
}
LIST_INIT(&mp->geom);
LIST_INSERT_HEAD(&g_classes, mp, class);
if (mp->init != NULL)
mp->init(mp);
if (mp->taste == NULL)
return;
LIST_FOREACH(mp2, &g_classes, class) {
if (mp == mp2)
continue;
LIST_FOREACH(gp, &mp2->geom, geom) {
LIST_FOREACH(pp, &gp->provider, provider) {
mp->taste(mp, pp, 0);
g_topology_assert();
}
}
}
}
static int
g_unload_class(struct g_class *mp)
{
struct g_geom *gp;
struct g_provider *pp;
struct g_consumer *cp;
int error;
g_topology_lock();
g_trace(G_T_TOPOLOGY, "g_unload_class(%s)", mp->name);
retry:
G_VALID_CLASS(mp);
LIST_FOREACH(gp, &mp->geom, geom) {
/* We refuse to unload if anything is open */
LIST_FOREACH(pp, &gp->provider, provider)
if (pp->acr || pp->acw || pp->ace) {
g_topology_unlock();
return (EBUSY);
}
LIST_FOREACH(cp, &gp->consumer, consumer)
if (cp->acr || cp->acw || cp->ace) {
g_topology_unlock();
return (EBUSY);
}
/* If the geom is withering, wait for it to finish. */
if (gp->flags & G_GEOM_WITHER) {
g_topology_sleep(mp, 1);
goto retry;
}
}
/*
* We allow unloading if we have no geoms, or a class
* method we can use to get rid of them.
*/
if (!LIST_EMPTY(&mp->geom) && mp->destroy_geom == NULL) {
g_topology_unlock();
return (EOPNOTSUPP);
}
/* Bar new entries */
mp->taste = NULL;
mp->config = NULL;
LIST_FOREACH(gp, &mp->geom, geom) {
error = mp->destroy_geom(NULL, mp, gp);
if (error != 0) {
g_topology_unlock();
return (error);
}
}
/* Wait for withering to finish. */
for (;;) {
gp = LIST_FIRST(&mp->geom);
if (gp == NULL)
break;
KASSERT(gp->flags & G_GEOM_WITHER,
("Non-withering geom in class %s", mp->name));
g_topology_sleep(mp, 1);
}
G_VALID_CLASS(mp);
if (mp->fini != NULL)
mp->fini(mp);
LIST_REMOVE(mp, class);
g_topology_unlock();
return (0);
}
int
g_modevent(module_t mod, int type, void *data)
{
struct g_hh00 *hh;
int error;
static int g_ignition;
struct g_class *mp;
mp = data;
if (mp->version != G_VERSION) {
printf("GEOM class %s has Wrong version %x\n",
mp->name, mp->version);
return (EINVAL);
}
if (!g_ignition) {
g_ignition++;
g_init();
}
error = EOPNOTSUPP;
switch (type) {
case MOD_LOAD:
g_trace(G_T_TOPOLOGY, "g_modevent(%s, LOAD)", mp->name);
hh = g_malloc(sizeof *hh, M_WAITOK | M_ZERO);
hh->mp = mp;
/*
* Once the system is not cold, MOD_LOAD calls will be
* from the userland and the g_event thread will be able
* to acknowledge their completion.
*/
if (cold) {
hh->post = 1;
error = g_post_event(g_load_class, hh, M_WAITOK, NULL);
} else {
error = g_waitfor_event(g_load_class, hh, M_WAITOK,
NULL);
if (error == 0)
error = hh->error;
g_free(hh);
}
break;
case MOD_UNLOAD:
g_trace(G_T_TOPOLOGY, "g_modevent(%s, UNLOAD)", mp->name);
error = g_unload_class(mp);
if (error == 0) {
KASSERT(LIST_EMPTY(&mp->geom),
("Unloaded class (%s) still has geom", mp->name));
}
break;
}
return (error);
}
static void
g_retaste_event(void *arg, int flag)
{
struct g_class *mp, *mp2;
struct g_geom *gp;
struct g_hh00 *hh;
struct g_provider *pp;
struct g_consumer *cp;
g_topology_assert();
if (flag == EV_CANCEL) /* XXX: can't happen ? */
return;
if (g_shutdown || g_notaste)
return;
hh = arg;
mp = hh->mp;
hh->error = 0;
if (hh->post) {
g_free(hh);
hh = NULL;
}
g_trace(G_T_TOPOLOGY, "g_retaste(%s)", mp->name);
LIST_FOREACH(mp2, &g_classes, class) {
LIST_FOREACH(gp, &mp2->geom, geom) {
LIST_FOREACH(pp, &gp->provider, provider) {
if (pp->acr || pp->acw || pp->ace)
continue;
LIST_FOREACH(cp, &pp->consumers, consumers) {
if (cp->geom->class == mp &&
(cp->flags & G_CF_ORPHAN) == 0)
break;
}
if (cp != NULL) {
cp->flags |= G_CF_ORPHAN;
g_wither_geom(cp->geom, ENXIO);
}
mp->taste(mp, pp, 0);
g_topology_assert();
}
}
}
}
int
g_retaste(struct g_class *mp)
{
struct g_hh00 *hh;
int error;
if (mp->taste == NULL)
return (EINVAL);
hh = g_malloc(sizeof *hh, M_WAITOK | M_ZERO);
hh->mp = mp;
if (cold) {
hh->post = 1;
error = g_post_event(g_retaste_event, hh, M_WAITOK, NULL);
} else {
error = g_waitfor_event(g_retaste_event, hh, M_WAITOK, NULL);
if (error == 0)
error = hh->error;
g_free(hh);
}
return (error);
}
struct g_geom *
g_new_geomf(struct g_class *mp, const char *fmt, ...)
{
struct g_geom *gp;
va_list ap;
struct sbuf *sb;
g_topology_assert();
G_VALID_CLASS(mp);
sb = sbuf_new_auto();
va_start(ap, fmt);
sbuf_vprintf(sb, fmt, ap);
va_end(ap);
sbuf_finish(sb);
gp = g_malloc(sizeof *gp, M_WAITOK | M_ZERO);
gp->name = g_malloc(sbuf_len(sb) + 1, M_WAITOK | M_ZERO);
gp->class = mp;
gp->rank = 1;
LIST_INIT(&gp->consumer);
LIST_INIT(&gp->provider);
LIST_INIT(&gp->aliases);
LIST_INSERT_HEAD(&mp->geom, gp, geom);
TAILQ_INSERT_HEAD(&geoms, gp, geoms);
strcpy(gp->name, sbuf_data(sb));
sbuf_delete(sb);
/* Fill in defaults from class */
gp->start = mp->start;
gp->spoiled = mp->spoiled;
gp->attrchanged = mp->attrchanged;
gp->providergone = mp->providergone;
gp->dumpconf = mp->dumpconf;
gp->access = mp->access;
gp->orphan = mp->orphan;
gp->ioctl = mp->ioctl;
gp->resize = mp->resize;
return (gp);
}
void
g_destroy_geom(struct g_geom *gp)
{
struct g_geom_alias *gap, *gaptmp;
g_topology_assert();
G_VALID_GEOM(gp);
g_trace(G_T_TOPOLOGY, "g_destroy_geom(%p(%s))", gp, gp->name);
KASSERT(LIST_EMPTY(&gp->consumer),
("g_destroy_geom(%s) with consumer(s) [%p]",
gp->name, LIST_FIRST(&gp->consumer)));
KASSERT(LIST_EMPTY(&gp->provider),
("g_destroy_geom(%s) with provider(s) [%p]",
gp->name, LIST_FIRST(&gp->provider)));
g_cancel_event(gp);
LIST_REMOVE(gp, geom);
TAILQ_REMOVE(&geoms, gp, geoms);
LIST_FOREACH_SAFE(gap, &gp->aliases, ga_next, gaptmp)
g_free(gap);
g_free(gp->name);
g_free(gp);
}
/*
* This function is called (repeatedly) until the geom has withered away.
*/
void
g_wither_geom(struct g_geom *gp, int error)
{
struct g_provider *pp;
g_topology_assert();
G_VALID_GEOM(gp);
g_trace(G_T_TOPOLOGY, "g_wither_geom(%p(%s))", gp, gp->name);
if (!(gp->flags & G_GEOM_WITHER)) {
gp->flags |= G_GEOM_WITHER;
LIST_FOREACH(pp, &gp->provider, provider)
if (!(pp->flags & G_PF_ORPHAN))
g_orphan_provider(pp, error);
}
g_do_wither();
}
/*
* Convenience function to destroy a particular provider.
*/
void
g_wither_provider(struct g_provider *pp, int error)
{
pp->flags |= G_PF_WITHER;
if (!(pp->flags & G_PF_ORPHAN))
g_orphan_provider(pp, error);
}
/*
* This function is called (repeatedly) until the has withered away.
*/
void
g_wither_geom_close(struct g_geom *gp, int error)
{
struct g_consumer *cp;
g_topology_assert();
G_VALID_GEOM(gp);
g_trace(G_T_TOPOLOGY, "g_wither_geom_close(%p(%s))", gp, gp->name);
LIST_FOREACH(cp, &gp->consumer, consumer)
if (cp->acr || cp->acw || cp->ace)
g_access(cp, -cp->acr, -cp->acw, -cp->ace);
g_wither_geom(gp, error);
}
/*
* This function is called (repeatedly) until we cant wash away more
* withered bits at present.
*/
void
g_wither_washer()
{
struct g_class *mp;
struct g_geom *gp, *gp2;
struct g_provider *pp, *pp2;
struct g_consumer *cp, *cp2;
g_topology_assert();
LIST_FOREACH(mp, &g_classes, class) {
LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
LIST_FOREACH_SAFE(pp, &gp->provider, provider, pp2) {
if (!(pp->flags & G_PF_WITHER))
continue;
if (LIST_EMPTY(&pp->consumers))
g_destroy_provider(pp);
}
if (!(gp->flags & G_GEOM_WITHER))
continue;
LIST_FOREACH_SAFE(pp, &gp->provider, provider, pp2) {
if (LIST_EMPTY(&pp->consumers))
g_destroy_provider(pp);
}
LIST_FOREACH_SAFE(cp, &gp->consumer, consumer, cp2) {
if (cp->acr || cp->acw || cp->ace)
continue;
if (cp->provider != NULL)
g_detach(cp);
g_destroy_consumer(cp);
}
if (LIST_EMPTY(&gp->provider) &&
LIST_EMPTY(&gp->consumer))
g_destroy_geom(gp);
}
}
}
struct g_consumer *
g_new_consumer(struct g_geom *gp)
{
struct g_consumer *cp;
g_topology_assert();
G_VALID_GEOM(gp);
KASSERT(!(gp->flags & G_GEOM_WITHER),
("g_new_consumer on WITHERing geom(%s) (class %s)",
gp->name, gp->class->name));
KASSERT(gp->orphan != NULL,
("g_new_consumer on geom(%s) (class %s) without orphan",
gp->name, gp->class->name));
cp = g_malloc(sizeof *cp, M_WAITOK | M_ZERO);
cp->geom = gp;
cp->stat = devstat_new_entry(cp, -1, 0, DEVSTAT_ALL_SUPPORTED,
DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX);
LIST_INSERT_HEAD(&gp->consumer, cp, consumer);
return(cp);
}
void
g_destroy_consumer(struct g_consumer *cp)
{
struct g_geom *gp;
g_topology_assert();
G_VALID_CONSUMER(cp);
g_trace(G_T_TOPOLOGY, "g_destroy_consumer(%p)", cp);
KASSERT (cp->provider == NULL, ("g_destroy_consumer but attached"));
KASSERT (cp->acr == 0, ("g_destroy_consumer with acr"));
KASSERT (cp->acw == 0, ("g_destroy_consumer with acw"));
KASSERT (cp->ace == 0, ("g_destroy_consumer with ace"));
g_cancel_event(cp);
gp = cp->geom;
LIST_REMOVE(cp, consumer);
devstat_remove_entry(cp->stat);
g_free(cp);
if (gp->flags & G_GEOM_WITHER)
g_do_wither();
}
static void
g_new_provider_event(void *arg, int flag)
{
struct g_class *mp;
struct g_provider *pp;
struct g_consumer *cp, *next_cp;
g_topology_assert();
if (flag == EV_CANCEL)
return;
if (g_shutdown)
return;
pp = arg;
G_VALID_PROVIDER(pp);
KASSERT(!(pp->flags & G_PF_WITHER),
("g_new_provider_event but withered"));
LIST_FOREACH_SAFE(cp, &pp->consumers, consumers, next_cp) {
if ((cp->flags & G_CF_ORPHAN) == 0 &&
cp->geom->attrchanged != NULL)
cp->geom->attrchanged(cp, "GEOM::media");
}
if (g_notaste)
return;
LIST_FOREACH(mp, &g_classes, class) {
if (mp->taste == NULL)
continue;
LIST_FOREACH(cp, &pp->consumers, consumers)
if (cp->geom->class == mp &&
(cp->flags & G_CF_ORPHAN) == 0)
break;
if (cp != NULL)
continue;
mp->taste(mp, pp, 0);
g_topology_assert();
}
}
struct g_provider *
g_new_providerf(struct g_geom *gp, const char *fmt, ...)
{
struct g_provider *pp;
struct sbuf *sb;
va_list ap;
g_topology_assert();
G_VALID_GEOM(gp);
KASSERT(gp->access != NULL,
("new provider on geom(%s) without ->access (class %s)",
gp->name, gp->class->name));
KASSERT(gp->start != NULL,
("new provider on geom(%s) without ->start (class %s)",
gp->name, gp->class->name));
KASSERT(!(gp->flags & G_GEOM_WITHER),
("new provider on WITHERing geom(%s) (class %s)",
gp->name, gp->class->name));
sb = sbuf_new_auto();
va_start(ap, fmt);
sbuf_vprintf(sb, fmt, ap);
va_end(ap);
sbuf_finish(sb);
pp = g_malloc(sizeof *pp + sbuf_len(sb) + 1, M_WAITOK | M_ZERO);
pp->name = (char *)(pp + 1);
strcpy(pp->name, sbuf_data(sb));
sbuf_delete(sb);
LIST_INIT(&pp->consumers);
pp->error = ENXIO;
pp->geom = gp;
pp->stat = devstat_new_entry(pp, -1, 0, DEVSTAT_ALL_SUPPORTED,
DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX);
LIST_INSERT_HEAD(&gp->provider, pp, provider);
g_post_event(g_new_provider_event, pp, M_WAITOK, pp, gp, NULL);
return (pp);
}
void
g_error_provider(struct g_provider *pp, int error)
{
/* G_VALID_PROVIDER(pp); We may not have g_topology */
pp->error = error;
}
static void
g_resize_provider_event(void *arg, int flag)
{
struct g_hh00 *hh;
struct g_class *mp;
struct g_geom *gp;
struct g_provider *pp;
struct g_consumer *cp, *cp2;
off_t size;
g_topology_assert();
if (g_shutdown)
return;
hh = arg;
pp = hh->pp;
size = hh->size;
g_free(hh);
G_VALID_PROVIDER(pp);
KASSERT(!(pp->flags & G_PF_WITHER),
("g_resize_provider_event but withered"));
g_trace(G_T_TOPOLOGY, "g_resize_provider_event(%p)", pp);
LIST_FOREACH_SAFE(cp, &pp->consumers, consumers, cp2) {
gp = cp->geom;
if (gp->resize == NULL && size < pp->mediasize) {
/*
* XXX: g_dev_orphan method does deferred destroying
* and it is possible, that other event could already
* call the orphan method. Check consumer's flags to
* do not schedule it twice.
*/
if (cp->flags & G_CF_ORPHAN)
continue;
cp->flags |= G_CF_ORPHAN;
cp->geom->orphan(cp);
}
}
pp->mediasize = size;
LIST_FOREACH_SAFE(cp, &pp->consumers, consumers, cp2) {
gp = cp->geom;
if ((gp->flags & G_GEOM_WITHER) == 0 && gp->resize != NULL)
gp->resize(cp);
}
/*
* After resizing, the previously invalid GEOM class metadata
* might become valid. This means we should retaste.
*/
LIST_FOREACH(mp, &g_classes, class) {
if (mp->taste == NULL)
continue;
LIST_FOREACH(cp, &pp->consumers, consumers)
if (cp->geom->class == mp &&
(cp->flags & G_CF_ORPHAN) == 0)
break;
if (cp != NULL)
continue;
mp->taste(mp, pp, 0);
g_topology_assert();
}
}
void
g_resize_provider(struct g_provider *pp, off_t size)
{
struct g_hh00 *hh;
G_VALID_PROVIDER(pp);
if (pp->flags & G_PF_WITHER)
return;
if (size == pp->mediasize)
return;
hh = g_malloc(sizeof *hh, M_WAITOK | M_ZERO);
hh->pp = pp;
hh->size = size;
g_post_event(g_resize_provider_event, hh, M_WAITOK, NULL);
}
#ifndef _PATH_DEV
#define _PATH_DEV "/dev/"
#endif
struct g_provider *
g_provider_by_name(char const *arg)
{
struct g_class *cp;
struct g_geom *gp;
struct g_provider *pp, *wpp;
if (strncmp(arg, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0)
arg += sizeof(_PATH_DEV) - 1;
wpp = NULL;
LIST_FOREACH(cp, &g_classes, class) {
LIST_FOREACH(gp, &cp->geom, geom) {
LIST_FOREACH(pp, &gp->provider, provider) {
if (strcmp(arg, pp->name) != 0)
continue;
if ((gp->flags & G_GEOM_WITHER) == 0 &&
(pp->flags & G_PF_WITHER) == 0)
return (pp);
else
wpp = pp;
}
}
}
return (wpp);
}
void
g_destroy_provider(struct g_provider *pp)
{
struct g_geom *gp;
g_topology_assert();
G_VALID_PROVIDER(pp);
KASSERT(LIST_EMPTY(&pp->consumers),
("g_destroy_provider but attached"));
KASSERT (pp->acr == 0, ("g_destroy_provider with acr"));
KASSERT (pp->acw == 0, ("g_destroy_provider with acw"));
KASSERT (pp->ace == 0, ("g_destroy_provider with ace"));
g_cancel_event(pp);
LIST_REMOVE(pp, provider);
gp = pp->geom;
devstat_remove_entry(pp->stat);
/*
* If a callback was provided, send notification that the provider
* is now gone.
*/
if (gp->providergone != NULL)
gp->providergone(pp);
g_free(pp);
if ((gp->flags & G_GEOM_WITHER))
g_do_wither();
}
/*
* We keep the "geoms" list sorted by topological order (== increasing
* numerical rank) at all times.
* When an attach is done, the attaching geoms rank is invalidated
* and it is moved to the tail of the list.
* All geoms later in the sequence has their ranks reevaluated in
* sequence. If we cannot assign rank to a geom because it's
* prerequisites do not have rank, we move that element to the tail
* of the sequence with invalid rank as well.
* At some point we encounter our original geom and if we stil fail
* to assign it a rank, there must be a loop and we fail back to
* g_attach() which detach again and calls redo_rank again
* to fix up the damage.
* It would be much simpler code wise to do it recursively, but we
* can't risk that on the kernel stack.
*/
static int
redo_rank(struct g_geom *gp)
{
struct g_consumer *cp;
struct g_geom *gp1, *gp2;
int n, m;
g_topology_assert();
G_VALID_GEOM(gp);
/* Invalidate this geoms rank and move it to the tail */
gp1 = TAILQ_NEXT(gp, geoms);
if (gp1 != NULL) {
gp->rank = 0;
TAILQ_REMOVE(&geoms, gp, geoms);
TAILQ_INSERT_TAIL(&geoms, gp, geoms);
} else {
gp1 = gp;
}
/* re-rank the rest of the sequence */
for (; gp1 != NULL; gp1 = gp2) {
gp1->rank = 0;
m = 1;
LIST_FOREACH(cp, &gp1->consumer, consumer) {
if (cp->provider == NULL)
continue;
n = cp->provider->geom->rank;
if (n == 0) {
m = 0;
break;
} else if (n >= m)
m = n + 1;
}
gp1->rank = m;
gp2 = TAILQ_NEXT(gp1, geoms);
/* got a rank, moving on */
if (m != 0)
continue;
/* no rank to original geom means loop */
if (gp == gp1)
return (ELOOP);
/* no rank, put it at the end move on */
TAILQ_REMOVE(&geoms, gp1, geoms);
TAILQ_INSERT_TAIL(&geoms, gp1, geoms);
}
return (0);
}
int
g_attach(struct g_consumer *cp, struct g_provider *pp)
{
int error;
g_topology_assert();
G_VALID_CONSUMER(cp);
G_VALID_PROVIDER(pp);
g_trace(G_T_TOPOLOGY, "g_attach(%p, %p)", cp, pp);
KASSERT(cp->provider == NULL, ("attach but attached"));
cp->provider = pp;
cp->flags &= ~G_CF_ORPHAN;
LIST_INSERT_HEAD(&pp->consumers, cp, consumers);
error = redo_rank(cp->geom);
if (error) {
LIST_REMOVE(cp, consumers);
cp->provider = NULL;
redo_rank(cp->geom);
}
return (error);
}
void
g_detach(struct g_consumer *cp)
{
struct g_provider *pp;
g_topology_assert();
G_VALID_CONSUMER(cp);
g_trace(G_T_TOPOLOGY, "g_detach(%p)", cp);
KASSERT(cp->provider != NULL, ("detach but not attached"));
KASSERT(cp->acr == 0, ("detach but nonzero acr"));
KASSERT(cp->acw == 0, ("detach but nonzero acw"));
KASSERT(cp->ace == 0, ("detach but nonzero ace"));
KASSERT(cp->nstart == cp->nend,
("detach with active requests"));
pp = cp->provider;
LIST_REMOVE(cp, consumers);
cp->provider = NULL;
if ((cp->geom->flags & G_GEOM_WITHER) ||
(pp->geom->flags & G_GEOM_WITHER) ||
(pp->flags & G_PF_WITHER))
g_do_wither();
redo_rank(cp->geom);
}
/*
* g_access()
*
* Access-check with delta values. The question asked is "can provider
* "cp" change the access counters by the relative amounts dc[rwe] ?"
*/
int
g_access(struct g_consumer *cp, int dcr, int dcw, int dce)
{
struct g_provider *pp;
- int pr,pw,pe;
+ int pw, pe;
int error;
g_topology_assert();
G_VALID_CONSUMER(cp);
pp = cp->provider;
KASSERT(pp != NULL, ("access but not attached"));
G_VALID_PROVIDER(pp);
g_trace(G_T_ACCESS, "g_access(%p(%s), %d, %d, %d)",
cp, pp->name, dcr, dcw, dce);
KASSERT(cp->acr + dcr >= 0, ("access resulting in negative acr"));
KASSERT(cp->acw + dcw >= 0, ("access resulting in negative acw"));
KASSERT(cp->ace + dce >= 0, ("access resulting in negative ace"));
KASSERT(dcr != 0 || dcw != 0 || dce != 0, ("NOP access request"));
KASSERT(pp->geom->access != NULL, ("NULL geom->access"));
/*
* If our class cares about being spoiled, and we have been, we
* are probably just ahead of the event telling us that. Fail
* now rather than having to unravel this later.
*/
if (cp->geom->spoiled != NULL && (cp->flags & G_CF_SPOILED) &&
(dcr > 0 || dcw > 0 || dce > 0))
return (ENXIO);
/*
* Figure out what counts the provider would have had, if this
* consumer had (r0w0e0) at this time.
*/
- pr = pp->acr - cp->acr;
pw = pp->acw - cp->acw;
pe = pp->ace - cp->ace;
g_trace(G_T_ACCESS,
"open delta:[r%dw%de%d] old:[r%dw%de%d] provider:[r%dw%de%d] %p(%s)",
dcr, dcw, dce,
cp->acr, cp->acw, cp->ace,
pp->acr, pp->acw, pp->ace,
pp, pp->name);
/* If foot-shooting is enabled, any open on rank#1 is OK */
if ((g_debugflags & 16) && pp->geom->rank == 1)
;
/* If we try exclusive but already write: fail */
else if (dce > 0 && pw > 0)
return (EPERM);
/* If we try write but already exclusive: fail */
else if (dcw > 0 && pe > 0)
return (EPERM);
/* If we try to open more but provider is error'ed: fail */
else if ((dcr > 0 || dcw > 0 || dce > 0) && pp->error != 0) {
printf("%s(%d): provider %s has error %d set\n",
__func__, __LINE__, pp->name, pp->error);
return (pp->error);
}
/* Ok then... */
error = pp->geom->access(pp, dcr, dcw, dce);
KASSERT(dcr > 0 || dcw > 0 || dce > 0 || error == 0,
("Geom provider %s::%s dcr=%d dcw=%d dce=%d error=%d failed "
"closing ->access()", pp->geom->class->name, pp->name, dcr, dcw,
dce, error));
if (!error) {
/*
* If we open first write, spoil any partner consumers.
* If we close last write and provider is not errored,
* trigger re-taste.
*/
if (pp->acw == 0 && dcw != 0)
g_spoil(pp, cp);
else if (pp->acw != 0 && pp->acw == -dcw && pp->error == 0 &&
!(pp->geom->flags & G_GEOM_WITHER))
g_post_event(g_new_provider_event, pp, M_WAITOK,
pp, NULL);
pp->acr += dcr;
pp->acw += dcw;
pp->ace += dce;
cp->acr += dcr;
cp->acw += dcw;
cp->ace += dce;
if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)
KASSERT(pp->sectorsize > 0,
("Provider %s lacks sectorsize", pp->name));
if ((cp->geom->flags & G_GEOM_WITHER) &&
cp->acr == 0 && cp->acw == 0 && cp->ace == 0)
g_do_wither();
}
return (error);
}
int
g_handleattr_int(struct bio *bp, const char *attribute, int val)
{
return (g_handleattr(bp, attribute, &val, sizeof val));
}
int
g_handleattr_uint16_t(struct bio *bp, const char *attribute, uint16_t val)
{
return (g_handleattr(bp, attribute, &val, sizeof val));
}
int
g_handleattr_off_t(struct bio *bp, const char *attribute, off_t val)
{
return (g_handleattr(bp, attribute, &val, sizeof val));
}
int
g_handleattr_str(struct bio *bp, const char *attribute, const char *str)
{
return (g_handleattr(bp, attribute, str, 0));
}
int
g_handleattr(struct bio *bp, const char *attribute, const void *val, int len)
{
int error = 0;
if (strcmp(bp->bio_attribute, attribute))
return (0);
if (len == 0) {
bzero(bp->bio_data, bp->bio_length);
if (strlcpy(bp->bio_data, val, bp->bio_length) >=
bp->bio_length) {
printf("%s: %s bio_length %jd len %zu -> EFAULT\n",
__func__, bp->bio_to->name,
(intmax_t)bp->bio_length, strlen(val));
error = EFAULT;
}
} else if (bp->bio_length == len) {
bcopy(val, bp->bio_data, len);
} else {
printf("%s: %s bio_length %jd len %d -> EFAULT\n", __func__,
bp->bio_to->name, (intmax_t)bp->bio_length, len);
error = EFAULT;
}
if (error == 0)
bp->bio_completed = bp->bio_length;
g_io_deliver(bp, error);
return (1);
}
int
g_std_access(struct g_provider *pp,
int dr __unused, int dw __unused, int de __unused)
{
g_topology_assert();
G_VALID_PROVIDER(pp);
return (0);
}
void
g_std_done(struct bio *bp)
{
struct bio *bp2;
bp2 = bp->bio_parent;
if (bp2->bio_error == 0)
bp2->bio_error = bp->bio_error;
bp2->bio_completed += bp->bio_completed;
g_destroy_bio(bp);
bp2->bio_inbed++;
if (bp2->bio_children == bp2->bio_inbed)
g_io_deliver(bp2, bp2->bio_error);
}
/* XXX: maybe this is only g_slice_spoiled */
void
g_std_spoiled(struct g_consumer *cp)
{
struct g_geom *gp;
struct g_provider *pp;
g_topology_assert();
G_VALID_CONSUMER(cp);
g_trace(G_T_TOPOLOGY, "g_std_spoiled(%p)", cp);
cp->flags |= G_CF_ORPHAN;
g_detach(cp);
gp = cp->geom;
LIST_FOREACH(pp, &gp->provider, provider)
g_orphan_provider(pp, ENXIO);
g_destroy_consumer(cp);
if (LIST_EMPTY(&gp->provider) && LIST_EMPTY(&gp->consumer))
g_destroy_geom(gp);
else
gp->flags |= G_GEOM_WITHER;
}
/*
* Spoiling happens when a provider is opened for writing, but consumers
* which are configured by in-band data are attached (slicers for instance).
* Since the write might potentially change the in-band data, such consumers
* need to re-evaluate their existence after the writing session closes.
* We do this by (offering to) tear them down when the open for write happens
* in return for a re-taste when it closes again.
* Together with the fact that such consumers grab an 'e' bit whenever they
* are open, regardless of mode, this ends up DTRT.
*/
static void
g_spoil_event(void *arg, int flag)
{
struct g_provider *pp;
struct g_consumer *cp, *cp2;
g_topology_assert();
if (flag == EV_CANCEL)
return;
pp = arg;
G_VALID_PROVIDER(pp);
g_trace(G_T_TOPOLOGY, "%s %p(%s:%s:%s)", __func__, pp,
pp->geom->class->name, pp->geom->name, pp->name);
for (cp = LIST_FIRST(&pp->consumers); cp != NULL; cp = cp2) {
cp2 = LIST_NEXT(cp, consumers);
if ((cp->flags & G_CF_SPOILED) == 0)
continue;
cp->flags &= ~G_CF_SPOILED;
if (cp->geom->spoiled == NULL)
continue;
cp->geom->spoiled(cp);
g_topology_assert();
}
}
void
g_spoil(struct g_provider *pp, struct g_consumer *cp)
{
struct g_consumer *cp2;
g_topology_assert();
G_VALID_PROVIDER(pp);
G_VALID_CONSUMER(cp);
LIST_FOREACH(cp2, &pp->consumers, consumers) {
if (cp2 == cp)
continue;
/*
KASSERT(cp2->acr == 0, ("spoiling cp->acr = %d", cp2->acr));
KASSERT(cp2->acw == 0, ("spoiling cp->acw = %d", cp2->acw));
*/
KASSERT(cp2->ace == 0, ("spoiling cp->ace = %d", cp2->ace));
cp2->flags |= G_CF_SPOILED;
}
g_post_event(g_spoil_event, pp, M_WAITOK, pp, NULL);
}
static void
g_media_changed_event(void *arg, int flag)
{
struct g_provider *pp;
int retaste;
g_topology_assert();
if (flag == EV_CANCEL)
return;
pp = arg;
G_VALID_PROVIDER(pp);
/*
* If provider was not open for writing, queue retaste after spoiling.
* If it was, retaste will happen automatically on close.
*/
retaste = (pp->acw == 0 && pp->error == 0 &&
!(pp->geom->flags & G_GEOM_WITHER));
g_spoil_event(arg, flag);
if (retaste)
g_post_event(g_new_provider_event, pp, M_WAITOK, pp, NULL);
}
int
g_media_changed(struct g_provider *pp, int flag)
{
struct g_consumer *cp;
LIST_FOREACH(cp, &pp->consumers, consumers)
cp->flags |= G_CF_SPOILED;
return (g_post_event(g_media_changed_event, pp, flag, pp, NULL));
}
int
g_media_gone(struct g_provider *pp, int flag)
{
struct g_consumer *cp;
LIST_FOREACH(cp, &pp->consumers, consumers)
cp->flags |= G_CF_SPOILED;
return (g_post_event(g_spoil_event, pp, flag, pp, NULL));
}
int
g_getattr__(const char *attr, struct g_consumer *cp, void *var, int len)
{
int error, i;
i = len;
error = g_io_getattr(attr, cp, &i, var);
if (error)
return (error);
if (i != len)
return (EINVAL);
return (0);
}
static int
g_get_device_prefix_len(const char *name)
{
int len;
if (strncmp(name, "ada", 3) == 0)
len = 3;
else if (strncmp(name, "ad", 2) == 0)
len = 2;
else
return (0);
if (name[len] < '0' || name[len] > '9')
return (0);
do {
len++;
} while (name[len] >= '0' && name[len] <= '9');
return (len);
}
int
g_compare_names(const char *namea, const char *nameb)
{
int deva, devb;
if (strcmp(namea, nameb) == 0)
return (1);
deva = g_get_device_prefix_len(namea);
if (deva == 0)
return (0);
devb = g_get_device_prefix_len(nameb);
if (devb == 0)
return (0);
if (strcmp(namea + deva, nameb + devb) == 0)
return (1);
return (0);
}
void
g_geom_add_alias(struct g_geom *gp, const char *alias)
{
struct g_geom_alias *gap;
gap = (struct g_geom_alias *)g_malloc(
sizeof(struct g_geom_alias) + strlen(alias) + 1, M_WAITOK);
strcpy((char *)(gap + 1), alias);
gap->ga_alias = (const char *)(gap + 1);
LIST_INSERT_HEAD(&gp->aliases, gap, ga_next);
}
#if defined(DIAGNOSTIC) || defined(DDB)
/*
* This function walks the mesh and returns a non-zero integer if it
* finds the argument pointer is an object. The return value indicates
* which type of object it is believed to be. If topology is not locked,
* this function is potentially dangerous, but we don't assert that the
* topology lock is held when called from debugger.
*/
int
g_valid_obj(void const *ptr)
{
struct g_class *mp;
struct g_geom *gp;
struct g_consumer *cp;
struct g_provider *pp;
#ifdef KDB
if (kdb_active == 0)
#endif
g_topology_assert();
LIST_FOREACH(mp, &g_classes, class) {
if (ptr == mp)
return (1);
LIST_FOREACH(gp, &mp->geom, geom) {
if (ptr == gp)
return (2);
LIST_FOREACH(cp, &gp->consumer, consumer)
if (ptr == cp)
return (3);
LIST_FOREACH(pp, &gp->provider, provider)
if (ptr == pp)
return (4);
}
}
return(0);
}
#endif
#ifdef DDB
#define gprintf(...) do { \
db_printf("%*s", indent, ""); \
db_printf(__VA_ARGS__); \
} while (0)
#define gprintln(...) do { \
gprintf(__VA_ARGS__); \
db_printf("\n"); \
} while (0)
#define ADDFLAG(obj, flag, sflag) do { \
if ((obj)->flags & (flag)) { \
if (comma) \
strlcat(str, ",", size); \
strlcat(str, (sflag), size); \
comma = 1; \
} \
} while (0)
static char *
provider_flags_to_string(struct g_provider *pp, char *str, size_t size)
{
int comma = 0;
bzero(str, size);
if (pp->flags == 0) {
strlcpy(str, "NONE", size);
return (str);
}
ADDFLAG(pp, G_PF_WITHER, "G_PF_WITHER");
ADDFLAG(pp, G_PF_ORPHAN, "G_PF_ORPHAN");
return (str);
}
static char *
geom_flags_to_string(struct g_geom *gp, char *str, size_t size)
{
int comma = 0;
bzero(str, size);
if (gp->flags == 0) {
strlcpy(str, "NONE", size);
return (str);
}
ADDFLAG(gp, G_GEOM_WITHER, "G_GEOM_WITHER");
return (str);
}
static void
db_show_geom_consumer(int indent, struct g_consumer *cp)
{
if (indent == 0) {
gprintln("consumer: %p", cp);
gprintln(" class: %s (%p)", cp->geom->class->name,
cp->geom->class);
gprintln(" geom: %s (%p)", cp->geom->name, cp->geom);
if (cp->provider == NULL)
gprintln(" provider: none");
else {
gprintln(" provider: %s (%p)", cp->provider->name,
cp->provider);
}
gprintln(" access: r%dw%de%d", cp->acr, cp->acw, cp->ace);
gprintln(" flags: 0x%04x", cp->flags);
gprintln(" nstart: %u", cp->nstart);
gprintln(" nend: %u", cp->nend);
} else {
gprintf("consumer: %p (%s), access=r%dw%de%d", cp,
cp->provider != NULL ? cp->provider->name : "none",
cp->acr, cp->acw, cp->ace);
if (cp->flags)
db_printf(", flags=0x%04x", cp->flags);
db_printf("\n");
}
}
static void
db_show_geom_provider(int indent, struct g_provider *pp)
{
struct g_consumer *cp;
char flags[64];
if (indent == 0) {
gprintln("provider: %s (%p)", pp->name, pp);
gprintln(" class: %s (%p)", pp->geom->class->name,
pp->geom->class);
gprintln(" geom: %s (%p)", pp->geom->name, pp->geom);
gprintln(" mediasize: %jd", (intmax_t)pp->mediasize);
gprintln(" sectorsize: %u", pp->sectorsize);
gprintln(" stripesize: %u", pp->stripesize);
gprintln(" stripeoffset: %u", pp->stripeoffset);
gprintln(" access: r%dw%de%d", pp->acr, pp->acw,
pp->ace);
gprintln(" flags: %s (0x%04x)",
provider_flags_to_string(pp, flags, sizeof(flags)),
pp->flags);
gprintln(" error: %d", pp->error);
gprintln(" nstart: %u", pp->nstart);
gprintln(" nend: %u", pp->nend);
if (LIST_EMPTY(&pp->consumers))
gprintln(" consumers: none");
} else {
gprintf("provider: %s (%p), access=r%dw%de%d",
pp->name, pp, pp->acr, pp->acw, pp->ace);
if (pp->flags != 0) {
db_printf(", flags=%s (0x%04x)",
provider_flags_to_string(pp, flags, sizeof(flags)),
pp->flags);
}
db_printf("\n");
}
if (!LIST_EMPTY(&pp->consumers)) {
LIST_FOREACH(cp, &pp->consumers, consumers) {
db_show_geom_consumer(indent + 2, cp);
if (db_pager_quit)
break;
}
}
}
static void
db_show_geom_geom(int indent, struct g_geom *gp)
{
struct g_provider *pp;
struct g_consumer *cp;
char flags[64];
if (indent == 0) {
gprintln("geom: %s (%p)", gp->name, gp);
gprintln(" class: %s (%p)", gp->class->name, gp->class);
gprintln(" flags: %s (0x%04x)",
geom_flags_to_string(gp, flags, sizeof(flags)), gp->flags);
gprintln(" rank: %d", gp->rank);
if (LIST_EMPTY(&gp->provider))
gprintln(" providers: none");
if (LIST_EMPTY(&gp->consumer))
gprintln(" consumers: none");
} else {
gprintf("geom: %s (%p), rank=%d", gp->name, gp, gp->rank);
if (gp->flags != 0) {
db_printf(", flags=%s (0x%04x)",
geom_flags_to_string(gp, flags, sizeof(flags)),
gp->flags);
}
db_printf("\n");
}
if (!LIST_EMPTY(&gp->provider)) {
LIST_FOREACH(pp, &gp->provider, provider) {
db_show_geom_provider(indent + 2, pp);
if (db_pager_quit)
break;
}
}
if (!LIST_EMPTY(&gp->consumer)) {
LIST_FOREACH(cp, &gp->consumer, consumer) {
db_show_geom_consumer(indent + 2, cp);
if (db_pager_quit)
break;
}
}
}
static void
db_show_geom_class(struct g_class *mp)
{
struct g_geom *gp;
db_printf("class: %s (%p)\n", mp->name, mp);
LIST_FOREACH(gp, &mp->geom, geom) {
db_show_geom_geom(2, gp);
if (db_pager_quit)
break;
}
}
/*
* Print the GEOM topology or the given object.
*/
DB_SHOW_COMMAND(geom, db_show_geom)
{
struct g_class *mp;
if (!have_addr) {
/* No address given, print the entire topology. */
LIST_FOREACH(mp, &g_classes, class) {
db_show_geom_class(mp);
db_printf("\n");
if (db_pager_quit)
break;
}
} else {
switch (g_valid_obj((void *)addr)) {
case 1:
db_show_geom_class((struct g_class *)addr);
break;
case 2:
db_show_geom_geom(0, (struct g_geom *)addr);
break;
case 3:
db_show_geom_consumer(0, (struct g_consumer *)addr);
break;
case 4:
db_show_geom_provider(0, (struct g_provider *)addr);
break;
default:
db_printf("Not a GEOM object.\n");
break;
}
}
}
static void
db_print_bio_cmd(struct bio *bp)
{
db_printf(" cmd: ");
switch (bp->bio_cmd) {
case BIO_READ: db_printf("BIO_READ"); break;
case BIO_WRITE: db_printf("BIO_WRITE"); break;
case BIO_DELETE: db_printf("BIO_DELETE"); break;
case BIO_GETATTR: db_printf("BIO_GETATTR"); break;
case BIO_FLUSH: db_printf("BIO_FLUSH"); break;
case BIO_CMD0: db_printf("BIO_CMD0"); break;
case BIO_CMD1: db_printf("BIO_CMD1"); break;
case BIO_CMD2: db_printf("BIO_CMD2"); break;
case BIO_ZONE: db_printf("BIO_ZONE"); break;
default: db_printf("UNKNOWN"); break;
}
db_printf("\n");
}
static void
db_print_bio_flags(struct bio *bp)
{
int comma;
comma = 0;
db_printf(" flags: ");
if (bp->bio_flags & BIO_ERROR) {
db_printf("BIO_ERROR");
comma = 1;
}
if (bp->bio_flags & BIO_DONE) {
db_printf("%sBIO_DONE", (comma ? ", " : ""));
comma = 1;
}
if (bp->bio_flags & BIO_ONQUEUE)
db_printf("%sBIO_ONQUEUE", (comma ? ", " : ""));
db_printf("\n");
}
/*
* Print useful information in a BIO
*/
DB_SHOW_COMMAND(bio, db_show_bio)
{
struct bio *bp;
if (have_addr) {
bp = (struct bio *)addr;
db_printf("BIO %p\n", bp);
db_print_bio_cmd(bp);
db_print_bio_flags(bp);
db_printf(" cflags: 0x%hx\n", bp->bio_cflags);
db_printf(" pflags: 0x%hx\n", bp->bio_pflags);
db_printf(" offset: %jd\n", (intmax_t)bp->bio_offset);
db_printf(" length: %jd\n", (intmax_t)bp->bio_length);
db_printf(" bcount: %ld\n", bp->bio_bcount);
db_printf(" resid: %ld\n", bp->bio_resid);
db_printf(" completed: %jd\n", (intmax_t)bp->bio_completed);
db_printf(" children: %u\n", bp->bio_children);
db_printf(" inbed: %u\n", bp->bio_inbed);
db_printf(" error: %d\n", bp->bio_error);
db_printf(" parent: %p\n", bp->bio_parent);
db_printf(" driver1: %p\n", bp->bio_driver1);
db_printf(" driver2: %p\n", bp->bio_driver2);
db_printf(" caller1: %p\n", bp->bio_caller1);
db_printf(" caller2: %p\n", bp->bio_caller2);
db_printf(" bio_from: %p\n", bp->bio_from);
db_printf(" bio_to: %p\n", bp->bio_to);
#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
db_printf(" bio_track_bp: %p\n", bp->bio_track_bp);
#endif
}
}
#undef gprintf
#undef gprintln
#undef ADDFLAG
#endif /* DDB */
Index: head/sys/geom/raid/g_raid.c
===================================================================
--- head/sys/geom/raid/g_raid.c (revision 327172)
+++ head/sys/geom/raid/g_raid.c (revision 327173)
@@ -1,2577 +1,2575 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/module.h>
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/bio.h>
#include <sys/sbuf.h>
#include <sys/sysctl.h>
#include <sys/malloc.h>
#include <sys/eventhandler.h>
#include <vm/uma.h>
#include <geom/geom.h>
#include <sys/proc.h>
#include <sys/kthread.h>
#include <sys/sched.h>
#include <geom/raid/g_raid.h>
#include "g_raid_md_if.h"
#include "g_raid_tr_if.h"
static MALLOC_DEFINE(M_RAID, "raid_data", "GEOM_RAID Data");
SYSCTL_DECL(_kern_geom);
SYSCTL_NODE(_kern_geom, OID_AUTO, raid, CTLFLAG_RW, 0, "GEOM_RAID stuff");
int g_raid_enable = 1;
SYSCTL_INT(_kern_geom_raid, OID_AUTO, enable, CTLFLAG_RWTUN,
&g_raid_enable, 0, "Enable on-disk metadata taste");
u_int g_raid_aggressive_spare = 0;
SYSCTL_UINT(_kern_geom_raid, OID_AUTO, aggressive_spare, CTLFLAG_RWTUN,
&g_raid_aggressive_spare, 0, "Use disks without metadata as spare");
u_int g_raid_debug = 0;
SYSCTL_UINT(_kern_geom_raid, OID_AUTO, debug, CTLFLAG_RWTUN, &g_raid_debug, 0,
"Debug level");
int g_raid_read_err_thresh = 10;
SYSCTL_UINT(_kern_geom_raid, OID_AUTO, read_err_thresh, CTLFLAG_RWTUN,
&g_raid_read_err_thresh, 0,
"Number of read errors equated to disk failure");
u_int g_raid_start_timeout = 30;
SYSCTL_UINT(_kern_geom_raid, OID_AUTO, start_timeout, CTLFLAG_RWTUN,
&g_raid_start_timeout, 0,
"Time to wait for all array components");
static u_int g_raid_clean_time = 5;
SYSCTL_UINT(_kern_geom_raid, OID_AUTO, clean_time, CTLFLAG_RWTUN,
&g_raid_clean_time, 0, "Mark volume as clean when idling");
static u_int g_raid_disconnect_on_failure = 1;
SYSCTL_UINT(_kern_geom_raid, OID_AUTO, disconnect_on_failure, CTLFLAG_RWTUN,
&g_raid_disconnect_on_failure, 0, "Disconnect component on I/O failure.");
static u_int g_raid_name_format = 0;
SYSCTL_UINT(_kern_geom_raid, OID_AUTO, name_format, CTLFLAG_RWTUN,
&g_raid_name_format, 0, "Providers name format.");
static u_int g_raid_idle_threshold = 1000000;
SYSCTL_UINT(_kern_geom_raid, OID_AUTO, idle_threshold, CTLFLAG_RWTUN,
&g_raid_idle_threshold, 1000000,
"Time in microseconds to consider a volume idle.");
#define MSLEEP(rv, ident, mtx, priority, wmesg, timeout) do { \
G_RAID_DEBUG(4, "%s: Sleeping %p.", __func__, (ident)); \
rv = msleep((ident), (mtx), (priority), (wmesg), (timeout)); \
G_RAID_DEBUG(4, "%s: Woken up %p.", __func__, (ident)); \
} while (0)
LIST_HEAD(, g_raid_md_class) g_raid_md_classes =
LIST_HEAD_INITIALIZER(g_raid_md_classes);
LIST_HEAD(, g_raid_tr_class) g_raid_tr_classes =
LIST_HEAD_INITIALIZER(g_raid_tr_classes);
LIST_HEAD(, g_raid_volume) g_raid_volumes =
LIST_HEAD_INITIALIZER(g_raid_volumes);
static eventhandler_tag g_raid_post_sync = NULL;
static int g_raid_started = 0;
static int g_raid_shutdown = 0;
static int g_raid_destroy_geom(struct gctl_req *req, struct g_class *mp,
struct g_geom *gp);
static g_taste_t g_raid_taste;
static void g_raid_init(struct g_class *mp);
static void g_raid_fini(struct g_class *mp);
struct g_class g_raid_class = {
.name = G_RAID_CLASS_NAME,
.version = G_VERSION,
.ctlreq = g_raid_ctl,
.taste = g_raid_taste,
.destroy_geom = g_raid_destroy_geom,
.init = g_raid_init,
.fini = g_raid_fini
};
static void g_raid_destroy_provider(struct g_raid_volume *vol);
static int g_raid_update_disk(struct g_raid_disk *disk, u_int event);
static int g_raid_update_subdisk(struct g_raid_subdisk *subdisk, u_int event);
static int g_raid_update_volume(struct g_raid_volume *vol, u_int event);
static int g_raid_update_node(struct g_raid_softc *sc, u_int event);
static void g_raid_dumpconf(struct sbuf *sb, const char *indent,
struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
static void g_raid_start(struct bio *bp);
static void g_raid_start_request(struct bio *bp);
static void g_raid_disk_done(struct bio *bp);
static void g_raid_poll(struct g_raid_softc *sc);
static const char *
g_raid_node_event2str(int event)
{
switch (event) {
case G_RAID_NODE_E_WAKE:
return ("WAKE");
case G_RAID_NODE_E_START:
return ("START");
default:
return ("INVALID");
}
}
const char *
g_raid_disk_state2str(int state)
{
switch (state) {
case G_RAID_DISK_S_NONE:
return ("NONE");
case G_RAID_DISK_S_OFFLINE:
return ("OFFLINE");
case G_RAID_DISK_S_DISABLED:
return ("DISABLED");
case G_RAID_DISK_S_FAILED:
return ("FAILED");
case G_RAID_DISK_S_STALE_FAILED:
return ("STALE_FAILED");
case G_RAID_DISK_S_SPARE:
return ("SPARE");
case G_RAID_DISK_S_STALE:
return ("STALE");
case G_RAID_DISK_S_ACTIVE:
return ("ACTIVE");
default:
return ("INVALID");
}
}
static const char *
g_raid_disk_event2str(int event)
{
switch (event) {
case G_RAID_DISK_E_DISCONNECTED:
return ("DISCONNECTED");
default:
return ("INVALID");
}
}
const char *
g_raid_subdisk_state2str(int state)
{
switch (state) {
case G_RAID_SUBDISK_S_NONE:
return ("NONE");
case G_RAID_SUBDISK_S_FAILED:
return ("FAILED");
case G_RAID_SUBDISK_S_NEW:
return ("NEW");
case G_RAID_SUBDISK_S_REBUILD:
return ("REBUILD");
case G_RAID_SUBDISK_S_UNINITIALIZED:
return ("UNINITIALIZED");
case G_RAID_SUBDISK_S_STALE:
return ("STALE");
case G_RAID_SUBDISK_S_RESYNC:
return ("RESYNC");
case G_RAID_SUBDISK_S_ACTIVE:
return ("ACTIVE");
default:
return ("INVALID");
}
}
static const char *
g_raid_subdisk_event2str(int event)
{
switch (event) {
case G_RAID_SUBDISK_E_NEW:
return ("NEW");
case G_RAID_SUBDISK_E_FAILED:
return ("FAILED");
case G_RAID_SUBDISK_E_DISCONNECTED:
return ("DISCONNECTED");
default:
return ("INVALID");
}
}
const char *
g_raid_volume_state2str(int state)
{
switch (state) {
case G_RAID_VOLUME_S_STARTING:
return ("STARTING");
case G_RAID_VOLUME_S_BROKEN:
return ("BROKEN");
case G_RAID_VOLUME_S_DEGRADED:
return ("DEGRADED");
case G_RAID_VOLUME_S_SUBOPTIMAL:
return ("SUBOPTIMAL");
case G_RAID_VOLUME_S_OPTIMAL:
return ("OPTIMAL");
case G_RAID_VOLUME_S_UNSUPPORTED:
return ("UNSUPPORTED");
case G_RAID_VOLUME_S_STOPPED:
return ("STOPPED");
default:
return ("INVALID");
}
}
static const char *
g_raid_volume_event2str(int event)
{
switch (event) {
case G_RAID_VOLUME_E_UP:
return ("UP");
case G_RAID_VOLUME_E_DOWN:
return ("DOWN");
case G_RAID_VOLUME_E_START:
return ("START");
case G_RAID_VOLUME_E_STARTMD:
return ("STARTMD");
default:
return ("INVALID");
}
}
const char *
g_raid_volume_level2str(int level, int qual)
{
switch (level) {
case G_RAID_VOLUME_RL_RAID0:
return ("RAID0");
case G_RAID_VOLUME_RL_RAID1:
return ("RAID1");
case G_RAID_VOLUME_RL_RAID3:
if (qual == G_RAID_VOLUME_RLQ_R3P0)
return ("RAID3-P0");
if (qual == G_RAID_VOLUME_RLQ_R3PN)
return ("RAID3-PN");
return ("RAID3");
case G_RAID_VOLUME_RL_RAID4:
if (qual == G_RAID_VOLUME_RLQ_R4P0)
return ("RAID4-P0");
if (qual == G_RAID_VOLUME_RLQ_R4PN)
return ("RAID4-PN");
return ("RAID4");
case G_RAID_VOLUME_RL_RAID5:
if (qual == G_RAID_VOLUME_RLQ_R5RA)
return ("RAID5-RA");
if (qual == G_RAID_VOLUME_RLQ_R5RS)
return ("RAID5-RS");
if (qual == G_RAID_VOLUME_RLQ_R5LA)
return ("RAID5-LA");
if (qual == G_RAID_VOLUME_RLQ_R5LS)
return ("RAID5-LS");
return ("RAID5");
case G_RAID_VOLUME_RL_RAID6:
if (qual == G_RAID_VOLUME_RLQ_R6RA)
return ("RAID6-RA");
if (qual == G_RAID_VOLUME_RLQ_R6RS)
return ("RAID6-RS");
if (qual == G_RAID_VOLUME_RLQ_R6LA)
return ("RAID6-LA");
if (qual == G_RAID_VOLUME_RLQ_R6LS)
return ("RAID6-LS");
return ("RAID6");
case G_RAID_VOLUME_RL_RAIDMDF:
if (qual == G_RAID_VOLUME_RLQ_RMDFRA)
return ("RAIDMDF-RA");
if (qual == G_RAID_VOLUME_RLQ_RMDFRS)
return ("RAIDMDF-RS");
if (qual == G_RAID_VOLUME_RLQ_RMDFLA)
return ("RAIDMDF-LA");
if (qual == G_RAID_VOLUME_RLQ_RMDFLS)
return ("RAIDMDF-LS");
return ("RAIDMDF");
case G_RAID_VOLUME_RL_RAID1E:
if (qual == G_RAID_VOLUME_RLQ_R1EA)
return ("RAID1E-A");
if (qual == G_RAID_VOLUME_RLQ_R1EO)
return ("RAID1E-O");
return ("RAID1E");
case G_RAID_VOLUME_RL_SINGLE:
return ("SINGLE");
case G_RAID_VOLUME_RL_CONCAT:
return ("CONCAT");
case G_RAID_VOLUME_RL_RAID5E:
if (qual == G_RAID_VOLUME_RLQ_R5ERA)
return ("RAID5E-RA");
if (qual == G_RAID_VOLUME_RLQ_R5ERS)
return ("RAID5E-RS");
if (qual == G_RAID_VOLUME_RLQ_R5ELA)
return ("RAID5E-LA");
if (qual == G_RAID_VOLUME_RLQ_R5ELS)
return ("RAID5E-LS");
return ("RAID5E");
case G_RAID_VOLUME_RL_RAID5EE:
if (qual == G_RAID_VOLUME_RLQ_R5EERA)
return ("RAID5EE-RA");
if (qual == G_RAID_VOLUME_RLQ_R5EERS)
return ("RAID5EE-RS");
if (qual == G_RAID_VOLUME_RLQ_R5EELA)
return ("RAID5EE-LA");
if (qual == G_RAID_VOLUME_RLQ_R5EELS)
return ("RAID5EE-LS");
return ("RAID5EE");
case G_RAID_VOLUME_RL_RAID5R:
if (qual == G_RAID_VOLUME_RLQ_R5RRA)
return ("RAID5R-RA");
if (qual == G_RAID_VOLUME_RLQ_R5RRS)
return ("RAID5R-RS");
if (qual == G_RAID_VOLUME_RLQ_R5RLA)
return ("RAID5R-LA");
if (qual == G_RAID_VOLUME_RLQ_R5RLS)
return ("RAID5R-LS");
return ("RAID5E");
default:
return ("UNKNOWN");
}
}
int
g_raid_volume_str2level(const char *str, int *level, int *qual)
{
*level = G_RAID_VOLUME_RL_UNKNOWN;
*qual = G_RAID_VOLUME_RLQ_NONE;
if (strcasecmp(str, "RAID0") == 0)
*level = G_RAID_VOLUME_RL_RAID0;
else if (strcasecmp(str, "RAID1") == 0)
*level = G_RAID_VOLUME_RL_RAID1;
else if (strcasecmp(str, "RAID3-P0") == 0) {
*level = G_RAID_VOLUME_RL_RAID3;
*qual = G_RAID_VOLUME_RLQ_R3P0;
} else if (strcasecmp(str, "RAID3-PN") == 0 ||
strcasecmp(str, "RAID3") == 0) {
*level = G_RAID_VOLUME_RL_RAID3;
*qual = G_RAID_VOLUME_RLQ_R3PN;
} else if (strcasecmp(str, "RAID4-P0") == 0) {
*level = G_RAID_VOLUME_RL_RAID4;
*qual = G_RAID_VOLUME_RLQ_R4P0;
} else if (strcasecmp(str, "RAID4-PN") == 0 ||
strcasecmp(str, "RAID4") == 0) {
*level = G_RAID_VOLUME_RL_RAID4;
*qual = G_RAID_VOLUME_RLQ_R4PN;
} else if (strcasecmp(str, "RAID5-RA") == 0) {
*level = G_RAID_VOLUME_RL_RAID5;
*qual = G_RAID_VOLUME_RLQ_R5RA;
} else if (strcasecmp(str, "RAID5-RS") == 0) {
*level = G_RAID_VOLUME_RL_RAID5;
*qual = G_RAID_VOLUME_RLQ_R5RS;
} else if (strcasecmp(str, "RAID5") == 0 ||
strcasecmp(str, "RAID5-LA") == 0) {
*level = G_RAID_VOLUME_RL_RAID5;
*qual = G_RAID_VOLUME_RLQ_R5LA;
} else if (strcasecmp(str, "RAID5-LS") == 0) {
*level = G_RAID_VOLUME_RL_RAID5;
*qual = G_RAID_VOLUME_RLQ_R5LS;
} else if (strcasecmp(str, "RAID6-RA") == 0) {
*level = G_RAID_VOLUME_RL_RAID6;
*qual = G_RAID_VOLUME_RLQ_R6RA;
} else if (strcasecmp(str, "RAID6-RS") == 0) {
*level = G_RAID_VOLUME_RL_RAID6;
*qual = G_RAID_VOLUME_RLQ_R6RS;
} else if (strcasecmp(str, "RAID6") == 0 ||
strcasecmp(str, "RAID6-LA") == 0) {
*level = G_RAID_VOLUME_RL_RAID6;
*qual = G_RAID_VOLUME_RLQ_R6LA;
} else if (strcasecmp(str, "RAID6-LS") == 0) {
*level = G_RAID_VOLUME_RL_RAID6;
*qual = G_RAID_VOLUME_RLQ_R6LS;
} else if (strcasecmp(str, "RAIDMDF-RA") == 0) {
*level = G_RAID_VOLUME_RL_RAIDMDF;
*qual = G_RAID_VOLUME_RLQ_RMDFRA;
} else if (strcasecmp(str, "RAIDMDF-RS") == 0) {
*level = G_RAID_VOLUME_RL_RAIDMDF;
*qual = G_RAID_VOLUME_RLQ_RMDFRS;
} else if (strcasecmp(str, "RAIDMDF") == 0 ||
strcasecmp(str, "RAIDMDF-LA") == 0) {
*level = G_RAID_VOLUME_RL_RAIDMDF;
*qual = G_RAID_VOLUME_RLQ_RMDFLA;
} else if (strcasecmp(str, "RAIDMDF-LS") == 0) {
*level = G_RAID_VOLUME_RL_RAIDMDF;
*qual = G_RAID_VOLUME_RLQ_RMDFLS;
} else if (strcasecmp(str, "RAID10") == 0 ||
strcasecmp(str, "RAID1E") == 0 ||
strcasecmp(str, "RAID1E-A") == 0) {
*level = G_RAID_VOLUME_RL_RAID1E;
*qual = G_RAID_VOLUME_RLQ_R1EA;
} else if (strcasecmp(str, "RAID1E-O") == 0) {
*level = G_RAID_VOLUME_RL_RAID1E;
*qual = G_RAID_VOLUME_RLQ_R1EO;
} else if (strcasecmp(str, "SINGLE") == 0)
*level = G_RAID_VOLUME_RL_SINGLE;
else if (strcasecmp(str, "CONCAT") == 0)
*level = G_RAID_VOLUME_RL_CONCAT;
else if (strcasecmp(str, "RAID5E-RA") == 0) {
*level = G_RAID_VOLUME_RL_RAID5E;
*qual = G_RAID_VOLUME_RLQ_R5ERA;
} else if (strcasecmp(str, "RAID5E-RS") == 0) {
*level = G_RAID_VOLUME_RL_RAID5E;
*qual = G_RAID_VOLUME_RLQ_R5ERS;
} else if (strcasecmp(str, "RAID5E") == 0 ||
strcasecmp(str, "RAID5E-LA") == 0) {
*level = G_RAID_VOLUME_RL_RAID5E;
*qual = G_RAID_VOLUME_RLQ_R5ELA;
} else if (strcasecmp(str, "RAID5E-LS") == 0) {
*level = G_RAID_VOLUME_RL_RAID5E;
*qual = G_RAID_VOLUME_RLQ_R5ELS;
} else if (strcasecmp(str, "RAID5EE-RA") == 0) {
*level = G_RAID_VOLUME_RL_RAID5EE;
*qual = G_RAID_VOLUME_RLQ_R5EERA;
} else if (strcasecmp(str, "RAID5EE-RS") == 0) {
*level = G_RAID_VOLUME_RL_RAID5EE;
*qual = G_RAID_VOLUME_RLQ_R5EERS;
} else if (strcasecmp(str, "RAID5EE") == 0 ||
strcasecmp(str, "RAID5EE-LA") == 0) {
*level = G_RAID_VOLUME_RL_RAID5EE;
*qual = G_RAID_VOLUME_RLQ_R5EELA;
} else if (strcasecmp(str, "RAID5EE-LS") == 0) {
*level = G_RAID_VOLUME_RL_RAID5EE;
*qual = G_RAID_VOLUME_RLQ_R5EELS;
} else if (strcasecmp(str, "RAID5R-RA") == 0) {
*level = G_RAID_VOLUME_RL_RAID5R;
*qual = G_RAID_VOLUME_RLQ_R5RRA;
} else if (strcasecmp(str, "RAID5R-RS") == 0) {
*level = G_RAID_VOLUME_RL_RAID5R;
*qual = G_RAID_VOLUME_RLQ_R5RRS;
} else if (strcasecmp(str, "RAID5R") == 0 ||
strcasecmp(str, "RAID5R-LA") == 0) {
*level = G_RAID_VOLUME_RL_RAID5R;
*qual = G_RAID_VOLUME_RLQ_R5RLA;
} else if (strcasecmp(str, "RAID5R-LS") == 0) {
*level = G_RAID_VOLUME_RL_RAID5R;
*qual = G_RAID_VOLUME_RLQ_R5RLS;
} else
return (-1);
return (0);
}
const char *
g_raid_get_diskname(struct g_raid_disk *disk)
{
if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL)
return ("[unknown]");
return (disk->d_consumer->provider->name);
}
void
g_raid_get_disk_info(struct g_raid_disk *disk)
{
struct g_consumer *cp = disk->d_consumer;
int error, len;
/* Read kernel dumping information. */
disk->d_kd.offset = 0;
disk->d_kd.length = OFF_MAX;
len = sizeof(disk->d_kd);
error = g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd);
if (error)
disk->d_kd.di.dumper = NULL;
if (disk->d_kd.di.dumper == NULL)
G_RAID_DEBUG1(2, disk->d_softc,
"Dumping not supported by %s: %d.",
cp->provider->name, error);
/* Read BIO_DELETE support. */
error = g_getattr("GEOM::candelete", cp, &disk->d_candelete);
if (error)
disk->d_candelete = 0;
if (!disk->d_candelete)
G_RAID_DEBUG1(2, disk->d_softc,
"BIO_DELETE not supported by %s: %d.",
cp->provider->name, error);
}
void
g_raid_report_disk_state(struct g_raid_disk *disk)
{
struct g_raid_subdisk *sd;
int len, state;
uint32_t s;
if (disk->d_consumer == NULL)
return;
if (disk->d_state == G_RAID_DISK_S_DISABLED) {
s = G_STATE_ACTIVE; /* XXX */
} else if (disk->d_state == G_RAID_DISK_S_FAILED ||
disk->d_state == G_RAID_DISK_S_STALE_FAILED) {
s = G_STATE_FAILED;
} else {
state = G_RAID_SUBDISK_S_ACTIVE;
TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
if (sd->sd_state < state)
state = sd->sd_state;
}
if (state == G_RAID_SUBDISK_S_FAILED)
s = G_STATE_FAILED;
else if (state == G_RAID_SUBDISK_S_NEW ||
state == G_RAID_SUBDISK_S_REBUILD)
s = G_STATE_REBUILD;
else if (state == G_RAID_SUBDISK_S_STALE ||
state == G_RAID_SUBDISK_S_RESYNC)
s = G_STATE_RESYNC;
else
s = G_STATE_ACTIVE;
}
len = sizeof(s);
g_io_getattr("GEOM::setstate", disk->d_consumer, &len, &s);
G_RAID_DEBUG1(2, disk->d_softc, "Disk %s state reported as %d.",
g_raid_get_diskname(disk), s);
}
void
g_raid_change_disk_state(struct g_raid_disk *disk, int state)
{
G_RAID_DEBUG1(0, disk->d_softc, "Disk %s state changed from %s to %s.",
g_raid_get_diskname(disk),
g_raid_disk_state2str(disk->d_state),
g_raid_disk_state2str(state));
disk->d_state = state;
g_raid_report_disk_state(disk);
}
void
g_raid_change_subdisk_state(struct g_raid_subdisk *sd, int state)
{
G_RAID_DEBUG1(0, sd->sd_softc,
"Subdisk %s:%d-%s state changed from %s to %s.",
sd->sd_volume->v_name, sd->sd_pos,
sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
g_raid_subdisk_state2str(sd->sd_state),
g_raid_subdisk_state2str(state));
sd->sd_state = state;
if (sd->sd_disk)
g_raid_report_disk_state(sd->sd_disk);
}
void
g_raid_change_volume_state(struct g_raid_volume *vol, int state)
{
G_RAID_DEBUG1(0, vol->v_softc,
"Volume %s state changed from %s to %s.",
vol->v_name,
g_raid_volume_state2str(vol->v_state),
g_raid_volume_state2str(state));
vol->v_state = state;
}
/*
* --- Events handling functions ---
* Events in geom_raid are used to maintain subdisks and volumes status
* from one thread to simplify locking.
*/
static void
g_raid_event_free(struct g_raid_event *ep)
{
free(ep, M_RAID);
}
int
g_raid_event_send(void *arg, int event, int flags)
{
struct g_raid_softc *sc;
struct g_raid_event *ep;
int error;
if ((flags & G_RAID_EVENT_VOLUME) != 0) {
sc = ((struct g_raid_volume *)arg)->v_softc;
} else if ((flags & G_RAID_EVENT_DISK) != 0) {
sc = ((struct g_raid_disk *)arg)->d_softc;
} else if ((flags & G_RAID_EVENT_SUBDISK) != 0) {
sc = ((struct g_raid_subdisk *)arg)->sd_softc;
} else {
sc = arg;
}
ep = malloc(sizeof(*ep), M_RAID,
sx_xlocked(&sc->sc_lock) ? M_WAITOK : M_NOWAIT);
if (ep == NULL)
return (ENOMEM);
ep->e_tgt = arg;
ep->e_event = event;
ep->e_flags = flags;
ep->e_error = 0;
G_RAID_DEBUG1(4, sc, "Sending event %p. Waking up %p.", ep, sc);
mtx_lock(&sc->sc_queue_mtx);
TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
mtx_unlock(&sc->sc_queue_mtx);
wakeup(sc);
if ((flags & G_RAID_EVENT_WAIT) == 0)
return (0);
sx_assert(&sc->sc_lock, SX_XLOCKED);
G_RAID_DEBUG1(4, sc, "Sleeping on %p.", ep);
sx_xunlock(&sc->sc_lock);
while ((ep->e_flags & G_RAID_EVENT_DONE) == 0) {
mtx_lock(&sc->sc_queue_mtx);
MSLEEP(error, ep, &sc->sc_queue_mtx, PRIBIO | PDROP, "m:event",
hz * 5);
}
error = ep->e_error;
g_raid_event_free(ep);
sx_xlock(&sc->sc_lock);
return (error);
}
static void
g_raid_event_cancel(struct g_raid_softc *sc, void *tgt)
{
struct g_raid_event *ep, *tmpep;
sx_assert(&sc->sc_lock, SX_XLOCKED);
mtx_lock(&sc->sc_queue_mtx);
TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
if (ep->e_tgt != tgt)
continue;
TAILQ_REMOVE(&sc->sc_events, ep, e_next);
if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0)
g_raid_event_free(ep);
else {
ep->e_error = ECANCELED;
wakeup(ep);
}
}
mtx_unlock(&sc->sc_queue_mtx);
}
static int
g_raid_event_check(struct g_raid_softc *sc, void *tgt)
{
struct g_raid_event *ep;
int res = 0;
sx_assert(&sc->sc_lock, SX_XLOCKED);
mtx_lock(&sc->sc_queue_mtx);
TAILQ_FOREACH(ep, &sc->sc_events, e_next) {
if (ep->e_tgt != tgt)
continue;
res = 1;
break;
}
mtx_unlock(&sc->sc_queue_mtx);
return (res);
}
/*
* Return the number of disks in given state.
* If state is equal to -1, count all connected disks.
*/
u_int
g_raid_ndisks(struct g_raid_softc *sc, int state)
{
struct g_raid_disk *disk;
u_int n;
sx_assert(&sc->sc_lock, SX_LOCKED);
n = 0;
TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
if (disk->d_state == state || state == -1)
n++;
}
return (n);
}
/*
* Return the number of subdisks in given state.
* If state is equal to -1, count all connected disks.
*/
u_int
g_raid_nsubdisks(struct g_raid_volume *vol, int state)
{
struct g_raid_subdisk *subdisk;
struct g_raid_softc *sc;
u_int i, n ;
sc = vol->v_softc;
sx_assert(&sc->sc_lock, SX_LOCKED);
n = 0;
for (i = 0; i < vol->v_disks_count; i++) {
subdisk = &vol->v_subdisks[i];
if ((state == -1 &&
subdisk->sd_state != G_RAID_SUBDISK_S_NONE) ||
subdisk->sd_state == state)
n++;
}
return (n);
}
/*
* Return the first subdisk in given state.
* If state is equal to -1, then the first connected disks.
*/
struct g_raid_subdisk *
g_raid_get_subdisk(struct g_raid_volume *vol, int state)
{
struct g_raid_subdisk *sd;
struct g_raid_softc *sc;
u_int i;
sc = vol->v_softc;
sx_assert(&sc->sc_lock, SX_LOCKED);
for (i = 0; i < vol->v_disks_count; i++) {
sd = &vol->v_subdisks[i];
if ((state == -1 &&
sd->sd_state != G_RAID_SUBDISK_S_NONE) ||
sd->sd_state == state)
return (sd);
}
return (NULL);
}
struct g_consumer *
g_raid_open_consumer(struct g_raid_softc *sc, const char *name)
{
struct g_consumer *cp;
struct g_provider *pp;
g_topology_assert();
if (strncmp(name, "/dev/", 5) == 0)
name += 5;
pp = g_provider_by_name(name);
if (pp == NULL)
return (NULL);
cp = g_new_consumer(sc->sc_geom);
cp->flags |= G_CF_DIRECT_RECEIVE;
if (g_attach(cp, pp) != 0) {
g_destroy_consumer(cp);
return (NULL);
}
if (g_access(cp, 1, 1, 1) != 0) {
g_detach(cp);
g_destroy_consumer(cp);
return (NULL);
}
return (cp);
}
static u_int
g_raid_nrequests(struct g_raid_softc *sc, struct g_consumer *cp)
{
struct bio *bp;
u_int nreqs = 0;
mtx_lock(&sc->sc_queue_mtx);
TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
if (bp->bio_from == cp)
nreqs++;
}
mtx_unlock(&sc->sc_queue_mtx);
return (nreqs);
}
u_int
g_raid_nopens(struct g_raid_softc *sc)
{
struct g_raid_volume *vol;
u_int opens;
opens = 0;
TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
if (vol->v_provider_open != 0)
opens++;
}
return (opens);
}
static int
g_raid_consumer_is_busy(struct g_raid_softc *sc, struct g_consumer *cp)
{
if (cp->index > 0) {
G_RAID_DEBUG1(2, sc,
"I/O requests for %s exist, can't destroy it now.",
cp->provider->name);
return (1);
}
if (g_raid_nrequests(sc, cp) > 0) {
G_RAID_DEBUG1(2, sc,
"I/O requests for %s in queue, can't destroy it now.",
cp->provider->name);
return (1);
}
return (0);
}
static void
g_raid_destroy_consumer(void *arg, int flags __unused)
{
struct g_consumer *cp;
g_topology_assert();
cp = arg;
G_RAID_DEBUG(1, "Consumer %s destroyed.", cp->provider->name);
g_detach(cp);
g_destroy_consumer(cp);
}
void
g_raid_kill_consumer(struct g_raid_softc *sc, struct g_consumer *cp)
{
struct g_provider *pp;
int retaste_wait;
g_topology_assert_not();
g_topology_lock();
cp->private = NULL;
if (g_raid_consumer_is_busy(sc, cp))
goto out;
pp = cp->provider;
retaste_wait = 0;
if (cp->acw == 1) {
if ((pp->geom->flags & G_GEOM_WITHER) == 0)
retaste_wait = 1;
}
if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
g_access(cp, -cp->acr, -cp->acw, -cp->ace);
if (retaste_wait) {
/*
* After retaste event was send (inside g_access()), we can send
* event to detach and destroy consumer.
* A class, which has consumer to the given provider connected
* will not receive retaste event for the provider.
* This is the way how I ignore retaste events when I close
* consumers opened for write: I detach and destroy consumer
* after retaste event is sent.
*/
g_post_event(g_raid_destroy_consumer, cp, M_WAITOK, NULL);
goto out;
}
G_RAID_DEBUG(1, "Consumer %s destroyed.", pp->name);
g_detach(cp);
g_destroy_consumer(cp);
out:
g_topology_unlock();
}
static void
g_raid_orphan(struct g_consumer *cp)
{
struct g_raid_disk *disk;
g_topology_assert();
disk = cp->private;
if (disk == NULL)
return;
g_raid_event_send(disk, G_RAID_DISK_E_DISCONNECTED,
G_RAID_EVENT_DISK);
}
static void
g_raid_clean(struct g_raid_volume *vol, int acw)
{
struct g_raid_softc *sc;
int timeout;
sc = vol->v_softc;
g_topology_assert_not();
sx_assert(&sc->sc_lock, SX_XLOCKED);
// if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0)
// return;
if (!vol->v_dirty)
return;
if (vol->v_writes > 0)
return;
if (acw > 0 || (acw == -1 &&
vol->v_provider != NULL && vol->v_provider->acw > 0)) {
timeout = g_raid_clean_time - (time_uptime - vol->v_last_write);
if (!g_raid_shutdown && timeout > 0)
return;
}
vol->v_dirty = 0;
G_RAID_DEBUG1(1, sc, "Volume %s marked as clean.",
vol->v_name);
g_raid_write_metadata(sc, vol, NULL, NULL);
}
static void
g_raid_dirty(struct g_raid_volume *vol)
{
struct g_raid_softc *sc;
sc = vol->v_softc;
g_topology_assert_not();
sx_assert(&sc->sc_lock, SX_XLOCKED);
// if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0)
// return;
vol->v_dirty = 1;
G_RAID_DEBUG1(1, sc, "Volume %s marked as dirty.",
vol->v_name);
g_raid_write_metadata(sc, vol, NULL, NULL);
}
void
g_raid_tr_flush_common(struct g_raid_tr_object *tr, struct bio *bp)
{
- struct g_raid_softc *sc;
struct g_raid_volume *vol;
struct g_raid_subdisk *sd;
struct bio_queue_head queue;
struct bio *cbp;
int i;
vol = tr->tro_volume;
- sc = vol->v_softc;
/*
* Allocate all bios before sending any request, so we can return
* ENOMEM in nice and clean way.
*/
bioq_init(&queue);
for (i = 0; i < vol->v_disks_count; i++) {
sd = &vol->v_subdisks[i];
if (sd->sd_state == G_RAID_SUBDISK_S_NONE ||
sd->sd_state == G_RAID_SUBDISK_S_FAILED)
continue;
cbp = g_clone_bio(bp);
if (cbp == NULL)
goto failure;
cbp->bio_caller1 = sd;
bioq_insert_tail(&queue, cbp);
}
while ((cbp = bioq_takefirst(&queue)) != NULL) {
sd = cbp->bio_caller1;
cbp->bio_caller1 = NULL;
g_raid_subdisk_iostart(sd, cbp);
}
return;
failure:
while ((cbp = bioq_takefirst(&queue)) != NULL)
g_destroy_bio(cbp);
if (bp->bio_error == 0)
bp->bio_error = ENOMEM;
g_raid_iodone(bp, bp->bio_error);
}
static void
g_raid_tr_kerneldump_common_done(struct bio *bp)
{
bp->bio_flags |= BIO_DONE;
}
int
g_raid_tr_kerneldump_common(struct g_raid_tr_object *tr,
void *virtual, vm_offset_t physical, off_t offset, size_t length)
{
struct g_raid_softc *sc;
struct g_raid_volume *vol;
struct bio bp;
vol = tr->tro_volume;
sc = vol->v_softc;
g_reset_bio(&bp);
bp.bio_cmd = BIO_WRITE;
bp.bio_done = g_raid_tr_kerneldump_common_done;
bp.bio_attribute = NULL;
bp.bio_offset = offset;
bp.bio_length = length;
bp.bio_data = virtual;
bp.bio_to = vol->v_provider;
g_raid_start(&bp);
while (!(bp.bio_flags & BIO_DONE)) {
G_RAID_DEBUG1(4, sc, "Poll...");
g_raid_poll(sc);
DELAY(10);
}
return (bp.bio_error != 0 ? EIO : 0);
}
static int
g_raid_dump(void *arg,
void *virtual, vm_offset_t physical, off_t offset, size_t length)
{
struct g_raid_volume *vol;
int error;
vol = (struct g_raid_volume *)arg;
G_RAID_DEBUG1(3, vol->v_softc, "Dumping at off %llu len %llu.",
(long long unsigned)offset, (long long unsigned)length);
error = G_RAID_TR_KERNELDUMP(vol->v_tr,
virtual, physical, offset, length);
return (error);
}
static void
g_raid_kerneldump(struct g_raid_softc *sc, struct bio *bp)
{
struct g_kerneldump *gkd;
struct g_provider *pp;
struct g_raid_volume *vol;
gkd = (struct g_kerneldump*)bp->bio_data;
pp = bp->bio_to;
vol = pp->private;
g_trace(G_T_TOPOLOGY, "g_raid_kerneldump(%s, %jd, %jd)",
pp->name, (intmax_t)gkd->offset, (intmax_t)gkd->length);
gkd->di.dumper = g_raid_dump;
gkd->di.priv = vol;
gkd->di.blocksize = vol->v_sectorsize;
gkd->di.maxiosize = DFLTPHYS;
gkd->di.mediaoffset = gkd->offset;
if ((gkd->offset + gkd->length) > vol->v_mediasize)
gkd->length = vol->v_mediasize - gkd->offset;
gkd->di.mediasize = gkd->length;
g_io_deliver(bp, 0);
}
static void
g_raid_candelete(struct g_raid_softc *sc, struct bio *bp)
{
struct g_provider *pp;
struct g_raid_volume *vol;
struct g_raid_subdisk *sd;
int *val;
int i;
val = (int *)bp->bio_data;
pp = bp->bio_to;
vol = pp->private;
*val = 0;
for (i = 0; i < vol->v_disks_count; i++) {
sd = &vol->v_subdisks[i];
if (sd->sd_state == G_RAID_SUBDISK_S_NONE)
continue;
if (sd->sd_disk->d_candelete) {
*val = 1;
break;
}
}
g_io_deliver(bp, 0);
}
static void
g_raid_start(struct bio *bp)
{
struct g_raid_softc *sc;
sc = bp->bio_to->geom->softc;
/*
* If sc == NULL or there are no valid disks, provider's error
* should be set and g_raid_start() should not be called at all.
*/
// KASSERT(sc != NULL && sc->sc_state == G_RAID_VOLUME_S_RUNNING,
// ("Provider's error should be set (error=%d)(mirror=%s).",
// bp->bio_to->error, bp->bio_to->name));
G_RAID_LOGREQ(3, bp, "Request received.");
switch (bp->bio_cmd) {
case BIO_READ:
case BIO_WRITE:
case BIO_DELETE:
case BIO_FLUSH:
break;
case BIO_GETATTR:
if (!strcmp(bp->bio_attribute, "GEOM::candelete"))
g_raid_candelete(sc, bp);
else if (!strcmp(bp->bio_attribute, "GEOM::kerneldump"))
g_raid_kerneldump(sc, bp);
else
g_io_deliver(bp, EOPNOTSUPP);
return;
default:
g_io_deliver(bp, EOPNOTSUPP);
return;
}
mtx_lock(&sc->sc_queue_mtx);
bioq_insert_tail(&sc->sc_queue, bp);
mtx_unlock(&sc->sc_queue_mtx);
if (!dumping) {
G_RAID_DEBUG1(4, sc, "Waking up %p.", sc);
wakeup(sc);
}
}
static int
g_raid_bio_overlaps(const struct bio *bp, off_t lstart, off_t len)
{
/*
* 5 cases:
* (1) bp entirely below NO
* (2) bp entirely above NO
* (3) bp start below, but end in range YES
* (4) bp entirely within YES
* (5) bp starts within, ends above YES
*
* lock range 10-19 (offset 10 length 10)
* (1) 1-5: first if kicks it out
* (2) 30-35: second if kicks it out
* (3) 5-15: passes both ifs
* (4) 12-14: passes both ifs
* (5) 19-20: passes both
*/
off_t lend = lstart + len - 1;
off_t bstart = bp->bio_offset;
off_t bend = bp->bio_offset + bp->bio_length - 1;
if (bend < lstart)
return (0);
if (lend < bstart)
return (0);
return (1);
}
static int
g_raid_is_in_locked_range(struct g_raid_volume *vol, const struct bio *bp)
{
struct g_raid_lock *lp;
sx_assert(&vol->v_softc->sc_lock, SX_LOCKED);
LIST_FOREACH(lp, &vol->v_locks, l_next) {
if (g_raid_bio_overlaps(bp, lp->l_offset, lp->l_length))
return (1);
}
return (0);
}
static void
g_raid_start_request(struct bio *bp)
{
struct g_raid_softc *sc;
struct g_raid_volume *vol;
sc = bp->bio_to->geom->softc;
sx_assert(&sc->sc_lock, SX_LOCKED);
vol = bp->bio_to->private;
/*
* Check to see if this item is in a locked range. If so,
* queue it to our locked queue and return. We'll requeue
* it when the range is unlocked. Internal I/O for the
* rebuild/rescan/recovery process is excluded from this
* check so we can actually do the recovery.
*/
if (!(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL) &&
g_raid_is_in_locked_range(vol, bp)) {
G_RAID_LOGREQ(3, bp, "Defer request.");
bioq_insert_tail(&vol->v_locked, bp);
return;
}
/*
* If we're actually going to do the write/delete, then
* update the idle stats for the volume.
*/
if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) {
if (!vol->v_dirty)
g_raid_dirty(vol);
vol->v_writes++;
}
/*
* Put request onto inflight queue, so we can check if new
* synchronization requests don't collide with it. Then tell
* the transformation layer to start the I/O.
*/
bioq_insert_tail(&vol->v_inflight, bp);
G_RAID_LOGREQ(4, bp, "Request started");
G_RAID_TR_IOSTART(vol->v_tr, bp);
}
static void
g_raid_finish_with_locked_ranges(struct g_raid_volume *vol, struct bio *bp)
{
off_t off, len;
struct bio *nbp;
struct g_raid_lock *lp;
vol->v_pending_lock = 0;
LIST_FOREACH(lp, &vol->v_locks, l_next) {
if (lp->l_pending) {
off = lp->l_offset;
len = lp->l_length;
lp->l_pending = 0;
TAILQ_FOREACH(nbp, &vol->v_inflight.queue, bio_queue) {
if (g_raid_bio_overlaps(nbp, off, len))
lp->l_pending++;
}
if (lp->l_pending) {
vol->v_pending_lock = 1;
G_RAID_DEBUG1(4, vol->v_softc,
"Deferred lock(%jd, %jd) has %d pending",
(intmax_t)off, (intmax_t)(off + len),
lp->l_pending);
continue;
}
G_RAID_DEBUG1(4, vol->v_softc,
"Deferred lock of %jd to %jd completed",
(intmax_t)off, (intmax_t)(off + len));
G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg);
}
}
}
void
g_raid_iodone(struct bio *bp, int error)
{
struct g_raid_softc *sc;
struct g_raid_volume *vol;
sc = bp->bio_to->geom->softc;
sx_assert(&sc->sc_lock, SX_LOCKED);
vol = bp->bio_to->private;
G_RAID_LOGREQ(3, bp, "Request done: %d.", error);
/* Update stats if we done write/delete. */
if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) {
vol->v_writes--;
vol->v_last_write = time_uptime;
}
bioq_remove(&vol->v_inflight, bp);
if (vol->v_pending_lock && g_raid_is_in_locked_range(vol, bp))
g_raid_finish_with_locked_ranges(vol, bp);
getmicrouptime(&vol->v_last_done);
g_io_deliver(bp, error);
}
int
g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len,
struct bio *ignore, void *argp)
{
struct g_raid_softc *sc;
struct g_raid_lock *lp;
struct bio *bp;
sc = vol->v_softc;
lp = malloc(sizeof(*lp), M_RAID, M_WAITOK | M_ZERO);
LIST_INSERT_HEAD(&vol->v_locks, lp, l_next);
lp->l_offset = off;
lp->l_length = len;
lp->l_callback_arg = argp;
lp->l_pending = 0;
TAILQ_FOREACH(bp, &vol->v_inflight.queue, bio_queue) {
if (bp != ignore && g_raid_bio_overlaps(bp, off, len))
lp->l_pending++;
}
/*
* If there are any writes that are pending, we return EBUSY. All
* callers will have to wait until all pending writes clear.
*/
if (lp->l_pending > 0) {
vol->v_pending_lock = 1;
G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd deferred %d pend",
(intmax_t)off, (intmax_t)(off+len), lp->l_pending);
return (EBUSY);
}
G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd",
(intmax_t)off, (intmax_t)(off+len));
G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg);
return (0);
}
int
g_raid_unlock_range(struct g_raid_volume *vol, off_t off, off_t len)
{
struct g_raid_lock *lp;
struct g_raid_softc *sc;
struct bio *bp;
sc = vol->v_softc;
LIST_FOREACH(lp, &vol->v_locks, l_next) {
if (lp->l_offset == off && lp->l_length == len) {
LIST_REMOVE(lp, l_next);
/* XXX
* Right now we just put them all back on the queue
* and hope for the best. We hope this because any
* locked ranges will go right back on this list
* when the worker thread runs.
* XXX
*/
G_RAID_DEBUG1(4, sc, "Unlocked %jd to %jd",
(intmax_t)lp->l_offset,
(intmax_t)(lp->l_offset+lp->l_length));
mtx_lock(&sc->sc_queue_mtx);
while ((bp = bioq_takefirst(&vol->v_locked)) != NULL)
bioq_insert_tail(&sc->sc_queue, bp);
mtx_unlock(&sc->sc_queue_mtx);
free(lp, M_RAID);
return (0);
}
}
return (EINVAL);
}
void
g_raid_subdisk_iostart(struct g_raid_subdisk *sd, struct bio *bp)
{
struct g_consumer *cp;
struct g_raid_disk *disk, *tdisk;
bp->bio_caller1 = sd;
/*
* Make sure that the disk is present. Generally it is a task of
* transformation layers to not send requests to absent disks, but
* it is better to be safe and report situation then sorry.
*/
if (sd->sd_disk == NULL) {
G_RAID_LOGREQ(0, bp, "Warning! I/O request to an absent disk!");
nodisk:
bp->bio_from = NULL;
bp->bio_to = NULL;
bp->bio_error = ENXIO;
g_raid_disk_done(bp);
return;
}
disk = sd->sd_disk;
if (disk->d_state != G_RAID_DISK_S_ACTIVE &&
disk->d_state != G_RAID_DISK_S_FAILED) {
G_RAID_LOGREQ(0, bp, "Warning! I/O request to a disk in a "
"wrong state (%s)!", g_raid_disk_state2str(disk->d_state));
goto nodisk;
}
cp = disk->d_consumer;
bp->bio_from = cp;
bp->bio_to = cp->provider;
cp->index++;
/* Update average disks load. */
TAILQ_FOREACH(tdisk, &sd->sd_softc->sc_disks, d_next) {
if (tdisk->d_consumer == NULL)
tdisk->d_load = 0;
else
tdisk->d_load = (tdisk->d_consumer->index *
G_RAID_SUBDISK_LOAD_SCALE + tdisk->d_load * 7) / 8;
}
disk->d_last_offset = bp->bio_offset + bp->bio_length;
if (dumping) {
G_RAID_LOGREQ(3, bp, "Sending dumping request.");
if (bp->bio_cmd == BIO_WRITE) {
bp->bio_error = g_raid_subdisk_kerneldump(sd,
bp->bio_data, 0, bp->bio_offset, bp->bio_length);
} else
bp->bio_error = EOPNOTSUPP;
g_raid_disk_done(bp);
} else {
bp->bio_done = g_raid_disk_done;
bp->bio_offset += sd->sd_offset;
G_RAID_LOGREQ(3, bp, "Sending request.");
g_io_request(bp, cp);
}
}
int
g_raid_subdisk_kerneldump(struct g_raid_subdisk *sd,
void *virtual, vm_offset_t physical, off_t offset, size_t length)
{
if (sd->sd_disk == NULL)
return (ENXIO);
if (sd->sd_disk->d_kd.di.dumper == NULL)
return (EOPNOTSUPP);
return (dump_write(&sd->sd_disk->d_kd.di,
virtual, physical,
sd->sd_disk->d_kd.di.mediaoffset + sd->sd_offset + offset,
length));
}
static void
g_raid_disk_done(struct bio *bp)
{
struct g_raid_softc *sc;
struct g_raid_subdisk *sd;
sd = bp->bio_caller1;
sc = sd->sd_softc;
mtx_lock(&sc->sc_queue_mtx);
bioq_insert_tail(&sc->sc_queue, bp);
mtx_unlock(&sc->sc_queue_mtx);
if (!dumping)
wakeup(sc);
}
static void
g_raid_disk_done_request(struct bio *bp)
{
struct g_raid_softc *sc;
struct g_raid_disk *disk;
struct g_raid_subdisk *sd;
struct g_raid_volume *vol;
g_topology_assert_not();
G_RAID_LOGREQ(3, bp, "Disk request done: %d.", bp->bio_error);
sd = bp->bio_caller1;
sc = sd->sd_softc;
vol = sd->sd_volume;
if (bp->bio_from != NULL) {
bp->bio_from->index--;
disk = bp->bio_from->private;
if (disk == NULL)
g_raid_kill_consumer(sc, bp->bio_from);
}
bp->bio_offset -= sd->sd_offset;
G_RAID_TR_IODONE(vol->v_tr, sd, bp);
}
static void
g_raid_handle_event(struct g_raid_softc *sc, struct g_raid_event *ep)
{
if ((ep->e_flags & G_RAID_EVENT_VOLUME) != 0)
ep->e_error = g_raid_update_volume(ep->e_tgt, ep->e_event);
else if ((ep->e_flags & G_RAID_EVENT_DISK) != 0)
ep->e_error = g_raid_update_disk(ep->e_tgt, ep->e_event);
else if ((ep->e_flags & G_RAID_EVENT_SUBDISK) != 0)
ep->e_error = g_raid_update_subdisk(ep->e_tgt, ep->e_event);
else
ep->e_error = g_raid_update_node(ep->e_tgt, ep->e_event);
if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0) {
KASSERT(ep->e_error == 0,
("Error cannot be handled."));
g_raid_event_free(ep);
} else {
ep->e_flags |= G_RAID_EVENT_DONE;
G_RAID_DEBUG1(4, sc, "Waking up %p.", ep);
mtx_lock(&sc->sc_queue_mtx);
wakeup(ep);
mtx_unlock(&sc->sc_queue_mtx);
}
}
/*
* Worker thread.
*/
static void
g_raid_worker(void *arg)
{
struct g_raid_softc *sc;
struct g_raid_event *ep;
struct g_raid_volume *vol;
struct bio *bp;
struct timeval now, t;
int timeout, rv;
sc = arg;
thread_lock(curthread);
sched_prio(curthread, PRIBIO);
thread_unlock(curthread);
sx_xlock(&sc->sc_lock);
for (;;) {
mtx_lock(&sc->sc_queue_mtx);
/*
* First take a look at events.
* This is important to handle events before any I/O requests.
*/
bp = NULL;
vol = NULL;
rv = 0;
ep = TAILQ_FIRST(&sc->sc_events);
if (ep != NULL)
TAILQ_REMOVE(&sc->sc_events, ep, e_next);
else if ((bp = bioq_takefirst(&sc->sc_queue)) != NULL)
;
else {
getmicrouptime(&now);
t = now;
TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
if (bioq_first(&vol->v_inflight) == NULL &&
vol->v_tr &&
timevalcmp(&vol->v_last_done, &t, < ))
t = vol->v_last_done;
}
timevalsub(&t, &now);
timeout = g_raid_idle_threshold +
t.tv_sec * 1000000 + t.tv_usec;
if (timeout > 0) {
/*
* Two steps to avoid overflows at HZ=1000
* and idle timeouts > 2.1s. Some rounding
* errors can occur, but they are < 1tick,
* which is deemed to be close enough for
* this purpose.
*/
int micpertic = 1000000 / hz;
timeout = (timeout + micpertic - 1) / micpertic;
sx_xunlock(&sc->sc_lock);
MSLEEP(rv, sc, &sc->sc_queue_mtx,
PRIBIO | PDROP, "-", timeout);
sx_xlock(&sc->sc_lock);
goto process;
} else
rv = EWOULDBLOCK;
}
mtx_unlock(&sc->sc_queue_mtx);
process:
if (ep != NULL) {
g_raid_handle_event(sc, ep);
} else if (bp != NULL) {
if (bp->bio_to != NULL &&
bp->bio_to->geom == sc->sc_geom)
g_raid_start_request(bp);
else
g_raid_disk_done_request(bp);
} else if (rv == EWOULDBLOCK) {
TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
g_raid_clean(vol, -1);
if (bioq_first(&vol->v_inflight) == NULL &&
vol->v_tr) {
t.tv_sec = g_raid_idle_threshold / 1000000;
t.tv_usec = g_raid_idle_threshold % 1000000;
timevaladd(&t, &vol->v_last_done);
getmicrouptime(&now);
if (timevalcmp(&t, &now, <= )) {
G_RAID_TR_IDLE(vol->v_tr);
vol->v_last_done = now;
}
}
}
}
if (sc->sc_stopping == G_RAID_DESTROY_HARD)
g_raid_destroy_node(sc, 1); /* May not return. */
}
}
static void
g_raid_poll(struct g_raid_softc *sc)
{
struct g_raid_event *ep;
struct bio *bp;
sx_xlock(&sc->sc_lock);
mtx_lock(&sc->sc_queue_mtx);
/*
* First take a look at events.
* This is important to handle events before any I/O requests.
*/
ep = TAILQ_FIRST(&sc->sc_events);
if (ep != NULL) {
TAILQ_REMOVE(&sc->sc_events, ep, e_next);
mtx_unlock(&sc->sc_queue_mtx);
g_raid_handle_event(sc, ep);
goto out;
}
bp = bioq_takefirst(&sc->sc_queue);
if (bp != NULL) {
mtx_unlock(&sc->sc_queue_mtx);
if (bp->bio_from == NULL ||
bp->bio_from->geom != sc->sc_geom)
g_raid_start_request(bp);
else
g_raid_disk_done_request(bp);
}
out:
sx_xunlock(&sc->sc_lock);
}
static void
g_raid_launch_provider(struct g_raid_volume *vol)
{
struct g_raid_disk *disk;
struct g_raid_subdisk *sd;
struct g_raid_softc *sc;
struct g_provider *pp;
char name[G_RAID_MAX_VOLUMENAME];
off_t off;
int i;
sc = vol->v_softc;
sx_assert(&sc->sc_lock, SX_LOCKED);
g_topology_lock();
/* Try to name provider with volume name. */
snprintf(name, sizeof(name), "raid/%s", vol->v_name);
if (g_raid_name_format == 0 || vol->v_name[0] == 0 ||
g_provider_by_name(name) != NULL) {
/* Otherwise use sequential volume number. */
snprintf(name, sizeof(name), "raid/r%d", vol->v_global_id);
}
pp = g_new_providerf(sc->sc_geom, "%s", name);
pp->flags |= G_PF_DIRECT_RECEIVE;
if (vol->v_tr->tro_class->trc_accept_unmapped) {
pp->flags |= G_PF_ACCEPT_UNMAPPED;
for (i = 0; i < vol->v_disks_count; i++) {
sd = &vol->v_subdisks[i];
if (sd->sd_state == G_RAID_SUBDISK_S_NONE)
continue;
if ((sd->sd_disk->d_consumer->provider->flags &
G_PF_ACCEPT_UNMAPPED) == 0)
pp->flags &= ~G_PF_ACCEPT_UNMAPPED;
}
}
pp->private = vol;
pp->mediasize = vol->v_mediasize;
pp->sectorsize = vol->v_sectorsize;
pp->stripesize = 0;
pp->stripeoffset = 0;
if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
vol->v_raid_level == G_RAID_VOLUME_RL_RAID3 ||
vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE ||
vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT) {
if ((disk = vol->v_subdisks[0].sd_disk) != NULL &&
disk->d_consumer != NULL &&
disk->d_consumer->provider != NULL) {
pp->stripesize = disk->d_consumer->provider->stripesize;
off = disk->d_consumer->provider->stripeoffset;
pp->stripeoffset = off + vol->v_subdisks[0].sd_offset;
if (off > 0)
pp->stripeoffset %= off;
}
if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3) {
pp->stripesize *= (vol->v_disks_count - 1);
pp->stripeoffset *= (vol->v_disks_count - 1);
}
} else
pp->stripesize = vol->v_strip_size;
vol->v_provider = pp;
g_error_provider(pp, 0);
g_topology_unlock();
G_RAID_DEBUG1(0, sc, "Provider %s for volume %s created.",
pp->name, vol->v_name);
}
static void
g_raid_destroy_provider(struct g_raid_volume *vol)
{
struct g_raid_softc *sc;
struct g_provider *pp;
struct bio *bp, *tmp;
g_topology_assert_not();
sc = vol->v_softc;
pp = vol->v_provider;
KASSERT(pp != NULL, ("NULL provider (volume=%s).", vol->v_name));
g_topology_lock();
g_error_provider(pp, ENXIO);
mtx_lock(&sc->sc_queue_mtx);
TAILQ_FOREACH_SAFE(bp, &sc->sc_queue.queue, bio_queue, tmp) {
if (bp->bio_to != pp)
continue;
bioq_remove(&sc->sc_queue, bp);
g_io_deliver(bp, ENXIO);
}
mtx_unlock(&sc->sc_queue_mtx);
G_RAID_DEBUG1(0, sc, "Provider %s for volume %s destroyed.",
pp->name, vol->v_name);
g_wither_provider(pp, ENXIO);
g_topology_unlock();
vol->v_provider = NULL;
}
/*
* Update device state.
*/
static int
g_raid_update_volume(struct g_raid_volume *vol, u_int event)
{
struct g_raid_softc *sc;
sc = vol->v_softc;
sx_assert(&sc->sc_lock, SX_XLOCKED);
G_RAID_DEBUG1(2, sc, "Event %s for volume %s.",
g_raid_volume_event2str(event),
vol->v_name);
switch (event) {
case G_RAID_VOLUME_E_DOWN:
if (vol->v_provider != NULL)
g_raid_destroy_provider(vol);
break;
case G_RAID_VOLUME_E_UP:
if (vol->v_provider == NULL)
g_raid_launch_provider(vol);
break;
case G_RAID_VOLUME_E_START:
if (vol->v_tr)
G_RAID_TR_START(vol->v_tr);
return (0);
default:
if (sc->sc_md)
G_RAID_MD_VOLUME_EVENT(sc->sc_md, vol, event);
return (0);
}
/* Manage root mount release. */
if (vol->v_starting) {
vol->v_starting = 0;
G_RAID_DEBUG1(1, sc, "root_mount_rel %p", vol->v_rootmount);
root_mount_rel(vol->v_rootmount);
vol->v_rootmount = NULL;
}
if (vol->v_stopping && vol->v_provider_open == 0)
g_raid_destroy_volume(vol);
return (0);
}
/*
* Update subdisk state.
*/
static int
g_raid_update_subdisk(struct g_raid_subdisk *sd, u_int event)
{
struct g_raid_softc *sc;
struct g_raid_volume *vol;
sc = sd->sd_softc;
vol = sd->sd_volume;
sx_assert(&sc->sc_lock, SX_XLOCKED);
G_RAID_DEBUG1(2, sc, "Event %s for subdisk %s:%d-%s.",
g_raid_subdisk_event2str(event),
vol->v_name, sd->sd_pos,
sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
if (vol->v_tr)
G_RAID_TR_EVENT(vol->v_tr, sd, event);
return (0);
}
/*
* Update disk state.
*/
static int
g_raid_update_disk(struct g_raid_disk *disk, u_int event)
{
struct g_raid_softc *sc;
sc = disk->d_softc;
sx_assert(&sc->sc_lock, SX_XLOCKED);
G_RAID_DEBUG1(2, sc, "Event %s for disk %s.",
g_raid_disk_event2str(event),
g_raid_get_diskname(disk));
if (sc->sc_md)
G_RAID_MD_EVENT(sc->sc_md, disk, event);
return (0);
}
/*
* Node event.
*/
static int
g_raid_update_node(struct g_raid_softc *sc, u_int event)
{
sx_assert(&sc->sc_lock, SX_XLOCKED);
G_RAID_DEBUG1(2, sc, "Event %s for the array.",
g_raid_node_event2str(event));
if (event == G_RAID_NODE_E_WAKE)
return (0);
if (sc->sc_md)
G_RAID_MD_EVENT(sc->sc_md, NULL, event);
return (0);
}
static int
g_raid_access(struct g_provider *pp, int acr, int acw, int ace)
{
struct g_raid_volume *vol;
struct g_raid_softc *sc;
int dcw, opens, error = 0;
g_topology_assert();
sc = pp->geom->softc;
vol = pp->private;
KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name));
KASSERT(vol != NULL, ("NULL volume (provider=%s).", pp->name));
G_RAID_DEBUG1(2, sc, "Access request for %s: r%dw%de%d.", pp->name,
acr, acw, ace);
dcw = pp->acw + acw;
g_topology_unlock();
sx_xlock(&sc->sc_lock);
/* Deny new opens while dying. */
if (sc->sc_stopping != 0 && (acr > 0 || acw > 0 || ace > 0)) {
error = ENXIO;
goto out;
}
/* Deny write opens for read-only volumes. */
if (vol->v_read_only && acw > 0) {
error = EROFS;
goto out;
}
if (dcw == 0)
g_raid_clean(vol, dcw);
vol->v_provider_open += acr + acw + ace;
/* Handle delayed node destruction. */
if (sc->sc_stopping == G_RAID_DESTROY_DELAYED &&
vol->v_provider_open == 0) {
/* Count open volumes. */
opens = g_raid_nopens(sc);
if (opens == 0) {
sc->sc_stopping = G_RAID_DESTROY_HARD;
/* Wake up worker to make it selfdestruct. */
g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
}
}
/* Handle open volume destruction. */
if (vol->v_stopping && vol->v_provider_open == 0)
g_raid_destroy_volume(vol);
out:
sx_xunlock(&sc->sc_lock);
g_topology_lock();
return (error);
}
struct g_raid_softc *
g_raid_create_node(struct g_class *mp,
const char *name, struct g_raid_md_object *md)
{
struct g_raid_softc *sc;
struct g_geom *gp;
int error;
g_topology_assert();
G_RAID_DEBUG(1, "Creating array %s.", name);
gp = g_new_geomf(mp, "%s", name);
sc = malloc(sizeof(*sc), M_RAID, M_WAITOK | M_ZERO);
gp->start = g_raid_start;
gp->orphan = g_raid_orphan;
gp->access = g_raid_access;
gp->dumpconf = g_raid_dumpconf;
sc->sc_md = md;
sc->sc_geom = gp;
sc->sc_flags = 0;
TAILQ_INIT(&sc->sc_volumes);
TAILQ_INIT(&sc->sc_disks);
sx_init(&sc->sc_lock, "graid:lock");
mtx_init(&sc->sc_queue_mtx, "graid:queue", NULL, MTX_DEF);
TAILQ_INIT(&sc->sc_events);
bioq_init(&sc->sc_queue);
gp->softc = sc;
error = kproc_create(g_raid_worker, sc, &sc->sc_worker, 0, 0,
"g_raid %s", name);
if (error != 0) {
G_RAID_DEBUG(0, "Cannot create kernel thread for %s.", name);
mtx_destroy(&sc->sc_queue_mtx);
sx_destroy(&sc->sc_lock);
g_destroy_geom(sc->sc_geom);
free(sc, M_RAID);
return (NULL);
}
G_RAID_DEBUG1(0, sc, "Array %s created.", name);
return (sc);
}
struct g_raid_volume *
g_raid_create_volume(struct g_raid_softc *sc, const char *name, int id)
{
struct g_raid_volume *vol, *vol1;
int i;
G_RAID_DEBUG1(1, sc, "Creating volume %s.", name);
vol = malloc(sizeof(*vol), M_RAID, M_WAITOK | M_ZERO);
vol->v_softc = sc;
strlcpy(vol->v_name, name, G_RAID_MAX_VOLUMENAME);
vol->v_state = G_RAID_VOLUME_S_STARTING;
vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN;
vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_UNKNOWN;
vol->v_rotate_parity = 1;
bioq_init(&vol->v_inflight);
bioq_init(&vol->v_locked);
LIST_INIT(&vol->v_locks);
for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) {
vol->v_subdisks[i].sd_softc = sc;
vol->v_subdisks[i].sd_volume = vol;
vol->v_subdisks[i].sd_pos = i;
vol->v_subdisks[i].sd_state = G_RAID_DISK_S_NONE;
}
/* Find free ID for this volume. */
g_topology_lock();
vol1 = vol;
if (id >= 0) {
LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) {
if (vol1->v_global_id == id)
break;
}
}
if (vol1 != NULL) {
for (id = 0; ; id++) {
LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) {
if (vol1->v_global_id == id)
break;
}
if (vol1 == NULL)
break;
}
}
vol->v_global_id = id;
LIST_INSERT_HEAD(&g_raid_volumes, vol, v_global_next);
g_topology_unlock();
/* Delay root mounting. */
vol->v_rootmount = root_mount_hold("GRAID");
G_RAID_DEBUG1(1, sc, "root_mount_hold %p", vol->v_rootmount);
vol->v_starting = 1;
TAILQ_INSERT_TAIL(&sc->sc_volumes, vol, v_next);
return (vol);
}
struct g_raid_disk *
g_raid_create_disk(struct g_raid_softc *sc)
{
struct g_raid_disk *disk;
G_RAID_DEBUG1(1, sc, "Creating disk.");
disk = malloc(sizeof(*disk), M_RAID, M_WAITOK | M_ZERO);
disk->d_softc = sc;
disk->d_state = G_RAID_DISK_S_NONE;
TAILQ_INIT(&disk->d_subdisks);
TAILQ_INSERT_TAIL(&sc->sc_disks, disk, d_next);
return (disk);
}
int g_raid_start_volume(struct g_raid_volume *vol)
{
struct g_raid_tr_class *class;
struct g_raid_tr_object *obj;
int status;
G_RAID_DEBUG1(2, vol->v_softc, "Starting volume %s.", vol->v_name);
LIST_FOREACH(class, &g_raid_tr_classes, trc_list) {
if (!class->trc_enable)
continue;
G_RAID_DEBUG1(2, vol->v_softc,
"Tasting volume %s for %s transformation.",
vol->v_name, class->name);
obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
M_WAITOK);
obj->tro_class = class;
obj->tro_volume = vol;
status = G_RAID_TR_TASTE(obj, vol);
if (status != G_RAID_TR_TASTE_FAIL)
break;
kobj_delete((kobj_t)obj, M_RAID);
}
if (class == NULL) {
G_RAID_DEBUG1(0, vol->v_softc,
"No transformation module found for %s.",
vol->v_name);
vol->v_tr = NULL;
g_raid_change_volume_state(vol, G_RAID_VOLUME_S_UNSUPPORTED);
g_raid_event_send(vol, G_RAID_VOLUME_E_DOWN,
G_RAID_EVENT_VOLUME);
return (-1);
}
G_RAID_DEBUG1(2, vol->v_softc,
"Transformation module %s chosen for %s.",
class->name, vol->v_name);
vol->v_tr = obj;
return (0);
}
int
g_raid_destroy_node(struct g_raid_softc *sc, int worker)
{
struct g_raid_volume *vol, *tmpv;
struct g_raid_disk *disk, *tmpd;
int error = 0;
sc->sc_stopping = G_RAID_DESTROY_HARD;
TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tmpv) {
if (g_raid_destroy_volume(vol))
error = EBUSY;
}
if (error)
return (error);
TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tmpd) {
if (g_raid_destroy_disk(disk))
error = EBUSY;
}
if (error)
return (error);
if (sc->sc_md) {
G_RAID_MD_FREE(sc->sc_md);
kobj_delete((kobj_t)sc->sc_md, M_RAID);
sc->sc_md = NULL;
}
if (sc->sc_geom != NULL) {
G_RAID_DEBUG1(0, sc, "Array %s destroyed.", sc->sc_name);
g_topology_lock();
sc->sc_geom->softc = NULL;
g_wither_geom(sc->sc_geom, ENXIO);
g_topology_unlock();
sc->sc_geom = NULL;
} else
G_RAID_DEBUG(1, "Array destroyed.");
if (worker) {
g_raid_event_cancel(sc, sc);
mtx_destroy(&sc->sc_queue_mtx);
sx_xunlock(&sc->sc_lock);
sx_destroy(&sc->sc_lock);
wakeup(&sc->sc_stopping);
free(sc, M_RAID);
curthread->td_pflags &= ~TDP_GEOM;
G_RAID_DEBUG(1, "Thread exiting.");
kproc_exit(0);
} else {
/* Wake up worker to make it selfdestruct. */
g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
}
return (0);
}
int
g_raid_destroy_volume(struct g_raid_volume *vol)
{
struct g_raid_softc *sc;
struct g_raid_disk *disk;
int i;
sc = vol->v_softc;
G_RAID_DEBUG1(2, sc, "Destroying volume %s.", vol->v_name);
vol->v_stopping = 1;
if (vol->v_state != G_RAID_VOLUME_S_STOPPED) {
if (vol->v_tr) {
G_RAID_TR_STOP(vol->v_tr);
return (EBUSY);
} else
vol->v_state = G_RAID_VOLUME_S_STOPPED;
}
if (g_raid_event_check(sc, vol) != 0)
return (EBUSY);
if (vol->v_provider != NULL)
return (EBUSY);
if (vol->v_provider_open != 0)
return (EBUSY);
if (vol->v_tr) {
G_RAID_TR_FREE(vol->v_tr);
kobj_delete((kobj_t)vol->v_tr, M_RAID);
vol->v_tr = NULL;
}
if (vol->v_rootmount)
root_mount_rel(vol->v_rootmount);
g_topology_lock();
LIST_REMOVE(vol, v_global_next);
g_topology_unlock();
TAILQ_REMOVE(&sc->sc_volumes, vol, v_next);
for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) {
g_raid_event_cancel(sc, &vol->v_subdisks[i]);
disk = vol->v_subdisks[i].sd_disk;
if (disk == NULL)
continue;
TAILQ_REMOVE(&disk->d_subdisks, &vol->v_subdisks[i], sd_next);
}
G_RAID_DEBUG1(2, sc, "Volume %s destroyed.", vol->v_name);
if (sc->sc_md)
G_RAID_MD_FREE_VOLUME(sc->sc_md, vol);
g_raid_event_cancel(sc, vol);
free(vol, M_RAID);
if (sc->sc_stopping == G_RAID_DESTROY_HARD) {
/* Wake up worker to let it selfdestruct. */
g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
}
return (0);
}
int
g_raid_destroy_disk(struct g_raid_disk *disk)
{
struct g_raid_softc *sc;
struct g_raid_subdisk *sd, *tmp;
sc = disk->d_softc;
G_RAID_DEBUG1(2, sc, "Destroying disk.");
if (disk->d_consumer) {
g_raid_kill_consumer(sc, disk->d_consumer);
disk->d_consumer = NULL;
}
TAILQ_FOREACH_SAFE(sd, &disk->d_subdisks, sd_next, tmp) {
g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE);
g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED,
G_RAID_EVENT_SUBDISK);
TAILQ_REMOVE(&disk->d_subdisks, sd, sd_next);
sd->sd_disk = NULL;
}
TAILQ_REMOVE(&sc->sc_disks, disk, d_next);
if (sc->sc_md)
G_RAID_MD_FREE_DISK(sc->sc_md, disk);
g_raid_event_cancel(sc, disk);
free(disk, M_RAID);
return (0);
}
int
g_raid_destroy(struct g_raid_softc *sc, int how)
{
int error, opens;
g_topology_assert_not();
if (sc == NULL)
return (ENXIO);
sx_assert(&sc->sc_lock, SX_XLOCKED);
/* Count open volumes. */
opens = g_raid_nopens(sc);
/* React on some opened volumes. */
if (opens > 0) {
switch (how) {
case G_RAID_DESTROY_SOFT:
G_RAID_DEBUG1(1, sc,
"%d volumes are still open.",
opens);
sx_xunlock(&sc->sc_lock);
return (EBUSY);
case G_RAID_DESTROY_DELAYED:
G_RAID_DEBUG1(1, sc,
"Array will be destroyed on last close.");
sc->sc_stopping = G_RAID_DESTROY_DELAYED;
sx_xunlock(&sc->sc_lock);
return (EBUSY);
case G_RAID_DESTROY_HARD:
G_RAID_DEBUG1(1, sc,
"%d volumes are still open.",
opens);
}
}
/* Mark node for destruction. */
sc->sc_stopping = G_RAID_DESTROY_HARD;
/* Wake up worker to let it selfdestruct. */
g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
/* Sleep until node destroyed. */
error = sx_sleep(&sc->sc_stopping, &sc->sc_lock,
PRIBIO | PDROP, "r:destroy", hz * 3);
return (error == EWOULDBLOCK ? EBUSY : 0);
}
static void
g_raid_taste_orphan(struct g_consumer *cp)
{
KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
cp->provider->name));
}
static struct g_geom *
g_raid_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
{
struct g_consumer *cp;
struct g_geom *gp, *geom;
struct g_raid_md_class *class;
struct g_raid_md_object *obj;
int status;
g_topology_assert();
g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
if (!g_raid_enable)
return (NULL);
G_RAID_DEBUG(2, "Tasting provider %s.", pp->name);
geom = NULL;
status = G_RAID_MD_TASTE_FAIL;
gp = g_new_geomf(mp, "raid:taste");
/*
* This orphan function should be never called.
*/
gp->orphan = g_raid_taste_orphan;
cp = g_new_consumer(gp);
cp->flags |= G_CF_DIRECT_RECEIVE;
g_attach(cp, pp);
if (g_access(cp, 1, 0, 0) != 0)
goto ofail;
LIST_FOREACH(class, &g_raid_md_classes, mdc_list) {
if (!class->mdc_enable)
continue;
G_RAID_DEBUG(2, "Tasting provider %s for %s metadata.",
pp->name, class->name);
obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
M_WAITOK);
obj->mdo_class = class;
status = G_RAID_MD_TASTE(obj, mp, cp, &geom);
if (status != G_RAID_MD_TASTE_NEW)
kobj_delete((kobj_t)obj, M_RAID);
if (status != G_RAID_MD_TASTE_FAIL)
break;
}
if (status == G_RAID_MD_TASTE_FAIL)
(void)g_access(cp, -1, 0, 0);
ofail:
g_detach(cp);
g_destroy_consumer(cp);
g_destroy_geom(gp);
G_RAID_DEBUG(2, "Tasting provider %s done.", pp->name);
return (geom);
}
int
g_raid_create_node_format(const char *format, struct gctl_req *req,
struct g_geom **gp)
{
struct g_raid_md_class *class;
struct g_raid_md_object *obj;
int status;
G_RAID_DEBUG(2, "Creating array for %s metadata.", format);
LIST_FOREACH(class, &g_raid_md_classes, mdc_list) {
if (strcasecmp(class->name, format) == 0)
break;
}
if (class == NULL) {
G_RAID_DEBUG(1, "No support for %s metadata.", format);
return (G_RAID_MD_TASTE_FAIL);
}
obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
M_WAITOK);
obj->mdo_class = class;
status = G_RAID_MD_CREATE_REQ(obj, &g_raid_class, req, gp);
if (status != G_RAID_MD_TASTE_NEW)
kobj_delete((kobj_t)obj, M_RAID);
return (status);
}
static int
g_raid_destroy_geom(struct gctl_req *req __unused,
struct g_class *mp __unused, struct g_geom *gp)
{
struct g_raid_softc *sc;
int error;
g_topology_unlock();
sc = gp->softc;
sx_xlock(&sc->sc_lock);
g_cancel_event(sc);
error = g_raid_destroy(gp->softc, G_RAID_DESTROY_SOFT);
g_topology_lock();
return (error);
}
void g_raid_write_metadata(struct g_raid_softc *sc, struct g_raid_volume *vol,
struct g_raid_subdisk *sd, struct g_raid_disk *disk)
{
if (sc->sc_stopping == G_RAID_DESTROY_HARD)
return;
if (sc->sc_md)
G_RAID_MD_WRITE(sc->sc_md, vol, sd, disk);
}
void g_raid_fail_disk(struct g_raid_softc *sc,
struct g_raid_subdisk *sd, struct g_raid_disk *disk)
{
if (disk == NULL)
disk = sd->sd_disk;
if (disk == NULL) {
G_RAID_DEBUG1(0, sc, "Warning! Fail request to an absent disk!");
return;
}
if (disk->d_state != G_RAID_DISK_S_ACTIVE) {
G_RAID_DEBUG1(0, sc, "Warning! Fail request to a disk in a "
"wrong state (%s)!", g_raid_disk_state2str(disk->d_state));
return;
}
if (sc->sc_md)
G_RAID_MD_FAIL_DISK(sc->sc_md, sd, disk);
}
static void
g_raid_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
struct g_consumer *cp, struct g_provider *pp)
{
struct g_raid_softc *sc;
struct g_raid_volume *vol;
struct g_raid_subdisk *sd;
struct g_raid_disk *disk;
int i, s;
g_topology_assert();
sc = gp->softc;
if (sc == NULL)
return;
if (pp != NULL) {
vol = pp->private;
g_topology_unlock();
sx_xlock(&sc->sc_lock);
sbuf_printf(sb, "%s<descr>%s %s volume</descr>\n", indent,
sc->sc_md->mdo_class->name,
g_raid_volume_level2str(vol->v_raid_level,
vol->v_raid_level_qualifier));
sbuf_printf(sb, "%s<Label>%s</Label>\n", indent,
vol->v_name);
sbuf_printf(sb, "%s<RAIDLevel>%s</RAIDLevel>\n", indent,
g_raid_volume_level2str(vol->v_raid_level,
vol->v_raid_level_qualifier));
sbuf_printf(sb,
"%s<Transformation>%s</Transformation>\n", indent,
vol->v_tr ? vol->v_tr->tro_class->name : "NONE");
sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
vol->v_disks_count);
sbuf_printf(sb, "%s<Strip>%u</Strip>\n", indent,
vol->v_strip_size);
sbuf_printf(sb, "%s<State>%s</State>\n", indent,
g_raid_volume_state2str(vol->v_state));
sbuf_printf(sb, "%s<Dirty>%s</Dirty>\n", indent,
vol->v_dirty ? "Yes" : "No");
sbuf_printf(sb, "%s<Subdisks>", indent);
for (i = 0; i < vol->v_disks_count; i++) {
sd = &vol->v_subdisks[i];
if (sd->sd_disk != NULL &&
sd->sd_disk->d_consumer != NULL) {
sbuf_printf(sb, "%s ",
g_raid_get_diskname(sd->sd_disk));
} else {
sbuf_printf(sb, "NONE ");
}
sbuf_printf(sb, "(%s",
g_raid_subdisk_state2str(sd->sd_state));
if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
sbuf_printf(sb, " %d%%",
(int)(sd->sd_rebuild_pos * 100 /
sd->sd_size));
}
sbuf_printf(sb, ")");
if (i + 1 < vol->v_disks_count)
sbuf_printf(sb, ", ");
}
sbuf_printf(sb, "</Subdisks>\n");
sx_xunlock(&sc->sc_lock);
g_topology_lock();
} else if (cp != NULL) {
disk = cp->private;
if (disk == NULL)
return;
g_topology_unlock();
sx_xlock(&sc->sc_lock);
sbuf_printf(sb, "%s<State>%s", indent,
g_raid_disk_state2str(disk->d_state));
if (!TAILQ_EMPTY(&disk->d_subdisks)) {
sbuf_printf(sb, " (");
TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
sbuf_printf(sb, "%s",
g_raid_subdisk_state2str(sd->sd_state));
if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
sbuf_printf(sb, " %d%%",
(int)(sd->sd_rebuild_pos * 100 /
sd->sd_size));
}
if (TAILQ_NEXT(sd, sd_next))
sbuf_printf(sb, ", ");
}
sbuf_printf(sb, ")");
}
sbuf_printf(sb, "</State>\n");
sbuf_printf(sb, "%s<Subdisks>", indent);
TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
sbuf_printf(sb, "r%d(%s):%d@%ju",
sd->sd_volume->v_global_id,
sd->sd_volume->v_name,
sd->sd_pos, sd->sd_offset);
if (TAILQ_NEXT(sd, sd_next))
sbuf_printf(sb, ", ");
}
sbuf_printf(sb, "</Subdisks>\n");
sbuf_printf(sb, "%s<ReadErrors>%d</ReadErrors>\n", indent,
disk->d_read_errs);
sx_xunlock(&sc->sc_lock);
g_topology_lock();
} else {
g_topology_unlock();
sx_xlock(&sc->sc_lock);
if (sc->sc_md) {
sbuf_printf(sb, "%s<Metadata>%s</Metadata>\n", indent,
sc->sc_md->mdo_class->name);
}
if (!TAILQ_EMPTY(&sc->sc_volumes)) {
s = 0xff;
TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
if (vol->v_state < s)
s = vol->v_state;
}
sbuf_printf(sb, "%s<State>%s</State>\n", indent,
g_raid_volume_state2str(s));
}
sx_xunlock(&sc->sc_lock);
g_topology_lock();
}
}
static void
g_raid_shutdown_post_sync(void *arg, int howto)
{
struct g_class *mp;
struct g_geom *gp, *gp2;
struct g_raid_softc *sc;
struct g_raid_volume *vol;
mp = arg;
g_topology_lock();
g_raid_shutdown = 1;
LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
if ((sc = gp->softc) == NULL)
continue;
g_topology_unlock();
sx_xlock(&sc->sc_lock);
TAILQ_FOREACH(vol, &sc->sc_volumes, v_next)
g_raid_clean(vol, -1);
g_cancel_event(sc);
g_raid_destroy(sc, G_RAID_DESTROY_DELAYED);
g_topology_lock();
}
g_topology_unlock();
}
static void
g_raid_init(struct g_class *mp)
{
g_raid_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync,
g_raid_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST);
if (g_raid_post_sync == NULL)
G_RAID_DEBUG(0, "Warning! Cannot register shutdown event.");
g_raid_started = 1;
}
static void
g_raid_fini(struct g_class *mp)
{
if (g_raid_post_sync != NULL)
EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_raid_post_sync);
g_raid_started = 0;
}
int
g_raid_md_modevent(module_t mod, int type, void *arg)
{
struct g_raid_md_class *class, *c, *nc;
int error;
error = 0;
class = arg;
switch (type) {
case MOD_LOAD:
c = LIST_FIRST(&g_raid_md_classes);
if (c == NULL || c->mdc_priority > class->mdc_priority)
LIST_INSERT_HEAD(&g_raid_md_classes, class, mdc_list);
else {
while ((nc = LIST_NEXT(c, mdc_list)) != NULL &&
nc->mdc_priority < class->mdc_priority)
c = nc;
LIST_INSERT_AFTER(c, class, mdc_list);
}
if (g_raid_started)
g_retaste(&g_raid_class);
break;
case MOD_UNLOAD:
LIST_REMOVE(class, mdc_list);
break;
default:
error = EOPNOTSUPP;
break;
}
return (error);
}
int
g_raid_tr_modevent(module_t mod, int type, void *arg)
{
struct g_raid_tr_class *class, *c, *nc;
int error;
error = 0;
class = arg;
switch (type) {
case MOD_LOAD:
c = LIST_FIRST(&g_raid_tr_classes);
if (c == NULL || c->trc_priority > class->trc_priority)
LIST_INSERT_HEAD(&g_raid_tr_classes, class, trc_list);
else {
while ((nc = LIST_NEXT(c, trc_list)) != NULL &&
nc->trc_priority < class->trc_priority)
c = nc;
LIST_INSERT_AFTER(c, class, trc_list);
}
break;
case MOD_UNLOAD:
LIST_REMOVE(class, trc_list);
break;
default:
error = EOPNOTSUPP;
break;
}
return (error);
}
/*
* Use local implementation of DECLARE_GEOM_CLASS(g_raid_class, g_raid)
* to reduce module priority, allowing submodules to register them first.
*/
static moduledata_t g_raid_mod = {
"g_raid",
g_modevent,
&g_raid_class
};
DECLARE_MODULE(g_raid, g_raid_mod, SI_SUB_DRIVERS, SI_ORDER_THIRD);
MODULE_VERSION(geom_raid, 0);
Index: head/sys/geom/raid/md_ddf.c
===================================================================
--- head/sys/geom/raid/md_ddf.c (revision 327172)
+++ head/sys/geom/raid/md_ddf.c (revision 327173)
@@ -1,3097 +1,3087 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2012 Alexander Motin <mav@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/bio.h>
#include <sys/endian.h>
#include <sys/kernel.h>
#include <sys/kobj.h>
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/clock.h>
#include <geom/geom.h>
#include "geom/raid/g_raid.h"
#include "geom/raid/md_ddf.h"
#include "g_raid_md_if.h"
static MALLOC_DEFINE(M_MD_DDF, "md_ddf_data", "GEOM_RAID DDF metadata");
#define DDF_MAX_DISKS_HARD 128
#define DDF_MAX_DISKS 16
#define DDF_MAX_VDISKS 7
#define DDF_MAX_PARTITIONS 1
#define DECADE (3600*24*(365*10+2)) /* 10 years in seconds. */
struct ddf_meta {
u_int sectorsize;
u_int bigendian;
struct ddf_header *hdr;
struct ddf_cd_record *cdr;
struct ddf_pd_record *pdr;
struct ddf_vd_record *vdr;
void *cr;
struct ddf_pdd_record *pdd;
struct ddf_bbm_log *bbm;
};
struct ddf_vol_meta {
u_int sectorsize;
u_int bigendian;
struct ddf_header *hdr;
struct ddf_cd_record *cdr;
struct ddf_vd_entry *vde;
struct ddf_vdc_record *vdc;
struct ddf_vdc_record *bvdc[DDF_MAX_DISKS_HARD];
};
struct g_raid_md_ddf_perdisk {
struct ddf_meta pd_meta;
};
struct g_raid_md_ddf_pervolume {
struct ddf_vol_meta pv_meta;
int pv_started;
struct callout pv_start_co; /* STARTING state timer. */
};
struct g_raid_md_ddf_object {
struct g_raid_md_object mdio_base;
u_int mdio_bigendian;
struct ddf_meta mdio_meta;
int mdio_starting;
struct callout mdio_start_co; /* STARTING state timer. */
int mdio_started;
struct root_hold_token *mdio_rootmount; /* Root mount delay token. */
};
static g_raid_md_create_req_t g_raid_md_create_req_ddf;
static g_raid_md_taste_t g_raid_md_taste_ddf;
static g_raid_md_event_t g_raid_md_event_ddf;
static g_raid_md_volume_event_t g_raid_md_volume_event_ddf;
static g_raid_md_ctl_t g_raid_md_ctl_ddf;
static g_raid_md_write_t g_raid_md_write_ddf;
static g_raid_md_fail_disk_t g_raid_md_fail_disk_ddf;
static g_raid_md_free_disk_t g_raid_md_free_disk_ddf;
static g_raid_md_free_volume_t g_raid_md_free_volume_ddf;
static g_raid_md_free_t g_raid_md_free_ddf;
static kobj_method_t g_raid_md_ddf_methods[] = {
KOBJMETHOD(g_raid_md_create_req, g_raid_md_create_req_ddf),
KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_ddf),
KOBJMETHOD(g_raid_md_event, g_raid_md_event_ddf),
KOBJMETHOD(g_raid_md_volume_event, g_raid_md_volume_event_ddf),
KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_ddf),
KOBJMETHOD(g_raid_md_write, g_raid_md_write_ddf),
KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_ddf),
KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_ddf),
KOBJMETHOD(g_raid_md_free_volume, g_raid_md_free_volume_ddf),
KOBJMETHOD(g_raid_md_free, g_raid_md_free_ddf),
{ 0, 0 }
};
static struct g_raid_md_class g_raid_md_ddf_class = {
"DDF",
g_raid_md_ddf_methods,
sizeof(struct g_raid_md_ddf_object),
.mdc_enable = 1,
.mdc_priority = 100
};
#define GET8(m, f) ((m)->f)
#define GET16(m, f) ((m)->bigendian ? be16dec(&(m)->f) : le16dec(&(m)->f))
#define GET32(m, f) ((m)->bigendian ? be32dec(&(m)->f) : le32dec(&(m)->f))
#define GET64(m, f) ((m)->bigendian ? be64dec(&(m)->f) : le64dec(&(m)->f))
#define GET8D(m, f) (f)
#define GET16D(m, f) ((m)->bigendian ? be16dec(&f) : le16dec(&f))
#define GET32D(m, f) ((m)->bigendian ? be32dec(&f) : le32dec(&f))
#define GET64D(m, f) ((m)->bigendian ? be64dec(&f) : le64dec(&f))
#define GET8P(m, f) (*(f))
#define GET16P(m, f) ((m)->bigendian ? be16dec(f) : le16dec(f))
#define GET32P(m, f) ((m)->bigendian ? be32dec(f) : le32dec(f))
#define GET64P(m, f) ((m)->bigendian ? be64dec(f) : le64dec(f))
#define SET8P(m, f, v) \
(*(f) = (v))
#define SET16P(m, f, v) \
do { \
if ((m)->bigendian) \
be16enc((f), (v)); \
else \
le16enc((f), (v)); \
} while (0)
#define SET32P(m, f, v) \
do { \
if ((m)->bigendian) \
be32enc((f), (v)); \
else \
le32enc((f), (v)); \
} while (0)
#define SET64P(m, f, v) \
do { \
if ((m)->bigendian) \
be64enc((f), (v)); \
else \
le64enc((f), (v)); \
} while (0)
#define SET8(m, f, v) SET8P((m), &((m)->f), (v))
#define SET16(m, f, v) SET16P((m), &((m)->f), (v))
#define SET32(m, f, v) SET32P((m), &((m)->f), (v))
#define SET64(m, f, v) SET64P((m), &((m)->f), (v))
#define SET8D(m, f, v) SET8P((m), &(f), (v))
#define SET16D(m, f, v) SET16P((m), &(f), (v))
#define SET32D(m, f, v) SET32P((m), &(f), (v))
#define SET64D(m, f, v) SET64P((m), &(f), (v))
#define GETCRNUM(m) (GET32((m), hdr->cr_length) / \
GET16((m), hdr->Configuration_Record_Length))
#define GETVDCPTR(m, n) ((struct ddf_vdc_record *)((uint8_t *)(m)->cr + \
(n) * GET16((m), hdr->Configuration_Record_Length) * \
(m)->sectorsize))
#define GETSAPTR(m, n) ((struct ddf_sa_record *)((uint8_t *)(m)->cr + \
(n) * GET16((m), hdr->Configuration_Record_Length) * \
(m)->sectorsize))
static int
isff(uint8_t *buf, int size)
{
int i;
for (i = 0; i < size; i++)
if (buf[i] != 0xff)
return (0);
return (1);
}
static void
print_guid(uint8_t *buf)
{
int i, ascii;
ascii = 1;
for (i = 0; i < 24; i++) {
if (buf[i] != 0 && (buf[i] < ' ' || buf[i] > 127)) {
ascii = 0;
break;
}
}
if (ascii) {
printf("'%.24s'", buf);
} else {
for (i = 0; i < 24; i++)
printf("%02x", buf[i]);
}
}
static void
g_raid_md_ddf_print(struct ddf_meta *meta)
{
struct ddf_vdc_record *vdc;
struct ddf_vuc_record *vuc;
struct ddf_sa_record *sa;
uint64_t *val2;
uint32_t val;
int i, j, k, num, num2;
if (g_raid_debug < 1)
return;
printf("********* DDF Metadata *********\n");
printf("**** Header ****\n");
printf("DDF_Header_GUID ");
print_guid(meta->hdr->DDF_Header_GUID);
printf("\n");
printf("DDF_rev %8.8s\n", (char *)&meta->hdr->DDF_rev[0]);
printf("Sequence_Number 0x%08x\n", GET32(meta, hdr->Sequence_Number));
printf("TimeStamp 0x%08x\n", GET32(meta, hdr->TimeStamp));
printf("Open_Flag 0x%02x\n", GET16(meta, hdr->Open_Flag));
printf("Foreign_Flag 0x%02x\n", GET16(meta, hdr->Foreign_Flag));
printf("Diskgrouping 0x%02x\n", GET16(meta, hdr->Diskgrouping));
printf("Primary_Header_LBA %ju\n", GET64(meta, hdr->Primary_Header_LBA));
printf("Secondary_Header_LBA %ju\n", GET64(meta, hdr->Secondary_Header_LBA));
printf("WorkSpace_Length %u\n", GET32(meta, hdr->WorkSpace_Length));
printf("WorkSpace_LBA %ju\n", GET64(meta, hdr->WorkSpace_LBA));
printf("Max_PD_Entries %u\n", GET16(meta, hdr->Max_PD_Entries));
printf("Max_VD_Entries %u\n", GET16(meta, hdr->Max_VD_Entries));
printf("Max_Partitions %u\n", GET16(meta, hdr->Max_Partitions));
printf("Configuration_Record_Length %u\n", GET16(meta, hdr->Configuration_Record_Length));
printf("Max_Primary_Element_Entries %u\n", GET16(meta, hdr->Max_Primary_Element_Entries));
printf("Controller Data %u:%u\n", GET32(meta, hdr->cd_section), GET32(meta, hdr->cd_length));
printf("Physical Disk %u:%u\n", GET32(meta, hdr->pdr_section), GET32(meta, hdr->pdr_length));
printf("Virtual Disk %u:%u\n", GET32(meta, hdr->vdr_section), GET32(meta, hdr->vdr_length));
printf("Configuration Recs %u:%u\n", GET32(meta, hdr->cr_section), GET32(meta, hdr->cr_length));
printf("Physical Disk Recs %u:%u\n", GET32(meta, hdr->pdd_section), GET32(meta, hdr->pdd_length));
printf("BBM Log %u:%u\n", GET32(meta, hdr->bbmlog_section), GET32(meta, hdr->bbmlog_length));
printf("Diagnostic Space %u:%u\n", GET32(meta, hdr->Diagnostic_Space), GET32(meta, hdr->Diagnostic_Space_Length));
printf("Vendor_Specific_Logs %u:%u\n", GET32(meta, hdr->Vendor_Specific_Logs), GET32(meta, hdr->Vendor_Specific_Logs_Length));
printf("**** Controller Data ****\n");
printf("Controller_GUID ");
print_guid(meta->cdr->Controller_GUID);
printf("\n");
printf("Controller_Type 0x%04x%04x 0x%04x%04x\n",
GET16(meta, cdr->Controller_Type.Vendor_ID),
GET16(meta, cdr->Controller_Type.Device_ID),
GET16(meta, cdr->Controller_Type.SubVendor_ID),
GET16(meta, cdr->Controller_Type.SubDevice_ID));
printf("Product_ID '%.16s'\n", (char *)&meta->cdr->Product_ID[0]);
printf("**** Physical Disk Records ****\n");
printf("Populated_PDEs %u\n", GET16(meta, pdr->Populated_PDEs));
printf("Max_PDE_Supported %u\n", GET16(meta, pdr->Max_PDE_Supported));
for (j = 0; j < GET16(meta, pdr->Populated_PDEs); j++) {
if (isff(meta->pdr->entry[j].PD_GUID, 24))
continue;
if (GET32(meta, pdr->entry[j].PD_Reference) == 0xffffffff)
continue;
printf("PD_GUID ");
print_guid(meta->pdr->entry[j].PD_GUID);
printf("\n");
printf("PD_Reference 0x%08x\n",
GET32(meta, pdr->entry[j].PD_Reference));
printf("PD_Type 0x%04x\n",
GET16(meta, pdr->entry[j].PD_Type));
printf("PD_State 0x%04x\n",
GET16(meta, pdr->entry[j].PD_State));
printf("Configured_Size %ju\n",
GET64(meta, pdr->entry[j].Configured_Size));
printf("Block_Size %u\n",
GET16(meta, pdr->entry[j].Block_Size));
}
printf("**** Virtual Disk Records ****\n");
printf("Populated_VDEs %u\n", GET16(meta, vdr->Populated_VDEs));
printf("Max_VDE_Supported %u\n", GET16(meta, vdr->Max_VDE_Supported));
for (j = 0; j < GET16(meta, vdr->Populated_VDEs); j++) {
if (isff(meta->vdr->entry[j].VD_GUID, 24))
continue;
printf("VD_GUID ");
print_guid(meta->vdr->entry[j].VD_GUID);
printf("\n");
printf("VD_Number 0x%04x\n",
GET16(meta, vdr->entry[j].VD_Number));
printf("VD_Type 0x%04x\n",
GET16(meta, vdr->entry[j].VD_Type));
printf("VD_State 0x%02x\n",
GET8(meta, vdr->entry[j].VD_State));
printf("Init_State 0x%02x\n",
GET8(meta, vdr->entry[j].Init_State));
printf("Drive_Failures_Remaining %u\n",
GET8(meta, vdr->entry[j].Drive_Failures_Remaining));
printf("VD_Name '%.16s'\n",
(char *)&meta->vdr->entry[j].VD_Name);
}
printf("**** Configuration Records ****\n");
num = GETCRNUM(meta);
for (j = 0; j < num; j++) {
vdc = GETVDCPTR(meta, j);
val = GET32D(meta, vdc->Signature);
switch (val) {
case DDF_VDCR_SIGNATURE:
printf("** Virtual Disk Configuration **\n");
printf("VD_GUID ");
print_guid(vdc->VD_GUID);
printf("\n");
printf("Timestamp 0x%08x\n",
GET32D(meta, vdc->Timestamp));
printf("Sequence_Number 0x%08x\n",
GET32D(meta, vdc->Sequence_Number));
printf("Primary_Element_Count %u\n",
GET16D(meta, vdc->Primary_Element_Count));
printf("Stripe_Size %u\n",
GET8D(meta, vdc->Stripe_Size));
printf("Primary_RAID_Level 0x%02x\n",
GET8D(meta, vdc->Primary_RAID_Level));
printf("RLQ 0x%02x\n",
GET8D(meta, vdc->RLQ));
printf("Secondary_Element_Count %u\n",
GET8D(meta, vdc->Secondary_Element_Count));
printf("Secondary_Element_Seq %u\n",
GET8D(meta, vdc->Secondary_Element_Seq));
printf("Secondary_RAID_Level 0x%02x\n",
GET8D(meta, vdc->Secondary_RAID_Level));
printf("Block_Count %ju\n",
GET64D(meta, vdc->Block_Count));
printf("VD_Size %ju\n",
GET64D(meta, vdc->VD_Size));
printf("Block_Size %u\n",
GET16D(meta, vdc->Block_Size));
printf("Rotate_Parity_count %u\n",
GET8D(meta, vdc->Rotate_Parity_count));
printf("Associated_Spare_Disks");
for (i = 0; i < 8; i++) {
if (GET32D(meta, vdc->Associated_Spares[i]) != 0xffffffff)
printf(" 0x%08x", GET32D(meta, vdc->Associated_Spares[i]));
}
printf("\n");
printf("Cache_Flags %016jx\n",
GET64D(meta, vdc->Cache_Flags));
printf("BG_Rate %u\n",
GET8D(meta, vdc->BG_Rate));
printf("MDF_Parity_Disks %u\n",
GET8D(meta, vdc->MDF_Parity_Disks));
printf("MDF_Parity_Generator_Polynomial 0x%04x\n",
GET16D(meta, vdc->MDF_Parity_Generator_Polynomial));
printf("MDF_Constant_Generation_Method 0x%02x\n",
GET8D(meta, vdc->MDF_Constant_Generation_Method));
printf("Physical_Disks ");
num2 = GET16D(meta, vdc->Primary_Element_Count);
val2 = (uint64_t *)&(vdc->Physical_Disk_Sequence[GET16(meta, hdr->Max_Primary_Element_Entries)]);
for (i = 0; i < num2; i++)
printf(" 0x%08x @ %ju",
GET32D(meta, vdc->Physical_Disk_Sequence[i]),
GET64P(meta, val2 + i));
printf("\n");
break;
case DDF_VUCR_SIGNATURE:
printf("** Vendor Unique Configuration **\n");
vuc = (struct ddf_vuc_record *)vdc;
printf("VD_GUID ");
print_guid(vuc->VD_GUID);
printf("\n");
break;
case DDF_SA_SIGNATURE:
printf("** Spare Assignment Configuration **\n");
sa = (struct ddf_sa_record *)vdc;
printf("Timestamp 0x%08x\n",
GET32D(meta, sa->Timestamp));
printf("Spare_Type 0x%02x\n",
GET8D(meta, sa->Spare_Type));
printf("Populated_SAEs %u\n",
GET16D(meta, sa->Populated_SAEs));
printf("MAX_SAE_Supported %u\n",
GET16D(meta, sa->MAX_SAE_Supported));
for (i = 0; i < GET16D(meta, sa->Populated_SAEs); i++) {
if (isff(sa->entry[i].VD_GUID, 24))
continue;
printf("VD_GUID ");
for (k = 0; k < 24; k++)
printf("%02x", sa->entry[i].VD_GUID[k]);
printf("\n");
printf("Secondary_Element %u\n",
GET16D(meta, sa->entry[i].Secondary_Element));
}
break;
case 0x00000000:
case 0xFFFFFFFF:
break;
default:
printf("Unknown configuration signature %08x\n", val);
break;
}
}
printf("**** Physical Disk Data ****\n");
printf("PD_GUID ");
print_guid(meta->pdd->PD_GUID);
printf("\n");
printf("PD_Reference 0x%08x\n",
GET32(meta, pdd->PD_Reference));
printf("Forced_Ref_Flag 0x%02x\n",
GET8(meta, pdd->Forced_Ref_Flag));
printf("Forced_PD_GUID_Flag 0x%02x\n",
GET8(meta, pdd->Forced_PD_GUID_Flag));
}
static int
ddf_meta_find_pd(struct ddf_meta *meta, uint8_t *GUID, uint32_t PD_Reference)
{
int i;
for (i = 0; i < GET16(meta, pdr->Populated_PDEs); i++) {
if (GUID != NULL) {
if (memcmp(meta->pdr->entry[i].PD_GUID, GUID, 24) == 0)
return (i);
} else if (PD_Reference != 0xffffffff) {
if (GET32(meta, pdr->entry[i].PD_Reference) == PD_Reference)
return (i);
} else
if (isff(meta->pdr->entry[i].PD_GUID, 24))
return (i);
}
if (GUID == NULL && PD_Reference == 0xffffffff) {
if (i >= GET16(meta, pdr->Max_PDE_Supported))
return (-1);
SET16(meta, pdr->Populated_PDEs, i + 1);
return (i);
}
return (-1);
}
static int
ddf_meta_find_vd(struct ddf_meta *meta, uint8_t *GUID)
{
int i;
for (i = 0; i < GET16(meta, vdr->Populated_VDEs); i++) {
if (GUID != NULL) {
if (memcmp(meta->vdr->entry[i].VD_GUID, GUID, 24) == 0)
return (i);
} else
if (isff(meta->vdr->entry[i].VD_GUID, 24))
return (i);
}
if (GUID == NULL) {
if (i >= GET16(meta, vdr->Max_VDE_Supported))
return (-1);
SET16(meta, vdr->Populated_VDEs, i + 1);
return (i);
}
return (-1);
}
static struct ddf_vdc_record *
ddf_meta_find_vdc(struct ddf_meta *meta, uint8_t *GUID)
{
struct ddf_vdc_record *vdc;
int i, num;
num = GETCRNUM(meta);
for (i = 0; i < num; i++) {
vdc = GETVDCPTR(meta, i);
if (GUID != NULL) {
if (GET32D(meta, vdc->Signature) == DDF_VDCR_SIGNATURE &&
memcmp(vdc->VD_GUID, GUID, 24) == 0)
return (vdc);
} else
if (GET32D(meta, vdc->Signature) == 0xffffffff ||
GET32D(meta, vdc->Signature) == 0)
return (vdc);
}
return (NULL);
}
static int
ddf_meta_count_vdc(struct ddf_meta *meta, uint8_t *GUID)
{
struct ddf_vdc_record *vdc;
int i, num, cnt;
cnt = 0;
num = GETCRNUM(meta);
for (i = 0; i < num; i++) {
vdc = GETVDCPTR(meta, i);
if (GET32D(meta, vdc->Signature) != DDF_VDCR_SIGNATURE)
continue;
if (GUID == NULL || memcmp(vdc->VD_GUID, GUID, 24) == 0)
cnt++;
}
return (cnt);
}
static int
ddf_meta_find_disk(struct ddf_vol_meta *vmeta, uint32_t PD_Reference,
int *bvdp, int *posp)
{
int i, bvd, pos;
i = 0;
for (bvd = 0; bvd < GET8(vmeta, vdc->Secondary_Element_Count); bvd++) {
if (vmeta->bvdc[bvd] == NULL) {
i += GET16(vmeta, vdc->Primary_Element_Count); // XXX
continue;
}
for (pos = 0; pos < GET16(vmeta, bvdc[bvd]->Primary_Element_Count);
pos++, i++) {
if (GET32(vmeta, bvdc[bvd]->Physical_Disk_Sequence[pos]) ==
PD_Reference) {
if (bvdp != NULL)
*bvdp = bvd;
if (posp != NULL)
*posp = pos;
return (i);
}
}
}
return (-1);
}
static struct ddf_sa_record *
ddf_meta_find_sa(struct ddf_meta *meta, int create)
{
struct ddf_sa_record *sa;
int i, num;
num = GETCRNUM(meta);
for (i = 0; i < num; i++) {
sa = GETSAPTR(meta, i);
if (GET32D(meta, sa->Signature) == DDF_SA_SIGNATURE)
return (sa);
}
if (create) {
for (i = 0; i < num; i++) {
sa = GETSAPTR(meta, i);
if (GET32D(meta, sa->Signature) == 0xffffffff ||
GET32D(meta, sa->Signature) == 0)
return (sa);
}
}
return (NULL);
}
static void
ddf_meta_create(struct g_raid_disk *disk, struct ddf_meta *sample)
{
struct timespec ts;
struct clocktime ct;
struct g_raid_md_ddf_perdisk *pd;
struct g_raid_md_ddf_object *mdi;
struct ddf_meta *meta;
struct ddf_pd_entry *pde;
off_t anchorlba;
u_int ss, pos, size;
int len, error;
char serial_buffer[24];
if (sample->hdr == NULL)
sample = NULL;
mdi = (struct g_raid_md_ddf_object *)disk->d_softc->sc_md;
pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
meta = &pd->pd_meta;
ss = disk->d_consumer->provider->sectorsize;
anchorlba = disk->d_consumer->provider->mediasize / ss - 1;
meta->sectorsize = ss;
meta->bigendian = sample ? sample->bigendian : mdi->mdio_bigendian;
getnanotime(&ts);
clock_ts_to_ct(&ts, &ct);
/* Header */
meta->hdr = malloc(ss, M_MD_DDF, M_WAITOK);
memset(meta->hdr, 0xff, ss);
if (sample) {
memcpy(meta->hdr, sample->hdr, sizeof(struct ddf_header));
if (ss != sample->sectorsize) {
SET32(meta, hdr->WorkSpace_Length,
howmany(GET32(sample, hdr->WorkSpace_Length) *
sample->sectorsize, ss));
SET16(meta, hdr->Configuration_Record_Length,
howmany(GET16(sample,
hdr->Configuration_Record_Length) *
sample->sectorsize, ss));
SET32(meta, hdr->cd_length,
howmany(GET32(sample, hdr->cd_length) *
sample->sectorsize, ss));
SET32(meta, hdr->pdr_length,
howmany(GET32(sample, hdr->pdr_length) *
sample->sectorsize, ss));
SET32(meta, hdr->vdr_length,
howmany(GET32(sample, hdr->vdr_length) *
sample->sectorsize, ss));
SET32(meta, hdr->cr_length,
howmany(GET32(sample, hdr->cr_length) *
sample->sectorsize, ss));
SET32(meta, hdr->pdd_length,
howmany(GET32(sample, hdr->pdd_length) *
sample->sectorsize, ss));
SET32(meta, hdr->bbmlog_length,
howmany(GET32(sample, hdr->bbmlog_length) *
sample->sectorsize, ss));
SET32(meta, hdr->Diagnostic_Space,
howmany(GET32(sample, hdr->bbmlog_length) *
sample->sectorsize, ss));
SET32(meta, hdr->Vendor_Specific_Logs,
howmany(GET32(sample, hdr->bbmlog_length) *
sample->sectorsize, ss));
}
} else {
SET32(meta, hdr->Signature, DDF_HEADER_SIGNATURE);
snprintf(meta->hdr->DDF_Header_GUID, 25, "FreeBSD %08x%08x",
(u_int)(ts.tv_sec - DECADE), arc4random());
memcpy(meta->hdr->DDF_rev, "02.00.00", 8);
SET32(meta, hdr->TimeStamp, (ts.tv_sec - DECADE));
SET32(meta, hdr->WorkSpace_Length, 16 * 1024 * 1024 / ss);
SET16(meta, hdr->Max_PD_Entries, DDF_MAX_DISKS - 1);
SET16(meta, hdr->Max_VD_Entries, DDF_MAX_VDISKS);
SET16(meta, hdr->Max_Partitions, DDF_MAX_PARTITIONS);
SET16(meta, hdr->Max_Primary_Element_Entries, DDF_MAX_DISKS);
SET16(meta, hdr->Configuration_Record_Length,
howmany(sizeof(struct ddf_vdc_record) + (4 + 8) *
GET16(meta, hdr->Max_Primary_Element_Entries), ss));
SET32(meta, hdr->cd_length,
howmany(sizeof(struct ddf_cd_record), ss));
SET32(meta, hdr->pdr_length,
howmany(sizeof(struct ddf_pd_record) +
sizeof(struct ddf_pd_entry) * GET16(meta,
hdr->Max_PD_Entries), ss));
SET32(meta, hdr->vdr_length,
howmany(sizeof(struct ddf_vd_record) +
sizeof(struct ddf_vd_entry) *
GET16(meta, hdr->Max_VD_Entries), ss));
SET32(meta, hdr->cr_length,
GET16(meta, hdr->Configuration_Record_Length) *
(GET16(meta, hdr->Max_Partitions) + 1));
SET32(meta, hdr->pdd_length,
howmany(sizeof(struct ddf_pdd_record), ss));
SET32(meta, hdr->bbmlog_length, 0);
SET32(meta, hdr->Diagnostic_Space_Length, 0);
SET32(meta, hdr->Vendor_Specific_Logs_Length, 0);
}
pos = 1;
SET32(meta, hdr->cd_section, pos);
pos += GET32(meta, hdr->cd_length);
SET32(meta, hdr->pdr_section, pos);
pos += GET32(meta, hdr->pdr_length);
SET32(meta, hdr->vdr_section, pos);
pos += GET32(meta, hdr->vdr_length);
SET32(meta, hdr->cr_section, pos);
pos += GET32(meta, hdr->cr_length);
SET32(meta, hdr->pdd_section, pos);
pos += GET32(meta, hdr->pdd_length);
SET32(meta, hdr->bbmlog_section,
GET32(meta, hdr->bbmlog_length) != 0 ? pos : 0xffffffff);
pos += GET32(meta, hdr->bbmlog_length);
SET32(meta, hdr->Diagnostic_Space,
GET32(meta, hdr->Diagnostic_Space_Length) != 0 ? pos : 0xffffffff);
pos += GET32(meta, hdr->Diagnostic_Space_Length);
SET32(meta, hdr->Vendor_Specific_Logs,
GET32(meta, hdr->Vendor_Specific_Logs_Length) != 0 ? pos : 0xffffffff);
pos += min(GET32(meta, hdr->Vendor_Specific_Logs_Length), 1);
SET64(meta, hdr->Primary_Header_LBA,
anchorlba - pos);
SET64(meta, hdr->Secondary_Header_LBA,
0xffffffffffffffffULL);
SET64(meta, hdr->WorkSpace_LBA,
anchorlba + 1 - 32 * 1024 * 1024 / ss);
/* Controller Data */
size = GET32(meta, hdr->cd_length) * ss;
meta->cdr = malloc(size, M_MD_DDF, M_WAITOK);
memset(meta->cdr, 0xff, size);
SET32(meta, cdr->Signature, DDF_CONTROLLER_DATA_SIGNATURE);
memcpy(meta->cdr->Controller_GUID, "FreeBSD GEOM RAID SERIAL", 24);
memcpy(meta->cdr->Product_ID, "FreeBSD GEOMRAID", 16);
/* Physical Drive Records. */
size = GET32(meta, hdr->pdr_length) * ss;
meta->pdr = malloc(size, M_MD_DDF, M_WAITOK);
memset(meta->pdr, 0xff, size);
SET32(meta, pdr->Signature, DDF_PDR_SIGNATURE);
SET16(meta, pdr->Populated_PDEs, 1);
SET16(meta, pdr->Max_PDE_Supported,
GET16(meta, hdr->Max_PD_Entries));
pde = &meta->pdr->entry[0];
len = sizeof(serial_buffer);
error = g_io_getattr("GEOM::ident", disk->d_consumer, &len, serial_buffer);
if (error == 0 && (len = strlen (serial_buffer)) >= 6 && len <= 20)
snprintf(pde->PD_GUID, 25, "DISK%20s", serial_buffer);
else
snprintf(pde->PD_GUID, 25, "DISK%04d%02d%02d%08x%04x",
ct.year, ct.mon, ct.day,
arc4random(), arc4random() & 0xffff);
SET32D(meta, pde->PD_Reference, arc4random());
SET16D(meta, pde->PD_Type, DDF_PDE_GUID_FORCE);
SET16D(meta, pde->PD_State, 0);
SET64D(meta, pde->Configured_Size,
anchorlba + 1 - 32 * 1024 * 1024 / ss);
SET16D(meta, pde->Block_Size, ss);
/* Virtual Drive Records. */
size = GET32(meta, hdr->vdr_length) * ss;
meta->vdr = malloc(size, M_MD_DDF, M_WAITOK);
memset(meta->vdr, 0xff, size);
SET32(meta, vdr->Signature, DDF_VD_RECORD_SIGNATURE);
SET32(meta, vdr->Populated_VDEs, 0);
SET16(meta, vdr->Max_VDE_Supported,
GET16(meta, hdr->Max_VD_Entries));
/* Configuration Records. */
size = GET32(meta, hdr->cr_length) * ss;
meta->cr = malloc(size, M_MD_DDF, M_WAITOK);
memset(meta->cr, 0xff, size);
/* Physical Disk Data. */
size = GET32(meta, hdr->pdd_length) * ss;
meta->pdd = malloc(size, M_MD_DDF, M_WAITOK);
memset(meta->pdd, 0xff, size);
SET32(meta, pdd->Signature, DDF_PDD_SIGNATURE);
memcpy(meta->pdd->PD_GUID, pde->PD_GUID, 24);
SET32(meta, pdd->PD_Reference, GET32D(meta, pde->PD_Reference));
SET8(meta, pdd->Forced_Ref_Flag, DDF_PDD_FORCED_REF);
SET8(meta, pdd->Forced_PD_GUID_Flag, DDF_PDD_FORCED_GUID);
/* Bad Block Management Log. */
if (GET32(meta, hdr->bbmlog_length) != 0) {
size = GET32(meta, hdr->bbmlog_length) * ss;
meta->bbm = malloc(size, M_MD_DDF, M_WAITOK);
memset(meta->bbm, 0xff, size);
SET32(meta, bbm->Signature, DDF_BBML_SIGNATURE);
SET32(meta, bbm->Entry_Count, 0);
SET32(meta, bbm->Spare_Block_Count, 0);
}
}
static void
ddf_meta_copy(struct ddf_meta *dst, struct ddf_meta *src)
{
- struct ddf_header *hdr;
u_int ss;
- hdr = src->hdr;
dst->bigendian = src->bigendian;
ss = dst->sectorsize = src->sectorsize;
dst->hdr = malloc(ss, M_MD_DDF, M_WAITOK);
memcpy(dst->hdr, src->hdr, ss);
dst->cdr = malloc(GET32(src, hdr->cd_length) * ss, M_MD_DDF, M_WAITOK);
memcpy(dst->cdr, src->cdr, GET32(src, hdr->cd_length) * ss);
dst->pdr = malloc(GET32(src, hdr->pdr_length) * ss, M_MD_DDF, M_WAITOK);
memcpy(dst->pdr, src->pdr, GET32(src, hdr->pdr_length) * ss);
dst->vdr = malloc(GET32(src, hdr->vdr_length) * ss, M_MD_DDF, M_WAITOK);
memcpy(dst->vdr, src->vdr, GET32(src, hdr->vdr_length) * ss);
dst->cr = malloc(GET32(src, hdr->cr_length) * ss, M_MD_DDF, M_WAITOK);
memcpy(dst->cr, src->cr, GET32(src, hdr->cr_length) * ss);
dst->pdd = malloc(GET32(src, hdr->pdd_length) * ss, M_MD_DDF, M_WAITOK);
memcpy(dst->pdd, src->pdd, GET32(src, hdr->pdd_length) * ss);
if (src->bbm != NULL) {
dst->bbm = malloc(GET32(src, hdr->bbmlog_length) * ss, M_MD_DDF, M_WAITOK);
memcpy(dst->bbm, src->bbm, GET32(src, hdr->bbmlog_length) * ss);
}
}
static void
ddf_meta_update(struct ddf_meta *meta, struct ddf_meta *src)
{
struct ddf_pd_entry *pde, *spde;
int i, j;
for (i = 0; i < GET16(src, pdr->Populated_PDEs); i++) {
spde = &src->pdr->entry[i];
if (isff(spde->PD_GUID, 24))
continue;
j = ddf_meta_find_pd(meta, NULL,
GET32(src, pdr->entry[i].PD_Reference));
if (j < 0) {
j = ddf_meta_find_pd(meta, NULL, 0xffffffff);
pde = &meta->pdr->entry[j];
memcpy(pde, spde, sizeof(*pde));
} else {
pde = &meta->pdr->entry[j];
SET16D(meta, pde->PD_State,
GET16D(meta, pde->PD_State) |
GET16D(src, pde->PD_State));
}
}
}
static void
ddf_meta_free(struct ddf_meta *meta)
{
if (meta->hdr != NULL) {
free(meta->hdr, M_MD_DDF);
meta->hdr = NULL;
}
if (meta->cdr != NULL) {
free(meta->cdr, M_MD_DDF);
meta->cdr = NULL;
}
if (meta->pdr != NULL) {
free(meta->pdr, M_MD_DDF);
meta->pdr = NULL;
}
if (meta->vdr != NULL) {
free(meta->vdr, M_MD_DDF);
meta->vdr = NULL;
}
if (meta->cr != NULL) {
free(meta->cr, M_MD_DDF);
meta->cr = NULL;
}
if (meta->pdd != NULL) {
free(meta->pdd, M_MD_DDF);
meta->pdd = NULL;
}
if (meta->bbm != NULL) {
free(meta->bbm, M_MD_DDF);
meta->bbm = NULL;
}
}
static void
ddf_vol_meta_create(struct ddf_vol_meta *meta, struct ddf_meta *sample)
{
struct timespec ts;
struct clocktime ct;
- struct ddf_header *hdr;
u_int ss, size;
- hdr = sample->hdr;
meta->bigendian = sample->bigendian;
ss = meta->sectorsize = sample->sectorsize;
meta->hdr = malloc(ss, M_MD_DDF, M_WAITOK);
memcpy(meta->hdr, sample->hdr, ss);
meta->cdr = malloc(GET32(sample, hdr->cd_length) * ss, M_MD_DDF, M_WAITOK);
memcpy(meta->cdr, sample->cdr, GET32(sample, hdr->cd_length) * ss);
meta->vde = malloc(sizeof(struct ddf_vd_entry), M_MD_DDF, M_WAITOK);
memset(meta->vde, 0xff, sizeof(struct ddf_vd_entry));
getnanotime(&ts);
clock_ts_to_ct(&ts, &ct);
snprintf(meta->vde->VD_GUID, 25, "FreeBSD%04d%02d%02d%08x%01x",
ct.year, ct.mon, ct.day,
arc4random(), arc4random() & 0xf);
size = GET16(sample, hdr->Configuration_Record_Length) * ss;
meta->vdc = malloc(size, M_MD_DDF, M_WAITOK);
memset(meta->vdc, 0xff, size);
SET32(meta, vdc->Signature, DDF_VDCR_SIGNATURE);
memcpy(meta->vdc->VD_GUID, meta->vde->VD_GUID, 24);
SET32(meta, vdc->Sequence_Number, 0);
}
static void
ddf_vol_meta_update(struct ddf_vol_meta *dst, struct ddf_meta *src,
uint8_t *GUID, int started)
{
- struct ddf_header *hdr;
struct ddf_vd_entry *vde;
struct ddf_vdc_record *vdc;
int vnew, bvnew, bvd, size;
u_int ss;
- hdr = src->hdr;
vde = &src->vdr->entry[ddf_meta_find_vd(src, GUID)];
vdc = ddf_meta_find_vdc(src, GUID);
if (GET8D(src, vdc->Secondary_Element_Count) == 1)
bvd = 0;
else
bvd = GET8D(src, vdc->Secondary_Element_Seq);
size = GET16(src, hdr->Configuration_Record_Length) * src->sectorsize;
if (dst->vdc == NULL ||
(!started && ((int32_t)(GET32D(src, vdc->Sequence_Number) -
GET32(dst, vdc->Sequence_Number))) > 0))
vnew = 1;
else
vnew = 0;
if (dst->bvdc[bvd] == NULL ||
(!started && ((int32_t)(GET32D(src, vdc->Sequence_Number) -
GET32(dst, bvdc[bvd]->Sequence_Number))) > 0))
bvnew = 1;
else
bvnew = 0;
if (vnew) {
dst->bigendian = src->bigendian;
ss = dst->sectorsize = src->sectorsize;
if (dst->hdr != NULL)
free(dst->hdr, M_MD_DDF);
dst->hdr = malloc(ss, M_MD_DDF, M_WAITOK);
memcpy(dst->hdr, src->hdr, ss);
if (dst->cdr != NULL)
free(dst->cdr, M_MD_DDF);
dst->cdr = malloc(GET32(src, hdr->cd_length) * ss, M_MD_DDF, M_WAITOK);
memcpy(dst->cdr, src->cdr, GET32(src, hdr->cd_length) * ss);
if (dst->vde != NULL)
free(dst->vde, M_MD_DDF);
dst->vde = malloc(sizeof(struct ddf_vd_entry), M_MD_DDF, M_WAITOK);
memcpy(dst->vde, vde, sizeof(struct ddf_vd_entry));
if (dst->vdc != NULL)
free(dst->vdc, M_MD_DDF);
dst->vdc = malloc(size, M_MD_DDF, M_WAITOK);
memcpy(dst->vdc, vdc, size);
}
if (bvnew) {
if (dst->bvdc[bvd] != NULL)
free(dst->bvdc[bvd], M_MD_DDF);
dst->bvdc[bvd] = malloc(size, M_MD_DDF, M_WAITOK);
memcpy(dst->bvdc[bvd], vdc, size);
}
}
static void
ddf_vol_meta_free(struct ddf_vol_meta *meta)
{
int i;
if (meta->hdr != NULL) {
free(meta->hdr, M_MD_DDF);
meta->hdr = NULL;
}
if (meta->cdr != NULL) {
free(meta->cdr, M_MD_DDF);
meta->cdr = NULL;
}
if (meta->vde != NULL) {
free(meta->vde, M_MD_DDF);
meta->vde = NULL;
}
if (meta->vdc != NULL) {
free(meta->vdc, M_MD_DDF);
meta->vdc = NULL;
}
for (i = 0; i < DDF_MAX_DISKS_HARD; i++) {
if (meta->bvdc[i] != NULL) {
free(meta->bvdc[i], M_MD_DDF);
meta->bvdc[i] = NULL;
}
}
}
static int
ddf_meta_unused_range(struct ddf_meta *meta, off_t *off, off_t *size)
{
struct ddf_vdc_record *vdc;
off_t beg[32], end[32], beg1, end1;
uint64_t *offp;
int i, j, n, num, pos;
uint32_t ref;
*off = 0;
*size = 0;
ref = GET32(meta, pdd->PD_Reference);
pos = ddf_meta_find_pd(meta, NULL, ref);
beg[0] = 0;
end[0] = GET64(meta, pdr->entry[pos].Configured_Size);
n = 1;
num = GETCRNUM(meta);
for (i = 0; i < num; i++) {
vdc = GETVDCPTR(meta, i);
if (GET32D(meta, vdc->Signature) != DDF_VDCR_SIGNATURE)
continue;
for (pos = 0; pos < GET16D(meta, vdc->Primary_Element_Count); pos++)
if (GET32D(meta, vdc->Physical_Disk_Sequence[pos]) == ref)
break;
if (pos == GET16D(meta, vdc->Primary_Element_Count))
continue;
offp = (uint64_t *)&(vdc->Physical_Disk_Sequence[
GET16(meta, hdr->Max_Primary_Element_Entries)]);
beg1 = GET64P(meta, offp + pos);
end1 = beg1 + GET64D(meta, vdc->Block_Count);
for (j = 0; j < n; j++) {
if (beg[j] >= end1 || end[j] <= beg1 )
continue;
if (beg[j] < beg1 && end[j] > end1) {
beg[n] = end1;
end[n] = end[j];
end[j] = beg1;
n++;
} else if (beg[j] < beg1)
end[j] = beg1;
else
beg[j] = end1;
}
}
for (j = 0; j < n; j++) {
if (end[j] - beg[j] > *size) {
*off = beg[j];
*size = end[j] - beg[j];
}
}
return ((*size > 0) ? 1 : 0);
}
static void
ddf_meta_get_name(struct ddf_meta *meta, int num, char *buf)
{
const char *b;
int i;
b = meta->vdr->entry[num].VD_Name;
for (i = 15; i >= 0; i--)
if (b[i] != 0x20)
break;
memcpy(buf, b, i + 1);
buf[i + 1] = 0;
}
static void
ddf_meta_put_name(struct ddf_vol_meta *meta, char *buf)
{
int len;
len = min(strlen(buf), 16);
memset(meta->vde->VD_Name, 0x20, 16);
memcpy(meta->vde->VD_Name, buf, len);
}
static int
ddf_meta_read(struct g_consumer *cp, struct ddf_meta *meta)
{
struct g_provider *pp;
struct ddf_header *ahdr, *hdr;
char *abuf, *buf;
off_t plba, slba, lba;
int error, len, i;
u_int ss;
uint32_t val;
ddf_meta_free(meta);
pp = cp->provider;
ss = meta->sectorsize = pp->sectorsize;
/* Read anchor block. */
abuf = g_read_data(cp, pp->mediasize - ss, ss, &error);
if (abuf == NULL) {
G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).",
pp->name, error);
return (error);
}
ahdr = (struct ddf_header *)abuf;
/* Check if this is an DDF RAID struct */
if (be32dec(&ahdr->Signature) == DDF_HEADER_SIGNATURE)
meta->bigendian = 1;
else if (le32dec(&ahdr->Signature) == DDF_HEADER_SIGNATURE)
meta->bigendian = 0;
else {
G_RAID_DEBUG(1, "DDF signature check failed on %s", pp->name);
error = EINVAL;
goto done;
}
if (ahdr->Header_Type != DDF_HEADER_ANCHOR) {
G_RAID_DEBUG(1, "DDF header type check failed on %s", pp->name);
error = EINVAL;
goto done;
}
meta->hdr = ahdr;
plba = GET64(meta, hdr->Primary_Header_LBA);
slba = GET64(meta, hdr->Secondary_Header_LBA);
val = GET32(meta, hdr->CRC);
SET32(meta, hdr->CRC, 0xffffffff);
meta->hdr = NULL;
if (crc32(ahdr, ss) != val) {
G_RAID_DEBUG(1, "DDF CRC mismatch on %s", pp->name);
error = EINVAL;
goto done;
}
if ((plba + 6) * ss >= pp->mediasize) {
G_RAID_DEBUG(1, "DDF primary header LBA is wrong on %s", pp->name);
error = EINVAL;
goto done;
}
if (slba != -1 && (slba + 6) * ss >= pp->mediasize) {
G_RAID_DEBUG(1, "DDF secondary header LBA is wrong on %s", pp->name);
error = EINVAL;
goto done;
}
lba = plba;
doread:
error = 0;
ddf_meta_free(meta);
/* Read header block. */
buf = g_read_data(cp, lba * ss, ss, &error);
if (buf == NULL) {
readerror:
G_RAID_DEBUG(1, "DDF %s metadata read error on %s (error=%d).",
(lba == plba) ? "primary" : "secondary", pp->name, error);
if (lba == plba && slba != -1) {
lba = slba;
goto doread;
}
G_RAID_DEBUG(1, "DDF metadata read error on %s.", pp->name);
goto done;
}
meta->hdr = malloc(ss, M_MD_DDF, M_WAITOK);
memcpy(meta->hdr, buf, ss);
g_free(buf);
hdr = meta->hdr;
val = GET32(meta, hdr->CRC);
SET32(meta, hdr->CRC, 0xffffffff);
if (hdr->Signature != ahdr->Signature ||
crc32(meta->hdr, ss) != val ||
memcmp(hdr->DDF_Header_GUID, ahdr->DDF_Header_GUID, 24) ||
GET64(meta, hdr->Primary_Header_LBA) != plba ||
GET64(meta, hdr->Secondary_Header_LBA) != slba) {
hdrerror:
G_RAID_DEBUG(1, "DDF %s metadata check failed on %s",
(lba == plba) ? "primary" : "secondary", pp->name);
if (lba == plba && slba != -1) {
lba = slba;
goto doread;
}
G_RAID_DEBUG(1, "DDF metadata check failed on %s", pp->name);
error = EINVAL;
goto done;
}
if ((lba == plba && hdr->Header_Type != DDF_HEADER_PRIMARY) ||
(lba == slba && hdr->Header_Type != DDF_HEADER_SECONDARY))
goto hdrerror;
len = 1;
len = max(len, GET32(meta, hdr->cd_section) + GET32(meta, hdr->cd_length));
len = max(len, GET32(meta, hdr->pdr_section) + GET32(meta, hdr->pdr_length));
len = max(len, GET32(meta, hdr->vdr_section) + GET32(meta, hdr->vdr_length));
len = max(len, GET32(meta, hdr->cr_section) + GET32(meta, hdr->cr_length));
len = max(len, GET32(meta, hdr->pdd_section) + GET32(meta, hdr->pdd_length));
if ((val = GET32(meta, hdr->bbmlog_section)) != 0xffffffff)
len = max(len, val + GET32(meta, hdr->bbmlog_length));
if ((val = GET32(meta, hdr->Diagnostic_Space)) != 0xffffffff)
len = max(len, val + GET32(meta, hdr->Diagnostic_Space_Length));
if ((val = GET32(meta, hdr->Vendor_Specific_Logs)) != 0xffffffff)
len = max(len, val + GET32(meta, hdr->Vendor_Specific_Logs_Length));
if ((plba + len) * ss >= pp->mediasize)
goto hdrerror;
if (slba != -1 && (slba + len) * ss >= pp->mediasize)
goto hdrerror;
/* Workaround for Adaptec implementation. */
if (GET16(meta, hdr->Max_Primary_Element_Entries) == 0xffff) {
SET16(meta, hdr->Max_Primary_Element_Entries,
min(GET16(meta, hdr->Max_PD_Entries),
(GET16(meta, hdr->Configuration_Record_Length) * ss - 512) / 12));
}
if (GET32(meta, hdr->cd_length) * ss >= MAXPHYS ||
GET32(meta, hdr->pdr_length) * ss >= MAXPHYS ||
GET32(meta, hdr->vdr_length) * ss >= MAXPHYS ||
GET32(meta, hdr->cr_length) * ss >= MAXPHYS ||
GET32(meta, hdr->pdd_length) * ss >= MAXPHYS ||
GET32(meta, hdr->bbmlog_length) * ss >= MAXPHYS) {
G_RAID_DEBUG(1, "%s: Blocksize is too big.", pp->name);
goto hdrerror;
}
/* Read controller data. */
buf = g_read_data(cp, (lba + GET32(meta, hdr->cd_section)) * ss,
GET32(meta, hdr->cd_length) * ss, &error);
if (buf == NULL)
goto readerror;
meta->cdr = malloc(GET32(meta, hdr->cd_length) * ss, M_MD_DDF, M_WAITOK);
memcpy(meta->cdr, buf, GET32(meta, hdr->cd_length) * ss);
g_free(buf);
if (GET32(meta, cdr->Signature) != DDF_CONTROLLER_DATA_SIGNATURE)
goto hdrerror;
/* Read physical disk records. */
buf = g_read_data(cp, (lba + GET32(meta, hdr->pdr_section)) * ss,
GET32(meta, hdr->pdr_length) * ss, &error);
if (buf == NULL)
goto readerror;
meta->pdr = malloc(GET32(meta, hdr->pdr_length) * ss, M_MD_DDF, M_WAITOK);
memcpy(meta->pdr, buf, GET32(meta, hdr->pdr_length) * ss);
g_free(buf);
if (GET32(meta, pdr->Signature) != DDF_PDR_SIGNATURE)
goto hdrerror;
/*
* Workaround for reading metadata corrupted due to graid bug.
* XXX: Remove this before we have disks above 128PB. :)
*/
if (meta->bigendian) {
for (i = 0; i < GET16(meta, pdr->Populated_PDEs); i++) {
if (isff(meta->pdr->entry[i].PD_GUID, 24))
continue;
if (GET32(meta, pdr->entry[i].PD_Reference) ==
0xffffffff)
continue;
if (GET64(meta, pdr->entry[i].Configured_Size) >=
(1ULL << 48)) {
SET16(meta, pdr->entry[i].PD_State,
GET16(meta, pdr->entry[i].PD_State) &
~DDF_PDE_FAILED);
SET64(meta, pdr->entry[i].Configured_Size,
GET64(meta, pdr->entry[i].Configured_Size) &
((1ULL << 48) - 1));
}
}
}
/* Read virtual disk records. */
buf = g_read_data(cp, (lba + GET32(meta, hdr->vdr_section)) * ss,
GET32(meta, hdr->vdr_length) * ss, &error);
if (buf == NULL)
goto readerror;
meta->vdr = malloc(GET32(meta, hdr->vdr_length) * ss, M_MD_DDF, M_WAITOK);
memcpy(meta->vdr, buf, GET32(meta, hdr->vdr_length) * ss);
g_free(buf);
if (GET32(meta, vdr->Signature) != DDF_VD_RECORD_SIGNATURE)
goto hdrerror;
/* Read configuration records. */
buf = g_read_data(cp, (lba + GET32(meta, hdr->cr_section)) * ss,
GET32(meta, hdr->cr_length) * ss, &error);
if (buf == NULL)
goto readerror;
meta->cr = malloc(GET32(meta, hdr->cr_length) * ss, M_MD_DDF, M_WAITOK);
memcpy(meta->cr, buf, GET32(meta, hdr->cr_length) * ss);
g_free(buf);
/* Read physical disk data. */
buf = g_read_data(cp, (lba + GET32(meta, hdr->pdd_section)) * ss,
GET32(meta, hdr->pdd_length) * ss, &error);
if (buf == NULL)
goto readerror;
meta->pdd = malloc(GET32(meta, hdr->pdd_length) * ss, M_MD_DDF, M_WAITOK);
memcpy(meta->pdd, buf, GET32(meta, hdr->pdd_length) * ss);
g_free(buf);
if (GET32(meta, pdd->Signature) != DDF_PDD_SIGNATURE)
goto hdrerror;
i = ddf_meta_find_pd(meta, NULL, GET32(meta, pdd->PD_Reference));
if (i < 0)
goto hdrerror;
/* Read BBM Log. */
if (GET32(meta, hdr->bbmlog_section) != 0xffffffff &&
GET32(meta, hdr->bbmlog_length) != 0) {
buf = g_read_data(cp, (lba + GET32(meta, hdr->bbmlog_section)) * ss,
GET32(meta, hdr->bbmlog_length) * ss, &error);
if (buf == NULL)
goto readerror;
meta->bbm = malloc(GET32(meta, hdr->bbmlog_length) * ss, M_MD_DDF, M_WAITOK);
memcpy(meta->bbm, buf, GET32(meta, hdr->bbmlog_length) * ss);
g_free(buf);
if (GET32(meta, bbm->Signature) != DDF_BBML_SIGNATURE)
goto hdrerror;
}
done:
g_free(abuf);
if (error != 0)
ddf_meta_free(meta);
return (error);
}
static int
ddf_meta_write(struct g_consumer *cp, struct ddf_meta *meta)
{
struct g_provider *pp;
struct ddf_vdc_record *vdc;
off_t alba, plba, slba, lba;
u_int ss, size;
int error, i, num;
pp = cp->provider;
ss = pp->sectorsize;
lba = alba = pp->mediasize / ss - 1;
plba = GET64(meta, hdr->Primary_Header_LBA);
slba = GET64(meta, hdr->Secondary_Header_LBA);
next:
SET8(meta, hdr->Header_Type, (lba == alba) ? DDF_HEADER_ANCHOR :
(lba == plba) ? DDF_HEADER_PRIMARY : DDF_HEADER_SECONDARY);
SET32(meta, hdr->CRC, 0xffffffff);
SET32(meta, hdr->CRC, crc32(meta->hdr, ss));
error = g_write_data(cp, lba * ss, meta->hdr, ss);
if (error != 0) {
err:
G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).",
pp->name, error);
if (lba != alba)
goto done;
}
if (lba == alba) {
lba = plba;
goto next;
}
size = GET32(meta, hdr->cd_length) * ss;
SET32(meta, cdr->CRC, 0xffffffff);
SET32(meta, cdr->CRC, crc32(meta->cdr, size));
error = g_write_data(cp, (lba + GET32(meta, hdr->cd_section)) * ss,
meta->cdr, size);
if (error != 0)
goto err;
size = GET32(meta, hdr->pdr_length) * ss;
SET32(meta, pdr->CRC, 0xffffffff);
SET32(meta, pdr->CRC, crc32(meta->pdr, size));
error = g_write_data(cp, (lba + GET32(meta, hdr->pdr_section)) * ss,
meta->pdr, size);
if (error != 0)
goto err;
size = GET32(meta, hdr->vdr_length) * ss;
SET32(meta, vdr->CRC, 0xffffffff);
SET32(meta, vdr->CRC, crc32(meta->vdr, size));
error = g_write_data(cp, (lba + GET32(meta, hdr->vdr_section)) * ss,
meta->vdr, size);
if (error != 0)
goto err;
size = GET16(meta, hdr->Configuration_Record_Length) * ss;
num = GETCRNUM(meta);
for (i = 0; i < num; i++) {
vdc = GETVDCPTR(meta, i);
SET32D(meta, vdc->CRC, 0xffffffff);
SET32D(meta, vdc->CRC, crc32(vdc, size));
}
error = g_write_data(cp, (lba + GET32(meta, hdr->cr_section)) * ss,
meta->cr, size * num);
if (error != 0)
goto err;
size = GET32(meta, hdr->pdd_length) * ss;
SET32(meta, pdd->CRC, 0xffffffff);
SET32(meta, pdd->CRC, crc32(meta->pdd, size));
error = g_write_data(cp, (lba + GET32(meta, hdr->pdd_section)) * ss,
meta->pdd, size);
if (error != 0)
goto err;
if (GET32(meta, hdr->bbmlog_length) != 0) {
size = GET32(meta, hdr->bbmlog_length) * ss;
SET32(meta, bbm->CRC, 0xffffffff);
SET32(meta, bbm->CRC, crc32(meta->bbm, size));
error = g_write_data(cp,
(lba + GET32(meta, hdr->bbmlog_section)) * ss,
meta->bbm, size);
if (error != 0)
goto err;
}
done:
if (lba == plba && slba != -1) {
lba = slba;
goto next;
}
return (error);
}
static int
ddf_meta_erase(struct g_consumer *cp)
{
struct g_provider *pp;
char *buf;
int error;
pp = cp->provider;
buf = malloc(pp->sectorsize, M_MD_DDF, M_WAITOK | M_ZERO);
error = g_write_data(cp, pp->mediasize - pp->sectorsize,
buf, pp->sectorsize);
if (error != 0) {
G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).",
pp->name, error);
}
free(buf, M_MD_DDF);
return (error);
}
static struct g_raid_volume *
g_raid_md_ddf_get_volume(struct g_raid_softc *sc, uint8_t *GUID)
{
struct g_raid_volume *vol;
struct g_raid_md_ddf_pervolume *pv;
TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
pv = vol->v_md_data;
if (memcmp(pv->pv_meta.vde->VD_GUID, GUID, 24) == 0)
break;
}
return (vol);
}
static struct g_raid_disk *
g_raid_md_ddf_get_disk(struct g_raid_softc *sc, uint8_t *GUID, uint32_t id)
{
struct g_raid_disk *disk;
struct g_raid_md_ddf_perdisk *pd;
struct ddf_meta *meta;
TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
meta = &pd->pd_meta;
if (GUID != NULL) {
if (memcmp(meta->pdd->PD_GUID, GUID, 24) == 0)
break;
} else {
if (GET32(meta, pdd->PD_Reference) == id)
break;
}
}
return (disk);
}
static int
g_raid_md_ddf_purge_volumes(struct g_raid_softc *sc)
{
struct g_raid_volume *vol, *tvol;
- struct g_raid_md_ddf_pervolume *pv;
int i, res;
res = 0;
TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tvol) {
- pv = vol->v_md_data;
if (vol->v_stopping)
continue;
for (i = 0; i < vol->v_disks_count; i++) {
if (vol->v_subdisks[i].sd_state != G_RAID_SUBDISK_S_NONE)
break;
}
if (i >= vol->v_disks_count) {
g_raid_destroy_volume(vol);
res = 1;
}
}
return (res);
}
static int
g_raid_md_ddf_purge_disks(struct g_raid_softc *sc)
{
#if 0
struct g_raid_disk *disk, *tdisk;
struct g_raid_volume *vol;
struct g_raid_md_ddf_perdisk *pd;
int i, j, res;
res = 0;
TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) {
if (disk->d_state == G_RAID_DISK_S_SPARE)
continue;
pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
/* Scan for deleted volumes. */
for (i = 0; i < pd->pd_subdisks; ) {
vol = g_raid_md_ddf_get_volume(sc,
pd->pd_meta[i]->volume_id);
if (vol != NULL && !vol->v_stopping) {
i++;
continue;
}
free(pd->pd_meta[i], M_MD_DDF);
for (j = i; j < pd->pd_subdisks - 1; j++)
pd->pd_meta[j] = pd->pd_meta[j + 1];
pd->pd_meta[DDF_MAX_SUBDISKS - 1] = NULL;
pd->pd_subdisks--;
pd->pd_updated = 1;
}
/* If there is no metadata left - erase and delete disk. */
if (pd->pd_subdisks == 0) {
ddf_meta_erase(disk->d_consumer);
g_raid_destroy_disk(disk);
res = 1;
}
}
return (res);
#endif
return (0);
}
static int
g_raid_md_ddf_supported(int level, int qual, int disks, int force)
{
if (disks > DDF_MAX_DISKS_HARD)
return (0);
switch (level) {
case G_RAID_VOLUME_RL_RAID0:
if (qual != G_RAID_VOLUME_RLQ_NONE)
return (0);
if (disks < 1)
return (0);
if (!force && disks < 2)
return (0);
break;
case G_RAID_VOLUME_RL_RAID1:
if (disks < 1)
return (0);
if (qual == G_RAID_VOLUME_RLQ_R1SM) {
if (!force && disks != 2)
return (0);
} else if (qual == G_RAID_VOLUME_RLQ_R1MM) {
if (!force && disks != 3)
return (0);
} else
return (0);
break;
case G_RAID_VOLUME_RL_RAID3:
if (qual != G_RAID_VOLUME_RLQ_R3P0 &&
qual != G_RAID_VOLUME_RLQ_R3PN)
return (0);
if (disks < 3)
return (0);
break;
case G_RAID_VOLUME_RL_RAID4:
if (qual != G_RAID_VOLUME_RLQ_R4P0 &&
qual != G_RAID_VOLUME_RLQ_R4PN)
return (0);
if (disks < 3)
return (0);
break;
case G_RAID_VOLUME_RL_RAID5:
if (qual != G_RAID_VOLUME_RLQ_R5RA &&
qual != G_RAID_VOLUME_RLQ_R5RS &&
qual != G_RAID_VOLUME_RLQ_R5LA &&
qual != G_RAID_VOLUME_RLQ_R5LS)
return (0);
if (disks < 3)
return (0);
break;
case G_RAID_VOLUME_RL_RAID6:
if (qual != G_RAID_VOLUME_RLQ_R6RA &&
qual != G_RAID_VOLUME_RLQ_R6RS &&
qual != G_RAID_VOLUME_RLQ_R6LA &&
qual != G_RAID_VOLUME_RLQ_R6LS)
return (0);
if (disks < 4)
return (0);
break;
case G_RAID_VOLUME_RL_RAIDMDF:
if (qual != G_RAID_VOLUME_RLQ_RMDFRA &&
qual != G_RAID_VOLUME_RLQ_RMDFRS &&
qual != G_RAID_VOLUME_RLQ_RMDFLA &&
qual != G_RAID_VOLUME_RLQ_RMDFLS)
return (0);
if (disks < 4)
return (0);
break;
case G_RAID_VOLUME_RL_RAID1E:
if (qual != G_RAID_VOLUME_RLQ_R1EA &&
qual != G_RAID_VOLUME_RLQ_R1EO)
return (0);
if (disks < 3)
return (0);
break;
case G_RAID_VOLUME_RL_SINGLE:
if (qual != G_RAID_VOLUME_RLQ_NONE)
return (0);
if (disks != 1)
return (0);
break;
case G_RAID_VOLUME_RL_CONCAT:
if (qual != G_RAID_VOLUME_RLQ_NONE)
return (0);
if (disks < 2)
return (0);
break;
case G_RAID_VOLUME_RL_RAID5E:
if (qual != G_RAID_VOLUME_RLQ_R5ERA &&
qual != G_RAID_VOLUME_RLQ_R5ERS &&
qual != G_RAID_VOLUME_RLQ_R5ELA &&
qual != G_RAID_VOLUME_RLQ_R5ELS)
return (0);
if (disks < 4)
return (0);
break;
case G_RAID_VOLUME_RL_RAID5EE:
if (qual != G_RAID_VOLUME_RLQ_R5EERA &&
qual != G_RAID_VOLUME_RLQ_R5EERS &&
qual != G_RAID_VOLUME_RLQ_R5EELA &&
qual != G_RAID_VOLUME_RLQ_R5EELS)
return (0);
if (disks < 4)
return (0);
break;
case G_RAID_VOLUME_RL_RAID5R:
if (qual != G_RAID_VOLUME_RLQ_R5RRA &&
qual != G_RAID_VOLUME_RLQ_R5RRS &&
qual != G_RAID_VOLUME_RLQ_R5RLA &&
qual != G_RAID_VOLUME_RLQ_R5RLS)
return (0);
if (disks < 3)
return (0);
break;
default:
return (0);
}
return (1);
}
static int
g_raid_md_ddf_start_disk(struct g_raid_disk *disk, struct g_raid_volume *vol)
{
struct g_raid_softc *sc;
struct g_raid_subdisk *sd;
struct g_raid_md_ddf_perdisk *pd;
struct g_raid_md_ddf_pervolume *pv;
struct g_raid_md_ddf_object *mdi;
struct ddf_vol_meta *vmeta;
struct ddf_meta *pdmeta, *gmeta;
struct ddf_vdc_record *vdc1;
struct ddf_sa_record *sa;
off_t size, eoff = 0, esize = 0;
uint64_t *val2;
int disk_pos, md_disk_bvd = -1, md_disk_pos = -1, md_pde_pos;
int i, resurrection = 0;
uint32_t reference;
sc = disk->d_softc;
mdi = (struct g_raid_md_ddf_object *)sc->sc_md;
pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
pdmeta = &pd->pd_meta;
reference = GET32(&pd->pd_meta, pdd->PD_Reference);
pv = vol->v_md_data;
vmeta = &pv->pv_meta;
gmeta = &mdi->mdio_meta;
/* Find disk position in metadata by its reference. */
disk_pos = ddf_meta_find_disk(vmeta, reference,
&md_disk_bvd, &md_disk_pos);
md_pde_pos = ddf_meta_find_pd(gmeta, NULL, reference);
if (disk_pos < 0) {
G_RAID_DEBUG1(1, sc,
"Disk %s is not a present part of the volume %s",
g_raid_get_diskname(disk), vol->v_name);
/* Failed stale disk is useless for us. */
if ((GET16(gmeta, pdr->entry[md_pde_pos].PD_State) & DDF_PDE_PFA) != 0) {
g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED);
return (0);
}
/* If disk has some metadata for this volume - erase. */
if ((vdc1 = ddf_meta_find_vdc(pdmeta, vmeta->vdc->VD_GUID)) != NULL)
SET32D(pdmeta, vdc1->Signature, 0xffffffff);
/* If we are in the start process, that's all for now. */
if (!pv->pv_started)
goto nofit;
/*
* If we have already started - try to get use of the disk.
* Try to replace OFFLINE disks first, then FAILED.
*/
if (ddf_meta_count_vdc(&pd->pd_meta, NULL) >=
GET16(&pd->pd_meta, hdr->Max_Partitions)) {
G_RAID_DEBUG1(1, sc, "No free partitions on disk %s",
g_raid_get_diskname(disk));
goto nofit;
}
ddf_meta_unused_range(&pd->pd_meta, &eoff, &esize);
if (esize == 0) {
G_RAID_DEBUG1(1, sc, "No free space on disk %s",
g_raid_get_diskname(disk));
goto nofit;
}
eoff *= pd->pd_meta.sectorsize;
esize *= pd->pd_meta.sectorsize;
size = INT64_MAX;
for (i = 0; i < vol->v_disks_count; i++) {
sd = &vol->v_subdisks[i];
if (sd->sd_state != G_RAID_SUBDISK_S_NONE)
size = sd->sd_size;
if (sd->sd_state <= G_RAID_SUBDISK_S_FAILED &&
(disk_pos < 0 ||
vol->v_subdisks[i].sd_state < sd->sd_state))
disk_pos = i;
}
if (disk_pos >= 0 &&
vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT &&
esize < size) {
G_RAID_DEBUG1(1, sc, "Disk %s free space "
"is too small (%ju < %ju)",
g_raid_get_diskname(disk), esize, size);
disk_pos = -1;
}
if (disk_pos >= 0) {
if (vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT)
esize = size;
md_disk_bvd = disk_pos / GET16(vmeta, vdc->Primary_Element_Count); // XXX
md_disk_pos = disk_pos % GET16(vmeta, vdc->Primary_Element_Count); // XXX
} else {
nofit:
if (disk->d_state == G_RAID_DISK_S_NONE)
g_raid_change_disk_state(disk,
G_RAID_DISK_S_STALE);
return (0);
}
/*
* If spare is committable, delete spare record.
* Othersize, mark it active and leave there.
*/
sa = ddf_meta_find_sa(&pd->pd_meta, 0);
if (sa != NULL) {
if ((GET8D(&pd->pd_meta, sa->Spare_Type) &
DDF_SAR_TYPE_REVERTIBLE) == 0) {
SET32D(&pd->pd_meta, sa->Signature, 0xffffffff);
} else {
SET8D(&pd->pd_meta, sa->Spare_Type,
GET8D(&pd->pd_meta, sa->Spare_Type) |
DDF_SAR_TYPE_ACTIVE);
}
}
G_RAID_DEBUG1(1, sc, "Disk %s takes pos %d in the volume %s",
g_raid_get_diskname(disk), disk_pos, vol->v_name);
resurrection = 1;
}
sd = &vol->v_subdisks[disk_pos];
if (resurrection && sd->sd_disk != NULL) {
g_raid_change_disk_state(sd->sd_disk,
G_RAID_DISK_S_STALE_FAILED);
TAILQ_REMOVE(&sd->sd_disk->d_subdisks,
sd, sd_next);
}
vol->v_subdisks[disk_pos].sd_disk = disk;
TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
/* Welcome the new disk. */
if (resurrection)
g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
else if (GET16(gmeta, pdr->entry[md_pde_pos].PD_State) & DDF_PDE_PFA)
g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED);
else
g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
if (resurrection) {
sd->sd_offset = eoff;
sd->sd_size = esize;
} else if (pdmeta->cr != NULL &&
(vdc1 = ddf_meta_find_vdc(pdmeta, vmeta->vdc->VD_GUID)) != NULL) {
val2 = (uint64_t *)&(vdc1->Physical_Disk_Sequence[GET16(vmeta, hdr->Max_Primary_Element_Entries)]);
sd->sd_offset = (off_t)GET64P(pdmeta, val2 + md_disk_pos) * 512;
sd->sd_size = (off_t)GET64D(pdmeta, vdc1->Block_Count) * 512;
}
if (resurrection) {
/* Stale disk, almost same as new. */
g_raid_change_subdisk_state(sd,
G_RAID_SUBDISK_S_NEW);
} else if (GET16(gmeta, pdr->entry[md_pde_pos].PD_State) & DDF_PDE_PFA) {
/* Failed disk. */
g_raid_change_subdisk_state(sd,
G_RAID_SUBDISK_S_FAILED);
} else if ((GET16(gmeta, pdr->entry[md_pde_pos].PD_State) &
(DDF_PDE_FAILED | DDF_PDE_REBUILD)) != 0) {
/* Rebuilding disk. */
g_raid_change_subdisk_state(sd,
G_RAID_SUBDISK_S_REBUILD);
sd->sd_rebuild_pos = 0;
} else if ((GET8(vmeta, vde->VD_State) & DDF_VDE_DIRTY) != 0 ||
(GET8(vmeta, vde->Init_State) & DDF_VDE_INIT_MASK) !=
DDF_VDE_INIT_FULL) {
/* Stale disk or dirty volume (unclean shutdown). */
g_raid_change_subdisk_state(sd,
G_RAID_SUBDISK_S_STALE);
} else {
/* Up to date disk. */
g_raid_change_subdisk_state(sd,
G_RAID_SUBDISK_S_ACTIVE);
}
g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
G_RAID_EVENT_SUBDISK);
return (resurrection);
}
static void
g_raid_md_ddf_refill(struct g_raid_softc *sc)
{
struct g_raid_volume *vol;
struct g_raid_subdisk *sd;
struct g_raid_disk *disk;
struct g_raid_md_object *md;
struct g_raid_md_ddf_perdisk *pd;
struct g_raid_md_ddf_pervolume *pv;
int update, updated, i, bad;
md = sc->sc_md;
restart:
updated = 0;
TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
pv = vol->v_md_data;
if (!pv->pv_started || vol->v_stopping)
continue;
/* Search for subdisk that needs replacement. */
bad = 0;
for (i = 0; i < vol->v_disks_count; i++) {
sd = &vol->v_subdisks[i];
if (sd->sd_state == G_RAID_SUBDISK_S_NONE ||
sd->sd_state == G_RAID_SUBDISK_S_FAILED)
bad = 1;
}
if (!bad)
continue;
G_RAID_DEBUG1(1, sc, "Volume %s is not complete, "
"trying to refill.", vol->v_name);
TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
/* Skip failed. */
if (disk->d_state < G_RAID_DISK_S_SPARE)
continue;
/* Skip already used by this volume. */
for (i = 0; i < vol->v_disks_count; i++) {
sd = &vol->v_subdisks[i];
if (sd->sd_disk == disk)
break;
}
if (i < vol->v_disks_count)
continue;
/* Try to use disk if it has empty extents. */
pd = disk->d_md_data;
if (ddf_meta_count_vdc(&pd->pd_meta, NULL) <
GET16(&pd->pd_meta, hdr->Max_Partitions)) {
update = g_raid_md_ddf_start_disk(disk, vol);
} else
update = 0;
if (update) {
updated = 1;
g_raid_md_write_ddf(md, vol, NULL, disk);
break;
}
}
}
if (updated)
goto restart;
}
static void
g_raid_md_ddf_start(struct g_raid_volume *vol)
{
struct g_raid_softc *sc;
struct g_raid_subdisk *sd;
struct g_raid_disk *disk;
struct g_raid_md_object *md;
struct g_raid_md_ddf_perdisk *pd;
struct g_raid_md_ddf_pervolume *pv;
struct g_raid_md_ddf_object *mdi;
struct ddf_vol_meta *vmeta;
- struct ddf_vdc_record *vdc;
uint64_t *val2;
int i, j, bvd;
sc = vol->v_softc;
md = sc->sc_md;
mdi = (struct g_raid_md_ddf_object *)md;
pv = vol->v_md_data;
vmeta = &pv->pv_meta;
- vdc = vmeta->vdc;
vol->v_raid_level = GET8(vmeta, vdc->Primary_RAID_Level);
vol->v_raid_level_qualifier = GET8(vmeta, vdc->RLQ);
if (GET8(vmeta, vdc->Secondary_Element_Count) > 1 &&
vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 &&
GET8(vmeta, vdc->Secondary_RAID_Level) == 0)
vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E;
vol->v_sectorsize = GET16(vmeta, vdc->Block_Size);
if (vol->v_sectorsize == 0xffff)
vol->v_sectorsize = vmeta->sectorsize;
vol->v_strip_size = vol->v_sectorsize << GET8(vmeta, vdc->Stripe_Size);
vol->v_disks_count = GET16(vmeta, vdc->Primary_Element_Count) *
GET8(vmeta, vdc->Secondary_Element_Count);
vol->v_mdf_pdisks = GET8(vmeta, vdc->MDF_Parity_Disks);
vol->v_mdf_polynomial = GET16(vmeta, vdc->MDF_Parity_Generator_Polynomial);
vol->v_mdf_method = GET8(vmeta, vdc->MDF_Constant_Generation_Method);
if (GET8(vmeta, vdc->Rotate_Parity_count) > 31)
vol->v_rotate_parity = 1;
else
vol->v_rotate_parity = 1 << GET8(vmeta, vdc->Rotate_Parity_count);
vol->v_mediasize = GET64(vmeta, vdc->VD_Size) * vol->v_sectorsize;
for (i = 0, j = 0, bvd = 0; i < vol->v_disks_count; i++, j++) {
if (j == GET16(vmeta, vdc->Primary_Element_Count)) {
j = 0;
bvd++;
}
sd = &vol->v_subdisks[i];
if (vmeta->bvdc[bvd] == NULL) {
sd->sd_offset = 0;
sd->sd_size = GET64(vmeta, vdc->Block_Count) *
vol->v_sectorsize;
continue;
}
val2 = (uint64_t *)&(vmeta->bvdc[bvd]->Physical_Disk_Sequence[
GET16(vmeta, hdr->Max_Primary_Element_Entries)]);
sd->sd_offset = GET64P(vmeta, val2 + j) * vol->v_sectorsize;
sd->sd_size = GET64(vmeta, bvdc[bvd]->Block_Count) *
vol->v_sectorsize;
}
g_raid_start_volume(vol);
/* Make all disks found till the moment take their places. */
TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
if (ddf_meta_find_vdc(&pd->pd_meta, vmeta->vdc->VD_GUID) != NULL)
g_raid_md_ddf_start_disk(disk, vol);
}
pv->pv_started = 1;
mdi->mdio_starting--;
callout_stop(&pv->pv_start_co);
G_RAID_DEBUG1(0, sc, "Volume started.");
g_raid_md_write_ddf(md, vol, NULL, NULL);
/* Pickup any STALE/SPARE disks to refill array if needed. */
g_raid_md_ddf_refill(sc);
g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME);
}
static void
g_raid_ddf_go(void *arg)
{
struct g_raid_volume *vol;
struct g_raid_softc *sc;
struct g_raid_md_ddf_pervolume *pv;
vol = arg;
pv = vol->v_md_data;
sc = vol->v_softc;
if (!pv->pv_started) {
G_RAID_DEBUG1(0, sc, "Force volume start due to timeout.");
g_raid_event_send(vol, G_RAID_VOLUME_E_STARTMD,
G_RAID_EVENT_VOLUME);
}
}
static void
g_raid_md_ddf_new_disk(struct g_raid_disk *disk)
{
struct g_raid_softc *sc;
struct g_raid_md_object *md;
struct g_raid_md_ddf_perdisk *pd;
struct g_raid_md_ddf_pervolume *pv;
struct g_raid_md_ddf_object *mdi;
struct g_raid_volume *vol;
struct ddf_meta *pdmeta;
struct ddf_vol_meta *vmeta;
struct ddf_vdc_record *vdc;
struct ddf_vd_entry *vde;
int i, j, k, num, have, need, cnt, spare;
uint32_t val;
char buf[17];
sc = disk->d_softc;
md = sc->sc_md;
mdi = (struct g_raid_md_ddf_object *)md;
pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
pdmeta = &pd->pd_meta;
spare = -1;
if (mdi->mdio_meta.hdr == NULL)
ddf_meta_copy(&mdi->mdio_meta, pdmeta);
else
ddf_meta_update(&mdi->mdio_meta, pdmeta);
num = GETCRNUM(pdmeta);
for (j = 0; j < num; j++) {
vdc = GETVDCPTR(pdmeta, j);
val = GET32D(pdmeta, vdc->Signature);
if (val == DDF_SA_SIGNATURE && spare == -1)
spare = 1;
if (val != DDF_VDCR_SIGNATURE)
continue;
spare = 0;
k = ddf_meta_find_vd(pdmeta, vdc->VD_GUID);
if (k < 0)
continue;
vde = &pdmeta->vdr->entry[k];
/* Look for volume with matching ID. */
vol = g_raid_md_ddf_get_volume(sc, vdc->VD_GUID);
if (vol == NULL) {
ddf_meta_get_name(pdmeta, k, buf);
vol = g_raid_create_volume(sc, buf,
GET16D(pdmeta, vde->VD_Number));
pv = malloc(sizeof(*pv), M_MD_DDF, M_WAITOK | M_ZERO);
vol->v_md_data = pv;
callout_init(&pv->pv_start_co, 1);
callout_reset(&pv->pv_start_co,
g_raid_start_timeout * hz,
g_raid_ddf_go, vol);
mdi->mdio_starting++;
} else
pv = vol->v_md_data;
/* If we haven't started yet - check metadata freshness. */
vmeta = &pv->pv_meta;
ddf_vol_meta_update(vmeta, pdmeta, vdc->VD_GUID, pv->pv_started);
}
if (spare == 1) {
g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE);
g_raid_md_ddf_refill(sc);
}
TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
pv = vol->v_md_data;
vmeta = &pv->pv_meta;
if (ddf_meta_find_vdc(pdmeta, vmeta->vdc->VD_GUID) == NULL)
continue;
if (pv->pv_started) {
if (g_raid_md_ddf_start_disk(disk, vol))
g_raid_md_write_ddf(md, vol, NULL, NULL);
continue;
}
/* If we collected all needed disks - start array. */
need = 0;
have = 0;
for (k = 0; k < GET8(vmeta, vdc->Secondary_Element_Count); k++) {
if (vmeta->bvdc[k] == NULL) {
need += GET16(vmeta, vdc->Primary_Element_Count);
continue;
}
cnt = GET16(vmeta, bvdc[k]->Primary_Element_Count);
need += cnt;
for (i = 0; i < cnt; i++) {
val = GET32(vmeta, bvdc[k]->Physical_Disk_Sequence[i]);
if (g_raid_md_ddf_get_disk(sc, NULL, val) != NULL)
have++;
}
}
G_RAID_DEBUG1(1, sc, "Volume %s now has %d of %d disks",
vol->v_name, have, need);
if (have == need)
g_raid_md_ddf_start(vol);
}
}
static int
g_raid_md_create_req_ddf(struct g_raid_md_object *md, struct g_class *mp,
struct gctl_req *req, struct g_geom **gp)
{
struct g_geom *geom;
struct g_raid_softc *sc;
struct g_raid_md_ddf_object *mdi, *mdi1;
char name[16];
const char *fmtopt;
int be = 1;
mdi = (struct g_raid_md_ddf_object *)md;
fmtopt = gctl_get_asciiparam(req, "fmtopt");
if (fmtopt == NULL || strcasecmp(fmtopt, "BE") == 0)
be = 1;
else if (strcasecmp(fmtopt, "LE") == 0)
be = 0;
else {
gctl_error(req, "Incorrect fmtopt argument.");
return (G_RAID_MD_TASTE_FAIL);
}
/* Search for existing node. */
LIST_FOREACH(geom, &mp->geom, geom) {
sc = geom->softc;
if (sc == NULL)
continue;
if (sc->sc_stopping != 0)
continue;
if (sc->sc_md->mdo_class != md->mdo_class)
continue;
mdi1 = (struct g_raid_md_ddf_object *)sc->sc_md;
if (mdi1->mdio_bigendian != be)
continue;
break;
}
if (geom != NULL) {
*gp = geom;
return (G_RAID_MD_TASTE_EXISTING);
}
/* Create new one if not found. */
mdi->mdio_bigendian = be;
snprintf(name, sizeof(name), "DDF%s", be ? "" : "-LE");
sc = g_raid_create_node(mp, name, md);
if (sc == NULL)
return (G_RAID_MD_TASTE_FAIL);
md->mdo_softc = sc;
*gp = sc->sc_geom;
return (G_RAID_MD_TASTE_NEW);
}
static int
g_raid_md_taste_ddf(struct g_raid_md_object *md, struct g_class *mp,
struct g_consumer *cp, struct g_geom **gp)
{
struct g_consumer *rcp;
struct g_provider *pp;
struct g_raid_softc *sc;
struct g_raid_disk *disk;
struct ddf_meta meta;
struct g_raid_md_ddf_perdisk *pd;
struct g_raid_md_ddf_object *mdi;
struct g_geom *geom;
int error, result, be;
char name[16];
G_RAID_DEBUG(1, "Tasting DDF on %s", cp->provider->name);
mdi = (struct g_raid_md_ddf_object *)md;
pp = cp->provider;
/* Read metadata from device. */
g_topology_unlock();
bzero(&meta, sizeof(meta));
error = ddf_meta_read(cp, &meta);
g_topology_lock();
if (error != 0)
return (G_RAID_MD_TASTE_FAIL);
be = meta.bigendian;
/* Metadata valid. Print it. */
g_raid_md_ddf_print(&meta);
/* Search for matching node. */
sc = NULL;
LIST_FOREACH(geom, &mp->geom, geom) {
sc = geom->softc;
if (sc == NULL)
continue;
if (sc->sc_stopping != 0)
continue;
if (sc->sc_md->mdo_class != md->mdo_class)
continue;
mdi = (struct g_raid_md_ddf_object *)sc->sc_md;
if (mdi->mdio_bigendian != be)
continue;
break;
}
/* Found matching node. */
if (geom != NULL) {
G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name);
result = G_RAID_MD_TASTE_EXISTING;
} else { /* Not found matching node -- create one. */
result = G_RAID_MD_TASTE_NEW;
mdi->mdio_bigendian = be;
snprintf(name, sizeof(name), "DDF%s", be ? "" : "-LE");
sc = g_raid_create_node(mp, name, md);
md->mdo_softc = sc;
geom = sc->sc_geom;
}
/* There is no return after this point, so we close passed consumer. */
g_access(cp, -1, 0, 0);
rcp = g_new_consumer(geom);
rcp->flags |= G_CF_DIRECT_RECEIVE;
g_attach(rcp, pp);
if (g_access(rcp, 1, 1, 1) != 0)
; //goto fail1;
g_topology_unlock();
sx_xlock(&sc->sc_lock);
pd = malloc(sizeof(*pd), M_MD_DDF, M_WAITOK | M_ZERO);
pd->pd_meta = meta;
disk = g_raid_create_disk(sc);
disk->d_md_data = (void *)pd;
disk->d_consumer = rcp;
rcp->private = disk;
g_raid_get_disk_info(disk);
g_raid_md_ddf_new_disk(disk);
sx_xunlock(&sc->sc_lock);
g_topology_lock();
*gp = geom;
return (result);
}
static int
g_raid_md_event_ddf(struct g_raid_md_object *md,
struct g_raid_disk *disk, u_int event)
{
struct g_raid_softc *sc;
sc = md->mdo_softc;
if (disk == NULL)
return (-1);
switch (event) {
case G_RAID_DISK_E_DISCONNECTED:
/* Delete disk. */
g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE);
g_raid_destroy_disk(disk);
g_raid_md_ddf_purge_volumes(sc);
/* Write updated metadata to all disks. */
g_raid_md_write_ddf(md, NULL, NULL, NULL);
/* Check if anything left. */
if (g_raid_ndisks(sc, -1) == 0)
g_raid_destroy_node(sc, 0);
else
g_raid_md_ddf_refill(sc);
return (0);
}
return (-2);
}
static int
g_raid_md_volume_event_ddf(struct g_raid_md_object *md,
struct g_raid_volume *vol, u_int event)
{
struct g_raid_md_ddf_pervolume *pv;
pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data;
switch (event) {
case G_RAID_VOLUME_E_STARTMD:
if (!pv->pv_started)
g_raid_md_ddf_start(vol);
return (0);
}
return (-2);
}
static int
g_raid_md_ctl_ddf(struct g_raid_md_object *md,
struct gctl_req *req)
{
struct g_raid_softc *sc;
struct g_raid_volume *vol, *vol1;
struct g_raid_subdisk *sd;
struct g_raid_disk *disk, *disks[DDF_MAX_DISKS_HARD];
struct g_raid_md_ddf_perdisk *pd;
struct g_raid_md_ddf_pervolume *pv;
struct g_raid_md_ddf_object *mdi;
struct ddf_sa_record *sa;
struct g_consumer *cp;
struct g_provider *pp;
char arg[16];
const char *nodename, *verb, *volname, *levelname, *diskname;
char *tmp;
int *nargs, *force;
off_t size, sectorsize, strip, offs[DDF_MAX_DISKS_HARD], esize;
intmax_t *sizearg, *striparg;
int i, numdisks, len, level, qual;
int error;
sc = md->mdo_softc;
mdi = (struct g_raid_md_ddf_object *)md;
verb = gctl_get_param(req, "verb", NULL);
nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
error = 0;
if (strcmp(verb, "label") == 0) {
if (*nargs < 4) {
gctl_error(req, "Invalid number of arguments.");
return (-1);
}
volname = gctl_get_asciiparam(req, "arg1");
if (volname == NULL) {
gctl_error(req, "No volume name.");
return (-2);
}
levelname = gctl_get_asciiparam(req, "arg2");
if (levelname == NULL) {
gctl_error(req, "No RAID level.");
return (-3);
}
if (g_raid_volume_str2level(levelname, &level, &qual)) {
gctl_error(req, "Unknown RAID level '%s'.", levelname);
return (-4);
}
numdisks = *nargs - 3;
force = gctl_get_paraml(req, "force", sizeof(*force));
if (!g_raid_md_ddf_supported(level, qual, numdisks,
force ? *force : 0)) {
gctl_error(req, "Unsupported RAID level "
"(0x%02x/0x%02x), or number of disks (%d).",
level, qual, numdisks);
return (-5);
}
/* Search for disks, connect them and probe. */
size = INT64_MAX;
sectorsize = 0;
bzero(disks, sizeof(disks));
bzero(offs, sizeof(offs));
for (i = 0; i < numdisks; i++) {
snprintf(arg, sizeof(arg), "arg%d", i + 3);
diskname = gctl_get_asciiparam(req, arg);
if (diskname == NULL) {
gctl_error(req, "No disk name (%s).", arg);
error = -6;
break;
}
if (strcmp(diskname, "NONE") == 0)
continue;
TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
if (disk->d_consumer != NULL &&
disk->d_consumer->provider != NULL &&
strcmp(disk->d_consumer->provider->name,
diskname) == 0)
break;
}
if (disk != NULL) {
if (disk->d_state != G_RAID_DISK_S_ACTIVE) {
gctl_error(req, "Disk '%s' is in a "
"wrong state (%s).", diskname,
g_raid_disk_state2str(disk->d_state));
error = -7;
break;
}
pd = disk->d_md_data;
if (ddf_meta_count_vdc(&pd->pd_meta, NULL) >=
GET16(&pd->pd_meta, hdr->Max_Partitions)) {
gctl_error(req, "No free partitions "
"on disk '%s'.",
diskname);
error = -7;
break;
}
pp = disk->d_consumer->provider;
disks[i] = disk;
ddf_meta_unused_range(&pd->pd_meta,
&offs[i], &esize);
offs[i] *= pp->sectorsize;
size = MIN(size, (off_t)esize * pp->sectorsize);
sectorsize = MAX(sectorsize, pp->sectorsize);
continue;
}
g_topology_lock();
cp = g_raid_open_consumer(sc, diskname);
if (cp == NULL) {
gctl_error(req, "Can't open disk '%s'.",
diskname);
g_topology_unlock();
error = -8;
break;
}
pp = cp->provider;
pd = malloc(sizeof(*pd), M_MD_DDF, M_WAITOK | M_ZERO);
disk = g_raid_create_disk(sc);
disk->d_md_data = (void *)pd;
disk->d_consumer = cp;
disks[i] = disk;
cp->private = disk;
ddf_meta_create(disk, &mdi->mdio_meta);
if (mdi->mdio_meta.hdr == NULL)
ddf_meta_copy(&mdi->mdio_meta, &pd->pd_meta);
else
ddf_meta_update(&mdi->mdio_meta, &pd->pd_meta);
g_topology_unlock();
g_raid_get_disk_info(disk);
/* Reserve some space for metadata. */
size = MIN(size, GET64(&pd->pd_meta,
pdr->entry[0].Configured_Size) * pp->sectorsize);
sectorsize = MAX(sectorsize, pp->sectorsize);
}
if (error != 0) {
for (i = 0; i < numdisks; i++) {
if (disks[i] != NULL &&
disks[i]->d_state == G_RAID_DISK_S_NONE)
g_raid_destroy_disk(disks[i]);
}
return (error);
}
if (sectorsize <= 0) {
gctl_error(req, "Can't get sector size.");
return (-8);
}
/* Handle size argument. */
len = sizeof(*sizearg);
sizearg = gctl_get_param(req, "size", &len);
if (sizearg != NULL && len == sizeof(*sizearg) &&
*sizearg > 0) {
if (*sizearg > size) {
gctl_error(req, "Size too big %lld > %lld.",
(long long)*sizearg, (long long)size);
return (-9);
}
size = *sizearg;
}
/* Handle strip argument. */
strip = 131072;
len = sizeof(*striparg);
striparg = gctl_get_param(req, "strip", &len);
if (striparg != NULL && len == sizeof(*striparg) &&
*striparg > 0) {
if (*striparg < sectorsize) {
gctl_error(req, "Strip size too small.");
return (-10);
}
if (*striparg % sectorsize != 0) {
gctl_error(req, "Incorrect strip size.");
return (-11);
}
strip = *striparg;
}
/* Round size down to strip or sector. */
if (level == G_RAID_VOLUME_RL_RAID1 ||
level == G_RAID_VOLUME_RL_RAID3 ||
level == G_RAID_VOLUME_RL_SINGLE ||
level == G_RAID_VOLUME_RL_CONCAT)
size -= (size % sectorsize);
else if (level == G_RAID_VOLUME_RL_RAID1E &&
(numdisks & 1) != 0)
size -= (size % (2 * strip));
else
size -= (size % strip);
if (size <= 0) {
gctl_error(req, "Size too small.");
return (-13);
}
/* We have all we need, create things: volume, ... */
pv = malloc(sizeof(*pv), M_MD_DDF, M_WAITOK | M_ZERO);
ddf_vol_meta_create(&pv->pv_meta, &mdi->mdio_meta);
pv->pv_started = 1;
vol = g_raid_create_volume(sc, volname, -1);
vol->v_md_data = pv;
vol->v_raid_level = level;
vol->v_raid_level_qualifier = qual;
vol->v_strip_size = strip;
vol->v_disks_count = numdisks;
if (level == G_RAID_VOLUME_RL_RAID0 ||
level == G_RAID_VOLUME_RL_CONCAT ||
level == G_RAID_VOLUME_RL_SINGLE)
vol->v_mediasize = size * numdisks;
else if (level == G_RAID_VOLUME_RL_RAID1)
vol->v_mediasize = size;
else if (level == G_RAID_VOLUME_RL_RAID3 ||
level == G_RAID_VOLUME_RL_RAID4 ||
level == G_RAID_VOLUME_RL_RAID5)
vol->v_mediasize = size * (numdisks - 1);
else if (level == G_RAID_VOLUME_RL_RAID5R) {
vol->v_mediasize = size * (numdisks - 1);
vol->v_rotate_parity = 1024;
} else if (level == G_RAID_VOLUME_RL_RAID6 ||
level == G_RAID_VOLUME_RL_RAID5E ||
level == G_RAID_VOLUME_RL_RAID5EE)
vol->v_mediasize = size * (numdisks - 2);
else if (level == G_RAID_VOLUME_RL_RAIDMDF) {
if (numdisks < 5)
vol->v_mdf_pdisks = 2;
else
vol->v_mdf_pdisks = 3;
vol->v_mdf_polynomial = 0x11d;
vol->v_mdf_method = 0x00;
vol->v_mediasize = size * (numdisks - vol->v_mdf_pdisks);
} else { /* RAID1E */
vol->v_mediasize = ((size * numdisks) / strip / 2) *
strip;
}
vol->v_sectorsize = sectorsize;
g_raid_start_volume(vol);
/* , and subdisks. */
for (i = 0; i < numdisks; i++) {
disk = disks[i];
sd = &vol->v_subdisks[i];
sd->sd_disk = disk;
sd->sd_offset = offs[i];
sd->sd_size = size;
if (disk == NULL)
continue;
TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
g_raid_change_disk_state(disk,
G_RAID_DISK_S_ACTIVE);
g_raid_change_subdisk_state(sd,
G_RAID_SUBDISK_S_ACTIVE);
g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
G_RAID_EVENT_SUBDISK);
}
/* Write metadata based on created entities. */
G_RAID_DEBUG1(0, sc, "Array started.");
g_raid_md_write_ddf(md, vol, NULL, NULL);
/* Pickup any STALE/SPARE disks to refill array if needed. */
g_raid_md_ddf_refill(sc);
g_raid_event_send(vol, G_RAID_VOLUME_E_START,
G_RAID_EVENT_VOLUME);
return (0);
}
if (strcmp(verb, "add") == 0) {
gctl_error(req, "`add` command is not applicable, "
"use `label` instead.");
return (-99);
}
if (strcmp(verb, "delete") == 0) {
nodename = gctl_get_asciiparam(req, "arg0");
if (nodename != NULL && strcasecmp(sc->sc_name, nodename) != 0)
nodename = NULL;
/* Full node destruction. */
if (*nargs == 1 && nodename != NULL) {
/* Check if some volume is still open. */
force = gctl_get_paraml(req, "force", sizeof(*force));
if (force != NULL && *force == 0 &&
g_raid_nopens(sc) != 0) {
gctl_error(req, "Some volume is still open.");
return (-4);
}
TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
if (disk->d_consumer)
ddf_meta_erase(disk->d_consumer);
}
g_raid_destroy_node(sc, 0);
return (0);
}
/* Destroy specified volume. If it was last - all node. */
if (*nargs > 2) {
gctl_error(req, "Invalid number of arguments.");
return (-1);
}
volname = gctl_get_asciiparam(req,
nodename != NULL ? "arg1" : "arg0");
if (volname == NULL) {
gctl_error(req, "No volume name.");
return (-2);
}
/* Search for volume. */
TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
if (strcmp(vol->v_name, volname) == 0)
break;
pp = vol->v_provider;
if (pp == NULL)
continue;
if (strcmp(pp->name, volname) == 0)
break;
if (strncmp(pp->name, "raid/", 5) == 0 &&
strcmp(pp->name + 5, volname) == 0)
break;
}
if (vol == NULL) {
i = strtol(volname, &tmp, 10);
if (verb != volname && tmp[0] == 0) {
TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
if (vol->v_global_id == i)
break;
}
}
}
if (vol == NULL) {
gctl_error(req, "Volume '%s' not found.", volname);
return (-3);
}
/* Check if volume is still open. */
force = gctl_get_paraml(req, "force", sizeof(*force));
if (force != NULL && *force == 0 &&
vol->v_provider_open != 0) {
gctl_error(req, "Volume is still open.");
return (-4);
}
/* Destroy volume and potentially node. */
i = 0;
TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next)
i++;
if (i >= 2) {
g_raid_destroy_volume(vol);
g_raid_md_ddf_purge_disks(sc);
g_raid_md_write_ddf(md, NULL, NULL, NULL);
} else {
TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
if (disk->d_consumer)
ddf_meta_erase(disk->d_consumer);
}
g_raid_destroy_node(sc, 0);
}
return (0);
}
if (strcmp(verb, "remove") == 0 ||
strcmp(verb, "fail") == 0) {
if (*nargs < 2) {
gctl_error(req, "Invalid number of arguments.");
return (-1);
}
for (i = 1; i < *nargs; i++) {
snprintf(arg, sizeof(arg), "arg%d", i);
diskname = gctl_get_asciiparam(req, arg);
if (diskname == NULL) {
gctl_error(req, "No disk name (%s).", arg);
error = -2;
break;
}
if (strncmp(diskname, "/dev/", 5) == 0)
diskname += 5;
TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
if (disk->d_consumer != NULL &&
disk->d_consumer->provider != NULL &&
strcmp(disk->d_consumer->provider->name,
diskname) == 0)
break;
}
if (disk == NULL) {
gctl_error(req, "Disk '%s' not found.",
diskname);
error = -3;
break;
}
if (strcmp(verb, "fail") == 0) {
g_raid_md_fail_disk_ddf(md, NULL, disk);
continue;
}
/* Erase metadata on deleting disk and destroy it. */
ddf_meta_erase(disk->d_consumer);
g_raid_destroy_disk(disk);
}
g_raid_md_ddf_purge_volumes(sc);
/* Write updated metadata to remaining disks. */
g_raid_md_write_ddf(md, NULL, NULL, NULL);
/* Check if anything left. */
if (g_raid_ndisks(sc, -1) == 0)
g_raid_destroy_node(sc, 0);
else
g_raid_md_ddf_refill(sc);
return (error);
}
if (strcmp(verb, "insert") == 0) {
if (*nargs < 2) {
gctl_error(req, "Invalid number of arguments.");
return (-1);
}
for (i = 1; i < *nargs; i++) {
/* Get disk name. */
snprintf(arg, sizeof(arg), "arg%d", i);
diskname = gctl_get_asciiparam(req, arg);
if (diskname == NULL) {
gctl_error(req, "No disk name (%s).", arg);
error = -3;
break;
}
/* Try to find provider with specified name. */
g_topology_lock();
cp = g_raid_open_consumer(sc, diskname);
if (cp == NULL) {
gctl_error(req, "Can't open disk '%s'.",
diskname);
g_topology_unlock();
error = -4;
break;
}
pp = cp->provider;
g_topology_unlock();
pd = malloc(sizeof(*pd), M_MD_DDF, M_WAITOK | M_ZERO);
disk = g_raid_create_disk(sc);
disk->d_consumer = cp;
disk->d_md_data = (void *)pd;
cp->private = disk;
g_raid_get_disk_info(disk);
/* Welcome the "new" disk. */
g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE);
ddf_meta_create(disk, &mdi->mdio_meta);
sa = ddf_meta_find_sa(&pd->pd_meta, 1);
if (sa != NULL) {
SET32D(&pd->pd_meta, sa->Signature,
DDF_SA_SIGNATURE);
SET8D(&pd->pd_meta, sa->Spare_Type, 0);
SET16D(&pd->pd_meta, sa->Populated_SAEs, 0);
SET16D(&pd->pd_meta, sa->MAX_SAE_Supported,
(GET16(&pd->pd_meta, hdr->Configuration_Record_Length) *
pd->pd_meta.sectorsize -
sizeof(struct ddf_sa_record)) /
sizeof(struct ddf_sa_entry));
}
if (mdi->mdio_meta.hdr == NULL)
ddf_meta_copy(&mdi->mdio_meta, &pd->pd_meta);
else
ddf_meta_update(&mdi->mdio_meta, &pd->pd_meta);
g_raid_md_write_ddf(md, NULL, NULL, NULL);
g_raid_md_ddf_refill(sc);
}
return (error);
}
return (-100);
}
static int
g_raid_md_write_ddf(struct g_raid_md_object *md, struct g_raid_volume *tvol,
struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk)
{
struct g_raid_softc *sc;
struct g_raid_volume *vol;
struct g_raid_subdisk *sd;
struct g_raid_disk *disk;
struct g_raid_md_ddf_perdisk *pd;
struct g_raid_md_ddf_pervolume *pv;
struct g_raid_md_ddf_object *mdi;
struct ddf_meta *gmeta;
struct ddf_vol_meta *vmeta;
struct ddf_vdc_record *vdc;
struct ddf_sa_record *sa;
uint64_t *val2;
int i, j, pos, bvd, size;
sc = md->mdo_softc;
mdi = (struct g_raid_md_ddf_object *)md;
gmeta = &mdi->mdio_meta;
if (sc->sc_stopping == G_RAID_DESTROY_HARD)
return (0);
/*
* Clear disk flags to let only really needed ones to be reset.
* Do it only if there are no volumes in starting state now,
* as they can update disk statuses yet and we may kill innocent.
*/
if (mdi->mdio_starting == 0) {
for (i = 0; i < GET16(gmeta, pdr->Populated_PDEs); i++) {
if (isff(gmeta->pdr->entry[i].PD_GUID, 24))
continue;
SET16(gmeta, pdr->entry[i].PD_Type,
GET16(gmeta, pdr->entry[i].PD_Type) &
~(DDF_PDE_PARTICIPATING |
DDF_PDE_GLOBAL_SPARE | DDF_PDE_CONFIG_SPARE));
if ((GET16(gmeta, pdr->entry[i].PD_State) &
DDF_PDE_PFA) == 0)
SET16(gmeta, pdr->entry[i].PD_State, 0);
}
}
/* Generate/update new per-volume metadata. */
TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data;
if (vol->v_stopping || !pv->pv_started)
continue;
vmeta = &pv->pv_meta;
SET32(vmeta, vdc->Sequence_Number,
GET32(vmeta, vdc->Sequence_Number) + 1);
if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E &&
vol->v_disks_count % 2 == 0)
SET16(vmeta, vdc->Primary_Element_Count, 2);
else
SET16(vmeta, vdc->Primary_Element_Count,
vol->v_disks_count);
SET8(vmeta, vdc->Stripe_Size,
ffs(vol->v_strip_size / vol->v_sectorsize) - 1);
if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E &&
vol->v_disks_count % 2 == 0) {
SET8(vmeta, vdc->Primary_RAID_Level,
DDF_VDCR_RAID1);
SET8(vmeta, vdc->RLQ, 0);
SET8(vmeta, vdc->Secondary_Element_Count,
vol->v_disks_count / 2);
SET8(vmeta, vdc->Secondary_RAID_Level, 0);
} else {
SET8(vmeta, vdc->Primary_RAID_Level,
vol->v_raid_level);
SET8(vmeta, vdc->RLQ,
vol->v_raid_level_qualifier);
SET8(vmeta, vdc->Secondary_Element_Count, 1);
SET8(vmeta, vdc->Secondary_RAID_Level, 0);
}
SET8(vmeta, vdc->Secondary_Element_Seq, 0);
SET64(vmeta, vdc->Block_Count, 0);
SET64(vmeta, vdc->VD_Size, vol->v_mediasize / vol->v_sectorsize);
SET16(vmeta, vdc->Block_Size, vol->v_sectorsize);
SET8(vmeta, vdc->Rotate_Parity_count,
fls(vol->v_rotate_parity) - 1);
SET8(vmeta, vdc->MDF_Parity_Disks, vol->v_mdf_pdisks);
SET16(vmeta, vdc->MDF_Parity_Generator_Polynomial,
vol->v_mdf_polynomial);
SET8(vmeta, vdc->MDF_Constant_Generation_Method,
vol->v_mdf_method);
SET16(vmeta, vde->VD_Number, vol->v_global_id);
if (vol->v_state <= G_RAID_VOLUME_S_BROKEN)
SET8(vmeta, vde->VD_State, DDF_VDE_FAILED);
else if (vol->v_state <= G_RAID_VOLUME_S_DEGRADED)
SET8(vmeta, vde->VD_State, DDF_VDE_DEGRADED);
else if (vol->v_state <= G_RAID_VOLUME_S_SUBOPTIMAL)
SET8(vmeta, vde->VD_State, DDF_VDE_PARTIAL);
else
SET8(vmeta, vde->VD_State, DDF_VDE_OPTIMAL);
if (vol->v_dirty ||
g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) > 0 ||
g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) > 0)
SET8(vmeta, vde->VD_State,
GET8(vmeta, vde->VD_State) | DDF_VDE_DIRTY);
SET8(vmeta, vde->Init_State, DDF_VDE_INIT_FULL); // XXX
ddf_meta_put_name(vmeta, vol->v_name);
for (i = 0; i < vol->v_disks_count; i++) {
sd = &vol->v_subdisks[i];
bvd = i / GET16(vmeta, vdc->Primary_Element_Count);
pos = i % GET16(vmeta, vdc->Primary_Element_Count);
disk = sd->sd_disk;
if (disk != NULL) {
pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
if (vmeta->bvdc[bvd] == NULL) {
size = GET16(vmeta,
hdr->Configuration_Record_Length) *
vmeta->sectorsize;
vmeta->bvdc[bvd] = malloc(size,
M_MD_DDF, M_WAITOK);
memset(vmeta->bvdc[bvd], 0xff, size);
}
memcpy(vmeta->bvdc[bvd], vmeta->vdc,
sizeof(struct ddf_vdc_record));
SET8(vmeta, bvdc[bvd]->Secondary_Element_Seq, bvd);
SET64(vmeta, bvdc[bvd]->Block_Count,
sd->sd_size / vol->v_sectorsize);
SET32(vmeta, bvdc[bvd]->Physical_Disk_Sequence[pos],
GET32(&pd->pd_meta, pdd->PD_Reference));
val2 = (uint64_t *)&(vmeta->bvdc[bvd]->Physical_Disk_Sequence[
GET16(vmeta, hdr->Max_Primary_Element_Entries)]);
SET64P(vmeta, val2 + pos,
sd->sd_offset / vol->v_sectorsize);
}
if (vmeta->bvdc[bvd] == NULL)
continue;
j = ddf_meta_find_pd(gmeta, NULL,
GET32(vmeta, bvdc[bvd]->Physical_Disk_Sequence[pos]));
if (j < 0)
continue;
SET16(gmeta, pdr->entry[j].PD_Type,
GET16(gmeta, pdr->entry[j].PD_Type) |
DDF_PDE_PARTICIPATING);
if (sd->sd_state == G_RAID_SUBDISK_S_NONE)
SET16(gmeta, pdr->entry[j].PD_State,
GET16(gmeta, pdr->entry[j].PD_State) |
(DDF_PDE_FAILED | DDF_PDE_MISSING));
else if (sd->sd_state == G_RAID_SUBDISK_S_FAILED)
SET16(gmeta, pdr->entry[j].PD_State,
GET16(gmeta, pdr->entry[j].PD_State) |
(DDF_PDE_FAILED | DDF_PDE_PFA));
else if (sd->sd_state <= G_RAID_SUBDISK_S_REBUILD)
SET16(gmeta, pdr->entry[j].PD_State,
GET16(gmeta, pdr->entry[j].PD_State) |
DDF_PDE_REBUILD);
else
SET16(gmeta, pdr->entry[j].PD_State,
GET16(gmeta, pdr->entry[j].PD_State) |
DDF_PDE_ONLINE);
}
}
/* Mark spare and failed disks as such. */
TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
i = ddf_meta_find_pd(gmeta, NULL,
GET32(&pd->pd_meta, pdd->PD_Reference));
if (i < 0)
continue;
if (disk->d_state == G_RAID_DISK_S_FAILED) {
SET16(gmeta, pdr->entry[i].PD_State,
GET16(gmeta, pdr->entry[i].PD_State) |
(DDF_PDE_FAILED | DDF_PDE_PFA));
}
if (disk->d_state != G_RAID_DISK_S_SPARE)
continue;
sa = ddf_meta_find_sa(&pd->pd_meta, 0);
if (sa == NULL ||
(GET8D(&pd->pd_meta, sa->Spare_Type) &
DDF_SAR_TYPE_DEDICATED) == 0) {
SET16(gmeta, pdr->entry[i].PD_Type,
GET16(gmeta, pdr->entry[i].PD_Type) |
DDF_PDE_GLOBAL_SPARE);
} else {
SET16(gmeta, pdr->entry[i].PD_Type,
GET16(gmeta, pdr->entry[i].PD_Type) |
DDF_PDE_CONFIG_SPARE);
}
SET16(gmeta, pdr->entry[i].PD_State,
GET16(gmeta, pdr->entry[i].PD_State) |
DDF_PDE_ONLINE);
}
/* Remove disks without "participating" flag (unused). */
for (i = 0, j = -1; i < GET16(gmeta, pdr->Populated_PDEs); i++) {
if (isff(gmeta->pdr->entry[i].PD_GUID, 24))
continue;
if ((GET16(gmeta, pdr->entry[i].PD_Type) &
(DDF_PDE_PARTICIPATING |
DDF_PDE_GLOBAL_SPARE | DDF_PDE_CONFIG_SPARE)) != 0 ||
g_raid_md_ddf_get_disk(sc,
NULL, GET32(gmeta, pdr->entry[i].PD_Reference)) != NULL)
j = i;
else
memset(&gmeta->pdr->entry[i], 0xff,
sizeof(struct ddf_pd_entry));
}
SET16(gmeta, pdr->Populated_PDEs, j + 1);
/* Update per-disk metadata and write them. */
TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
if (disk->d_state != G_RAID_DISK_S_ACTIVE &&
disk->d_state != G_RAID_DISK_S_SPARE)
continue;
/* Update PDR. */
memcpy(pd->pd_meta.pdr, gmeta->pdr,
GET32(&pd->pd_meta, hdr->pdr_length) *
pd->pd_meta.sectorsize);
/* Update VDR. */
SET16(&pd->pd_meta, vdr->Populated_VDEs, 0);
TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
if (vol->v_stopping)
continue;
pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data;
i = ddf_meta_find_vd(&pd->pd_meta,
pv->pv_meta.vde->VD_GUID);
if (i < 0)
i = ddf_meta_find_vd(&pd->pd_meta, NULL);
if (i >= 0)
memcpy(&pd->pd_meta.vdr->entry[i],
pv->pv_meta.vde,
sizeof(struct ddf_vd_entry));
}
/* Update VDC. */
if (mdi->mdio_starting == 0) {
/* Remove all VDCs to restore needed later. */
j = GETCRNUM(&pd->pd_meta);
for (i = 0; i < j; i++) {
vdc = GETVDCPTR(&pd->pd_meta, i);
if (GET32D(&pd->pd_meta, vdc->Signature) !=
DDF_VDCR_SIGNATURE)
continue;
SET32D(&pd->pd_meta, vdc->Signature, 0xffffffff);
}
}
TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
vol = sd->sd_volume;
if (vol->v_stopping)
continue;
pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data;
vmeta = &pv->pv_meta;
vdc = ddf_meta_find_vdc(&pd->pd_meta,
vmeta->vde->VD_GUID);
if (vdc == NULL)
vdc = ddf_meta_find_vdc(&pd->pd_meta, NULL);
if (vdc != NULL) {
bvd = sd->sd_pos / GET16(vmeta,
vdc->Primary_Element_Count);
memcpy(vdc, vmeta->bvdc[bvd],
GET16(&pd->pd_meta,
hdr->Configuration_Record_Length) *
pd->pd_meta.sectorsize);
}
}
G_RAID_DEBUG(1, "Writing DDF metadata to %s",
g_raid_get_diskname(disk));
g_raid_md_ddf_print(&pd->pd_meta);
ddf_meta_write(disk->d_consumer, &pd->pd_meta);
}
return (0);
}
static int
g_raid_md_fail_disk_ddf(struct g_raid_md_object *md,
struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk)
{
struct g_raid_softc *sc;
struct g_raid_md_ddf_perdisk *pd;
struct g_raid_subdisk *sd;
int i;
sc = md->mdo_softc;
pd = (struct g_raid_md_ddf_perdisk *)tdisk->d_md_data;
/* We can't fail disk that is not a part of array now. */
if (tdisk->d_state != G_RAID_DISK_S_ACTIVE)
return (-1);
/*
* Mark disk as failed in metadata and try to write that metadata
* to the disk itself to prevent it's later resurrection as STALE.
*/
G_RAID_DEBUG(1, "Writing DDF metadata to %s",
g_raid_get_diskname(tdisk));
i = ddf_meta_find_pd(&pd->pd_meta, NULL, GET32(&pd->pd_meta, pdd->PD_Reference));
SET16(&pd->pd_meta, pdr->entry[i].PD_State, DDF_PDE_FAILED | DDF_PDE_PFA);
if (tdisk->d_consumer != NULL)
ddf_meta_write(tdisk->d_consumer, &pd->pd_meta);
/* Change states. */
g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED);
TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) {
g_raid_change_subdisk_state(sd,
G_RAID_SUBDISK_S_FAILED);
g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED,
G_RAID_EVENT_SUBDISK);
}
/* Write updated metadata to remaining disks. */
g_raid_md_write_ddf(md, NULL, NULL, tdisk);
g_raid_md_ddf_refill(sc);
return (0);
}
static int
g_raid_md_free_disk_ddf(struct g_raid_md_object *md,
struct g_raid_disk *disk)
{
struct g_raid_md_ddf_perdisk *pd;
pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
ddf_meta_free(&pd->pd_meta);
free(pd, M_MD_DDF);
disk->d_md_data = NULL;
return (0);
}
static int
g_raid_md_free_volume_ddf(struct g_raid_md_object *md,
struct g_raid_volume *vol)
{
struct g_raid_md_ddf_object *mdi;
struct g_raid_md_ddf_pervolume *pv;
mdi = (struct g_raid_md_ddf_object *)md;
pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data;
ddf_vol_meta_free(&pv->pv_meta);
if (!pv->pv_started) {
pv->pv_started = 1;
mdi->mdio_starting--;
callout_stop(&pv->pv_start_co);
}
free(pv, M_MD_DDF);
vol->v_md_data = NULL;
return (0);
}
static int
g_raid_md_free_ddf(struct g_raid_md_object *md)
{
struct g_raid_md_ddf_object *mdi;
mdi = (struct g_raid_md_ddf_object *)md;
if (!mdi->mdio_started) {
mdi->mdio_started = 0;
callout_stop(&mdi->mdio_start_co);
G_RAID_DEBUG1(1, md->mdo_softc,
"root_mount_rel %p", mdi->mdio_rootmount);
root_mount_rel(mdi->mdio_rootmount);
mdi->mdio_rootmount = NULL;
}
ddf_meta_free(&mdi->mdio_meta);
return (0);
}
G_RAID_MD_DECLARE(ddf, "DDF");
Index: head/sys/geom/raid/md_promise.c
===================================================================
--- head/sys/geom/raid/md_promise.c (revision 327172)
+++ head/sys/geom/raid/md_promise.c (revision 327173)
@@ -1,2008 +1,2007 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2011 Alexander Motin <mav@FreeBSD.org>
* Copyright (c) 2000 - 2008 Søren Schmidt <sos@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/bio.h>
#include <sys/endian.h>
#include <sys/kernel.h>
#include <sys/kobj.h>
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/systm.h>
#include <geom/geom.h>
#include "geom/raid/g_raid.h"
#include "g_raid_md_if.h"
static MALLOC_DEFINE(M_MD_PROMISE, "md_promise_data", "GEOM_RAID Promise metadata");
#define PROMISE_MAX_DISKS 8
#define PROMISE_MAX_SUBDISKS 2
#define PROMISE_META_OFFSET 14
struct promise_raid_disk {
uint8_t flags; /* Subdisk status. */
#define PROMISE_F_VALID 0x01
#define PROMISE_F_ONLINE 0x02
#define PROMISE_F_ASSIGNED 0x04
#define PROMISE_F_SPARE 0x08
#define PROMISE_F_DUPLICATE 0x10
#define PROMISE_F_REDIR 0x20
#define PROMISE_F_DOWN 0x40
#define PROMISE_F_READY 0x80
uint8_t number; /* Position in a volume. */
uint8_t channel; /* ATA channel number. */
uint8_t device; /* ATA device number. */
uint64_t id __packed; /* Subdisk ID. */
} __packed;
struct promise_raid_conf {
char promise_id[24];
#define PROMISE_MAGIC "Promise Technology, Inc."
#define FREEBSD_MAGIC "FreeBSD ATA driver RAID "
uint32_t dummy_0;
uint64_t magic_0;
#define PROMISE_MAGIC0(x) (((uint64_t)(x.channel) << 48) | \
((uint64_t)(x.device != 0) << 56))
uint16_t magic_1;
uint32_t magic_2;
uint8_t filler1[470];
uint32_t integrity;
#define PROMISE_I_VALID 0x00000080
struct promise_raid_disk disk; /* This subdisk info. */
uint32_t disk_offset; /* Subdisk offset. */
uint32_t disk_sectors; /* Subdisk size */
uint32_t disk_rebuild; /* Rebuild position. */
uint16_t generation; /* Generation number. */
uint8_t status; /* Volume status. */
#define PROMISE_S_VALID 0x01
#define PROMISE_S_ONLINE 0x02
#define PROMISE_S_INITED 0x04
#define PROMISE_S_READY 0x08
#define PROMISE_S_DEGRADED 0x10
#define PROMISE_S_MARKED 0x20
#define PROMISE_S_MIGRATING 0x40
#define PROMISE_S_FUNCTIONAL 0x80
uint8_t type; /* Voluem type. */
#define PROMISE_T_RAID0 0x00
#define PROMISE_T_RAID1 0x01
#define PROMISE_T_RAID3 0x02
#define PROMISE_T_RAID5 0x04
#define PROMISE_T_SPAN 0x08
#define PROMISE_T_JBOD 0x10
uint8_t total_disks; /* Disks in this volume. */
uint8_t stripe_shift; /* Strip size. */
uint8_t array_width; /* Number of RAID0 stripes. */
uint8_t array_number; /* Global volume number. */
uint32_t total_sectors; /* Volume size. */
uint16_t cylinders; /* Volume geometry: C. */
uint8_t heads; /* Volume geometry: H. */
uint8_t sectors; /* Volume geometry: S. */
uint64_t volume_id __packed; /* Volume ID, */
struct promise_raid_disk disks[PROMISE_MAX_DISKS];
/* Subdisks in this volume. */
char name[32]; /* Volume label. */
uint32_t filler2[8];
uint32_t magic_3; /* Something related to rebuild. */
uint64_t rebuild_lba64; /* Per-volume rebuild position. */
uint32_t magic_4;
uint32_t magic_5;
uint32_t total_sectors_high;
uint8_t magic_6;
uint8_t sector_size;
uint16_t magic_7;
uint32_t magic_8[31];
uint32_t backup_time;
uint16_t magic_9;
uint32_t disk_offset_high;
uint32_t disk_sectors_high;
uint32_t disk_rebuild_high;
uint16_t magic_10;
uint32_t magic_11[3];
uint32_t filler3[284];
uint32_t checksum;
} __packed;
struct g_raid_md_promise_perdisk {
int pd_updated;
int pd_subdisks;
struct promise_raid_conf *pd_meta[PROMISE_MAX_SUBDISKS];
};
struct g_raid_md_promise_pervolume {
struct promise_raid_conf *pv_meta;
uint64_t pv_id;
uint16_t pv_generation;
int pv_disks_present;
int pv_started;
struct callout pv_start_co; /* STARTING state timer. */
};
static g_raid_md_create_t g_raid_md_create_promise;
static g_raid_md_taste_t g_raid_md_taste_promise;
static g_raid_md_event_t g_raid_md_event_promise;
static g_raid_md_volume_event_t g_raid_md_volume_event_promise;
static g_raid_md_ctl_t g_raid_md_ctl_promise;
static g_raid_md_write_t g_raid_md_write_promise;
static g_raid_md_fail_disk_t g_raid_md_fail_disk_promise;
static g_raid_md_free_disk_t g_raid_md_free_disk_promise;
static g_raid_md_free_volume_t g_raid_md_free_volume_promise;
static g_raid_md_free_t g_raid_md_free_promise;
static kobj_method_t g_raid_md_promise_methods[] = {
KOBJMETHOD(g_raid_md_create, g_raid_md_create_promise),
KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_promise),
KOBJMETHOD(g_raid_md_event, g_raid_md_event_promise),
KOBJMETHOD(g_raid_md_volume_event, g_raid_md_volume_event_promise),
KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_promise),
KOBJMETHOD(g_raid_md_write, g_raid_md_write_promise),
KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_promise),
KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_promise),
KOBJMETHOD(g_raid_md_free_volume, g_raid_md_free_volume_promise),
KOBJMETHOD(g_raid_md_free, g_raid_md_free_promise),
{ 0, 0 }
};
static struct g_raid_md_class g_raid_md_promise_class = {
"Promise",
g_raid_md_promise_methods,
sizeof(struct g_raid_md_object),
.mdc_enable = 1,
.mdc_priority = 100
};
static void
g_raid_md_promise_print(struct promise_raid_conf *meta)
{
int i;
if (g_raid_debug < 1)
return;
printf("********* ATA Promise Metadata *********\n");
printf("promise_id <%.24s>\n", meta->promise_id);
printf("disk %02x %02x %02x %02x %016jx\n",
meta->disk.flags, meta->disk.number, meta->disk.channel,
meta->disk.device, meta->disk.id);
printf("disk_offset %u\n", meta->disk_offset);
printf("disk_sectors %u\n", meta->disk_sectors);
printf("disk_rebuild %u\n", meta->disk_rebuild);
printf("generation %u\n", meta->generation);
printf("status 0x%02x\n", meta->status);
printf("type %u\n", meta->type);
printf("total_disks %u\n", meta->total_disks);
printf("stripe_shift %u\n", meta->stripe_shift);
printf("array_width %u\n", meta->array_width);
printf("array_number %u\n", meta->array_number);
printf("total_sectors %u\n", meta->total_sectors);
printf("cylinders %u\n", meta->cylinders);
printf("heads %u\n", meta->heads);
printf("sectors %u\n", meta->sectors);
printf("volume_id 0x%016jx\n", meta->volume_id);
printf("disks:\n");
for (i = 0; i < PROMISE_MAX_DISKS; i++ ) {
printf(" %02x %02x %02x %02x %016jx\n",
meta->disks[i].flags, meta->disks[i].number,
meta->disks[i].channel, meta->disks[i].device,
meta->disks[i].id);
}
printf("name <%.32s>\n", meta->name);
printf("magic_3 0x%08x\n", meta->magic_3);
printf("rebuild_lba64 %ju\n", meta->rebuild_lba64);
printf("magic_4 0x%08x\n", meta->magic_4);
printf("magic_5 0x%08x\n", meta->magic_5);
printf("total_sectors_high 0x%08x\n", meta->total_sectors_high);
printf("sector_size %u\n", meta->sector_size);
printf("backup_time %d\n", meta->backup_time);
printf("disk_offset_high 0x%08x\n", meta->disk_offset_high);
printf("disk_sectors_high 0x%08x\n", meta->disk_sectors_high);
printf("disk_rebuild_high 0x%08x\n", meta->disk_rebuild_high);
printf("=================================================\n");
}
static struct promise_raid_conf *
promise_meta_copy(struct promise_raid_conf *meta)
{
struct promise_raid_conf *nmeta;
nmeta = malloc(sizeof(*nmeta), M_MD_PROMISE, M_WAITOK);
memcpy(nmeta, meta, sizeof(*nmeta));
return (nmeta);
}
static int
promise_meta_find_disk(struct promise_raid_conf *meta, uint64_t id)
{
int pos;
for (pos = 0; pos < meta->total_disks; pos++) {
if (meta->disks[pos].id == id)
return (pos);
}
return (-1);
}
static int
promise_meta_unused_range(struct promise_raid_conf **metaarr, int nsd,
off_t sectors, off_t *off, off_t *size)
{
off_t coff, csize, tmp;
int i, j;
sectors -= 131072;
*off = 0;
*size = 0;
coff = 0;
csize = sectors;
i = 0;
while (1) {
for (j = 0; j < nsd; j++) {
tmp = ((off_t)metaarr[j]->disk_offset_high << 32) +
metaarr[j]->disk_offset;
if (tmp >= coff)
csize = MIN(csize, tmp - coff);
}
if (csize > *size) {
*off = coff;
*size = csize;
}
if (i >= nsd)
break;
coff = ((off_t)metaarr[i]->disk_offset_high << 32) +
metaarr[i]->disk_offset +
((off_t)metaarr[i]->disk_sectors_high << 32) +
metaarr[i]->disk_sectors;
csize = sectors - coff;
i++;
}
return ((*size > 0) ? 1 : 0);
}
static int
promise_meta_translate_disk(struct g_raid_volume *vol, int md_disk_pos)
{
int disk_pos, width;
if (md_disk_pos >= 0 && vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) {
width = vol->v_disks_count / 2;
disk_pos = (md_disk_pos / width) +
(md_disk_pos % width) * width;
} else
disk_pos = md_disk_pos;
return (disk_pos);
}
static void
promise_meta_get_name(struct promise_raid_conf *meta, char *buf)
{
int i;
strncpy(buf, meta->name, 32);
buf[32] = 0;
for (i = 31; i >= 0; i--) {
if (buf[i] > 0x20)
break;
buf[i] = 0;
}
}
static void
promise_meta_put_name(struct promise_raid_conf *meta, char *buf)
{
memset(meta->name, 0x20, 32);
memcpy(meta->name, buf, MIN(strlen(buf), 32));
}
static int
promise_meta_read(struct g_consumer *cp, struct promise_raid_conf **metaarr)
{
struct g_provider *pp;
struct promise_raid_conf *meta;
char *buf;
int error, i, subdisks;
uint32_t checksum, *ptr;
pp = cp->provider;
subdisks = 0;
if (pp->sectorsize * 4 > MAXPHYS) {
G_RAID_DEBUG(1, "%s: Blocksize is too big.", pp->name);
return (subdisks);
}
next:
/* Read metadata block. */
buf = g_read_data(cp, pp->mediasize - pp->sectorsize *
(63 - subdisks * PROMISE_META_OFFSET),
pp->sectorsize * 4, &error);
if (buf == NULL) {
G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).",
pp->name, error);
return (subdisks);
}
meta = (struct promise_raid_conf *)buf;
/* Check if this is an Promise RAID struct */
if (strncmp(meta->promise_id, PROMISE_MAGIC, strlen(PROMISE_MAGIC)) &&
strncmp(meta->promise_id, FREEBSD_MAGIC, strlen(FREEBSD_MAGIC))) {
if (subdisks == 0)
G_RAID_DEBUG(1,
"Promise signature check failed on %s", pp->name);
g_free(buf);
return (subdisks);
}
meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK);
memcpy(meta, buf, MIN(sizeof(*meta), pp->sectorsize * 4));
g_free(buf);
/* Check metadata checksum. */
for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < 511; i++)
checksum += *ptr++;
if (checksum != meta->checksum) {
G_RAID_DEBUG(1, "Promise checksum check failed on %s", pp->name);
free(meta, M_MD_PROMISE);
return (subdisks);
}
if ((meta->integrity & PROMISE_I_VALID) == 0) {
G_RAID_DEBUG(1, "Promise metadata is invalid on %s", pp->name);
free(meta, M_MD_PROMISE);
return (subdisks);
}
if (meta->total_disks > PROMISE_MAX_DISKS) {
G_RAID_DEBUG(1, "Wrong number of disks on %s (%d)",
pp->name, meta->total_disks);
free(meta, M_MD_PROMISE);
return (subdisks);
}
/* Remove filler garbage from fields used in newer metadata. */
if (meta->disk_offset_high == 0x8b8c8d8e &&
meta->disk_sectors_high == 0x8788898a &&
meta->disk_rebuild_high == 0x83848586) {
meta->disk_offset_high = 0;
meta->disk_sectors_high = 0;
if (meta->disk_rebuild == UINT32_MAX)
meta->disk_rebuild_high = UINT32_MAX;
else
meta->disk_rebuild_high = 0;
if (meta->total_sectors_high == 0x15161718) {
meta->total_sectors_high = 0;
meta->backup_time = 0;
if (meta->rebuild_lba64 == 0x2122232425262728)
meta->rebuild_lba64 = UINT64_MAX;
}
}
if (meta->sector_size < 1 || meta->sector_size > 8)
meta->sector_size = 1;
/* Save this part and look for next. */
*metaarr = meta;
metaarr++;
subdisks++;
if (subdisks < PROMISE_MAX_SUBDISKS)
goto next;
return (subdisks);
}
static int
promise_meta_write(struct g_consumer *cp,
struct promise_raid_conf **metaarr, int nsd)
{
struct g_provider *pp;
struct promise_raid_conf *meta;
char *buf;
off_t off, size;
int error, i, subdisk, fake;
uint32_t checksum, *ptr;
pp = cp->provider;
subdisk = 0;
fake = 0;
next:
buf = malloc(pp->sectorsize * 4, M_MD_PROMISE, M_WAITOK | M_ZERO);
meta = NULL;
if (subdisk < nsd) {
meta = metaarr[subdisk];
} else if (!fake && promise_meta_unused_range(metaarr, nsd,
cp->provider->mediasize / cp->provider->sectorsize,
&off, &size)) {
/* Optionally add record for unused space. */
meta = (struct promise_raid_conf *)buf;
memcpy(&meta->promise_id[0], PROMISE_MAGIC,
sizeof(PROMISE_MAGIC) - 1);
meta->dummy_0 = 0x00020000;
meta->integrity = PROMISE_I_VALID;
meta->disk.flags = PROMISE_F_ONLINE | PROMISE_F_VALID;
meta->disk.number = 0xff;
arc4rand(&meta->disk.id, sizeof(meta->disk.id), 0);
meta->disk_offset_high = off >> 32;
meta->disk_offset = (uint32_t)off;
meta->disk_sectors_high = size >> 32;
meta->disk_sectors = (uint32_t)size;
meta->disk_rebuild_high = UINT32_MAX;
meta->disk_rebuild = UINT32_MAX;
fake = 1;
}
if (meta != NULL) {
/* Recalculate checksum for case if metadata were changed. */
meta->checksum = 0;
for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < 511; i++)
checksum += *ptr++;
meta->checksum = checksum;
memcpy(buf, meta, MIN(pp->sectorsize * 4, sizeof(*meta)));
}
error = g_write_data(cp, pp->mediasize - pp->sectorsize *
(63 - subdisk * PROMISE_META_OFFSET),
buf, pp->sectorsize * 4);
if (error != 0) {
G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).",
pp->name, error);
}
free(buf, M_MD_PROMISE);
subdisk++;
if (subdisk < PROMISE_MAX_SUBDISKS)
goto next;
return (error);
}
static int
promise_meta_erase(struct g_consumer *cp)
{
struct g_provider *pp;
char *buf;
int error, subdisk;
pp = cp->provider;
buf = malloc(4 * pp->sectorsize, M_MD_PROMISE, M_WAITOK | M_ZERO);
for (subdisk = 0; subdisk < PROMISE_MAX_SUBDISKS; subdisk++) {
error = g_write_data(cp, pp->mediasize - pp->sectorsize *
(63 - subdisk * PROMISE_META_OFFSET),
buf, 4 * pp->sectorsize);
if (error != 0) {
G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).",
pp->name, error);
}
}
free(buf, M_MD_PROMISE);
return (error);
}
static int
promise_meta_write_spare(struct g_consumer *cp)
{
struct promise_raid_conf *meta;
off_t tmp;
int error;
meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK | M_ZERO);
memcpy(&meta->promise_id[0], PROMISE_MAGIC, sizeof(PROMISE_MAGIC) - 1);
meta->dummy_0 = 0x00020000;
meta->integrity = PROMISE_I_VALID;
meta->disk.flags = PROMISE_F_SPARE | PROMISE_F_ONLINE | PROMISE_F_VALID;
meta->disk.number = 0xff;
arc4rand(&meta->disk.id, sizeof(meta->disk.id), 0);
tmp = cp->provider->mediasize / cp->provider->sectorsize - 131072;
meta->disk_sectors_high = tmp >> 32;
meta->disk_sectors = (uint32_t)tmp;
meta->disk_rebuild_high = UINT32_MAX;
meta->disk_rebuild = UINT32_MAX;
error = promise_meta_write(cp, &meta, 1);
free(meta, M_MD_PROMISE);
return (error);
}
static struct g_raid_volume *
g_raid_md_promise_get_volume(struct g_raid_softc *sc, uint64_t id)
{
struct g_raid_volume *vol;
struct g_raid_md_promise_pervolume *pv;
TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
pv = vol->v_md_data;
if (pv->pv_id == id)
break;
}
return (vol);
}
static int
g_raid_md_promise_purge_volumes(struct g_raid_softc *sc)
{
struct g_raid_volume *vol, *tvol;
struct g_raid_md_promise_pervolume *pv;
int i, res;
res = 0;
TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tvol) {
pv = vol->v_md_data;
if (!pv->pv_started || vol->v_stopping)
continue;
for (i = 0; i < vol->v_disks_count; i++) {
if (vol->v_subdisks[i].sd_state != G_RAID_SUBDISK_S_NONE)
break;
}
if (i >= vol->v_disks_count) {
g_raid_destroy_volume(vol);
res = 1;
}
}
return (res);
}
static int
g_raid_md_promise_purge_disks(struct g_raid_softc *sc)
{
struct g_raid_disk *disk, *tdisk;
struct g_raid_volume *vol;
struct g_raid_md_promise_perdisk *pd;
int i, j, res;
res = 0;
TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) {
if (disk->d_state == G_RAID_DISK_S_SPARE)
continue;
pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
/* Scan for deleted volumes. */
for (i = 0; i < pd->pd_subdisks; ) {
vol = g_raid_md_promise_get_volume(sc,
pd->pd_meta[i]->volume_id);
if (vol != NULL && !vol->v_stopping) {
i++;
continue;
}
free(pd->pd_meta[i], M_MD_PROMISE);
for (j = i; j < pd->pd_subdisks - 1; j++)
pd->pd_meta[j] = pd->pd_meta[j + 1];
pd->pd_meta[pd->pd_subdisks - 1] = NULL;
pd->pd_subdisks--;
pd->pd_updated = 1;
}
/* If there is no metadata left - erase and delete disk. */
if (pd->pd_subdisks == 0) {
promise_meta_erase(disk->d_consumer);
g_raid_destroy_disk(disk);
res = 1;
}
}
return (res);
}
static int
g_raid_md_promise_supported(int level, int qual, int disks, int force)
{
if (disks > PROMISE_MAX_DISKS)
return (0);
switch (level) {
case G_RAID_VOLUME_RL_RAID0:
if (disks < 1)
return (0);
if (!force && disks < 2)
return (0);
break;
case G_RAID_VOLUME_RL_RAID1:
if (disks < 1)
return (0);
if (!force && (disks != 2))
return (0);
break;
case G_RAID_VOLUME_RL_RAID1E:
if (disks < 2)
return (0);
if (disks % 2 != 0)
return (0);
if (!force && (disks != 4))
return (0);
break;
case G_RAID_VOLUME_RL_SINGLE:
if (disks != 1)
return (0);
break;
case G_RAID_VOLUME_RL_CONCAT:
if (disks < 2)
return (0);
break;
case G_RAID_VOLUME_RL_RAID5:
if (disks < 3)
return (0);
if (qual != G_RAID_VOLUME_RLQ_R5LA)
return (0);
break;
default:
return (0);
}
if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE)
return (0);
return (1);
}
static int
g_raid_md_promise_start_disk(struct g_raid_disk *disk, int sdn,
struct g_raid_volume *vol)
{
struct g_raid_softc *sc;
struct g_raid_subdisk *sd;
struct g_raid_md_promise_perdisk *pd;
struct g_raid_md_promise_pervolume *pv;
struct promise_raid_conf *meta;
off_t eoff, esize, size;
int disk_pos, md_disk_pos, i, resurrection = 0;
sc = disk->d_softc;
pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
pv = vol->v_md_data;
meta = pv->pv_meta;
if (sdn >= 0) {
/* Find disk position in metadata by its serial. */
md_disk_pos = promise_meta_find_disk(meta, pd->pd_meta[sdn]->disk.id);
/* For RAID0+1 we need to translate order. */
disk_pos = promise_meta_translate_disk(vol, md_disk_pos);
} else {
md_disk_pos = -1;
disk_pos = -1;
}
if (disk_pos < 0) {
G_RAID_DEBUG1(1, sc, "Disk %s is not part of the volume %s",
g_raid_get_diskname(disk), vol->v_name);
/* Failed stale disk is useless for us. */
if (sdn >= 0 &&
pd->pd_meta[sdn]->disk.flags & PROMISE_F_DOWN) {
g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED);
return (0);
}
/* If we were given specific metadata subdisk - erase it. */
if (sdn >= 0) {
free(pd->pd_meta[sdn], M_MD_PROMISE);
for (i = sdn; i < pd->pd_subdisks - 1; i++)
pd->pd_meta[i] = pd->pd_meta[i + 1];
pd->pd_meta[pd->pd_subdisks - 1] = NULL;
pd->pd_subdisks--;
}
/* If we are in the start process, that's all for now. */
if (!pv->pv_started)
goto nofit;
/*
* If we have already started - try to get use of the disk.
* Try to replace OFFLINE disks first, then FAILED.
*/
promise_meta_unused_range(pd->pd_meta, pd->pd_subdisks,
disk->d_consumer->provider->mediasize /
disk->d_consumer->provider->sectorsize,
&eoff, &esize);
if (esize == 0) {
G_RAID_DEBUG1(1, sc, "No free space on disk %s",
g_raid_get_diskname(disk));
goto nofit;
}
size = INT64_MAX;
for (i = 0; i < vol->v_disks_count; i++) {
sd = &vol->v_subdisks[i];
if (sd->sd_state != G_RAID_SUBDISK_S_NONE)
size = sd->sd_size;
if (sd->sd_state <= G_RAID_SUBDISK_S_FAILED &&
(disk_pos < 0 ||
vol->v_subdisks[i].sd_state < sd->sd_state))
disk_pos = i;
}
if (disk_pos >= 0 &&
vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT &&
(off_t)esize * 512 < size) {
G_RAID_DEBUG1(1, sc, "Disk %s free space "
"is too small (%ju < %ju)",
g_raid_get_diskname(disk),
(off_t)esize * 512, size);
disk_pos = -1;
}
if (disk_pos >= 0) {
if (vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT)
esize = size / 512;
/* For RAID0+1 we need to translate order. */
md_disk_pos = promise_meta_translate_disk(vol, disk_pos);
} else {
nofit:
if (pd->pd_subdisks == 0) {
g_raid_change_disk_state(disk,
G_RAID_DISK_S_SPARE);
}
return (0);
}
G_RAID_DEBUG1(1, sc, "Disk %s takes pos %d in the volume %s",
g_raid_get_diskname(disk), disk_pos, vol->v_name);
resurrection = 1;
}
sd = &vol->v_subdisks[disk_pos];
if (resurrection && sd->sd_disk != NULL) {
g_raid_change_disk_state(sd->sd_disk,
G_RAID_DISK_S_STALE_FAILED);
TAILQ_REMOVE(&sd->sd_disk->d_subdisks,
sd, sd_next);
}
vol->v_subdisks[disk_pos].sd_disk = disk;
TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
/* Welcome the new disk. */
if (resurrection)
g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
else if (meta->disks[md_disk_pos].flags & PROMISE_F_DOWN)
g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED);
else
g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
if (resurrection) {
sd->sd_offset = (off_t)eoff * 512;
sd->sd_size = (off_t)esize * 512;
} else {
sd->sd_offset = (((off_t)pd->pd_meta[sdn]->disk_offset_high
<< 32) + pd->pd_meta[sdn]->disk_offset) * 512;
sd->sd_size = (((off_t)pd->pd_meta[sdn]->disk_sectors_high
<< 32) + pd->pd_meta[sdn]->disk_sectors) * 512;
}
if (resurrection) {
/* Stale disk, almost same as new. */
g_raid_change_subdisk_state(sd,
G_RAID_SUBDISK_S_NEW);
} else if (meta->disks[md_disk_pos].flags & PROMISE_F_DOWN) {
/* Failed disk. */
g_raid_change_subdisk_state(sd,
G_RAID_SUBDISK_S_FAILED);
} else if (meta->disks[md_disk_pos].flags & PROMISE_F_REDIR) {
/* Rebuilding disk. */
g_raid_change_subdisk_state(sd,
G_RAID_SUBDISK_S_REBUILD);
if (pd->pd_meta[sdn]->generation != meta->generation)
sd->sd_rebuild_pos = 0;
else {
sd->sd_rebuild_pos =
(((off_t)pd->pd_meta[sdn]->disk_rebuild_high << 32) +
pd->pd_meta[sdn]->disk_rebuild) * 512;
}
} else if (!(meta->disks[md_disk_pos].flags & PROMISE_F_ONLINE)) {
/* Rebuilding disk. */
g_raid_change_subdisk_state(sd,
G_RAID_SUBDISK_S_NEW);
} else if (pd->pd_meta[sdn]->generation != meta->generation ||
(meta->status & PROMISE_S_MARKED)) {
/* Stale disk or dirty volume (unclean shutdown). */
g_raid_change_subdisk_state(sd,
G_RAID_SUBDISK_S_STALE);
} else {
/* Up to date disk. */
g_raid_change_subdisk_state(sd,
G_RAID_SUBDISK_S_ACTIVE);
}
g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
G_RAID_EVENT_SUBDISK);
return (resurrection);
}
static void
g_raid_md_promise_refill(struct g_raid_softc *sc)
{
struct g_raid_volume *vol;
struct g_raid_subdisk *sd;
struct g_raid_disk *disk;
struct g_raid_md_object *md;
struct g_raid_md_promise_perdisk *pd;
struct g_raid_md_promise_pervolume *pv;
int update, updated, i, bad;
md = sc->sc_md;
restart:
updated = 0;
TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
pv = vol->v_md_data;
if (!pv->pv_started || vol->v_stopping)
continue;
/* Search for subdisk that needs replacement. */
bad = 0;
for (i = 0; i < vol->v_disks_count; i++) {
sd = &vol->v_subdisks[i];
if (sd->sd_state == G_RAID_SUBDISK_S_NONE ||
sd->sd_state == G_RAID_SUBDISK_S_FAILED)
bad = 1;
}
if (!bad)
continue;
G_RAID_DEBUG1(1, sc, "Volume %s is not complete, "
"trying to refill.", vol->v_name);
TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
/* Skip failed. */
if (disk->d_state < G_RAID_DISK_S_SPARE)
continue;
/* Skip already used by this volume. */
for (i = 0; i < vol->v_disks_count; i++) {
sd = &vol->v_subdisks[i];
if (sd->sd_disk == disk)
break;
}
if (i < vol->v_disks_count)
continue;
/* Try to use disk if it has empty extents. */
pd = disk->d_md_data;
if (pd->pd_subdisks < PROMISE_MAX_SUBDISKS) {
update =
g_raid_md_promise_start_disk(disk, -1, vol);
} else
update = 0;
if (update) {
updated = 1;
g_raid_md_write_promise(md, vol, NULL, disk);
break;
}
}
}
if (updated)
goto restart;
}
static void
g_raid_md_promise_start(struct g_raid_volume *vol)
{
struct g_raid_softc *sc;
struct g_raid_subdisk *sd;
struct g_raid_disk *disk;
struct g_raid_md_object *md;
struct g_raid_md_promise_perdisk *pd;
struct g_raid_md_promise_pervolume *pv;
struct promise_raid_conf *meta;
u_int i;
sc = vol->v_softc;
md = sc->sc_md;
pv = vol->v_md_data;
meta = pv->pv_meta;
vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE;
if (meta->type == PROMISE_T_RAID0)
vol->v_raid_level = G_RAID_VOLUME_RL_RAID0;
else if (meta->type == PROMISE_T_RAID1) {
if (meta->array_width == 1)
vol->v_raid_level = G_RAID_VOLUME_RL_RAID1;
else
vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E;
} else if (meta->type == PROMISE_T_RAID3)
vol->v_raid_level = G_RAID_VOLUME_RL_RAID3;
else if (meta->type == PROMISE_T_RAID5) {
vol->v_raid_level = G_RAID_VOLUME_RL_RAID5;
vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LA;
} else if (meta->type == PROMISE_T_SPAN)
vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT;
else if (meta->type == PROMISE_T_JBOD)
vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE;
else
vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN;
vol->v_strip_size = 512 << meta->stripe_shift; //ZZZ
vol->v_disks_count = meta->total_disks;
vol->v_mediasize = (off_t)meta->total_sectors * 512; //ZZZ
if (meta->total_sectors_high < 256) /* If value looks sane. */
vol->v_mediasize +=
((off_t)meta->total_sectors_high << 32) * 512; //ZZZ
vol->v_sectorsize = 512 * meta->sector_size;
for (i = 0; i < vol->v_disks_count; i++) {
sd = &vol->v_subdisks[i];
sd->sd_offset = (((off_t)meta->disk_offset_high << 32) +
meta->disk_offset) * 512;
sd->sd_size = (((off_t)meta->disk_sectors_high << 32) +
meta->disk_sectors) * 512;
}
g_raid_start_volume(vol);
/* Make all disks found till the moment take their places. */
TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
pd = disk->d_md_data;
for (i = 0; i < pd->pd_subdisks; i++) {
if (pd->pd_meta[i]->volume_id == meta->volume_id)
g_raid_md_promise_start_disk(disk, i, vol);
}
}
pv->pv_started = 1;
callout_stop(&pv->pv_start_co);
G_RAID_DEBUG1(0, sc, "Volume started.");
g_raid_md_write_promise(md, vol, NULL, NULL);
/* Pickup any STALE/SPARE disks to refill array if needed. */
g_raid_md_promise_refill(sc);
g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME);
}
static void
g_raid_promise_go(void *arg)
{
struct g_raid_volume *vol;
struct g_raid_softc *sc;
struct g_raid_md_promise_pervolume *pv;
vol = arg;
pv = vol->v_md_data;
sc = vol->v_softc;
if (!pv->pv_started) {
G_RAID_DEBUG1(0, sc, "Force volume start due to timeout.");
g_raid_event_send(vol, G_RAID_VOLUME_E_STARTMD,
G_RAID_EVENT_VOLUME);
}
}
static void
g_raid_md_promise_new_disk(struct g_raid_disk *disk)
{
struct g_raid_softc *sc;
struct g_raid_md_object *md;
struct promise_raid_conf *pdmeta;
struct g_raid_md_promise_perdisk *pd;
struct g_raid_md_promise_pervolume *pv;
struct g_raid_volume *vol;
int i;
char buf[33];
sc = disk->d_softc;
md = sc->sc_md;
pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
if (pd->pd_subdisks == 0) {
g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE);
g_raid_md_promise_refill(sc);
return;
}
for (i = 0; i < pd->pd_subdisks; i++) {
pdmeta = pd->pd_meta[i];
/* Look for volume with matching ID. */
vol = g_raid_md_promise_get_volume(sc, pdmeta->volume_id);
if (vol == NULL) {
promise_meta_get_name(pdmeta, buf);
vol = g_raid_create_volume(sc, buf, pdmeta->array_number);
pv = malloc(sizeof(*pv), M_MD_PROMISE, M_WAITOK | M_ZERO);
pv->pv_id = pdmeta->volume_id;
vol->v_md_data = pv;
callout_init(&pv->pv_start_co, 1);
callout_reset(&pv->pv_start_co,
g_raid_start_timeout * hz,
g_raid_promise_go, vol);
} else
pv = vol->v_md_data;
/* If we haven't started yet - check metadata freshness. */
if (pv->pv_meta == NULL || !pv->pv_started) {
if (pv->pv_meta == NULL ||
((int16_t)(pdmeta->generation - pv->pv_generation)) > 0) {
G_RAID_DEBUG1(1, sc, "Newer disk");
if (pv->pv_meta != NULL)
free(pv->pv_meta, M_MD_PROMISE);
pv->pv_meta = promise_meta_copy(pdmeta);
pv->pv_generation = pv->pv_meta->generation;
pv->pv_disks_present = 1;
} else if (pdmeta->generation == pv->pv_generation) {
pv->pv_disks_present++;
G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)",
pv->pv_disks_present,
pv->pv_meta->total_disks);
} else {
G_RAID_DEBUG1(1, sc, "Older disk");
}
}
}
for (i = 0; i < pd->pd_subdisks; i++) {
pdmeta = pd->pd_meta[i];
/* Look for volume with matching ID. */
vol = g_raid_md_promise_get_volume(sc, pdmeta->volume_id);
if (vol == NULL)
continue;
pv = vol->v_md_data;
if (pv->pv_started) {
if (g_raid_md_promise_start_disk(disk, i, vol))
g_raid_md_write_promise(md, vol, NULL, NULL);
} else {
/* If we collected all needed disks - start array. */
if (pv->pv_disks_present == pv->pv_meta->total_disks)
g_raid_md_promise_start(vol);
}
}
}
static int
g_raid_md_create_promise(struct g_raid_md_object *md, struct g_class *mp,
struct g_geom **gp)
{
struct g_geom *geom;
struct g_raid_softc *sc;
/* Search for existing node. */
LIST_FOREACH(geom, &mp->geom, geom) {
sc = geom->softc;
if (sc == NULL)
continue;
if (sc->sc_stopping != 0)
continue;
if (sc->sc_md->mdo_class != md->mdo_class)
continue;
break;
}
if (geom != NULL) {
*gp = geom;
return (G_RAID_MD_TASTE_EXISTING);
}
/* Create new one if not found. */
sc = g_raid_create_node(mp, "Promise", md);
if (sc == NULL)
return (G_RAID_MD_TASTE_FAIL);
md->mdo_softc = sc;
*gp = sc->sc_geom;
return (G_RAID_MD_TASTE_NEW);
}
static int
g_raid_md_taste_promise(struct g_raid_md_object *md, struct g_class *mp,
struct g_consumer *cp, struct g_geom **gp)
{
struct g_consumer *rcp;
struct g_provider *pp;
struct g_raid_softc *sc;
struct g_raid_disk *disk;
- struct promise_raid_conf *meta, *metaarr[4];
+ struct promise_raid_conf *metaarr[4];
struct g_raid_md_promise_perdisk *pd;
struct g_geom *geom;
int i, j, result, len, subdisks;
char name[16];
uint16_t vendor;
G_RAID_DEBUG(1, "Tasting Promise on %s", cp->provider->name);
pp = cp->provider;
/* Read metadata from device. */
- meta = NULL;
g_topology_unlock();
vendor = 0xffff;
len = sizeof(vendor);
if (pp->geom->rank == 1)
g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor);
subdisks = promise_meta_read(cp, metaarr);
g_topology_lock();
if (subdisks == 0) {
if (g_raid_aggressive_spare) {
if (vendor == 0x105a || vendor == 0x1002) {
G_RAID_DEBUG(1,
"No Promise metadata, forcing spare.");
goto search;
} else {
G_RAID_DEBUG(1,
"Promise/ATI vendor mismatch "
"0x%04x != 0x105a/0x1002",
vendor);
}
}
return (G_RAID_MD_TASTE_FAIL);
}
/* Metadata valid. Print it. */
for (i = 0; i < subdisks; i++)
g_raid_md_promise_print(metaarr[i]);
/* Purge meaningless (empty/spare) records. */
for (i = 0; i < subdisks; ) {
if (metaarr[i]->disk.flags & PROMISE_F_ASSIGNED) {
i++;
continue;
}
free(metaarr[i], M_MD_PROMISE);
for (j = i; j < subdisks - 1; j++)
metaarr[i] = metaarr[j + 1];
metaarr[subdisks - 1] = NULL;
subdisks--;
}
search:
/* Search for matching node. */
sc = NULL;
LIST_FOREACH(geom, &mp->geom, geom) {
sc = geom->softc;
if (sc == NULL)
continue;
if (sc->sc_stopping != 0)
continue;
if (sc->sc_md->mdo_class != md->mdo_class)
continue;
break;
}
/* Found matching node. */
if (geom != NULL) {
G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name);
result = G_RAID_MD_TASTE_EXISTING;
} else { /* Not found matching node -- create one. */
result = G_RAID_MD_TASTE_NEW;
snprintf(name, sizeof(name), "Promise");
sc = g_raid_create_node(mp, name, md);
md->mdo_softc = sc;
geom = sc->sc_geom;
}
/* There is no return after this point, so we close passed consumer. */
g_access(cp, -1, 0, 0);
rcp = g_new_consumer(geom);
rcp->flags |= G_CF_DIRECT_RECEIVE;
g_attach(rcp, pp);
if (g_access(rcp, 1, 1, 1) != 0)
; //goto fail1;
g_topology_unlock();
sx_xlock(&sc->sc_lock);
pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO);
pd->pd_subdisks = subdisks;
for (i = 0; i < subdisks; i++)
pd->pd_meta[i] = metaarr[i];
disk = g_raid_create_disk(sc);
disk->d_md_data = (void *)pd;
disk->d_consumer = rcp;
rcp->private = disk;
g_raid_get_disk_info(disk);
g_raid_md_promise_new_disk(disk);
sx_xunlock(&sc->sc_lock);
g_topology_lock();
*gp = geom;
return (result);
}
static int
g_raid_md_event_promise(struct g_raid_md_object *md,
struct g_raid_disk *disk, u_int event)
{
struct g_raid_softc *sc;
sc = md->mdo_softc;
if (disk == NULL)
return (-1);
switch (event) {
case G_RAID_DISK_E_DISCONNECTED:
/* Delete disk. */
g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE);
g_raid_destroy_disk(disk);
g_raid_md_promise_purge_volumes(sc);
/* Write updated metadata to all disks. */
g_raid_md_write_promise(md, NULL, NULL, NULL);
/* Check if anything left. */
if (g_raid_ndisks(sc, -1) == 0)
g_raid_destroy_node(sc, 0);
else
g_raid_md_promise_refill(sc);
return (0);
}
return (-2);
}
static int
g_raid_md_volume_event_promise(struct g_raid_md_object *md,
struct g_raid_volume *vol, u_int event)
{
struct g_raid_md_promise_pervolume *pv;
pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data;
switch (event) {
case G_RAID_VOLUME_E_STARTMD:
if (!pv->pv_started)
g_raid_md_promise_start(vol);
return (0);
}
return (-2);
}
static int
g_raid_md_ctl_promise(struct g_raid_md_object *md,
struct gctl_req *req)
{
struct g_raid_softc *sc;
struct g_raid_volume *vol, *vol1;
struct g_raid_subdisk *sd;
struct g_raid_disk *disk, *disks[PROMISE_MAX_DISKS];
struct g_raid_md_promise_perdisk *pd;
struct g_raid_md_promise_pervolume *pv;
struct g_consumer *cp;
struct g_provider *pp;
char arg[16];
const char *nodename, *verb, *volname, *levelname, *diskname;
char *tmp;
int *nargs, *force;
off_t esize, offs[PROMISE_MAX_DISKS], size, sectorsize, strip;
intmax_t *sizearg, *striparg;
int numdisks, i, len, level, qual;
int error;
sc = md->mdo_softc;
verb = gctl_get_param(req, "verb", NULL);
nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
error = 0;
if (strcmp(verb, "label") == 0) {
if (*nargs < 4) {
gctl_error(req, "Invalid number of arguments.");
return (-1);
}
volname = gctl_get_asciiparam(req, "arg1");
if (volname == NULL) {
gctl_error(req, "No volume name.");
return (-2);
}
levelname = gctl_get_asciiparam(req, "arg2");
if (levelname == NULL) {
gctl_error(req, "No RAID level.");
return (-3);
}
if (strcasecmp(levelname, "RAID5") == 0)
levelname = "RAID5-LA";
if (g_raid_volume_str2level(levelname, &level, &qual)) {
gctl_error(req, "Unknown RAID level '%s'.", levelname);
return (-4);
}
numdisks = *nargs - 3;
force = gctl_get_paraml(req, "force", sizeof(*force));
if (!g_raid_md_promise_supported(level, qual, numdisks,
force ? *force : 0)) {
gctl_error(req, "Unsupported RAID level "
"(0x%02x/0x%02x), or number of disks (%d).",
level, qual, numdisks);
return (-5);
}
/* Search for disks, connect them and probe. */
size = INT64_MAX;
sectorsize = 0;
bzero(disks, sizeof(disks));
bzero(offs, sizeof(offs));
for (i = 0; i < numdisks; i++) {
snprintf(arg, sizeof(arg), "arg%d", i + 3);
diskname = gctl_get_asciiparam(req, arg);
if (diskname == NULL) {
gctl_error(req, "No disk name (%s).", arg);
error = -6;
break;
}
if (strcmp(diskname, "NONE") == 0)
continue;
TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
if (disk->d_consumer != NULL &&
disk->d_consumer->provider != NULL &&
strcmp(disk->d_consumer->provider->name,
diskname) == 0)
break;
}
if (disk != NULL) {
if (disk->d_state != G_RAID_DISK_S_ACTIVE) {
gctl_error(req, "Disk '%s' is in a "
"wrong state (%s).", diskname,
g_raid_disk_state2str(disk->d_state));
error = -7;
break;
}
pd = disk->d_md_data;
if (pd->pd_subdisks >= PROMISE_MAX_SUBDISKS) {
gctl_error(req, "Disk '%s' already "
"used by %d volumes.",
diskname, pd->pd_subdisks);
error = -7;
break;
}
pp = disk->d_consumer->provider;
disks[i] = disk;
promise_meta_unused_range(pd->pd_meta,
pd->pd_subdisks,
pp->mediasize / pp->sectorsize,
&offs[i], &esize);
size = MIN(size, (off_t)esize * pp->sectorsize);
sectorsize = MAX(sectorsize, pp->sectorsize);
continue;
}
g_topology_lock();
cp = g_raid_open_consumer(sc, diskname);
if (cp == NULL) {
gctl_error(req, "Can't open disk '%s'.",
diskname);
g_topology_unlock();
error = -8;
break;
}
pp = cp->provider;
pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO);
disk = g_raid_create_disk(sc);
disk->d_md_data = (void *)pd;
disk->d_consumer = cp;
disks[i] = disk;
cp->private = disk;
g_topology_unlock();
g_raid_get_disk_info(disk);
/* Reserve some space for metadata. */
size = MIN(size, pp->mediasize - 131072llu * pp->sectorsize);
sectorsize = MAX(sectorsize, pp->sectorsize);
}
if (error != 0) {
for (i = 0; i < numdisks; i++) {
if (disks[i] != NULL &&
disks[i]->d_state == G_RAID_DISK_S_NONE)
g_raid_destroy_disk(disks[i]);
}
return (error);
}
if (sectorsize <= 0) {
gctl_error(req, "Can't get sector size.");
return (-8);
}
/* Handle size argument. */
len = sizeof(*sizearg);
sizearg = gctl_get_param(req, "size", &len);
if (sizearg != NULL && len == sizeof(*sizearg) &&
*sizearg > 0) {
if (*sizearg > size) {
gctl_error(req, "Size too big %lld > %lld.",
(long long)*sizearg, (long long)size);
return (-9);
}
size = *sizearg;
}
/* Handle strip argument. */
strip = 131072;
len = sizeof(*striparg);
striparg = gctl_get_param(req, "strip", &len);
if (striparg != NULL && len == sizeof(*striparg) &&
*striparg > 0) {
if (*striparg < sectorsize) {
gctl_error(req, "Strip size too small.");
return (-10);
}
if (*striparg % sectorsize != 0) {
gctl_error(req, "Incorrect strip size.");
return (-11);
}
strip = *striparg;
}
/* Round size down to strip or sector. */
if (level == G_RAID_VOLUME_RL_RAID1 ||
level == G_RAID_VOLUME_RL_SINGLE ||
level == G_RAID_VOLUME_RL_CONCAT)
size -= (size % sectorsize);
else if (level == G_RAID_VOLUME_RL_RAID1E &&
(numdisks & 1) != 0)
size -= (size % (2 * strip));
else
size -= (size % strip);
if (size <= 0) {
gctl_error(req, "Size too small.");
return (-13);
}
/* We have all we need, create things: volume, ... */
pv = malloc(sizeof(*pv), M_MD_PROMISE, M_WAITOK | M_ZERO);
arc4rand(&pv->pv_id, sizeof(pv->pv_id), 0);
pv->pv_generation = 0;
pv->pv_started = 1;
vol = g_raid_create_volume(sc, volname, -1);
vol->v_md_data = pv;
vol->v_raid_level = level;
vol->v_raid_level_qualifier = qual;
vol->v_strip_size = strip;
vol->v_disks_count = numdisks;
if (level == G_RAID_VOLUME_RL_RAID0 ||
level == G_RAID_VOLUME_RL_CONCAT ||
level == G_RAID_VOLUME_RL_SINGLE)
vol->v_mediasize = size * numdisks;
else if (level == G_RAID_VOLUME_RL_RAID1)
vol->v_mediasize = size;
else if (level == G_RAID_VOLUME_RL_RAID3 ||
level == G_RAID_VOLUME_RL_RAID5)
vol->v_mediasize = size * (numdisks - 1);
else { /* RAID1E */
vol->v_mediasize = ((size * numdisks) / strip / 2) *
strip;
}
vol->v_sectorsize = sectorsize;
g_raid_start_volume(vol);
/* , and subdisks. */
for (i = 0; i < numdisks; i++) {
disk = disks[i];
sd = &vol->v_subdisks[i];
sd->sd_disk = disk;
sd->sd_offset = (off_t)offs[i] * 512;
sd->sd_size = size;
if (disk == NULL)
continue;
TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
g_raid_change_disk_state(disk,
G_RAID_DISK_S_ACTIVE);
g_raid_change_subdisk_state(sd,
G_RAID_SUBDISK_S_ACTIVE);
g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
G_RAID_EVENT_SUBDISK);
}
/* Write metadata based on created entities. */
G_RAID_DEBUG1(0, sc, "Array started.");
g_raid_md_write_promise(md, vol, NULL, NULL);
/* Pickup any STALE/SPARE disks to refill array if needed. */
g_raid_md_promise_refill(sc);
g_raid_event_send(vol, G_RAID_VOLUME_E_START,
G_RAID_EVENT_VOLUME);
return (0);
}
if (strcmp(verb, "add") == 0) {
gctl_error(req, "`add` command is not applicable, "
"use `label` instead.");
return (-99);
}
if (strcmp(verb, "delete") == 0) {
nodename = gctl_get_asciiparam(req, "arg0");
if (nodename != NULL && strcasecmp(sc->sc_name, nodename) != 0)
nodename = NULL;
/* Full node destruction. */
if (*nargs == 1 && nodename != NULL) {
/* Check if some volume is still open. */
force = gctl_get_paraml(req, "force", sizeof(*force));
if (force != NULL && *force == 0 &&
g_raid_nopens(sc) != 0) {
gctl_error(req, "Some volume is still open.");
return (-4);
}
TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
if (disk->d_consumer)
promise_meta_erase(disk->d_consumer);
}
g_raid_destroy_node(sc, 0);
return (0);
}
/* Destroy specified volume. If it was last - all node. */
if (*nargs > 2) {
gctl_error(req, "Invalid number of arguments.");
return (-1);
}
volname = gctl_get_asciiparam(req,
nodename != NULL ? "arg1" : "arg0");
if (volname == NULL) {
gctl_error(req, "No volume name.");
return (-2);
}
/* Search for volume. */
TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
if (strcmp(vol->v_name, volname) == 0)
break;
pp = vol->v_provider;
if (pp == NULL)
continue;
if (strcmp(pp->name, volname) == 0)
break;
if (strncmp(pp->name, "raid/", 5) == 0 &&
strcmp(pp->name + 5, volname) == 0)
break;
}
if (vol == NULL) {
i = strtol(volname, &tmp, 10);
if (verb != volname && tmp[0] == 0) {
TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
if (vol->v_global_id == i)
break;
}
}
}
if (vol == NULL) {
gctl_error(req, "Volume '%s' not found.", volname);
return (-3);
}
/* Check if volume is still open. */
force = gctl_get_paraml(req, "force", sizeof(*force));
if (force != NULL && *force == 0 &&
vol->v_provider_open != 0) {
gctl_error(req, "Volume is still open.");
return (-4);
}
/* Destroy volume and potentially node. */
i = 0;
TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next)
i++;
if (i >= 2) {
g_raid_destroy_volume(vol);
g_raid_md_promise_purge_disks(sc);
g_raid_md_write_promise(md, NULL, NULL, NULL);
} else {
TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
if (disk->d_consumer)
promise_meta_erase(disk->d_consumer);
}
g_raid_destroy_node(sc, 0);
}
return (0);
}
if (strcmp(verb, "remove") == 0 ||
strcmp(verb, "fail") == 0) {
if (*nargs < 2) {
gctl_error(req, "Invalid number of arguments.");
return (-1);
}
for (i = 1; i < *nargs; i++) {
snprintf(arg, sizeof(arg), "arg%d", i);
diskname = gctl_get_asciiparam(req, arg);
if (diskname == NULL) {
gctl_error(req, "No disk name (%s).", arg);
error = -2;
break;
}
if (strncmp(diskname, "/dev/", 5) == 0)
diskname += 5;
TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
if (disk->d_consumer != NULL &&
disk->d_consumer->provider != NULL &&
strcmp(disk->d_consumer->provider->name,
diskname) == 0)
break;
}
if (disk == NULL) {
gctl_error(req, "Disk '%s' not found.",
diskname);
error = -3;
break;
}
if (strcmp(verb, "fail") == 0) {
g_raid_md_fail_disk_promise(md, NULL, disk);
continue;
}
/* Erase metadata on deleting disk and destroy it. */
promise_meta_erase(disk->d_consumer);
g_raid_destroy_disk(disk);
}
g_raid_md_promise_purge_volumes(sc);
/* Write updated metadata to remaining disks. */
g_raid_md_write_promise(md, NULL, NULL, NULL);
/* Check if anything left. */
if (g_raid_ndisks(sc, -1) == 0)
g_raid_destroy_node(sc, 0);
else
g_raid_md_promise_refill(sc);
return (error);
}
if (strcmp(verb, "insert") == 0) {
if (*nargs < 2) {
gctl_error(req, "Invalid number of arguments.");
return (-1);
}
for (i = 1; i < *nargs; i++) {
/* Get disk name. */
snprintf(arg, sizeof(arg), "arg%d", i);
diskname = gctl_get_asciiparam(req, arg);
if (diskname == NULL) {
gctl_error(req, "No disk name (%s).", arg);
error = -3;
break;
}
/* Try to find provider with specified name. */
g_topology_lock();
cp = g_raid_open_consumer(sc, diskname);
if (cp == NULL) {
gctl_error(req, "Can't open disk '%s'.",
diskname);
g_topology_unlock();
error = -4;
break;
}
pp = cp->provider;
g_topology_unlock();
pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO);
disk = g_raid_create_disk(sc);
disk->d_consumer = cp;
disk->d_md_data = (void *)pd;
cp->private = disk;
g_raid_get_disk_info(disk);
/* Welcome the "new" disk. */
g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE);
promise_meta_write_spare(cp);
g_raid_md_promise_refill(sc);
}
return (error);
}
return (-100);
}
static int
g_raid_md_write_promise(struct g_raid_md_object *md, struct g_raid_volume *tvol,
struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk)
{
struct g_raid_softc *sc;
struct g_raid_volume *vol;
struct g_raid_subdisk *sd;
struct g_raid_disk *disk;
struct g_raid_md_promise_perdisk *pd;
struct g_raid_md_promise_pervolume *pv;
struct promise_raid_conf *meta;
off_t rebuild_lba64;
int i, j, pos, rebuild;
sc = md->mdo_softc;
if (sc->sc_stopping == G_RAID_DESTROY_HARD)
return (0);
/* Generate new per-volume metadata for affected volumes. */
TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
if (vol->v_stopping)
continue;
/* Skip volumes not related to specified targets. */
if (tvol != NULL && vol != tvol)
continue;
if (tsd != NULL && vol != tsd->sd_volume)
continue;
if (tdisk != NULL) {
for (i = 0; i < vol->v_disks_count; i++) {
if (vol->v_subdisks[i].sd_disk == tdisk)
break;
}
if (i >= vol->v_disks_count)
continue;
}
pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data;
pv->pv_generation++;
meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK | M_ZERO);
if (pv->pv_meta != NULL)
memcpy(meta, pv->pv_meta, sizeof(*meta));
memcpy(meta->promise_id, PROMISE_MAGIC,
sizeof(PROMISE_MAGIC) - 1);
meta->dummy_0 = 0x00020000;
meta->integrity = PROMISE_I_VALID;
meta->generation = pv->pv_generation;
meta->status = PROMISE_S_VALID | PROMISE_S_ONLINE |
PROMISE_S_INITED | PROMISE_S_READY;
if (vol->v_state <= G_RAID_VOLUME_S_DEGRADED)
meta->status |= PROMISE_S_DEGRADED;
if (vol->v_dirty)
meta->status |= PROMISE_S_MARKED; /* XXX: INVENTED! */
if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0 ||
vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE)
meta->type = PROMISE_T_RAID0;
else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E)
meta->type = PROMISE_T_RAID1;
else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3)
meta->type = PROMISE_T_RAID3;
else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5)
meta->type = PROMISE_T_RAID5;
else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT)
meta->type = PROMISE_T_SPAN;
else
meta->type = PROMISE_T_JBOD;
meta->total_disks = vol->v_disks_count;
meta->stripe_shift = ffs(vol->v_strip_size / 1024);
meta->array_width = vol->v_disks_count;
if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E)
meta->array_width /= 2;
meta->array_number = vol->v_global_id;
meta->total_sectors = vol->v_mediasize / 512;
meta->total_sectors_high = (vol->v_mediasize / 512) >> 32;
meta->sector_size = vol->v_sectorsize / 512;
meta->cylinders = meta->total_sectors / (255 * 63) - 1;
meta->heads = 254;
meta->sectors = 63;
meta->volume_id = pv->pv_id;
rebuild_lba64 = UINT64_MAX;
rebuild = 0;
for (i = 0; i < vol->v_disks_count; i++) {
sd = &vol->v_subdisks[i];
/* For RAID0+1 we need to translate order. */
pos = promise_meta_translate_disk(vol, i);
meta->disks[pos].flags = PROMISE_F_VALID |
PROMISE_F_ASSIGNED;
if (sd->sd_state == G_RAID_SUBDISK_S_NONE) {
meta->disks[pos].flags |= 0;
} else if (sd->sd_state == G_RAID_SUBDISK_S_FAILED) {
meta->disks[pos].flags |=
PROMISE_F_DOWN | PROMISE_F_REDIR;
} else if (sd->sd_state <= G_RAID_SUBDISK_S_REBUILD) {
meta->disks[pos].flags |=
PROMISE_F_ONLINE | PROMISE_F_REDIR;
if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) {
rebuild_lba64 = MIN(rebuild_lba64,
sd->sd_rebuild_pos / 512);
} else
rebuild_lba64 = 0;
rebuild = 1;
} else {
meta->disks[pos].flags |= PROMISE_F_ONLINE;
if (sd->sd_state < G_RAID_SUBDISK_S_ACTIVE) {
meta->status |= PROMISE_S_MARKED;
if (sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
rebuild_lba64 = MIN(rebuild_lba64,
sd->sd_rebuild_pos / 512);
} else
rebuild_lba64 = 0;
}
}
if (pv->pv_meta != NULL) {
meta->disks[pos].id = pv->pv_meta->disks[pos].id;
} else {
meta->disks[pos].number = i * 2;
arc4rand(&meta->disks[pos].id,
sizeof(meta->disks[pos].id), 0);
}
}
promise_meta_put_name(meta, vol->v_name);
/* Try to mimic AMD BIOS rebuild/resync behavior. */
if (rebuild_lba64 != UINT64_MAX) {
if (rebuild)
meta->magic_3 = 0x03040010UL; /* Rebuild? */
else
meta->magic_3 = 0x03040008UL; /* Resync? */
/* Translate from per-disk to per-volume LBA. */
if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) {
rebuild_lba64 *= meta->array_width;
} else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3 ||
vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) {
rebuild_lba64 *= meta->array_width - 1;
} else
rebuild_lba64 = 0;
} else
meta->magic_3 = 0x03000000UL;
meta->rebuild_lba64 = rebuild_lba64;
meta->magic_4 = 0x04010101UL;
/* Replace per-volume metadata with new. */
if (pv->pv_meta != NULL)
free(pv->pv_meta, M_MD_PROMISE);
pv->pv_meta = meta;
/* Copy new metadata to the disks, adding or replacing old. */
for (i = 0; i < vol->v_disks_count; i++) {
sd = &vol->v_subdisks[i];
disk = sd->sd_disk;
if (disk == NULL)
continue;
/* For RAID0+1 we need to translate order. */
pos = promise_meta_translate_disk(vol, i);
pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
for (j = 0; j < pd->pd_subdisks; j++) {
if (pd->pd_meta[j]->volume_id == meta->volume_id)
break;
}
if (j == pd->pd_subdisks)
pd->pd_subdisks++;
if (pd->pd_meta[j] != NULL)
free(pd->pd_meta[j], M_MD_PROMISE);
pd->pd_meta[j] = promise_meta_copy(meta);
pd->pd_meta[j]->disk = meta->disks[pos];
pd->pd_meta[j]->disk.number = pos;
pd->pd_meta[j]->disk_offset_high =
(sd->sd_offset / 512) >> 32;
pd->pd_meta[j]->disk_offset = sd->sd_offset / 512;
pd->pd_meta[j]->disk_sectors_high =
(sd->sd_size / 512) >> 32;
pd->pd_meta[j]->disk_sectors = sd->sd_size / 512;
if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) {
pd->pd_meta[j]->disk_rebuild_high =
(sd->sd_rebuild_pos / 512) >> 32;
pd->pd_meta[j]->disk_rebuild =
sd->sd_rebuild_pos / 512;
} else if (sd->sd_state < G_RAID_SUBDISK_S_REBUILD) {
pd->pd_meta[j]->disk_rebuild_high = 0;
pd->pd_meta[j]->disk_rebuild = 0;
} else {
pd->pd_meta[j]->disk_rebuild_high = UINT32_MAX;
pd->pd_meta[j]->disk_rebuild = UINT32_MAX;
}
pd->pd_updated = 1;
}
}
TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
if (disk->d_state != G_RAID_DISK_S_ACTIVE)
continue;
if (!pd->pd_updated)
continue;
G_RAID_DEBUG(1, "Writing Promise metadata to %s",
g_raid_get_diskname(disk));
for (i = 0; i < pd->pd_subdisks; i++)
g_raid_md_promise_print(pd->pd_meta[i]);
promise_meta_write(disk->d_consumer,
pd->pd_meta, pd->pd_subdisks);
pd->pd_updated = 0;
}
return (0);
}
static int
g_raid_md_fail_disk_promise(struct g_raid_md_object *md,
struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk)
{
struct g_raid_softc *sc;
struct g_raid_md_promise_perdisk *pd;
struct g_raid_subdisk *sd;
int i, pos;
sc = md->mdo_softc;
pd = (struct g_raid_md_promise_perdisk *)tdisk->d_md_data;
/* We can't fail disk that is not a part of array now. */
if (tdisk->d_state != G_RAID_DISK_S_ACTIVE)
return (-1);
/*
* Mark disk as failed in metadata and try to write that metadata
* to the disk itself to prevent it's later resurrection as STALE.
*/
if (pd->pd_subdisks > 0 && tdisk->d_consumer != NULL)
G_RAID_DEBUG(1, "Writing Promise metadata to %s",
g_raid_get_diskname(tdisk));
for (i = 0; i < pd->pd_subdisks; i++) {
pd->pd_meta[i]->disk.flags |=
PROMISE_F_DOWN | PROMISE_F_REDIR;
pos = pd->pd_meta[i]->disk.number;
if (pos >= 0 && pos < PROMISE_MAX_DISKS) {
pd->pd_meta[i]->disks[pos].flags |=
PROMISE_F_DOWN | PROMISE_F_REDIR;
}
g_raid_md_promise_print(pd->pd_meta[i]);
}
if (tdisk->d_consumer != NULL)
promise_meta_write(tdisk->d_consumer,
pd->pd_meta, pd->pd_subdisks);
/* Change states. */
g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED);
TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) {
g_raid_change_subdisk_state(sd,
G_RAID_SUBDISK_S_FAILED);
g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED,
G_RAID_EVENT_SUBDISK);
}
/* Write updated metadata to remaining disks. */
g_raid_md_write_promise(md, NULL, NULL, tdisk);
g_raid_md_promise_refill(sc);
return (0);
}
static int
g_raid_md_free_disk_promise(struct g_raid_md_object *md,
struct g_raid_disk *disk)
{
struct g_raid_md_promise_perdisk *pd;
int i;
pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
for (i = 0; i < pd->pd_subdisks; i++) {
if (pd->pd_meta[i] != NULL) {
free(pd->pd_meta[i], M_MD_PROMISE);
pd->pd_meta[i] = NULL;
}
}
free(pd, M_MD_PROMISE);
disk->d_md_data = NULL;
return (0);
}
static int
g_raid_md_free_volume_promise(struct g_raid_md_object *md,
struct g_raid_volume *vol)
{
struct g_raid_md_promise_pervolume *pv;
pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data;
if (pv && pv->pv_meta != NULL) {
free(pv->pv_meta, M_MD_PROMISE);
pv->pv_meta = NULL;
}
if (pv && !pv->pv_started) {
pv->pv_started = 1;
callout_stop(&pv->pv_start_co);
}
free(pv, M_MD_PROMISE);
vol->v_md_data = NULL;
return (0);
}
static int
g_raid_md_free_promise(struct g_raid_md_object *md)
{
return (0);
}
G_RAID_MD_DECLARE(promise, "Promise");
Index: head/sys/geom/raid/tr_raid5.c
===================================================================
--- head/sys/geom/raid/tr_raid5.c (revision 327172)
+++ head/sys/geom/raid/tr_raid5.c (revision 327173)
@@ -1,423 +1,421 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2012 Alexander Motin <mav@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/bio.h>
#include <sys/endian.h>
#include <sys/kernel.h>
#include <sys/kobj.h>
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <geom/geom.h>
#include "geom/raid/g_raid.h"
#include "g_raid_tr_if.h"
static MALLOC_DEFINE(M_TR_RAID5, "tr_raid5_data", "GEOM_RAID RAID5 data");
#define TR_RAID5_NONE 0
#define TR_RAID5_REBUILD 1
#define TR_RAID5_RESYNC 2
#define TR_RAID5_F_DOING_SOME 0x1
#define TR_RAID5_F_LOCKED 0x2
#define TR_RAID5_F_ABORT 0x4
struct g_raid_tr_raid5_object {
struct g_raid_tr_object trso_base;
int trso_starting;
int trso_stopping;
int trso_type;
int trso_recover_slabs; /* slabs before rest */
int trso_fair_io;
int trso_meta_update;
int trso_flags;
struct g_raid_subdisk *trso_failed_sd; /* like per volume */
void *trso_buffer; /* Buffer space */
struct bio trso_bio;
};
static g_raid_tr_taste_t g_raid_tr_taste_raid5;
static g_raid_tr_event_t g_raid_tr_event_raid5;
static g_raid_tr_start_t g_raid_tr_start_raid5;
static g_raid_tr_stop_t g_raid_tr_stop_raid5;
static g_raid_tr_iostart_t g_raid_tr_iostart_raid5;
static g_raid_tr_iodone_t g_raid_tr_iodone_raid5;
static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid5;
static g_raid_tr_locked_t g_raid_tr_locked_raid5;
static g_raid_tr_free_t g_raid_tr_free_raid5;
static kobj_method_t g_raid_tr_raid5_methods[] = {
KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid5),
KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid5),
KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid5),
KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid5),
KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid5),
KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid5),
KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid5),
KOBJMETHOD(g_raid_tr_locked, g_raid_tr_locked_raid5),
KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid5),
{ 0, 0 }
};
static struct g_raid_tr_class g_raid_tr_raid5_class = {
"RAID5",
g_raid_tr_raid5_methods,
sizeof(struct g_raid_tr_raid5_object),
.trc_enable = 1,
.trc_priority = 100
};
static int
g_raid_tr_taste_raid5(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
{
struct g_raid_tr_raid5_object *trs;
u_int qual;
trs = (struct g_raid_tr_raid5_object *)tr;
qual = tr->tro_volume->v_raid_level_qualifier;
if (tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAID4 &&
(qual == G_RAID_VOLUME_RLQ_R4P0 ||
qual == G_RAID_VOLUME_RLQ_R4PN)) {
/* RAID4 */
} else if ((tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAID5 ||
tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAID5E ||
tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAID5EE ||
tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAID5R ||
tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAID6 ||
tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAIDMDF) &&
(qual == G_RAID_VOLUME_RLQ_R5RA ||
qual == G_RAID_VOLUME_RLQ_R5RS ||
qual == G_RAID_VOLUME_RLQ_R5LA ||
qual == G_RAID_VOLUME_RLQ_R5LS)) {
/* RAID5/5E/5EE/5R/6/MDF */
} else
return (G_RAID_TR_TASTE_FAIL);
trs->trso_starting = 1;
return (G_RAID_TR_TASTE_SUCCEED);
}
static int
g_raid_tr_update_state_raid5(struct g_raid_volume *vol,
struct g_raid_subdisk *sd)
{
struct g_raid_tr_raid5_object *trs;
struct g_raid_softc *sc;
u_int s;
int na, ns, nu;
sc = vol->v_softc;
trs = (struct g_raid_tr_raid5_object *)vol->v_tr;
if (trs->trso_stopping &&
(trs->trso_flags & TR_RAID5_F_DOING_SOME) == 0)
s = G_RAID_VOLUME_S_STOPPED;
else if (trs->trso_starting)
s = G_RAID_VOLUME_S_STARTING;
else {
na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE);
ns = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
nu = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
if (na == vol->v_disks_count)
s = G_RAID_VOLUME_S_OPTIMAL;
else if (na + ns == vol->v_disks_count ||
na + ns + nu == vol->v_disks_count /* XXX: Temporary. */)
s = G_RAID_VOLUME_S_SUBOPTIMAL;
else if (na == vol->v_disks_count - 1 ||
na + ns + nu == vol->v_disks_count)
s = G_RAID_VOLUME_S_DEGRADED;
else
s = G_RAID_VOLUME_S_BROKEN;
}
if (s != vol->v_state) {
g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
G_RAID_EVENT_VOLUME);
g_raid_change_volume_state(vol, s);
if (!trs->trso_starting && !trs->trso_stopping)
g_raid_write_metadata(sc, vol, NULL, NULL);
}
return (0);
}
static int
g_raid_tr_event_raid5(struct g_raid_tr_object *tr,
struct g_raid_subdisk *sd, u_int event)
{
g_raid_tr_update_state_raid5(tr->tro_volume, sd);
return (0);
}
static int
g_raid_tr_start_raid5(struct g_raid_tr_object *tr)
{
struct g_raid_tr_raid5_object *trs;
struct g_raid_volume *vol;
trs = (struct g_raid_tr_raid5_object *)tr;
trs->trso_starting = 0;
vol = tr->tro_volume;
vol->v_read_only = 1;
g_raid_tr_update_state_raid5(vol, NULL);
return (0);
}
static int
g_raid_tr_stop_raid5(struct g_raid_tr_object *tr)
{
struct g_raid_tr_raid5_object *trs;
struct g_raid_volume *vol;
trs = (struct g_raid_tr_raid5_object *)tr;
vol = tr->tro_volume;
trs->trso_starting = 0;
trs->trso_stopping = 1;
g_raid_tr_update_state_raid5(vol, NULL);
return (0);
}
static void
g_raid_tr_iostart_raid5_read(struct g_raid_tr_object *tr, struct bio *bp)
{
struct g_raid_volume *vol;
struct g_raid_subdisk *sd;
struct bio_queue_head queue;
struct bio *cbp;
char *addr;
off_t offset, start, length, nstripe, remain;
int no, pno, ddisks, pdisks, protate, pleft;
u_int strip_size, lvl, qual;
vol = tr->tro_volume;
addr = bp->bio_data;
strip_size = vol->v_strip_size;
lvl = tr->tro_volume->v_raid_level;
qual = tr->tro_volume->v_raid_level_qualifier;
protate = tr->tro_volume->v_rotate_parity;
/* Stripe number. */
nstripe = bp->bio_offset / strip_size;
/* Start position in stripe. */
start = bp->bio_offset % strip_size;
/* Number of data and parity disks. */
if (lvl == G_RAID_VOLUME_RL_RAIDMDF)
pdisks = tr->tro_volume->v_mdf_pdisks;
else if (lvl == G_RAID_VOLUME_RL_RAID5EE ||
lvl == G_RAID_VOLUME_RL_RAID6)
pdisks = 2;
else
pdisks = 1;
ddisks = vol->v_disks_count - pdisks;
/* Parity disk number. */
if (lvl == G_RAID_VOLUME_RL_RAID4) {
if (qual == 0) /* P0 */
pno = 0;
else /* PN */
pno = ddisks;
pleft = -1;
} else {
pno = (nstripe / (ddisks * protate)) % vol->v_disks_count;
pleft = protate - (nstripe / ddisks) % protate;
if (qual >= 2) { /* PN/Left */
pno = ddisks - pno;
if (pno < 0)
pno += vol->v_disks_count;
}
}
/* Data disk number. */
no = nstripe % ddisks;
if (lvl == G_RAID_VOLUME_RL_RAID4) {
if (qual == 0)
no += pdisks;
} else if (qual & 1) { /* Continuation/Symmetric */
no = (pno + pdisks + no) % vol->v_disks_count;
} else if (no >= pno) /* Restart/Asymmetric */
no += pdisks;
else
no += imax(0, pno + pdisks - vol->v_disks_count);
/* Stripe start position in disk. */
offset = (nstripe / ddisks) * strip_size;
/* Length of data to operate. */
remain = bp->bio_length;
bioq_init(&queue);
do {
length = MIN(strip_size - start, remain);
cbp = g_clone_bio(bp);
if (cbp == NULL)
goto failure;
cbp->bio_offset = offset + start;
cbp->bio_data = addr;
cbp->bio_length = length;
cbp->bio_caller1 = &vol->v_subdisks[no];
bioq_insert_tail(&queue, cbp);
no++;
if (lvl == G_RAID_VOLUME_RL_RAID4) {
no %= vol->v_disks_count;
if (no == pno)
no = (no + pdisks) % vol->v_disks_count;
} else if (qual & 1) { /* Continuation/Symmetric */
no %= vol->v_disks_count;
if (no == pno) {
if ((--pleft) <= 0) {
pleft += protate;
if (qual < 2) /* P0/Right */
pno++;
else /* PN/Left */
pno += vol->v_disks_count - 1;
pno %= vol->v_disks_count;
}
no = (pno + pdisks) % vol->v_disks_count;
offset += strip_size;
}
} else { /* Restart/Asymmetric */
if (no == pno)
no += pdisks;
if (no >= vol->v_disks_count) {
no -= vol->v_disks_count;
if ((--pleft) <= 0) {
pleft += protate;
if (qual < 2) /* P0/Right */
pno++;
else /* PN/Left */
pno += vol->v_disks_count - 1;
pno %= vol->v_disks_count;
}
if (no == pno)
no += pdisks;
else
no += imax(0, pno + pdisks - vol->v_disks_count);
offset += strip_size;
}
}
remain -= length;
addr += length;
start = 0;
} while (remain > 0);
while ((cbp = bioq_takefirst(&queue)) != NULL) {
sd = cbp->bio_caller1;
cbp->bio_caller1 = NULL;
g_raid_subdisk_iostart(sd, cbp);
}
return;
failure:
while ((cbp = bioq_takefirst(&queue)) != NULL)
g_destroy_bio(cbp);
if (bp->bio_error == 0)
bp->bio_error = ENOMEM;
g_raid_iodone(bp, bp->bio_error);
}
static void
g_raid_tr_iostart_raid5(struct g_raid_tr_object *tr, struct bio *bp)
{
struct g_raid_volume *vol;
- struct g_raid_tr_raid5_object *trs;
vol = tr->tro_volume;
- trs = (struct g_raid_tr_raid5_object *)tr;
if (vol->v_state < G_RAID_VOLUME_S_SUBOPTIMAL) {
g_raid_iodone(bp, EIO);
return;
}
switch (bp->bio_cmd) {
case BIO_READ:
g_raid_tr_iostart_raid5_read(tr, bp);
break;
case BIO_WRITE:
case BIO_DELETE:
case BIO_FLUSH:
g_raid_iodone(bp, ENODEV);
break;
default:
KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
bp->bio_cmd, vol->v_name));
break;
}
}
static void
g_raid_tr_iodone_raid5(struct g_raid_tr_object *tr,
struct g_raid_subdisk *sd, struct bio *bp)
{
struct bio *pbp;
pbp = bp->bio_parent;
if (pbp->bio_error == 0)
pbp->bio_error = bp->bio_error;
pbp->bio_inbed++;
g_destroy_bio(bp);
if (pbp->bio_children == pbp->bio_inbed) {
pbp->bio_completed = pbp->bio_length;
g_raid_iodone(pbp, pbp->bio_error);
}
}
static int
g_raid_tr_kerneldump_raid5(struct g_raid_tr_object *tr,
void *virtual, vm_offset_t physical, off_t offset, size_t length)
{
return (ENODEV);
}
static int
g_raid_tr_locked_raid5(struct g_raid_tr_object *tr, void *argp)
{
struct bio *bp;
struct g_raid_subdisk *sd;
bp = (struct bio *)argp;
sd = (struct g_raid_subdisk *)bp->bio_caller1;
g_raid_subdisk_iostart(sd, bp);
return (0);
}
static int
g_raid_tr_free_raid5(struct g_raid_tr_object *tr)
{
struct g_raid_tr_raid5_object *trs;
trs = (struct g_raid_tr_raid5_object *)tr;
if (trs->trso_buffer != NULL) {
free(trs->trso_buffer, M_TR_RAID5);
trs->trso_buffer = NULL;
}
return (0);
}
G_RAID_TR_DECLARE(raid5, "RAID5");
Index: head/sys/kern/kern_synch.c
===================================================================
--- head/sys/kern/kern_synch.c (revision 327172)
+++ head/sys/kern/kern_synch.c (revision 327173)
@@ -1,576 +1,572 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (c) 1982, 1986, 1990, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_synch.c 8.9 (Berkeley) 5/19/95
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_ktrace.h"
#include "opt_sched.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/condvar.h>
#include <sys/kdb.h>
#include <sys/kernel.h>
#include <sys/ktr.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/sched.h>
#include <sys/sdt.h>
#include <sys/signalvar.h>
#include <sys/sleepqueue.h>
#include <sys/smp.h>
#include <sys/sx.h>
#include <sys/sysctl.h>
#include <sys/sysproto.h>
#include <sys/vmmeter.h>
#ifdef KTRACE
#include <sys/uio.h>
#include <sys/ktrace.h>
#endif
#include <machine/cpu.h>
static void synch_setup(void *dummy);
SYSINIT(synch_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, synch_setup,
NULL);
int hogticks;
static uint8_t pause_wchan[MAXCPU];
static struct callout loadav_callout;
struct loadavg averunnable =
{ {0, 0, 0}, FSCALE }; /* load average, of runnable procs */
/*
* Constants for averages over 1, 5, and 15 minutes
* when sampling at 5 second intervals.
*/
static fixpt_t cexp[3] = {
0.9200444146293232 * FSCALE, /* exp(-1/12) */
0.9834714538216174 * FSCALE, /* exp(-1/60) */
0.9944598480048967 * FSCALE, /* exp(-1/180) */
};
/* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */
SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, FSCALE, "");
static void loadav(void *arg);
SDT_PROVIDER_DECLARE(sched);
SDT_PROBE_DEFINE(sched, , , preempt);
static void
sleepinit(void *unused)
{
hogticks = (hz / 10) * 2; /* Default only. */
init_sleepqueues();
}
/*
* vmem tries to lock the sleepq mutexes when free'ing kva, so make sure
* it is available.
*/
SYSINIT(sleepinit, SI_SUB_KMEM, SI_ORDER_ANY, sleepinit, 0);
/*
* General sleep call. Suspends the current thread until a wakeup is
* performed on the specified identifier. The thread will then be made
* runnable with the specified priority. Sleeps at most sbt units of time
* (0 means no timeout). If pri includes the PCATCH flag, let signals
* interrupt the sleep, otherwise ignore them while sleeping. Returns 0 if
* awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a
* signal becomes pending, ERESTART is returned if the current system
* call should be restarted if possible, and EINTR is returned if the system
* call should be interrupted by the signal (return EINTR).
*
* The lock argument is unlocked before the caller is suspended, and
* re-locked before _sleep() returns. If priority includes the PDROP
* flag the lock is not re-locked before returning.
*/
int
_sleep(void *ident, struct lock_object *lock, int priority,
const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags)
{
struct thread *td;
- struct proc *p;
struct lock_class *class;
uintptr_t lock_state;
int catch, pri, rval, sleepq_flags;
WITNESS_SAVE_DECL(lock_witness);
td = curthread;
- p = td->td_proc;
#ifdef KTRACE
if (KTRPOINT(td, KTR_CSW))
ktrcsw(1, 0, wmesg);
#endif
WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock,
"Sleeping on \"%s\"", wmesg);
KASSERT(sbt != 0 || mtx_owned(&Giant) || lock != NULL,
("sleeping without a lock"));
KASSERT(ident != NULL, ("_sleep: NULL ident"));
KASSERT(TD_IS_RUNNING(td), ("_sleep: curthread not running"));
if (priority & PDROP)
KASSERT(lock != NULL && lock != &Giant.lock_object,
("PDROP requires a non-Giant lock"));
if (lock != NULL)
class = LOCK_CLASS(lock);
else
class = NULL;
if (SCHEDULER_STOPPED_TD(td)) {
if (lock != NULL && priority & PDROP)
class->lc_unlock(lock);
return (0);
}
catch = priority & PCATCH;
pri = priority & PRIMASK;
KASSERT(!TD_ON_SLEEPQ(td), ("recursive sleep"));
if ((uint8_t *)ident >= &pause_wchan[0] &&
(uint8_t *)ident <= &pause_wchan[MAXCPU - 1])
sleepq_flags = SLEEPQ_PAUSE;
else
sleepq_flags = SLEEPQ_SLEEP;
if (catch)
sleepq_flags |= SLEEPQ_INTERRUPTIBLE;
sleepq_lock(ident);
CTR5(KTR_PROC, "sleep: thread %ld (pid %ld, %s) on %s (%p)",
- td->td_tid, p->p_pid, td->td_name, wmesg, ident);
+ td->td_tid, td->td_proc->p_pid, td->td_name, wmesg, ident);
if (lock == &Giant.lock_object)
mtx_assert(&Giant, MA_OWNED);
DROP_GIANT();
if (lock != NULL && lock != &Giant.lock_object &&
!(class->lc_flags & LC_SLEEPABLE)) {
WITNESS_SAVE(lock, lock_witness);
lock_state = class->lc_unlock(lock);
} else
/* GCC needs to follow the Yellow Brick Road */
lock_state = -1;
/*
* We put ourselves on the sleep queue and start our timeout
* before calling thread_suspend_check, as we could stop there,
* and a wakeup or a SIGCONT (or both) could occur while we were
* stopped without resuming us. Thus, we must be ready for sleep
* when cursig() is called. If the wakeup happens while we're
* stopped, then td will no longer be on a sleep queue upon
* return from cursig().
*/
sleepq_add(ident, lock, wmesg, sleepq_flags, 0);
if (sbt != 0)
sleepq_set_timeout_sbt(ident, sbt, pr, flags);
if (lock != NULL && class->lc_flags & LC_SLEEPABLE) {
sleepq_release(ident);
WITNESS_SAVE(lock, lock_witness);
lock_state = class->lc_unlock(lock);
sleepq_lock(ident);
}
if (sbt != 0 && catch)
rval = sleepq_timedwait_sig(ident, pri);
else if (sbt != 0)
rval = sleepq_timedwait(ident, pri);
else if (catch)
rval = sleepq_wait_sig(ident, pri);
else {
sleepq_wait(ident, pri);
rval = 0;
}
#ifdef KTRACE
if (KTRPOINT(td, KTR_CSW))
ktrcsw(0, 0, wmesg);
#endif
PICKUP_GIANT();
if (lock != NULL && lock != &Giant.lock_object && !(priority & PDROP)) {
class->lc_lock(lock, lock_state);
WITNESS_RESTORE(lock, lock_witness);
}
return (rval);
}
int
msleep_spin_sbt(void *ident, struct mtx *mtx, const char *wmesg,
sbintime_t sbt, sbintime_t pr, int flags)
{
struct thread *td;
- struct proc *p;
int rval;
WITNESS_SAVE_DECL(mtx);
td = curthread;
- p = td->td_proc;
KASSERT(mtx != NULL, ("sleeping without a mutex"));
KASSERT(ident != NULL, ("msleep_spin_sbt: NULL ident"));
KASSERT(TD_IS_RUNNING(td), ("msleep_spin_sbt: curthread not running"));
if (SCHEDULER_STOPPED_TD(td))
return (0);
sleepq_lock(ident);
CTR5(KTR_PROC, "msleep_spin: thread %ld (pid %ld, %s) on %s (%p)",
- td->td_tid, p->p_pid, td->td_name, wmesg, ident);
+ td->td_tid, td->td_proc->p_pid, td->td_name, wmesg, ident);
DROP_GIANT();
mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED);
WITNESS_SAVE(&mtx->lock_object, mtx);
mtx_unlock_spin(mtx);
/*
* We put ourselves on the sleep queue and start our timeout.
*/
sleepq_add(ident, &mtx->lock_object, wmesg, SLEEPQ_SLEEP, 0);
if (sbt != 0)
sleepq_set_timeout_sbt(ident, sbt, pr, flags);
/*
* Can't call ktrace with any spin locks held so it can lock the
* ktrace_mtx lock, and WITNESS_WARN considers it an error to hold
* any spin lock. Thus, we have to drop the sleepq spin lock while
* we handle those requests. This is safe since we have placed our
* thread on the sleep queue already.
*/
#ifdef KTRACE
if (KTRPOINT(td, KTR_CSW)) {
sleepq_release(ident);
ktrcsw(1, 0, wmesg);
sleepq_lock(ident);
}
#endif
#ifdef WITNESS
sleepq_release(ident);
WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "Sleeping on \"%s\"",
wmesg);
sleepq_lock(ident);
#endif
if (sbt != 0)
rval = sleepq_timedwait(ident, 0);
else {
sleepq_wait(ident, 0);
rval = 0;
}
#ifdef KTRACE
if (KTRPOINT(td, KTR_CSW))
ktrcsw(0, 0, wmesg);
#endif
PICKUP_GIANT();
mtx_lock_spin(mtx);
WITNESS_RESTORE(&mtx->lock_object, mtx);
return (rval);
}
/*
* pause() delays the calling thread by the given number of system ticks.
* During cold bootup, pause() uses the DELAY() function instead of
* the tsleep() function to do the waiting. The "timo" argument must be
* greater than or equal to zero. A "timo" value of zero is equivalent
* to a "timo" value of one.
*/
int
pause_sbt(const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags)
{
KASSERT(sbt >= 0, ("pause: timeout must be >= 0"));
/* silently convert invalid timeouts */
if (sbt == 0)
sbt = tick_sbt;
if ((cold && curthread == &thread0) || kdb_active ||
SCHEDULER_STOPPED()) {
/*
* We delay one second at a time to avoid overflowing the
* system specific DELAY() function(s):
*/
while (sbt >= SBT_1S) {
DELAY(1000000);
sbt -= SBT_1S;
}
/* Do the delay remainder, if any */
sbt = howmany(sbt, SBT_1US);
if (sbt > 0)
DELAY(sbt);
return (0);
}
return (_sleep(&pause_wchan[curcpu], NULL, 0, wmesg, sbt, pr, flags));
}
/*
* Make all threads sleeping on the specified identifier runnable.
*/
void
wakeup(void *ident)
{
int wakeup_swapper;
sleepq_lock(ident);
wakeup_swapper = sleepq_broadcast(ident, SLEEPQ_SLEEP, 0, 0);
sleepq_release(ident);
if (wakeup_swapper) {
KASSERT(ident != &proc0,
("wakeup and wakeup_swapper and proc0"));
kick_proc0();
}
}
/*
* Make a thread sleeping on the specified identifier runnable.
* May wake more than one thread if a target thread is currently
* swapped out.
*/
void
wakeup_one(void *ident)
{
int wakeup_swapper;
sleepq_lock(ident);
wakeup_swapper = sleepq_signal(ident, SLEEPQ_SLEEP, 0, 0);
sleepq_release(ident);
if (wakeup_swapper)
kick_proc0();
}
static void
kdb_switch(void)
{
thread_unlock(curthread);
kdb_backtrace();
kdb_reenter();
panic("%s: did not reenter debugger", __func__);
}
/*
* The machine independent parts of context switching.
*/
void
mi_switch(int flags, struct thread *newtd)
{
uint64_t runtime, new_switchtime;
struct thread *td;
td = curthread; /* XXX */
THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED);
KASSERT(!TD_ON_RUNQ(td), ("mi_switch: called by old code"));
#ifdef INVARIANTS
if (!TD_ON_LOCK(td) && !TD_IS_RUNNING(td))
mtx_assert(&Giant, MA_NOTOWNED);
#endif
KASSERT(td->td_critnest == 1 || panicstr,
("mi_switch: switch in a critical section"));
KASSERT((flags & (SW_INVOL | SW_VOL)) != 0,
("mi_switch: switch must be voluntary or involuntary"));
KASSERT(newtd != curthread, ("mi_switch: preempting back to ourself"));
/*
* Don't perform context switches from the debugger.
*/
if (kdb_active)
kdb_switch();
if (SCHEDULER_STOPPED_TD(td))
return;
if (flags & SW_VOL) {
td->td_ru.ru_nvcsw++;
td->td_swvoltick = ticks;
} else {
td->td_ru.ru_nivcsw++;
td->td_swinvoltick = ticks;
}
#ifdef SCHED_STATS
SCHED_STAT_INC(sched_switch_stats[flags & SW_TYPE_MASK]);
#endif
/*
* Compute the amount of time during which the current
* thread was running, and add that to its total so far.
*/
new_switchtime = cpu_ticks();
runtime = new_switchtime - PCPU_GET(switchtime);
td->td_runtime += runtime;
td->td_incruntime += runtime;
PCPU_SET(switchtime, new_switchtime);
td->td_generation++; /* bump preempt-detect counter */
VM_CNT_INC(v_swtch);
PCPU_SET(switchticks, ticks);
CTR4(KTR_PROC, "mi_switch: old thread %ld (td_sched %p, pid %ld, %s)",
td->td_tid, td_get_sched(td), td->td_proc->p_pid, td->td_name);
#ifdef KDTRACE_HOOKS
if ((flags & SW_PREEMPT) != 0 || ((flags & SW_INVOL) != 0 &&
(flags & SW_TYPE_MASK) == SWT_NEEDRESCHED))
SDT_PROBE0(sched, , , preempt);
#endif
sched_switch(td, newtd, flags);
CTR4(KTR_PROC, "mi_switch: new thread %ld (td_sched %p, pid %ld, %s)",
td->td_tid, td_get_sched(td), td->td_proc->p_pid, td->td_name);
/*
* If the last thread was exiting, finish cleaning it up.
*/
if ((td = PCPU_GET(deadthread))) {
PCPU_SET(deadthread, NULL);
thread_stash(td);
}
}
/*
* Change thread state to be runnable, placing it on the run queue if
* it is in memory. If it is swapped out, return true so our caller
* will know to awaken the swapper.
*/
int
setrunnable(struct thread *td)
{
THREAD_LOCK_ASSERT(td, MA_OWNED);
KASSERT(td->td_proc->p_state != PRS_ZOMBIE,
("setrunnable: pid %d is a zombie", td->td_proc->p_pid));
switch (td->td_state) {
case TDS_RUNNING:
case TDS_RUNQ:
return (0);
case TDS_INHIBITED:
/*
* If we are only inhibited because we are swapped out
* then arange to swap in this process. Otherwise just return.
*/
if (td->td_inhibitors != TDI_SWAPPED)
return (0);
/* FALLTHROUGH */
case TDS_CAN_RUN:
break;
default:
printf("state is 0x%x", td->td_state);
panic("setrunnable(2)");
}
if ((td->td_flags & TDF_INMEM) == 0) {
if ((td->td_flags & TDF_SWAPINREQ) == 0) {
td->td_flags |= TDF_SWAPINREQ;
return (1);
}
} else
sched_wakeup(td);
return (0);
}
/*
* Compute a tenex style load average of a quantity on
* 1, 5 and 15 minute intervals.
*/
static void
loadav(void *arg)
{
int i, nrun;
struct loadavg *avg;
nrun = sched_load();
avg = &averunnable;
for (i = 0; i < 3; i++)
avg->ldavg[i] = (cexp[i] * avg->ldavg[i] +
nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;
/*
* Schedule the next update to occur after 5 seconds, but add a
* random variation to avoid synchronisation with processes that
* run at regular intervals.
*/
callout_reset_sbt(&loadav_callout,
SBT_1US * (4000000 + (int)(random() % 2000001)), SBT_1US,
loadav, NULL, C_DIRECT_EXEC | C_PREL(32));
}
/* ARGSUSED */
static void
synch_setup(void *dummy)
{
callout_init(&loadav_callout, 1);
/* Kick off timeout driven events by calling first time. */
loadav(NULL);
}
int
should_yield(void)
{
return ((u_int)ticks - (u_int)curthread->td_swvoltick >= hogticks);
}
void
maybe_yield(void)
{
if (should_yield())
kern_yield(PRI_USER);
}
void
kern_yield(int prio)
{
struct thread *td;
td = curthread;
DROP_GIANT();
thread_lock(td);
if (prio == PRI_USER)
prio = td->td_user_pri;
if (prio >= 0)
sched_prio(td, prio);
mi_switch(SW_VOL | SWT_RELINQUISH, NULL);
thread_unlock(td);
PICKUP_GIANT();
}
/*
* General purpose yield system call.
*/
int
sys_yield(struct thread *td, struct yield_args *uap)
{
thread_lock(td);
if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
sched_prio(td, PRI_MAX_TIMESHARE);
mi_switch(SW_VOL | SWT_RELINQUISH, NULL);
thread_unlock(td);
td->td_retval[0] = 0;
return (0);
}
Index: head/sys/kern/link_elf.c
===================================================================
--- head/sys/kern/link_elf.c (revision 327172)
+++ head/sys/kern/link_elf.c (revision 327173)
@@ -1,1660 +1,1652 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 1998-2000 Doug Rabson
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_ddb.h"
#include "opt_gdb.h"
#include <sys/param.h>
#include <sys/systm.h>
#ifdef GPROF
#include <sys/gmon.h>
#endif
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/mount.h>
#include <sys/pcpu.h>
#include <sys/proc.h>
#include <sys/namei.h>
#include <sys/fcntl.h>
#include <sys/vnode.h>
#include <sys/linker.h>
#include <machine/elf.h>
#include <net/vnet.h>
#include <security/mac/mac_framework.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#ifdef SPARSE_MAPPING
#include <vm/vm_object.h>
#include <vm/vm_kern.h>
#include <vm/vm_extern.h>
#endif
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <sys/link_elf.h>
#ifdef DDB_CTF
#include <sys/zlib.h>
#endif
#include "linker_if.h"
#define MAXSEGS 4
typedef struct elf_file {
struct linker_file lf; /* Common fields */
int preloaded; /* Was file pre-loaded */
caddr_t address; /* Relocation address */
#ifdef SPARSE_MAPPING
vm_object_t object; /* VM object to hold file pages */
#endif
Elf_Dyn *dynamic; /* Symbol table etc. */
Elf_Hashelt nbuckets; /* DT_HASH info */
Elf_Hashelt nchains;
const Elf_Hashelt *buckets;
const Elf_Hashelt *chains;
caddr_t hash;
caddr_t strtab; /* DT_STRTAB */
int strsz; /* DT_STRSZ */
const Elf_Sym *symtab; /* DT_SYMTAB */
Elf_Addr *got; /* DT_PLTGOT */
const Elf_Rel *pltrel; /* DT_JMPREL */
int pltrelsize; /* DT_PLTRELSZ */
const Elf_Rela *pltrela; /* DT_JMPREL */
int pltrelasize; /* DT_PLTRELSZ */
const Elf_Rel *rel; /* DT_REL */
int relsize; /* DT_RELSZ */
const Elf_Rela *rela; /* DT_RELA */
int relasize; /* DT_RELASZ */
caddr_t modptr;
const Elf_Sym *ddbsymtab; /* The symbol table we are using */
long ddbsymcnt; /* Number of symbols */
caddr_t ddbstrtab; /* String table */
long ddbstrcnt; /* number of bytes in string table */
caddr_t symbase; /* malloc'ed symbold base */
caddr_t strbase; /* malloc'ed string base */
caddr_t ctftab; /* CTF table */
long ctfcnt; /* number of bytes in CTF table */
caddr_t ctfoff; /* CTF offset table */
caddr_t typoff; /* Type offset table */
long typlen; /* Number of type entries. */
Elf_Addr pcpu_start; /* Pre-relocation pcpu set start. */
Elf_Addr pcpu_stop; /* Pre-relocation pcpu set stop. */
Elf_Addr pcpu_base; /* Relocated pcpu set address. */
#ifdef VIMAGE
Elf_Addr vnet_start; /* Pre-relocation vnet set start. */
Elf_Addr vnet_stop; /* Pre-relocation vnet set stop. */
Elf_Addr vnet_base; /* Relocated vnet set address. */
#endif
#ifdef GDB
struct link_map gdb; /* hooks for gdb */
#endif
} *elf_file_t;
struct elf_set {
Elf_Addr es_start;
Elf_Addr es_stop;
Elf_Addr es_base;
TAILQ_ENTRY(elf_set) es_link;
};
TAILQ_HEAD(elf_set_head, elf_set);
#include <kern/kern_ctf.c>
static int link_elf_link_common_finish(linker_file_t);
static int link_elf_link_preload(linker_class_t cls,
const char *, linker_file_t *);
static int link_elf_link_preload_finish(linker_file_t);
static int link_elf_load_file(linker_class_t, const char *,
linker_file_t *);
static int link_elf_lookup_symbol(linker_file_t, const char *,
c_linker_sym_t *);
static int link_elf_symbol_values(linker_file_t, c_linker_sym_t,
linker_symval_t *);
static int link_elf_search_symbol(linker_file_t, caddr_t,
c_linker_sym_t *, long *);
static void link_elf_unload_file(linker_file_t);
static void link_elf_unload_preload(linker_file_t);
static int link_elf_lookup_set(linker_file_t, const char *,
void ***, void ***, int *);
static int link_elf_each_function_name(linker_file_t,
int (*)(const char *, void *), void *);
static int link_elf_each_function_nameval(linker_file_t,
linker_function_nameval_callback_t, void *);
static void link_elf_reloc_local(linker_file_t);
static long link_elf_symtab_get(linker_file_t, const Elf_Sym **);
static long link_elf_strtab_get(linker_file_t, caddr_t *);
static int elf_lookup(linker_file_t, Elf_Size, int, Elf_Addr *);
static kobj_method_t link_elf_methods[] = {
KOBJMETHOD(linker_lookup_symbol, link_elf_lookup_symbol),
KOBJMETHOD(linker_symbol_values, link_elf_symbol_values),
KOBJMETHOD(linker_search_symbol, link_elf_search_symbol),
KOBJMETHOD(linker_unload, link_elf_unload_file),
KOBJMETHOD(linker_load_file, link_elf_load_file),
KOBJMETHOD(linker_link_preload, link_elf_link_preload),
KOBJMETHOD(linker_link_preload_finish, link_elf_link_preload_finish),
KOBJMETHOD(linker_lookup_set, link_elf_lookup_set),
KOBJMETHOD(linker_each_function_name, link_elf_each_function_name),
KOBJMETHOD(linker_each_function_nameval, link_elf_each_function_nameval),
KOBJMETHOD(linker_ctf_get, link_elf_ctf_get),
KOBJMETHOD(linker_symtab_get, link_elf_symtab_get),
KOBJMETHOD(linker_strtab_get, link_elf_strtab_get),
{ 0, 0 }
};
static struct linker_class link_elf_class = {
#if ELF_TARG_CLASS == ELFCLASS32
"elf32",
#else
"elf64",
#endif
link_elf_methods, sizeof(struct elf_file)
};
static int parse_dynamic(elf_file_t);
static int relocate_file(elf_file_t);
static int link_elf_preload_parse_symbols(elf_file_t);
static struct elf_set_head set_pcpu_list;
#ifdef VIMAGE
static struct elf_set_head set_vnet_list;
#endif
static void
elf_set_add(struct elf_set_head *list, Elf_Addr start, Elf_Addr stop, Elf_Addr base)
{
struct elf_set *set, *iter;
set = malloc(sizeof(*set), M_LINKER, M_WAITOK);
set->es_start = start;
set->es_stop = stop;
set->es_base = base;
TAILQ_FOREACH(iter, list, es_link) {
KASSERT((set->es_start < iter->es_start && set->es_stop < iter->es_stop) ||
(set->es_start > iter->es_start && set->es_stop > iter->es_stop),
("linker sets intersection: to insert: 0x%jx-0x%jx; inserted: 0x%jx-0x%jx",
(uintmax_t)set->es_start, (uintmax_t)set->es_stop,
(uintmax_t)iter->es_start, (uintmax_t)iter->es_stop));
if (iter->es_start > set->es_start) {
TAILQ_INSERT_BEFORE(iter, set, es_link);
break;
}
}
if (iter == NULL)
TAILQ_INSERT_TAIL(list, set, es_link);
}
static int
elf_set_find(struct elf_set_head *list, Elf_Addr addr, Elf_Addr *start, Elf_Addr *base)
{
struct elf_set *set;
TAILQ_FOREACH(set, list, es_link) {
if (addr < set->es_start)
return (0);
if (addr < set->es_stop) {
*start = set->es_start;
*base = set->es_base;
return (1);
}
}
return (0);
}
static void
elf_set_delete(struct elf_set_head *list, Elf_Addr start)
{
struct elf_set *set;
TAILQ_FOREACH(set, list, es_link) {
if (start < set->es_start)
break;
if (start == set->es_start) {
TAILQ_REMOVE(list, set, es_link);
free(set, M_LINKER);
return;
}
}
KASSERT(0, ("deleting unknown linker set (start = 0x%jx)",
(uintmax_t)start));
}
#ifdef GDB
static void r_debug_state(struct r_debug *, struct link_map *);
/*
* A list of loaded modules for GDB to use for loading symbols.
*/
struct r_debug r_debug;
#define GDB_STATE(s) do { \
r_debug.r_state = s; r_debug_state(NULL, NULL); \
} while (0)
/*
* Function for the debugger to set a breakpoint on to gain control.
*/
static void
r_debug_state(struct r_debug *dummy_one __unused,
struct link_map *dummy_two __unused)
{
}
static void
link_elf_add_gdb(struct link_map *l)
{
struct link_map *prev;
l->l_next = NULL;
if (r_debug.r_map == NULL) {
/* Add first. */
l->l_prev = NULL;
r_debug.r_map = l;
} else {
/* Append to list. */
for (prev = r_debug.r_map;
prev->l_next != NULL;
prev = prev->l_next)
;
l->l_prev = prev;
prev->l_next = l;
}
}
static void
link_elf_delete_gdb(struct link_map *l)
{
if (l->l_prev == NULL) {
/* Remove first. */
if ((r_debug.r_map = l->l_next) != NULL)
l->l_next->l_prev = NULL;
} else {
/* Remove any but first. */
if ((l->l_prev->l_next = l->l_next) != NULL)
l->l_next->l_prev = l->l_prev;
}
}
#endif /* GDB */
/*
* The kernel symbol table starts here.
*/
extern struct _dynamic _DYNAMIC;
static void
link_elf_error(const char *filename, const char *s)
{
if (filename == NULL)
printf("kldload: %s\n", s);
else
printf("kldload: %s: %s\n", filename, s);
}
static void
link_elf_invoke_ctors(caddr_t addr, size_t size)
{
void (**ctor)(void);
size_t i, cnt;
if (addr == NULL || size == 0)
return;
cnt = size / sizeof(*ctor);
ctor = (void *)addr;
for (i = 0; i < cnt; i++) {
if (ctor[i] != NULL)
(*ctor[i])();
}
}
/*
* Actions performed after linking/loading both the preloaded kernel and any
* modules; whether preloaded or dynamicly loaded.
*/
static int
link_elf_link_common_finish(linker_file_t lf)
{
#ifdef GDB
elf_file_t ef = (elf_file_t)lf;
char *newfilename;
#endif
int error;
/* Notify MD code that a module is being loaded. */
error = elf_cpu_load_file(lf);
if (error != 0)
return (error);
#ifdef GDB
GDB_STATE(RT_ADD);
ef->gdb.l_addr = lf->address;
newfilename = malloc(strlen(lf->filename) + 1, M_LINKER, M_WAITOK);
strcpy(newfilename, lf->filename);
ef->gdb.l_name = newfilename;
ef->gdb.l_ld = ef->dynamic;
link_elf_add_gdb(&ef->gdb);
GDB_STATE(RT_CONSISTENT);
#endif
/* Invoke .ctors */
link_elf_invoke_ctors(lf->ctors_addr, lf->ctors_size);
return (0);
}
extern vm_offset_t __startkernel;
static void
link_elf_init(void* arg)
{
Elf_Dyn *dp;
Elf_Addr *ctors_addrp;
Elf_Size *ctors_sizep;
caddr_t modptr, baseptr, sizeptr;
elf_file_t ef;
char *modname;
linker_add_class(&link_elf_class);
dp = (Elf_Dyn *)&_DYNAMIC;
modname = NULL;
modptr = preload_search_by_type("elf" __XSTRING(__ELF_WORD_SIZE) " kernel");
if (modptr == NULL)
modptr = preload_search_by_type("elf kernel");
modname = (char *)preload_search_info(modptr, MODINFO_NAME);
if (modname == NULL)
modname = "kernel";
linker_kernel_file = linker_make_file(modname, &link_elf_class);
if (linker_kernel_file == NULL)
panic("%s: Can't create linker structures for kernel",
__func__);
ef = (elf_file_t) linker_kernel_file;
ef->preloaded = 1;
#ifdef __powerpc__
ef->address = (caddr_t) (__startkernel - KERNBASE);
#else
ef->address = 0;
#endif
#ifdef SPARSE_MAPPING
ef->object = 0;
#endif
ef->dynamic = dp;
if (dp != NULL)
parse_dynamic(ef);
linker_kernel_file->address += KERNBASE;
linker_kernel_file->size = -(intptr_t)linker_kernel_file->address;
if (modptr != NULL) {
ef->modptr = modptr;
baseptr = preload_search_info(modptr, MODINFO_ADDR);
if (baseptr != NULL)
linker_kernel_file->address = *(caddr_t *)baseptr;
sizeptr = preload_search_info(modptr, MODINFO_SIZE);
if (sizeptr != NULL)
linker_kernel_file->size = *(size_t *)sizeptr;
ctors_addrp = (Elf_Addr *)preload_search_info(modptr,
MODINFO_METADATA | MODINFOMD_CTORS_ADDR);
ctors_sizep = (Elf_Size *)preload_search_info(modptr,
MODINFO_METADATA | MODINFOMD_CTORS_SIZE);
if (ctors_addrp != NULL && ctors_sizep != NULL) {
linker_kernel_file->ctors_addr = ef->address +
*ctors_addrp;
linker_kernel_file->ctors_size = *ctors_sizep;
}
}
(void)link_elf_preload_parse_symbols(ef);
#ifdef GDB
r_debug.r_map = NULL;
r_debug.r_brk = r_debug_state;
r_debug.r_state = RT_CONSISTENT;
#endif
(void)link_elf_link_common_finish(linker_kernel_file);
linker_kernel_file->flags |= LINKER_FILE_LINKED;
TAILQ_INIT(&set_pcpu_list);
#ifdef VIMAGE
TAILQ_INIT(&set_vnet_list);
#endif
}
SYSINIT(link_elf, SI_SUB_KLD, SI_ORDER_THIRD, link_elf_init, 0);
static int
link_elf_preload_parse_symbols(elf_file_t ef)
{
caddr_t pointer;
caddr_t ssym, esym, base;
caddr_t strtab;
int strcnt;
Elf_Sym *symtab;
int symcnt;
if (ef->modptr == NULL)
return (0);
pointer = preload_search_info(ef->modptr,
MODINFO_METADATA | MODINFOMD_SSYM);
if (pointer == NULL)
return (0);
ssym = *(caddr_t *)pointer;
pointer = preload_search_info(ef->modptr,
MODINFO_METADATA | MODINFOMD_ESYM);
if (pointer == NULL)
return (0);
esym = *(caddr_t *)pointer;
base = ssym;
symcnt = *(long *)base;
base += sizeof(long);
symtab = (Elf_Sym *)base;
base += roundup(symcnt, sizeof(long));
if (base > esym || base < ssym) {
printf("Symbols are corrupt!\n");
return (EINVAL);
}
strcnt = *(long *)base;
base += sizeof(long);
strtab = base;
base += roundup(strcnt, sizeof(long));
if (base > esym || base < ssym) {
printf("Symbols are corrupt!\n");
return (EINVAL);
}
ef->ddbsymtab = symtab;
ef->ddbsymcnt = symcnt / sizeof(Elf_Sym);
ef->ddbstrtab = strtab;
ef->ddbstrcnt = strcnt;
return (0);
}
static int
parse_dynamic(elf_file_t ef)
{
Elf_Dyn *dp;
int plttype = DT_REL;
for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) {
switch (dp->d_tag) {
case DT_HASH:
{
/* From src/libexec/rtld-elf/rtld.c */
const Elf_Hashelt *hashtab = (const Elf_Hashelt *)
(ef->address + dp->d_un.d_ptr);
ef->nbuckets = hashtab[0];
ef->nchains = hashtab[1];
ef->buckets = hashtab + 2;
ef->chains = ef->buckets + ef->nbuckets;
break;
}
case DT_STRTAB:
ef->strtab = (caddr_t) (ef->address + dp->d_un.d_ptr);
break;
case DT_STRSZ:
ef->strsz = dp->d_un.d_val;
break;
case DT_SYMTAB:
ef->symtab = (Elf_Sym*) (ef->address + dp->d_un.d_ptr);
break;
case DT_SYMENT:
if (dp->d_un.d_val != sizeof(Elf_Sym))
return (ENOEXEC);
break;
case DT_PLTGOT:
ef->got = (Elf_Addr *) (ef->address + dp->d_un.d_ptr);
break;
case DT_REL:
ef->rel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr);
break;
case DT_RELSZ:
ef->relsize = dp->d_un.d_val;
break;
case DT_RELENT:
if (dp->d_un.d_val != sizeof(Elf_Rel))
return (ENOEXEC);
break;
case DT_JMPREL:
ef->pltrel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr);
break;
case DT_PLTRELSZ:
ef->pltrelsize = dp->d_un.d_val;
break;
case DT_RELA:
ef->rela = (const Elf_Rela *) (ef->address + dp->d_un.d_ptr);
break;
case DT_RELASZ:
ef->relasize = dp->d_un.d_val;
break;
case DT_RELAENT:
if (dp->d_un.d_val != sizeof(Elf_Rela))
return (ENOEXEC);
break;
case DT_PLTREL:
plttype = dp->d_un.d_val;
if (plttype != DT_REL && plttype != DT_RELA)
return (ENOEXEC);
break;
#ifdef GDB
case DT_DEBUG:
dp->d_un.d_ptr = (Elf_Addr)&r_debug;
break;
#endif
}
}
if (plttype == DT_RELA) {
ef->pltrela = (const Elf_Rela *)ef->pltrel;
ef->pltrel = NULL;
ef->pltrelasize = ef->pltrelsize;
ef->pltrelsize = 0;
}
ef->ddbsymtab = ef->symtab;
ef->ddbsymcnt = ef->nchains;
ef->ddbstrtab = ef->strtab;
ef->ddbstrcnt = ef->strsz;
return (0);
}
static int
parse_dpcpu(elf_file_t ef)
{
int count;
int error;
ef->pcpu_start = 0;
ef->pcpu_stop = 0;
error = link_elf_lookup_set(&ef->lf, "pcpu", (void ***)&ef->pcpu_start,
(void ***)&ef->pcpu_stop, &count);
/* Error just means there is no pcpu set to relocate. */
if (error != 0)
return (0);
count *= sizeof(void *);
/*
* Allocate space in the primary pcpu area. Copy in our
* initialization from the data section and then initialize
* all per-cpu storage from that.
*/
ef->pcpu_base = (Elf_Addr)(uintptr_t)dpcpu_alloc(count);
if (ef->pcpu_base == 0)
return (ENOSPC);
memcpy((void *)ef->pcpu_base, (void *)ef->pcpu_start, count);
dpcpu_copy((void *)ef->pcpu_base, count);
elf_set_add(&set_pcpu_list, ef->pcpu_start, ef->pcpu_stop,
ef->pcpu_base);
return (0);
}
#ifdef VIMAGE
static int
parse_vnet(elf_file_t ef)
{
int count;
int error;
ef->vnet_start = 0;
ef->vnet_stop = 0;
error = link_elf_lookup_set(&ef->lf, "vnet", (void ***)&ef->vnet_start,
(void ***)&ef->vnet_stop, &count);
/* Error just means there is no vnet data set to relocate. */
if (error != 0)
return (0);
count *= sizeof(void *);
/*
* Allocate space in the primary vnet area. Copy in our
* initialization from the data section and then initialize
* all per-vnet storage from that.
*/
ef->vnet_base = (Elf_Addr)(uintptr_t)vnet_data_alloc(count);
if (ef->vnet_base == 0)
return (ENOSPC);
memcpy((void *)ef->vnet_base, (void *)ef->vnet_start, count);
vnet_data_copy((void *)ef->vnet_base, count);
elf_set_add(&set_vnet_list, ef->vnet_start, ef->vnet_stop,
ef->vnet_base);
return (0);
}
#endif
static int
link_elf_link_preload(linker_class_t cls,
const char* filename, linker_file_t *result)
{
Elf_Addr *ctors_addrp;
Elf_Size *ctors_sizep;
caddr_t modptr, baseptr, sizeptr, dynptr;
char *type;
elf_file_t ef;
linker_file_t lf;
int error;
vm_offset_t dp;
/* Look to see if we have the file preloaded */
modptr = preload_search_by_name(filename);
if (modptr == NULL)
return (ENOENT);
type = (char *)preload_search_info(modptr, MODINFO_TYPE);
baseptr = preload_search_info(modptr, MODINFO_ADDR);
sizeptr = preload_search_info(modptr, MODINFO_SIZE);
dynptr = preload_search_info(modptr,
MODINFO_METADATA | MODINFOMD_DYNAMIC);
if (type == NULL ||
(strcmp(type, "elf" __XSTRING(__ELF_WORD_SIZE) " module") != 0 &&
strcmp(type, "elf module") != 0))
return (EFTYPE);
if (baseptr == NULL || sizeptr == NULL || dynptr == NULL)
return (EINVAL);
lf = linker_make_file(filename, &link_elf_class);
if (lf == NULL)
return (ENOMEM);
ef = (elf_file_t) lf;
ef->preloaded = 1;
ef->modptr = modptr;
ef->address = *(caddr_t *)baseptr;
#ifdef SPARSE_MAPPING
ef->object = 0;
#endif
dp = (vm_offset_t)ef->address + *(vm_offset_t *)dynptr;
ef->dynamic = (Elf_Dyn *)dp;
lf->address = ef->address;
lf->size = *(size_t *)sizeptr;
ctors_addrp = (Elf_Addr *)preload_search_info(modptr,
MODINFO_METADATA | MODINFOMD_CTORS_ADDR);
ctors_sizep = (Elf_Size *)preload_search_info(modptr,
MODINFO_METADATA | MODINFOMD_CTORS_SIZE);
if (ctors_addrp != NULL && ctors_sizep != NULL) {
lf->ctors_addr = ef->address + *ctors_addrp;
lf->ctors_size = *ctors_sizep;
}
error = parse_dynamic(ef);
if (error == 0)
error = parse_dpcpu(ef);
#ifdef VIMAGE
if (error == 0)
error = parse_vnet(ef);
#endif
if (error != 0) {
linker_file_unload(lf, LINKER_UNLOAD_FORCE);
return (error);
}
link_elf_reloc_local(lf);
*result = lf;
return (0);
}
static int
link_elf_link_preload_finish(linker_file_t lf)
{
elf_file_t ef;
int error;
ef = (elf_file_t) lf;
error = relocate_file(ef);
if (error != 0)
return (error);
(void)link_elf_preload_parse_symbols(ef);
return (link_elf_link_common_finish(lf));
}
static int
link_elf_load_file(linker_class_t cls, const char* filename,
linker_file_t* result)
{
struct nameidata nd;
struct thread* td = curthread; /* XXX */
Elf_Ehdr *hdr;
caddr_t firstpage;
int nbytes, i;
Elf_Phdr *phdr;
Elf_Phdr *phlimit;
Elf_Phdr *segs[MAXSEGS];
int nsegs;
Elf_Phdr *phdyn;
- Elf_Phdr *phphdr;
caddr_t mapbase;
size_t mapsize;
- Elf_Off base_offset;
Elf_Addr base_vaddr;
Elf_Addr base_vlimit;
int error = 0;
ssize_t resid;
int flags;
elf_file_t ef;
linker_file_t lf;
Elf_Shdr *shdr;
int symtabindex;
int symstrindex;
int shstrindex;
int symcnt;
int strcnt;
char *shstrs;
shdr = NULL;
lf = NULL;
shstrs = NULL;
NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, filename, td);
flags = FREAD;
error = vn_open(&nd, &flags, 0, NULL);
if (error != 0)
return (error);
NDFREE(&nd, NDF_ONLY_PNBUF);
if (nd.ni_vp->v_type != VREG) {
error = ENOEXEC;
firstpage = NULL;
goto out;
}
#ifdef MAC
error = mac_kld_check_load(curthread->td_ucred, nd.ni_vp);
if (error != 0) {
firstpage = NULL;
goto out;
}
#endif
/*
* Read the elf header from the file.
*/
firstpage = malloc(PAGE_SIZE, M_LINKER, M_WAITOK);
hdr = (Elf_Ehdr *)firstpage;
error = vn_rdwr(UIO_READ, nd.ni_vp, firstpage, PAGE_SIZE, 0,
UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
&resid, td);
nbytes = PAGE_SIZE - resid;
if (error != 0)
goto out;
if (!IS_ELF(*hdr)) {
error = ENOEXEC;
goto out;
}
if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS ||
hdr->e_ident[EI_DATA] != ELF_TARG_DATA) {
link_elf_error(filename, "Unsupported file layout");
error = ENOEXEC;
goto out;
}
if (hdr->e_ident[EI_VERSION] != EV_CURRENT ||
hdr->e_version != EV_CURRENT) {
link_elf_error(filename, "Unsupported file version");
error = ENOEXEC;
goto out;
}
if (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN) {
error = ENOSYS;
goto out;
}
if (hdr->e_machine != ELF_TARG_MACH) {
link_elf_error(filename, "Unsupported machine");
error = ENOEXEC;
goto out;
}
/*
* We rely on the program header being in the first page.
* This is not strictly required by the ABI specification, but
* it seems to always true in practice. And, it simplifies
* things considerably.
*/
if (!((hdr->e_phentsize == sizeof(Elf_Phdr)) &&
(hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= PAGE_SIZE) &&
(hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= nbytes)))
link_elf_error(filename, "Unreadable program headers");
/*
* Scan the program header entries, and save key information.
*
* We rely on there being exactly two load segments, text and data,
* in that order.
*/
phdr = (Elf_Phdr *) (firstpage + hdr->e_phoff);
phlimit = phdr + hdr->e_phnum;
nsegs = 0;
phdyn = NULL;
- phphdr = NULL;
while (phdr < phlimit) {
switch (phdr->p_type) {
case PT_LOAD:
if (nsegs == MAXSEGS) {
link_elf_error(filename, "Too many sections");
error = ENOEXEC;
goto out;
}
/*
* XXX: We just trust they come in right order ??
*/
segs[nsegs] = phdr;
++nsegs;
break;
- case PT_PHDR:
- phphdr = phdr;
- break;
-
case PT_DYNAMIC:
phdyn = phdr;
break;
case PT_INTERP:
error = ENOSYS;
goto out;
}
++phdr;
}
if (phdyn == NULL) {
link_elf_error(filename, "Object is not dynamically-linked");
error = ENOEXEC;
goto out;
}
if (nsegs == 0) {
link_elf_error(filename, "No sections");
error = ENOEXEC;
goto out;
}
/*
* Allocate the entire address space of the object, to stake
* out our contiguous region, and to establish the base
* address for relocation.
*/
- base_offset = trunc_page(segs[0]->p_offset);
base_vaddr = trunc_page(segs[0]->p_vaddr);
base_vlimit = round_page(segs[nsegs - 1]->p_vaddr +
segs[nsegs - 1]->p_memsz);
mapsize = base_vlimit - base_vaddr;
lf = linker_make_file(filename, &link_elf_class);
if (lf == NULL) {
error = ENOMEM;
goto out;
}
ef = (elf_file_t) lf;
#ifdef SPARSE_MAPPING
ef->object = vm_object_allocate(OBJT_DEFAULT, mapsize >> PAGE_SHIFT);
if (ef->object == NULL) {
error = ENOMEM;
goto out;
}
ef->address = (caddr_t) vm_map_min(kernel_map);
error = vm_map_find(kernel_map, ef->object, 0,
(vm_offset_t *) &ef->address, mapsize, 0, VMFS_OPTIMAL_SPACE,
VM_PROT_ALL, VM_PROT_ALL, 0);
if (error != 0) {
vm_object_deallocate(ef->object);
ef->object = 0;
goto out;
}
#else
ef->address = malloc(mapsize, M_LINKER, M_WAITOK);
#endif
mapbase = ef->address;
/*
* Read the text and data sections and zero the bss.
*/
for (i = 0; i < nsegs; i++) {
caddr_t segbase = mapbase + segs[i]->p_vaddr - base_vaddr;
error = vn_rdwr(UIO_READ, nd.ni_vp,
segbase, segs[i]->p_filesz, segs[i]->p_offset,
UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
&resid, td);
if (error != 0)
goto out;
bzero(segbase + segs[i]->p_filesz,
segs[i]->p_memsz - segs[i]->p_filesz);
#ifdef SPARSE_MAPPING
/*
* Wire down the pages
*/
error = vm_map_wire(kernel_map,
(vm_offset_t) segbase,
(vm_offset_t) segbase + segs[i]->p_memsz,
VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES);
if (error != KERN_SUCCESS) {
error = ENOMEM;
goto out;
}
#endif
}
#ifdef GPROF
/* Update profiling information with the new text segment. */
mtx_lock(&Giant);
kmupetext((uintfptr_t)(mapbase + segs[0]->p_vaddr - base_vaddr +
segs[0]->p_memsz));
mtx_unlock(&Giant);
#endif
ef->dynamic = (Elf_Dyn *) (mapbase + phdyn->p_vaddr - base_vaddr);
lf->address = ef->address;
lf->size = mapsize;
error = parse_dynamic(ef);
if (error != 0)
goto out;
error = parse_dpcpu(ef);
if (error != 0)
goto out;
#ifdef VIMAGE
error = parse_vnet(ef);
if (error != 0)
goto out;
#endif
link_elf_reloc_local(lf);
VOP_UNLOCK(nd.ni_vp, 0);
error = linker_load_dependencies(lf);
vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY);
if (error != 0)
goto out;
error = relocate_file(ef);
if (error != 0)
goto out;
/*
* Try and load the symbol table if it's present. (you can
* strip it!)
*/
nbytes = hdr->e_shnum * hdr->e_shentsize;
if (nbytes == 0 || hdr->e_shoff == 0)
goto nosyms;
shdr = malloc(nbytes, M_LINKER, M_WAITOK | M_ZERO);
error = vn_rdwr(UIO_READ, nd.ni_vp,
(caddr_t)shdr, nbytes, hdr->e_shoff,
UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
&resid, td);
if (error != 0)
goto out;
/* Read section string table */
shstrindex = hdr->e_shstrndx;
if (shstrindex != 0 && shdr[shstrindex].sh_type == SHT_STRTAB &&
shdr[shstrindex].sh_size != 0) {
nbytes = shdr[shstrindex].sh_size;
shstrs = malloc(nbytes, M_LINKER, M_WAITOK | M_ZERO);
error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)shstrs, nbytes,
shdr[shstrindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED,
td->td_ucred, NOCRED, &resid, td);
if (error)
goto out;
}
symtabindex = -1;
symstrindex = -1;
for (i = 0; i < hdr->e_shnum; i++) {
if (shdr[i].sh_type == SHT_SYMTAB) {
symtabindex = i;
symstrindex = shdr[i].sh_link;
} else if (shstrs != NULL && shdr[i].sh_name != 0 &&
strcmp(shstrs + shdr[i].sh_name, ".ctors") == 0) {
/* Record relocated address and size of .ctors. */
lf->ctors_addr = mapbase + shdr[i].sh_addr - base_vaddr;
lf->ctors_size = shdr[i].sh_size;
}
}
if (symtabindex < 0 || symstrindex < 0)
goto nosyms;
symcnt = shdr[symtabindex].sh_size;
ef->symbase = malloc(symcnt, M_LINKER, M_WAITOK);
strcnt = shdr[symstrindex].sh_size;
ef->strbase = malloc(strcnt, M_LINKER, M_WAITOK);
error = vn_rdwr(UIO_READ, nd.ni_vp,
ef->symbase, symcnt, shdr[symtabindex].sh_offset,
UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
&resid, td);
if (error != 0)
goto out;
error = vn_rdwr(UIO_READ, nd.ni_vp,
ef->strbase, strcnt, shdr[symstrindex].sh_offset,
UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
&resid, td);
if (error != 0)
goto out;
ef->ddbsymcnt = symcnt / sizeof(Elf_Sym);
ef->ddbsymtab = (const Elf_Sym *)ef->symbase;
ef->ddbstrcnt = strcnt;
ef->ddbstrtab = ef->strbase;
nosyms:
error = link_elf_link_common_finish(lf);
if (error != 0)
goto out;
*result = lf;
out:
VOP_UNLOCK(nd.ni_vp, 0);
vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
if (error != 0 && lf != NULL)
linker_file_unload(lf, LINKER_UNLOAD_FORCE);
free(shdr, M_LINKER);
free(firstpage, M_LINKER);
free(shstrs, M_LINKER);
return (error);
}
Elf_Addr
elf_relocaddr(linker_file_t lf, Elf_Addr x)
{
elf_file_t ef;
ef = (elf_file_t)lf;
if (x >= ef->pcpu_start && x < ef->pcpu_stop)
return ((x - ef->pcpu_start) + ef->pcpu_base);
#ifdef VIMAGE
if (x >= ef->vnet_start && x < ef->vnet_stop)
return ((x - ef->vnet_start) + ef->vnet_base);
#endif
return (x);
}
static void
link_elf_unload_file(linker_file_t file)
{
elf_file_t ef = (elf_file_t) file;
if (ef->pcpu_base != 0) {
dpcpu_free((void *)ef->pcpu_base,
ef->pcpu_stop - ef->pcpu_start);
elf_set_delete(&set_pcpu_list, ef->pcpu_start);
}
#ifdef VIMAGE
if (ef->vnet_base != 0) {
vnet_data_free((void *)ef->vnet_base,
ef->vnet_stop - ef->vnet_start);
elf_set_delete(&set_vnet_list, ef->vnet_start);
}
#endif
#ifdef GDB
if (ef->gdb.l_ld != NULL) {
GDB_STATE(RT_DELETE);
free((void *)(uintptr_t)ef->gdb.l_name, M_LINKER);
link_elf_delete_gdb(&ef->gdb);
GDB_STATE(RT_CONSISTENT);
}
#endif
/* Notify MD code that a module is being unloaded. */
elf_cpu_unload_file(file);
if (ef->preloaded) {
link_elf_unload_preload(file);
return;
}
#ifdef SPARSE_MAPPING
if (ef->object != NULL) {
vm_map_remove(kernel_map, (vm_offset_t) ef->address,
(vm_offset_t) ef->address
+ (ef->object->size << PAGE_SHIFT));
}
#else
free(ef->address, M_LINKER);
#endif
free(ef->symbase, M_LINKER);
free(ef->strbase, M_LINKER);
free(ef->ctftab, M_LINKER);
free(ef->ctfoff, M_LINKER);
free(ef->typoff, M_LINKER);
}
static void
link_elf_unload_preload(linker_file_t file)
{
if (file->filename != NULL)
preload_delete_name(file->filename);
}
static const char *
symbol_name(elf_file_t ef, Elf_Size r_info)
{
const Elf_Sym *ref;
if (ELF_R_SYM(r_info)) {
ref = ef->symtab + ELF_R_SYM(r_info);
return (ef->strtab + ref->st_name);
}
return (NULL);
}
static int
relocate_file(elf_file_t ef)
{
const Elf_Rel *rellim;
const Elf_Rel *rel;
const Elf_Rela *relalim;
const Elf_Rela *rela;
const char *symname;
/* Perform relocations without addend if there are any: */
rel = ef->rel;
if (rel != NULL) {
rellim = (const Elf_Rel *)
((const char *)ef->rel + ef->relsize);
while (rel < rellim) {
if (elf_reloc(&ef->lf, (Elf_Addr)ef->address, rel,
ELF_RELOC_REL, elf_lookup)) {
symname = symbol_name(ef, rel->r_info);
printf("link_elf: symbol %s undefined\n", symname);
return (ENOENT);
}
rel++;
}
}
/* Perform relocations with addend if there are any: */
rela = ef->rela;
if (rela != NULL) {
relalim = (const Elf_Rela *)
((const char *)ef->rela + ef->relasize);
while (rela < relalim) {
if (elf_reloc(&ef->lf, (Elf_Addr)ef->address, rela,
ELF_RELOC_RELA, elf_lookup)) {
symname = symbol_name(ef, rela->r_info);
printf("link_elf: symbol %s undefined\n",
symname);
return (ENOENT);
}
rela++;
}
}
/* Perform PLT relocations without addend if there are any: */
rel = ef->pltrel;
if (rel != NULL) {
rellim = (const Elf_Rel *)
((const char *)ef->pltrel + ef->pltrelsize);
while (rel < rellim) {
if (elf_reloc(&ef->lf, (Elf_Addr)ef->address, rel,
ELF_RELOC_REL, elf_lookup)) {
symname = symbol_name(ef, rel->r_info);
printf("link_elf: symbol %s undefined\n",
symname);
return (ENOENT);
}
rel++;
}
}
/* Perform relocations with addend if there are any: */
rela = ef->pltrela;
if (rela != NULL) {
relalim = (const Elf_Rela *)
((const char *)ef->pltrela + ef->pltrelasize);
while (rela < relalim) {
if (elf_reloc(&ef->lf, (Elf_Addr)ef->address, rela,
ELF_RELOC_RELA, elf_lookup)) {
symname = symbol_name(ef, rela->r_info);
printf("link_elf: symbol %s undefined\n",
symname);
return (ENOENT);
}
rela++;
}
}
return (0);
}
/*
* Hash function for symbol table lookup. Don't even think about changing
* this. It is specified by the System V ABI.
*/
static unsigned long
elf_hash(const char *name)
{
const unsigned char *p = (const unsigned char *) name;
unsigned long h = 0;
unsigned long g;
while (*p != '\0') {
h = (h << 4) + *p++;
if ((g = h & 0xf0000000) != 0)
h ^= g >> 24;
h &= ~g;
}
return (h);
}
static int
link_elf_lookup_symbol(linker_file_t lf, const char* name, c_linker_sym_t* sym)
{
elf_file_t ef = (elf_file_t) lf;
unsigned long symnum;
const Elf_Sym* symp;
const char *strp;
unsigned long hash;
int i;
/* If we don't have a hash, bail. */
if (ef->buckets == NULL || ef->nbuckets == 0) {
printf("link_elf_lookup_symbol: missing symbol hash table\n");
return (ENOENT);
}
/* First, search hashed global symbols */
hash = elf_hash(name);
symnum = ef->buckets[hash % ef->nbuckets];
while (symnum != STN_UNDEF) {
if (symnum >= ef->nchains) {
printf("%s: corrupt symbol table\n", __func__);
return (ENOENT);
}
symp = ef->symtab + symnum;
if (symp->st_name == 0) {
printf("%s: corrupt symbol table\n", __func__);
return (ENOENT);
}
strp = ef->strtab + symp->st_name;
if (strcmp(name, strp) == 0) {
if (symp->st_shndx != SHN_UNDEF ||
(symp->st_value != 0 &&
ELF_ST_TYPE(symp->st_info) == STT_FUNC)) {
*sym = (c_linker_sym_t) symp;
return (0);
}
return (ENOENT);
}
symnum = ef->chains[symnum];
}
/* If we have not found it, look at the full table (if loaded) */
if (ef->symtab == ef->ddbsymtab)
return (ENOENT);
/* Exhaustive search */
for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
strp = ef->ddbstrtab + symp->st_name;
if (strcmp(name, strp) == 0) {
if (symp->st_shndx != SHN_UNDEF ||
(symp->st_value != 0 &&
ELF_ST_TYPE(symp->st_info) == STT_FUNC)) {
*sym = (c_linker_sym_t) symp;
return (0);
}
return (ENOENT);
}
}
return (ENOENT);
}
static int
link_elf_symbol_values(linker_file_t lf, c_linker_sym_t sym,
linker_symval_t *symval)
{
elf_file_t ef = (elf_file_t) lf;
const Elf_Sym* es = (const Elf_Sym*) sym;
if (es >= ef->symtab && es < (ef->symtab + ef->nchains)) {
symval->name = ef->strtab + es->st_name;
symval->value = (caddr_t) ef->address + es->st_value;
symval->size = es->st_size;
return (0);
}
if (ef->symtab == ef->ddbsymtab)
return (ENOENT);
if (es >= ef->ddbsymtab && es < (ef->ddbsymtab + ef->ddbsymcnt)) {
symval->name = ef->ddbstrtab + es->st_name;
symval->value = (caddr_t) ef->address + es->st_value;
symval->size = es->st_size;
return (0);
}
return (ENOENT);
}
static int
link_elf_search_symbol(linker_file_t lf, caddr_t value,
c_linker_sym_t *sym, long *diffp)
{
elf_file_t ef = (elf_file_t) lf;
u_long off = (uintptr_t) (void *) value;
u_long diff = off;
u_long st_value;
const Elf_Sym* es;
const Elf_Sym* best = NULL;
int i;
for (i = 0, es = ef->ddbsymtab; i < ef->ddbsymcnt; i++, es++) {
if (es->st_name == 0)
continue;
st_value = es->st_value + (uintptr_t) (void *) ef->address;
if (off >= st_value) {
if (off - st_value < diff) {
diff = off - st_value;
best = es;
if (diff == 0)
break;
} else if (off - st_value == diff) {
best = es;
}
}
}
if (best == NULL)
*diffp = off;
else
*diffp = diff;
*sym = (c_linker_sym_t) best;
return (0);
}
/*
* Look up a linker set on an ELF system.
*/
static int
link_elf_lookup_set(linker_file_t lf, const char *name,
void ***startp, void ***stopp, int *countp)
{
c_linker_sym_t sym;
linker_symval_t symval;
char *setsym;
void **start, **stop;
int len, error = 0, count;
len = strlen(name) + sizeof("__start_set_"); /* sizeof includes \0 */
setsym = malloc(len, M_LINKER, M_WAITOK);
/* get address of first entry */
snprintf(setsym, len, "%s%s", "__start_set_", name);
error = link_elf_lookup_symbol(lf, setsym, &sym);
if (error != 0)
goto out;
link_elf_symbol_values(lf, sym, &symval);
if (symval.value == 0) {
error = ESRCH;
goto out;
}
start = (void **)symval.value;
/* get address of last entry */
snprintf(setsym, len, "%s%s", "__stop_set_", name);
error = link_elf_lookup_symbol(lf, setsym, &sym);
if (error != 0)
goto out;
link_elf_symbol_values(lf, sym, &symval);
if (symval.value == 0) {
error = ESRCH;
goto out;
}
stop = (void **)symval.value;
/* and the number of entries */
count = stop - start;
/* and copy out */
if (startp != NULL)
*startp = start;
if (stopp != NULL)
*stopp = stop;
if (countp != NULL)
*countp = count;
out:
free(setsym, M_LINKER);
return (error);
}
static int
link_elf_each_function_name(linker_file_t file,
int (*callback)(const char *, void *), void *opaque)
{
elf_file_t ef = (elf_file_t)file;
const Elf_Sym *symp;
int i, error;
/* Exhaustive search */
for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
if (symp->st_value != 0 &&
ELF_ST_TYPE(symp->st_info) == STT_FUNC) {
error = callback(ef->ddbstrtab + symp->st_name, opaque);
if (error != 0)
return (error);
}
}
return (0);
}
static int
link_elf_each_function_nameval(linker_file_t file,
linker_function_nameval_callback_t callback, void *opaque)
{
linker_symval_t symval;
elf_file_t ef = (elf_file_t)file;
const Elf_Sym* symp;
int i, error;
/* Exhaustive search */
for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
if (symp->st_value != 0 &&
ELF_ST_TYPE(symp->st_info) == STT_FUNC) {
error = link_elf_symbol_values(file,
(c_linker_sym_t) symp, &symval);
if (error != 0)
return (error);
error = callback(file, i, &symval, opaque);
if (error != 0)
return (error);
}
}
return (0);
}
const Elf_Sym *
elf_get_sym(linker_file_t lf, Elf_Size symidx)
{
elf_file_t ef = (elf_file_t)lf;
if (symidx >= ef->nchains)
return (NULL);
return (ef->symtab + symidx);
}
const char *
elf_get_symname(linker_file_t lf, Elf_Size symidx)
{
elf_file_t ef = (elf_file_t)lf;
const Elf_Sym *sym;
if (symidx >= ef->nchains)
return (NULL);
sym = ef->symtab + symidx;
return (ef->strtab + sym->st_name);
}
/*
* Symbol lookup function that can be used when the symbol index is known (ie
* in relocations). It uses the symbol index instead of doing a fully fledged
* hash table based lookup when such is valid. For example for local symbols.
* This is not only more efficient, it's also more correct. It's not always
* the case that the symbol can be found through the hash table.
*/
static int
elf_lookup(linker_file_t lf, Elf_Size symidx, int deps, Elf_Addr *res)
{
elf_file_t ef = (elf_file_t)lf;
const Elf_Sym *sym;
const char *symbol;
Elf_Addr addr, start, base;
/* Don't even try to lookup the symbol if the index is bogus. */
if (symidx >= ef->nchains) {
*res = 0;
return (EINVAL);
}
sym = ef->symtab + symidx;
/*
* Don't do a full lookup when the symbol is local. It may even
* fail because it may not be found through the hash table.
*/
if (ELF_ST_BIND(sym->st_info) == STB_LOCAL) {
/* Force lookup failure when we have an insanity. */
if (sym->st_shndx == SHN_UNDEF || sym->st_value == 0) {
*res = 0;
return (EINVAL);
}
*res = ((Elf_Addr)ef->address + sym->st_value);
return (0);
}
/*
* XXX we can avoid doing a hash table based lookup for global
* symbols as well. This however is not always valid, so we'll
* just do it the hard way for now. Performance tweaks can
* always be added.
*/
symbol = ef->strtab + sym->st_name;
/* Force a lookup failure if the symbol name is bogus. */
if (*symbol == 0) {
*res = 0;
return (EINVAL);
}
addr = ((Elf_Addr)linker_file_lookup_symbol(lf, symbol, deps));
if (addr == 0 && ELF_ST_BIND(sym->st_info) != STB_WEAK) {
*res = 0;
return (EINVAL);
}
if (elf_set_find(&set_pcpu_list, addr, &start, &base))
addr = addr - start + base;
#ifdef VIMAGE
else if (elf_set_find(&set_vnet_list, addr, &start, &base))
addr = addr - start + base;
#endif
*res = addr;
return (0);
}
static void
link_elf_reloc_local(linker_file_t lf)
{
const Elf_Rel *rellim;
const Elf_Rel *rel;
const Elf_Rela *relalim;
const Elf_Rela *rela;
elf_file_t ef = (elf_file_t)lf;
/* Perform relocations without addend if there are any: */
if ((rel = ef->rel) != NULL) {
rellim = (const Elf_Rel *)((const char *)ef->rel + ef->relsize);
while (rel < rellim) {
elf_reloc_local(lf, (Elf_Addr)ef->address, rel,
ELF_RELOC_REL, elf_lookup);
rel++;
}
}
/* Perform relocations with addend if there are any: */
if ((rela = ef->rela) != NULL) {
relalim = (const Elf_Rela *)
((const char *)ef->rela + ef->relasize);
while (rela < relalim) {
elf_reloc_local(lf, (Elf_Addr)ef->address, rela,
ELF_RELOC_RELA, elf_lookup);
rela++;
}
}
}
static long
link_elf_symtab_get(linker_file_t lf, const Elf_Sym **symtab)
{
elf_file_t ef = (elf_file_t)lf;
*symtab = ef->ddbsymtab;
if (*symtab == NULL)
return (0);
return (ef->ddbsymcnt);
}
static long
link_elf_strtab_get(linker_file_t lf, caddr_t *strtab)
{
elf_file_t ef = (elf_file_t)lf;
*strtab = ef->ddbstrtab;
if (*strtab == NULL)
return (0);
return (ef->ddbstrcnt);
}
Index: head/sys/kern/subr_msgbuf.c
===================================================================
--- head/sys/kern/subr_msgbuf.c (revision 327172)
+++ head/sys/kern/subr_msgbuf.c (revision 327173)
@@ -1,419 +1,418 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2003 Ian Dowse. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
/*
* Generic message buffer support routines.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/lock.h>
#include <sys/kernel.h>
#include <sys/mutex.h>
#include <sys/msgbuf.h>
#include <sys/sysctl.h>
/*
* Maximum number conversion buffer length: uintmax_t in base 2, plus <>
* around the priority, and a terminating NUL.
*/
#define MAXPRIBUF (sizeof(intmax_t) * NBBY + 3)
/* Read/write sequence numbers are modulo a multiple of the buffer size. */
#define SEQMOD(size) ((size) * 16)
static u_int msgbuf_cksum(struct msgbuf *mbp);
/*
* Timestamps in msgbuf are useful when trying to diagnose when core dumps
* or other actions occurred.
*/
static int msgbuf_show_timestamp = 0;
SYSCTL_INT(_kern, OID_AUTO, msgbuf_show_timestamp, CTLFLAG_RWTUN,
&msgbuf_show_timestamp, 0, "Show timestamp in msgbuf");
/*
* Initialize a message buffer of the specified size at the specified
* location. This also zeros the buffer area.
*/
void
msgbuf_init(struct msgbuf *mbp, void *ptr, int size)
{
mbp->msg_ptr = ptr;
mbp->msg_size = size;
mbp->msg_seqmod = SEQMOD(size);
msgbuf_clear(mbp);
mbp->msg_magic = MSG_MAGIC;
mbp->msg_lastpri = -1;
mbp->msg_flags = 0;
bzero(&mbp->msg_lock, sizeof(mbp->msg_lock));
mtx_init(&mbp->msg_lock, "msgbuf", NULL, MTX_SPIN);
}
/*
* Reinitialize a message buffer, retaining its previous contents if
* the size and checksum are correct. If the old contents cannot be
* recovered, the message buffer is cleared.
*/
void
msgbuf_reinit(struct msgbuf *mbp, void *ptr, int size)
{
u_int cksum;
if (mbp->msg_magic != MSG_MAGIC || mbp->msg_size != size) {
msgbuf_init(mbp, ptr, size);
return;
}
mbp->msg_seqmod = SEQMOD(size);
mbp->msg_wseq = MSGBUF_SEQNORM(mbp, mbp->msg_wseq);
mbp->msg_rseq = MSGBUF_SEQNORM(mbp, mbp->msg_rseq);
mbp->msg_ptr = ptr;
cksum = msgbuf_cksum(mbp);
if (cksum != mbp->msg_cksum) {
if (bootverbose) {
printf("msgbuf cksum mismatch (read %x, calc %x)\n",
mbp->msg_cksum, cksum);
printf("Old msgbuf not recovered\n");
}
msgbuf_clear(mbp);
}
mbp->msg_lastpri = -1;
/* Assume that the old message buffer didn't end in a newline. */
mbp->msg_flags |= MSGBUF_NEEDNL;
bzero(&mbp->msg_lock, sizeof(mbp->msg_lock));
mtx_init(&mbp->msg_lock, "msgbuf", NULL, MTX_SPIN);
}
/*
* Clear the message buffer.
*/
void
msgbuf_clear(struct msgbuf *mbp)
{
bzero(mbp->msg_ptr, mbp->msg_size);
mbp->msg_wseq = 0;
mbp->msg_rseq = 0;
mbp->msg_cksum = 0;
}
/*
* Get a count of the number of unread characters in the message buffer.
*/
int
msgbuf_getcount(struct msgbuf *mbp)
{
u_int len;
len = MSGBUF_SEQSUB(mbp, mbp->msg_wseq, mbp->msg_rseq);
if (len > mbp->msg_size)
len = mbp->msg_size;
return (len);
}
/*
* Add a character into the message buffer, and update the checksum and
* sequence number.
*
* The caller should hold the message buffer spinlock.
*/
static void
msgbuf_do_addchar(struct msgbuf * const mbp, u_int * const seq, const int c)
{
u_int pos;
/* Make sure we properly wrap the sequence number. */
pos = MSGBUF_SEQ_TO_POS(mbp, *seq);
mbp->msg_cksum += (u_int)(u_char)c -
(u_int)(u_char)mbp->msg_ptr[pos];
mbp->msg_ptr[pos] = c;
*seq = MSGBUF_SEQNORM(mbp, *seq + 1);
}
/*
* Append a character to a message buffer.
*/
void
msgbuf_addchar(struct msgbuf *mbp, int c)
{
mtx_lock_spin(&mbp->msg_lock);
msgbuf_do_addchar(mbp, &mbp->msg_wseq, c);
mtx_unlock_spin(&mbp->msg_lock);
}
/*
* Append a NUL-terminated string with a priority to a message buffer.
* Filter carriage returns if the caller requests it.
*
* XXX The carriage return filtering behavior is present in the
* msglogchar() API, however testing has shown that we don't seem to send
* carriage returns down this path. So do we still need it?
*/
void
msgbuf_addstr(struct msgbuf *mbp, int pri, char *str, int filter_cr)
{
u_int seq;
size_t len, prefix_len;
char prefix[MAXPRIBUF];
char buf[32];
- int nl, i, j, needtime;
+ int i, j, needtime;
len = strlen(str);
prefix_len = 0;
- nl = 0;
/* If we have a zero-length string, no need to do anything. */
if (len == 0)
return;
mtx_lock_spin(&mbp->msg_lock);
/*
* If this is true, we may need to insert a new priority sequence,
* so prepare the prefix.
*/
if (pri != -1)
prefix_len = sprintf(prefix, "<%d>", pri);
/*
* Starting write sequence number.
*/
seq = mbp->msg_wseq;
/*
* Whenever there is a change in priority, we have to insert a
* newline, and a priority prefix if the priority is not -1. Here
* we detect whether there was a priority change, and whether we
* did not end with a newline. If that is the case, we need to
* insert a newline before this string.
*/
if (mbp->msg_lastpri != pri && (mbp->msg_flags & MSGBUF_NEEDNL) != 0) {
msgbuf_do_addchar(mbp, &seq, '\n');
mbp->msg_flags &= ~MSGBUF_NEEDNL;
}
needtime = 1;
for (i = 0; i < len; i++) {
/*
* If we just had a newline, and the priority is not -1
* (and therefore prefix_len != 0), then we need a priority
* prefix for this line.
*/
if ((mbp->msg_flags & MSGBUF_NEEDNL) == 0 && prefix_len != 0) {
int j;
for (j = 0; j < prefix_len; j++)
msgbuf_do_addchar(mbp, &seq, prefix[j]);
}
if (msgbuf_show_timestamp && needtime == 1 &&
(mbp->msg_flags & MSGBUF_NEEDNL) == 0) {
snprintf(buf, sizeof(buf), "[%jd] ",
(intmax_t)time_uptime);
for (j = 0; buf[j] != '\0'; j++)
msgbuf_do_addchar(mbp, &seq, buf[j]);
needtime = 0;
}
/*
* Don't copy carriage returns if the caller requested
* filtering.
*
* XXX This matches the behavior of msglogchar(), but is it
* necessary? Testing has shown that we don't seem to get
* carriage returns here.
*/
if ((filter_cr != 0) && (str[i] == '\r'))
continue;
/*
* Clear this flag if we see a newline. This affects whether
* we need to insert a new prefix or insert a newline later.
*/
if (str[i] == '\n')
mbp->msg_flags &= ~MSGBUF_NEEDNL;
else
mbp->msg_flags |= MSGBUF_NEEDNL;
msgbuf_do_addchar(mbp, &seq, str[i]);
}
/*
* Update the write sequence number for the actual number of
* characters we put in the message buffer. (Depends on whether
* carriage returns are filtered.)
*/
mbp->msg_wseq = seq;
/*
* Set the last priority.
*/
mbp->msg_lastpri = pri;
mtx_unlock_spin(&mbp->msg_lock);
}
/*
* Read and mark as read a character from a message buffer.
* Returns the character, or -1 if no characters are available.
*/
int
msgbuf_getchar(struct msgbuf *mbp)
{
u_int len, wseq;
int c;
mtx_lock_spin(&mbp->msg_lock);
wseq = mbp->msg_wseq;
len = MSGBUF_SEQSUB(mbp, wseq, mbp->msg_rseq);
if (len == 0) {
mtx_unlock_spin(&mbp->msg_lock);
return (-1);
}
if (len > mbp->msg_size)
mbp->msg_rseq = MSGBUF_SEQNORM(mbp, wseq - mbp->msg_size);
c = (u_char)mbp->msg_ptr[MSGBUF_SEQ_TO_POS(mbp, mbp->msg_rseq)];
mbp->msg_rseq = MSGBUF_SEQNORM(mbp, mbp->msg_rseq + 1);
mtx_unlock_spin(&mbp->msg_lock);
return (c);
}
/*
* Read and mark as read a number of characters from a message buffer.
* Returns the number of characters that were placed in `buf'.
*/
int
msgbuf_getbytes(struct msgbuf *mbp, char *buf, int buflen)
{
u_int len, pos, wseq;
mtx_lock_spin(&mbp->msg_lock);
wseq = mbp->msg_wseq;
len = MSGBUF_SEQSUB(mbp, wseq, mbp->msg_rseq);
if (len == 0) {
mtx_unlock_spin(&mbp->msg_lock);
return (0);
}
if (len > mbp->msg_size) {
mbp->msg_rseq = MSGBUF_SEQNORM(mbp, wseq - mbp->msg_size);
len = mbp->msg_size;
}
pos = MSGBUF_SEQ_TO_POS(mbp, mbp->msg_rseq);
len = min(len, mbp->msg_size - pos);
len = min(len, (u_int)buflen);
bcopy(&mbp->msg_ptr[pos], buf, len);
mbp->msg_rseq = MSGBUF_SEQNORM(mbp, mbp->msg_rseq + len);
mtx_unlock_spin(&mbp->msg_lock);
return (len);
}
/*
* Peek at the full contents of a message buffer without marking any
* data as read. `seqp' should point to an unsigned integer that
* msgbuf_peekbytes() can use to retain state between calls so that
* the whole message buffer can be read in multiple short reads.
* To initialise this variable to the start of the message buffer,
* call msgbuf_peekbytes() with a NULL `buf' parameter.
*
* Returns the number of characters that were placed in `buf'.
*/
int
msgbuf_peekbytes(struct msgbuf *mbp, char *buf, int buflen, u_int *seqp)
{
u_int len, pos, wseq;
mtx_lock_spin(&mbp->msg_lock);
if (buf == NULL) {
/* Just initialise *seqp. */
*seqp = MSGBUF_SEQNORM(mbp, mbp->msg_wseq - mbp->msg_size);
mtx_unlock_spin(&mbp->msg_lock);
return (0);
}
wseq = mbp->msg_wseq;
len = MSGBUF_SEQSUB(mbp, wseq, *seqp);
if (len == 0) {
mtx_unlock_spin(&mbp->msg_lock);
return (0);
}
if (len > mbp->msg_size) {
*seqp = MSGBUF_SEQNORM(mbp, wseq - mbp->msg_size);
len = mbp->msg_size;
}
pos = MSGBUF_SEQ_TO_POS(mbp, *seqp);
len = min(len, mbp->msg_size - pos);
len = min(len, (u_int)buflen);
bcopy(&mbp->msg_ptr[MSGBUF_SEQ_TO_POS(mbp, *seqp)], buf, len);
*seqp = MSGBUF_SEQNORM(mbp, *seqp + len);
mtx_unlock_spin(&mbp->msg_lock);
return (len);
}
/*
* Compute the checksum for the complete message buffer contents.
*/
static u_int
msgbuf_cksum(struct msgbuf *mbp)
{
u_int i, sum;
sum = 0;
for (i = 0; i < mbp->msg_size; i++)
sum += (u_char)mbp->msg_ptr[i];
return (sum);
}
/*
* Copy from one message buffer to another.
*/
void
msgbuf_copy(struct msgbuf *src, struct msgbuf *dst)
{
int c;
while ((c = msgbuf_getchar(src)) >= 0)
msgbuf_addchar(dst, c);
}
Index: head/sys/kern/subr_sleepqueue.c
===================================================================
--- head/sys/kern/subr_sleepqueue.c (revision 327172)
+++ head/sys/kern/subr_sleepqueue.c (revision 327173)
@@ -1,1455 +1,1454 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2004 John Baldwin <jhb@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Implementation of sleep queues used to hold queue of threads blocked on
* a wait channel. Sleep queues are different from turnstiles in that wait
* channels are not owned by anyone, so there is no priority propagation.
* Sleep queues can also provide a timeout and can also be interrupted by
* signals. That said, there are several similarities between the turnstile
* and sleep queue implementations. (Note: turnstiles were implemented
* first.) For example, both use a hash table of the same size where each
* bucket is referred to as a "chain" that contains both a spin lock and
* a linked list of queues. An individual queue is located by using a hash
* to pick a chain, locking the chain, and then walking the chain searching
* for the queue. This means that a wait channel object does not need to
* embed its queue head just as locks do not embed their turnstile queue
* head. Threads also carry around a sleep queue that they lend to the
* wait channel when blocking. Just as in turnstiles, the queue includes
* a free list of the sleep queues of other threads blocked on the same
* wait channel in the case of multiple waiters.
*
* Some additional functionality provided by sleep queues include the
* ability to set a timeout. The timeout is managed using a per-thread
* callout that resumes a thread if it is asleep. A thread may also
* catch signals while it is asleep (aka an interruptible sleep). The
* signal code uses sleepq_abort() to interrupt a sleeping thread. Finally,
* sleep queues also provide some extra assertions. One is not allowed to
* mix the sleep/wakeup and cv APIs for a given wait channel. Also, one
* must consistently use the same lock to synchronize with a wait channel,
* though this check is currently only a warning for sleep/wakeup due to
* pre-existing abuse of that API. The same lock must also be held when
* awakening threads, though that is currently only enforced for condition
* variables.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_sleepqueue_profiling.h"
#include "opt_ddb.h"
#include "opt_sched.h"
#include "opt_stack.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/lock.h>
#include <sys/kernel.h>
#include <sys/ktr.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/sbuf.h>
#include <sys/sched.h>
#include <sys/sdt.h>
#include <sys/signalvar.h>
#include <sys/sleepqueue.h>
#include <sys/stack.h>
#include <sys/sysctl.h>
#include <sys/time.h>
#include <machine/atomic.h>
#include <vm/uma.h>
#ifdef DDB
#include <ddb/ddb.h>
#endif
/*
* Constants for the hash table of sleep queue chains.
* SC_TABLESIZE must be a power of two for SC_MASK to work properly.
*/
#ifndef SC_TABLESIZE
#define SC_TABLESIZE 256
#endif
CTASSERT(powerof2(SC_TABLESIZE));
#define SC_MASK (SC_TABLESIZE - 1)
#define SC_SHIFT 8
#define SC_HASH(wc) ((((uintptr_t)(wc) >> SC_SHIFT) ^ (uintptr_t)(wc)) & \
SC_MASK)
#define SC_LOOKUP(wc) &sleepq_chains[SC_HASH(wc)]
#define NR_SLEEPQS 2
/*
* There are two different lists of sleep queues. Both lists are connected
* via the sq_hash entries. The first list is the sleep queue chain list
* that a sleep queue is on when it is attached to a wait channel. The
* second list is the free list hung off of a sleep queue that is attached
* to a wait channel.
*
* Each sleep queue also contains the wait channel it is attached to, the
* list of threads blocked on that wait channel, flags specific to the
* wait channel, and the lock used to synchronize with a wait channel.
* The flags are used to catch mismatches between the various consumers
* of the sleep queue API (e.g. sleep/wakeup and condition variables).
* The lock pointer is only used when invariants are enabled for various
* debugging checks.
*
* Locking key:
* c - sleep queue chain lock
*/
struct sleepqueue {
TAILQ_HEAD(, thread) sq_blocked[NR_SLEEPQS]; /* (c) Blocked threads. */
u_int sq_blockedcnt[NR_SLEEPQS]; /* (c) N. of blocked threads. */
LIST_ENTRY(sleepqueue) sq_hash; /* (c) Chain and free list. */
LIST_HEAD(, sleepqueue) sq_free; /* (c) Free queues. */
void *sq_wchan; /* (c) Wait channel. */
int sq_type; /* (c) Queue type. */
#ifdef INVARIANTS
struct lock_object *sq_lock; /* (c) Associated lock. */
#endif
};
struct sleepqueue_chain {
LIST_HEAD(, sleepqueue) sc_queues; /* List of sleep queues. */
struct mtx sc_lock; /* Spin lock for this chain. */
#ifdef SLEEPQUEUE_PROFILING
u_int sc_depth; /* Length of sc_queues. */
u_int sc_max_depth; /* Max length of sc_queues. */
#endif
} __aligned(CACHE_LINE_SIZE);
#ifdef SLEEPQUEUE_PROFILING
u_int sleepq_max_depth;
static SYSCTL_NODE(_debug, OID_AUTO, sleepq, CTLFLAG_RD, 0, "sleepq profiling");
static SYSCTL_NODE(_debug_sleepq, OID_AUTO, chains, CTLFLAG_RD, 0,
"sleepq chain stats");
SYSCTL_UINT(_debug_sleepq, OID_AUTO, max_depth, CTLFLAG_RD, &sleepq_max_depth,
0, "maxmimum depth achieved of a single chain");
static void sleepq_profile(const char *wmesg);
static int prof_enabled;
#endif
static struct sleepqueue_chain sleepq_chains[SC_TABLESIZE];
static uma_zone_t sleepq_zone;
/*
* Prototypes for non-exported routines.
*/
static int sleepq_catch_signals(void *wchan, int pri);
static int sleepq_check_signals(void);
static int sleepq_check_timeout(void);
#ifdef INVARIANTS
static void sleepq_dtor(void *mem, int size, void *arg);
#endif
static int sleepq_init(void *mem, int size, int flags);
static int sleepq_resume_thread(struct sleepqueue *sq, struct thread *td,
int pri);
static void sleepq_switch(void *wchan, int pri);
static void sleepq_timeout(void *arg);
SDT_PROBE_DECLARE(sched, , , sleep);
SDT_PROBE_DECLARE(sched, , , wakeup);
/*
* Initialize SLEEPQUEUE_PROFILING specific sysctl nodes.
* Note that it must happen after sleepinit() has been fully executed, so
* it must happen after SI_SUB_KMEM SYSINIT() subsystem setup.
*/
#ifdef SLEEPQUEUE_PROFILING
static void
init_sleepqueue_profiling(void)
{
char chain_name[10];
struct sysctl_oid *chain_oid;
u_int i;
for (i = 0; i < SC_TABLESIZE; i++) {
snprintf(chain_name, sizeof(chain_name), "%u", i);
chain_oid = SYSCTL_ADD_NODE(NULL,
SYSCTL_STATIC_CHILDREN(_debug_sleepq_chains), OID_AUTO,
chain_name, CTLFLAG_RD, NULL, "sleepq chain stats");
SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
"depth", CTLFLAG_RD, &sleepq_chains[i].sc_depth, 0, NULL);
SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
"max_depth", CTLFLAG_RD, &sleepq_chains[i].sc_max_depth, 0,
NULL);
}
}
SYSINIT(sleepqueue_profiling, SI_SUB_LOCK, SI_ORDER_ANY,
init_sleepqueue_profiling, NULL);
#endif
/*
* Early initialization of sleep queues that is called from the sleepinit()
* SYSINIT.
*/
void
init_sleepqueues(void)
{
int i;
for (i = 0; i < SC_TABLESIZE; i++) {
LIST_INIT(&sleepq_chains[i].sc_queues);
mtx_init(&sleepq_chains[i].sc_lock, "sleepq chain", NULL,
MTX_SPIN | MTX_RECURSE);
}
sleepq_zone = uma_zcreate("SLEEPQUEUE", sizeof(struct sleepqueue),
#ifdef INVARIANTS
NULL, sleepq_dtor, sleepq_init, NULL, UMA_ALIGN_CACHE, 0);
#else
NULL, NULL, sleepq_init, NULL, UMA_ALIGN_CACHE, 0);
#endif
thread0.td_sleepqueue = sleepq_alloc();
}
/*
* Get a sleep queue for a new thread.
*/
struct sleepqueue *
sleepq_alloc(void)
{
return (uma_zalloc(sleepq_zone, M_WAITOK));
}
/*
* Free a sleep queue when a thread is destroyed.
*/
void
sleepq_free(struct sleepqueue *sq)
{
uma_zfree(sleepq_zone, sq);
}
/*
* Lock the sleep queue chain associated with the specified wait channel.
*/
void
sleepq_lock(void *wchan)
{
struct sleepqueue_chain *sc;
sc = SC_LOOKUP(wchan);
mtx_lock_spin(&sc->sc_lock);
}
/*
* Look up the sleep queue associated with a given wait channel in the hash
* table locking the associated sleep queue chain. If no queue is found in
* the table, NULL is returned.
*/
struct sleepqueue *
sleepq_lookup(void *wchan)
{
struct sleepqueue_chain *sc;
struct sleepqueue *sq;
KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
sc = SC_LOOKUP(wchan);
mtx_assert(&sc->sc_lock, MA_OWNED);
LIST_FOREACH(sq, &sc->sc_queues, sq_hash)
if (sq->sq_wchan == wchan)
return (sq);
return (NULL);
}
/*
* Unlock the sleep queue chain associated with a given wait channel.
*/
void
sleepq_release(void *wchan)
{
struct sleepqueue_chain *sc;
sc = SC_LOOKUP(wchan);
mtx_unlock_spin(&sc->sc_lock);
}
/*
* Places the current thread on the sleep queue for the specified wait
* channel. If INVARIANTS is enabled, then it associates the passed in
* lock with the sleepq to make sure it is held when that sleep queue is
* woken up.
*/
void
sleepq_add(void *wchan, struct lock_object *lock, const char *wmesg, int flags,
int queue)
{
struct sleepqueue_chain *sc;
struct sleepqueue *sq;
struct thread *td;
td = curthread;
sc = SC_LOOKUP(wchan);
mtx_assert(&sc->sc_lock, MA_OWNED);
MPASS(td->td_sleepqueue != NULL);
MPASS(wchan != NULL);
MPASS((queue >= 0) && (queue < NR_SLEEPQS));
/* If this thread is not allowed to sleep, die a horrible death. */
KASSERT(td->td_no_sleeping == 0,
("%s: td %p to sleep on wchan %p with sleeping prohibited",
__func__, td, wchan));
/* Look up the sleep queue associated with the wait channel 'wchan'. */
sq = sleepq_lookup(wchan);
/*
* If the wait channel does not already have a sleep queue, use
* this thread's sleep queue. Otherwise, insert the current thread
* into the sleep queue already in use by this wait channel.
*/
if (sq == NULL) {
#ifdef INVARIANTS
int i;
sq = td->td_sleepqueue;
for (i = 0; i < NR_SLEEPQS; i++) {
KASSERT(TAILQ_EMPTY(&sq->sq_blocked[i]),
("thread's sleep queue %d is not empty", i));
KASSERT(sq->sq_blockedcnt[i] == 0,
("thread's sleep queue %d count mismatches", i));
}
KASSERT(LIST_EMPTY(&sq->sq_free),
("thread's sleep queue has a non-empty free list"));
KASSERT(sq->sq_wchan == NULL, ("stale sq_wchan pointer"));
sq->sq_lock = lock;
#endif
#ifdef SLEEPQUEUE_PROFILING
sc->sc_depth++;
if (sc->sc_depth > sc->sc_max_depth) {
sc->sc_max_depth = sc->sc_depth;
if (sc->sc_max_depth > sleepq_max_depth)
sleepq_max_depth = sc->sc_max_depth;
}
#endif
sq = td->td_sleepqueue;
LIST_INSERT_HEAD(&sc->sc_queues, sq, sq_hash);
sq->sq_wchan = wchan;
sq->sq_type = flags & SLEEPQ_TYPE;
} else {
MPASS(wchan == sq->sq_wchan);
MPASS(lock == sq->sq_lock);
MPASS((flags & SLEEPQ_TYPE) == sq->sq_type);
LIST_INSERT_HEAD(&sq->sq_free, td->td_sleepqueue, sq_hash);
}
thread_lock(td);
TAILQ_INSERT_TAIL(&sq->sq_blocked[queue], td, td_slpq);
sq->sq_blockedcnt[queue]++;
td->td_sleepqueue = NULL;
td->td_sqqueue = queue;
td->td_wchan = wchan;
td->td_wmesg = wmesg;
if (flags & SLEEPQ_INTERRUPTIBLE) {
td->td_flags |= TDF_SINTR;
td->td_flags &= ~TDF_SLEEPABORT;
}
thread_unlock(td);
}
/*
* Sets a timeout that will remove the current thread from the specified
* sleep queue after timo ticks if the thread has not already been awakened.
*/
void
sleepq_set_timeout_sbt(void *wchan, sbintime_t sbt, sbintime_t pr,
int flags)
{
struct sleepqueue_chain *sc;
struct thread *td;
sbintime_t pr1;
td = curthread;
sc = SC_LOOKUP(wchan);
mtx_assert(&sc->sc_lock, MA_OWNED);
MPASS(TD_ON_SLEEPQ(td));
MPASS(td->td_sleepqueue == NULL);
MPASS(wchan != NULL);
if (cold && td == &thread0)
panic("timed sleep before timers are working");
KASSERT(td->td_sleeptimo == 0, ("td %d %p td_sleeptimo %jx",
td->td_tid, td, (uintmax_t)td->td_sleeptimo));
thread_lock(td);
callout_when(sbt, pr, flags, &td->td_sleeptimo, &pr1);
thread_unlock(td);
callout_reset_sbt_on(&td->td_slpcallout, td->td_sleeptimo, pr1,
sleepq_timeout, td, PCPU_GET(cpuid), flags | C_PRECALC |
C_DIRECT_EXEC);
}
/*
* Return the number of actual sleepers for the specified queue.
*/
u_int
sleepq_sleepcnt(void *wchan, int queue)
{
struct sleepqueue *sq;
KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
MPASS((queue >= 0) && (queue < NR_SLEEPQS));
sq = sleepq_lookup(wchan);
if (sq == NULL)
return (0);
return (sq->sq_blockedcnt[queue]);
}
/*
* Marks the pending sleep of the current thread as interruptible and
* makes an initial check for pending signals before putting a thread
* to sleep. Enters and exits with the thread lock held. Thread lock
* may have transitioned from the sleepq lock to a run lock.
*/
static int
sleepq_catch_signals(void *wchan, int pri)
{
struct sleepqueue_chain *sc;
struct sleepqueue *sq;
struct thread *td;
struct proc *p;
struct sigacts *ps;
int sig, ret;
ret = 0;
td = curthread;
p = curproc;
sc = SC_LOOKUP(wchan);
mtx_assert(&sc->sc_lock, MA_OWNED);
MPASS(wchan != NULL);
if ((td->td_pflags & TDP_WAKEUP) != 0) {
td->td_pflags &= ~TDP_WAKEUP;
ret = EINTR;
thread_lock(td);
goto out;
}
/*
* See if there are any pending signals or suspension requests for this
* thread. If not, we can switch immediately.
*/
thread_lock(td);
if ((td->td_flags & (TDF_NEEDSIGCHK | TDF_NEEDSUSPCHK)) != 0) {
thread_unlock(td);
mtx_unlock_spin(&sc->sc_lock);
CTR3(KTR_PROC, "sleepq catching signals: thread %p (pid %ld, %s)",
(void *)td, (long)p->p_pid, td->td_name);
PROC_LOCK(p);
/*
* Check for suspension first. Checking for signals and then
* suspending could result in a missed signal, since a signal
* can be delivered while this thread is suspended.
*/
if ((td->td_flags & TDF_NEEDSUSPCHK) != 0) {
ret = thread_suspend_check(1);
MPASS(ret == 0 || ret == EINTR || ret == ERESTART);
if (ret != 0) {
PROC_UNLOCK(p);
mtx_lock_spin(&sc->sc_lock);
thread_lock(td);
goto out;
}
}
if ((td->td_flags & TDF_NEEDSIGCHK) != 0) {
ps = p->p_sigacts;
mtx_lock(&ps->ps_mtx);
sig = cursig(td);
if (sig == -1) {
mtx_unlock(&ps->ps_mtx);
KASSERT((td->td_flags & TDF_SBDRY) != 0,
("lost TDF_SBDRY"));
KASSERT(TD_SBDRY_INTR(td),
("lost TDF_SERESTART of TDF_SEINTR"));
KASSERT((td->td_flags &
(TDF_SEINTR | TDF_SERESTART)) !=
(TDF_SEINTR | TDF_SERESTART),
("both TDF_SEINTR and TDF_SERESTART"));
ret = TD_SBDRY_ERRNO(td);
} else if (sig != 0) {
ret = SIGISMEMBER(ps->ps_sigintr, sig) ?
EINTR : ERESTART;
mtx_unlock(&ps->ps_mtx);
} else {
mtx_unlock(&ps->ps_mtx);
}
}
/*
* Lock the per-process spinlock prior to dropping the PROC_LOCK
* to avoid a signal delivery race. PROC_LOCK, PROC_SLOCK, and
* thread_lock() are currently held in tdsendsignal().
*/
PROC_SLOCK(p);
mtx_lock_spin(&sc->sc_lock);
PROC_UNLOCK(p);
thread_lock(td);
PROC_SUNLOCK(p);
}
if (ret == 0) {
sleepq_switch(wchan, pri);
return (0);
}
out:
/*
* There were pending signals and this thread is still
* on the sleep queue, remove it from the sleep queue.
*/
if (TD_ON_SLEEPQ(td)) {
sq = sleepq_lookup(wchan);
if (sleepq_resume_thread(sq, td, 0)) {
#ifdef INVARIANTS
/*
* This thread hasn't gone to sleep yet, so it
* should not be swapped out.
*/
panic("not waking up swapper");
#endif
}
}
mtx_unlock_spin(&sc->sc_lock);
MPASS(td->td_lock != &sc->sc_lock);
return (ret);
}
/*
* Switches to another thread if we are still asleep on a sleep queue.
* Returns with thread lock.
*/
static void
sleepq_switch(void *wchan, int pri)
{
struct sleepqueue_chain *sc;
struct sleepqueue *sq;
struct thread *td;
bool rtc_changed;
td = curthread;
sc = SC_LOOKUP(wchan);
mtx_assert(&sc->sc_lock, MA_OWNED);
THREAD_LOCK_ASSERT(td, MA_OWNED);
/*
* If we have a sleep queue, then we've already been woken up, so
* just return.
*/
if (td->td_sleepqueue != NULL) {
mtx_unlock_spin(&sc->sc_lock);
return;
}
/*
* If TDF_TIMEOUT is set, then our sleep has been timed out
* already but we are still on the sleep queue, so dequeue the
* thread and return.
*
* Do the same if the real-time clock has been adjusted since this
* thread calculated its timeout based on that clock. This handles
* the following race:
* - The Ts thread needs to sleep until an absolute real-clock time.
* It copies the global rtc_generation into curthread->td_rtcgen,
* reads the RTC, and calculates a sleep duration based on that time.
* See umtxq_sleep() for an example.
* - The Tc thread adjusts the RTC, bumps rtc_generation, and wakes
* threads that are sleeping until an absolute real-clock time.
* See tc_setclock() and the POSIX specification of clock_settime().
* - Ts reaches the code below. It holds the sleepqueue chain lock,
* so Tc has finished waking, so this thread must test td_rtcgen.
* (The declaration of td_rtcgen refers to this comment.)
*/
rtc_changed = td->td_rtcgen != 0 && td->td_rtcgen != rtc_generation;
if ((td->td_flags & TDF_TIMEOUT) || rtc_changed) {
if (rtc_changed) {
td->td_rtcgen = 0;
}
MPASS(TD_ON_SLEEPQ(td));
sq = sleepq_lookup(wchan);
if (sleepq_resume_thread(sq, td, 0)) {
#ifdef INVARIANTS
/*
* This thread hasn't gone to sleep yet, so it
* should not be swapped out.
*/
panic("not waking up swapper");
#endif
}
mtx_unlock_spin(&sc->sc_lock);
return;
}
#ifdef SLEEPQUEUE_PROFILING
if (prof_enabled)
sleepq_profile(td->td_wmesg);
#endif
MPASS(td->td_sleepqueue == NULL);
sched_sleep(td, pri);
thread_lock_set(td, &sc->sc_lock);
SDT_PROBE0(sched, , , sleep);
TD_SET_SLEEPING(td);
mi_switch(SW_VOL | SWT_SLEEPQ, NULL);
KASSERT(TD_IS_RUNNING(td), ("running but not TDS_RUNNING"));
CTR3(KTR_PROC, "sleepq resume: thread %p (pid %ld, %s)",
(void *)td, (long)td->td_proc->p_pid, (void *)td->td_name);
}
/*
* Check to see if we timed out.
*/
static int
sleepq_check_timeout(void)
{
struct thread *td;
int res;
td = curthread;
THREAD_LOCK_ASSERT(td, MA_OWNED);
/*
* If TDF_TIMEOUT is set, we timed out. But recheck
* td_sleeptimo anyway.
*/
res = 0;
if (td->td_sleeptimo != 0) {
if (td->td_sleeptimo <= sbinuptime())
res = EWOULDBLOCK;
td->td_sleeptimo = 0;
}
if (td->td_flags & TDF_TIMEOUT)
td->td_flags &= ~TDF_TIMEOUT;
else
/*
* We ignore the situation where timeout subsystem was
* unable to stop our callout. The struct thread is
* type-stable, the callout will use the correct
* memory when running. The checks of the
* td_sleeptimo value in this function and in
* sleepq_timeout() ensure that the thread does not
* get spurious wakeups, even if the callout was reset
* or thread reused.
*/
callout_stop(&td->td_slpcallout);
return (res);
}
/*
* Check to see if we were awoken by a signal.
*/
static int
sleepq_check_signals(void)
{
struct thread *td;
td = curthread;
THREAD_LOCK_ASSERT(td, MA_OWNED);
/* We are no longer in an interruptible sleep. */
if (td->td_flags & TDF_SINTR)
td->td_flags &= ~TDF_SINTR;
if (td->td_flags & TDF_SLEEPABORT) {
td->td_flags &= ~TDF_SLEEPABORT;
return (td->td_intrval);
}
return (0);
}
/*
* Block the current thread until it is awakened from its sleep queue.
*/
void
sleepq_wait(void *wchan, int pri)
{
struct thread *td;
td = curthread;
MPASS(!(td->td_flags & TDF_SINTR));
thread_lock(td);
sleepq_switch(wchan, pri);
thread_unlock(td);
}
/*
* Block the current thread until it is awakened from its sleep queue
* or it is interrupted by a signal.
*/
int
sleepq_wait_sig(void *wchan, int pri)
{
int rcatch;
int rval;
rcatch = sleepq_catch_signals(wchan, pri);
rval = sleepq_check_signals();
thread_unlock(curthread);
if (rcatch)
return (rcatch);
return (rval);
}
/*
* Block the current thread until it is awakened from its sleep queue
* or it times out while waiting.
*/
int
sleepq_timedwait(void *wchan, int pri)
{
struct thread *td;
int rval;
td = curthread;
MPASS(!(td->td_flags & TDF_SINTR));
thread_lock(td);
sleepq_switch(wchan, pri);
rval = sleepq_check_timeout();
thread_unlock(td);
return (rval);
}
/*
* Block the current thread until it is awakened from its sleep queue,
* it is interrupted by a signal, or it times out waiting to be awakened.
*/
int
sleepq_timedwait_sig(void *wchan, int pri)
{
int rcatch, rvalt, rvals;
rcatch = sleepq_catch_signals(wchan, pri);
rvalt = sleepq_check_timeout();
rvals = sleepq_check_signals();
thread_unlock(curthread);
if (rcatch)
return (rcatch);
if (rvals)
return (rvals);
return (rvalt);
}
/*
* Returns the type of sleepqueue given a waitchannel.
*/
int
sleepq_type(void *wchan)
{
struct sleepqueue *sq;
int type;
MPASS(wchan != NULL);
sleepq_lock(wchan);
sq = sleepq_lookup(wchan);
if (sq == NULL) {
sleepq_release(wchan);
return (-1);
}
type = sq->sq_type;
sleepq_release(wchan);
return (type);
}
/*
* Removes a thread from a sleep queue and makes it
* runnable.
*/
static int
sleepq_resume_thread(struct sleepqueue *sq, struct thread *td, int pri)
{
struct sleepqueue_chain *sc;
MPASS(td != NULL);
MPASS(sq->sq_wchan != NULL);
MPASS(td->td_wchan == sq->sq_wchan);
MPASS(td->td_sqqueue < NR_SLEEPQS && td->td_sqqueue >= 0);
THREAD_LOCK_ASSERT(td, MA_OWNED);
sc = SC_LOOKUP(sq->sq_wchan);
mtx_assert(&sc->sc_lock, MA_OWNED);
SDT_PROBE2(sched, , , wakeup, td, td->td_proc);
/* Remove the thread from the queue. */
sq->sq_blockedcnt[td->td_sqqueue]--;
TAILQ_REMOVE(&sq->sq_blocked[td->td_sqqueue], td, td_slpq);
/*
* Get a sleep queue for this thread. If this is the last waiter,
* use the queue itself and take it out of the chain, otherwise,
* remove a queue from the free list.
*/
if (LIST_EMPTY(&sq->sq_free)) {
td->td_sleepqueue = sq;
#ifdef INVARIANTS
sq->sq_wchan = NULL;
#endif
#ifdef SLEEPQUEUE_PROFILING
sc->sc_depth--;
#endif
} else
td->td_sleepqueue = LIST_FIRST(&sq->sq_free);
LIST_REMOVE(td->td_sleepqueue, sq_hash);
td->td_wmesg = NULL;
td->td_wchan = NULL;
td->td_flags &= ~TDF_SINTR;
CTR3(KTR_PROC, "sleepq_wakeup: thread %p (pid %ld, %s)",
(void *)td, (long)td->td_proc->p_pid, td->td_name);
/* Adjust priority if requested. */
MPASS(pri == 0 || (pri >= PRI_MIN && pri <= PRI_MAX));
if (pri != 0 && td->td_priority > pri &&
PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
sched_prio(td, pri);
/*
* Note that thread td might not be sleeping if it is running
* sleepq_catch_signals() on another CPU or is blocked on its
* proc lock to check signals. There's no need to mark the
* thread runnable in that case.
*/
if (TD_IS_SLEEPING(td)) {
TD_CLR_SLEEPING(td);
return (setrunnable(td));
}
return (0);
}
#ifdef INVARIANTS
/*
* UMA zone item deallocator.
*/
static void
sleepq_dtor(void *mem, int size, void *arg)
{
struct sleepqueue *sq;
int i;
sq = mem;
for (i = 0; i < NR_SLEEPQS; i++) {
MPASS(TAILQ_EMPTY(&sq->sq_blocked[i]));
MPASS(sq->sq_blockedcnt[i] == 0);
}
}
#endif
/*
* UMA zone item initializer.
*/
static int
sleepq_init(void *mem, int size, int flags)
{
struct sleepqueue *sq;
int i;
bzero(mem, size);
sq = mem;
for (i = 0; i < NR_SLEEPQS; i++) {
TAILQ_INIT(&sq->sq_blocked[i]);
sq->sq_blockedcnt[i] = 0;
}
LIST_INIT(&sq->sq_free);
return (0);
}
/*
* Find the highest priority thread sleeping on a wait channel and resume it.
*/
int
sleepq_signal(void *wchan, int flags, int pri, int queue)
{
struct sleepqueue *sq;
struct thread *td, *besttd;
int wakeup_swapper;
CTR2(KTR_PROC, "sleepq_signal(%p, %d)", wchan, flags);
KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
MPASS((queue >= 0) && (queue < NR_SLEEPQS));
sq = sleepq_lookup(wchan);
if (sq == NULL)
return (0);
KASSERT(sq->sq_type == (flags & SLEEPQ_TYPE),
("%s: mismatch between sleep/wakeup and cv_*", __func__));
/*
* Find the highest priority thread on the queue. If there is a
* tie, use the thread that first appears in the queue as it has
* been sleeping the longest since threads are always added to
* the tail of sleep queues.
*/
besttd = TAILQ_FIRST(&sq->sq_blocked[queue]);
TAILQ_FOREACH(td, &sq->sq_blocked[queue], td_slpq) {
if (td->td_priority < besttd->td_priority)
besttd = td;
}
MPASS(besttd != NULL);
thread_lock(besttd);
wakeup_swapper = sleepq_resume_thread(sq, besttd, pri);
thread_unlock(besttd);
return (wakeup_swapper);
}
static bool
match_any(struct thread *td __unused)
{
return (true);
}
/*
* Resume all threads sleeping on a specified wait channel.
*/
int
sleepq_broadcast(void *wchan, int flags, int pri, int queue)
{
struct sleepqueue *sq;
CTR2(KTR_PROC, "sleepq_broadcast(%p, %d)", wchan, flags);
KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
MPASS((queue >= 0) && (queue < NR_SLEEPQS));
sq = sleepq_lookup(wchan);
if (sq == NULL)
return (0);
KASSERT(sq->sq_type == (flags & SLEEPQ_TYPE),
("%s: mismatch between sleep/wakeup and cv_*", __func__));
return (sleepq_remove_matching(sq, queue, match_any, pri));
}
/*
* Resume threads on the sleep queue that match the given predicate.
*/
int
sleepq_remove_matching(struct sleepqueue *sq, int queue,
bool (*matches)(struct thread *), int pri)
{
struct thread *td, *tdn;
int wakeup_swapper;
/*
* The last thread will be given ownership of sq and may
* re-enqueue itself before sleepq_resume_thread() returns,
* so we must cache the "next" queue item at the beginning
* of the final iteration.
*/
wakeup_swapper = 0;
TAILQ_FOREACH_SAFE(td, &sq->sq_blocked[queue], td_slpq, tdn) {
thread_lock(td);
if (matches(td))
wakeup_swapper |= sleepq_resume_thread(sq, td, pri);
thread_unlock(td);
}
return (wakeup_swapper);
}
/*
* Time sleeping threads out. When the timeout expires, the thread is
* removed from the sleep queue and made runnable if it is still asleep.
*/
static void
sleepq_timeout(void *arg)
{
struct sleepqueue_chain *sc;
struct sleepqueue *sq;
struct thread *td;
void *wchan;
int wakeup_swapper;
td = arg;
wakeup_swapper = 0;
CTR3(KTR_PROC, "sleepq_timeout: thread %p (pid %ld, %s)",
(void *)td, (long)td->td_proc->p_pid, (void *)td->td_name);
thread_lock(td);
if (td->td_sleeptimo > sbinuptime() || td->td_sleeptimo == 0) {
/*
* The thread does not want a timeout (yet).
*/
} else if (TD_IS_SLEEPING(td) && TD_ON_SLEEPQ(td)) {
/*
* See if the thread is asleep and get the wait
* channel if it is.
*/
wchan = td->td_wchan;
sc = SC_LOOKUP(wchan);
THREAD_LOCKPTR_ASSERT(td, &sc->sc_lock);
sq = sleepq_lookup(wchan);
MPASS(sq != NULL);
td->td_flags |= TDF_TIMEOUT;
wakeup_swapper = sleepq_resume_thread(sq, td, 0);
} else if (TD_ON_SLEEPQ(td)) {
/*
* If the thread is on the SLEEPQ but isn't sleeping
* yet, it can either be on another CPU in between
* sleepq_add() and one of the sleepq_*wait*()
* routines or it can be in sleepq_catch_signals().
*/
td->td_flags |= TDF_TIMEOUT;
}
thread_unlock(td);
if (wakeup_swapper)
kick_proc0();
}
/*
* Resumes a specific thread from the sleep queue associated with a specific
* wait channel if it is on that queue.
*/
void
sleepq_remove(struct thread *td, void *wchan)
{
struct sleepqueue *sq;
int wakeup_swapper;
/*
* Look up the sleep queue for this wait channel, then re-check
* that the thread is asleep on that channel, if it is not, then
* bail.
*/
MPASS(wchan != NULL);
sleepq_lock(wchan);
sq = sleepq_lookup(wchan);
/*
* We can not lock the thread here as it may be sleeping on a
* different sleepq. However, holding the sleepq lock for this
* wchan can guarantee that we do not miss a wakeup for this
* channel. The asserts below will catch any false positives.
*/
if (!TD_ON_SLEEPQ(td) || td->td_wchan != wchan) {
sleepq_release(wchan);
return;
}
/* Thread is asleep on sleep queue sq, so wake it up. */
thread_lock(td);
MPASS(sq != NULL);
MPASS(td->td_wchan == wchan);
wakeup_swapper = sleepq_resume_thread(sq, td, 0);
thread_unlock(td);
sleepq_release(wchan);
if (wakeup_swapper)
kick_proc0();
}
/*
* Abort a thread as if an interrupt had occurred. Only abort
* interruptible waits (unfortunately it isn't safe to abort others).
*/
int
sleepq_abort(struct thread *td, int intrval)
{
struct sleepqueue *sq;
void *wchan;
THREAD_LOCK_ASSERT(td, MA_OWNED);
MPASS(TD_ON_SLEEPQ(td));
MPASS(td->td_flags & TDF_SINTR);
MPASS(intrval == EINTR || intrval == ERESTART);
/*
* If the TDF_TIMEOUT flag is set, just leave. A
* timeout is scheduled anyhow.
*/
if (td->td_flags & TDF_TIMEOUT)
return (0);
CTR3(KTR_PROC, "sleepq_abort: thread %p (pid %ld, %s)",
(void *)td, (long)td->td_proc->p_pid, (void *)td->td_name);
td->td_intrval = intrval;
td->td_flags |= TDF_SLEEPABORT;
/*
* If the thread has not slept yet it will find the signal in
* sleepq_catch_signals() and call sleepq_resume_thread. Otherwise
* we have to do it here.
*/
if (!TD_IS_SLEEPING(td))
return (0);
wchan = td->td_wchan;
MPASS(wchan != NULL);
sq = sleepq_lookup(wchan);
MPASS(sq != NULL);
/* Thread is asleep on sleep queue sq, so wake it up. */
return (sleepq_resume_thread(sq, td, 0));
}
void
sleepq_chains_remove_matching(bool (*matches)(struct thread *))
{
struct sleepqueue_chain *sc;
struct sleepqueue *sq;
int i, wakeup_swapper;
wakeup_swapper = 0;
for (sc = &sleepq_chains[0]; sc < sleepq_chains + SC_TABLESIZE; ++sc) {
if (LIST_EMPTY(&sc->sc_queues)) {
continue;
}
mtx_lock_spin(&sc->sc_lock);
LIST_FOREACH(sq, &sc->sc_queues, sq_hash) {
for (i = 0; i < NR_SLEEPQS; ++i) {
wakeup_swapper |= sleepq_remove_matching(sq, i,
matches, 0);
}
}
mtx_unlock_spin(&sc->sc_lock);
}
if (wakeup_swapper) {
kick_proc0();
}
}
/*
* Prints the stacks of all threads presently sleeping on wchan/queue to
* the sbuf sb. Sets count_stacks_printed to the number of stacks actually
* printed. Typically, this will equal the number of threads sleeping on the
* queue, but may be less if sb overflowed before all stacks were printed.
*/
#ifdef STACK
int
sleepq_sbuf_print_stacks(struct sbuf *sb, void *wchan, int queue,
int *count_stacks_printed)
{
struct thread *td, *td_next;
struct sleepqueue *sq;
struct stack **st;
struct sbuf **td_infos;
int i, stack_idx, error, stacks_to_allocate;
- bool finished, partial_print;
+ bool finished;
error = 0;
finished = false;
- partial_print = false;
KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
MPASS((queue >= 0) && (queue < NR_SLEEPQS));
stacks_to_allocate = 10;
for (i = 0; i < 3 && !finished ; i++) {
/* We cannot malloc while holding the queue's spinlock, so
* we do our mallocs now, and hope it is enough. If it
* isn't, we will free these, drop the lock, malloc more,
* and try again, up to a point. After that point we will
* give up and report ENOMEM. We also cannot write to sb
* during this time since the client may have set the
* SBUF_AUTOEXTEND flag on their sbuf, which could cause a
* malloc as we print to it. So we defer actually printing
* to sb until after we drop the spinlock.
*/
/* Where we will store the stacks. */
st = malloc(sizeof(struct stack *) * stacks_to_allocate,
M_TEMP, M_WAITOK);
for (stack_idx = 0; stack_idx < stacks_to_allocate;
stack_idx++)
st[stack_idx] = stack_create(M_WAITOK);
/* Where we will store the td name, tid, etc. */
td_infos = malloc(sizeof(struct sbuf *) * stacks_to_allocate,
M_TEMP, M_WAITOK);
for (stack_idx = 0; stack_idx < stacks_to_allocate;
stack_idx++)
td_infos[stack_idx] = sbuf_new(NULL, NULL,
MAXCOMLEN + sizeof(struct thread *) * 2 + 40,
SBUF_FIXEDLEN);
sleepq_lock(wchan);
sq = sleepq_lookup(wchan);
if (sq == NULL) {
/* This sleepq does not exist; exit and return ENOENT. */
error = ENOENT;
finished = true;
sleepq_release(wchan);
goto loop_end;
}
stack_idx = 0;
/* Save thread info */
TAILQ_FOREACH_SAFE(td, &sq->sq_blocked[queue], td_slpq,
td_next) {
if (stack_idx >= stacks_to_allocate)
goto loop_end;
/* Note the td_lock is equal to the sleepq_lock here. */
stack_save_td(st[stack_idx], td);
sbuf_printf(td_infos[stack_idx], "%d: %s %p",
td->td_tid, td->td_name, td);
++stack_idx;
}
finished = true;
sleepq_release(wchan);
/* Print the stacks */
for (i = 0; i < stack_idx; i++) {
sbuf_finish(td_infos[i]);
sbuf_printf(sb, "--- thread %s: ---\n", sbuf_data(td_infos[i]));
stack_sbuf_print(sb, st[i]);
sbuf_printf(sb, "\n");
error = sbuf_error(sb);
if (error == 0)
*count_stacks_printed = stack_idx;
}
loop_end:
if (!finished)
sleepq_release(wchan);
for (stack_idx = 0; stack_idx < stacks_to_allocate;
stack_idx++)
stack_destroy(st[stack_idx]);
for (stack_idx = 0; stack_idx < stacks_to_allocate;
stack_idx++)
sbuf_delete(td_infos[stack_idx]);
free(st, M_TEMP);
free(td_infos, M_TEMP);
stacks_to_allocate *= 10;
}
if (!finished && error == 0)
error = ENOMEM;
return (error);
}
#endif
#ifdef SLEEPQUEUE_PROFILING
#define SLEEPQ_PROF_LOCATIONS 1024
#define SLEEPQ_SBUFSIZE 512
struct sleepq_prof {
LIST_ENTRY(sleepq_prof) sp_link;
const char *sp_wmesg;
long sp_count;
};
LIST_HEAD(sqphead, sleepq_prof);
struct sqphead sleepq_prof_free;
struct sqphead sleepq_hash[SC_TABLESIZE];
static struct sleepq_prof sleepq_profent[SLEEPQ_PROF_LOCATIONS];
static struct mtx sleepq_prof_lock;
MTX_SYSINIT(sleepq_prof_lock, &sleepq_prof_lock, "sleepq_prof", MTX_SPIN);
static void
sleepq_profile(const char *wmesg)
{
struct sleepq_prof *sp;
mtx_lock_spin(&sleepq_prof_lock);
if (prof_enabled == 0)
goto unlock;
LIST_FOREACH(sp, &sleepq_hash[SC_HASH(wmesg)], sp_link)
if (sp->sp_wmesg == wmesg)
goto done;
sp = LIST_FIRST(&sleepq_prof_free);
if (sp == NULL)
goto unlock;
sp->sp_wmesg = wmesg;
LIST_REMOVE(sp, sp_link);
LIST_INSERT_HEAD(&sleepq_hash[SC_HASH(wmesg)], sp, sp_link);
done:
sp->sp_count++;
unlock:
mtx_unlock_spin(&sleepq_prof_lock);
return;
}
static void
sleepq_prof_reset(void)
{
struct sleepq_prof *sp;
int enabled;
int i;
mtx_lock_spin(&sleepq_prof_lock);
enabled = prof_enabled;
prof_enabled = 0;
for (i = 0; i < SC_TABLESIZE; i++)
LIST_INIT(&sleepq_hash[i]);
LIST_INIT(&sleepq_prof_free);
for (i = 0; i < SLEEPQ_PROF_LOCATIONS; i++) {
sp = &sleepq_profent[i];
sp->sp_wmesg = NULL;
sp->sp_count = 0;
LIST_INSERT_HEAD(&sleepq_prof_free, sp, sp_link);
}
prof_enabled = enabled;
mtx_unlock_spin(&sleepq_prof_lock);
}
static int
enable_sleepq_prof(SYSCTL_HANDLER_ARGS)
{
int error, v;
v = prof_enabled;
error = sysctl_handle_int(oidp, &v, v, req);
if (error)
return (error);
if (req->newptr == NULL)
return (error);
if (v == prof_enabled)
return (0);
if (v == 1)
sleepq_prof_reset();
mtx_lock_spin(&sleepq_prof_lock);
prof_enabled = !!v;
mtx_unlock_spin(&sleepq_prof_lock);
return (0);
}
static int
reset_sleepq_prof_stats(SYSCTL_HANDLER_ARGS)
{
int error, v;
v = 0;
error = sysctl_handle_int(oidp, &v, 0, req);
if (error)
return (error);
if (req->newptr == NULL)
return (error);
if (v == 0)
return (0);
sleepq_prof_reset();
return (0);
}
static int
dump_sleepq_prof_stats(SYSCTL_HANDLER_ARGS)
{
struct sleepq_prof *sp;
struct sbuf *sb;
int enabled;
int error;
int i;
error = sysctl_wire_old_buffer(req, 0);
if (error != 0)
return (error);
sb = sbuf_new_for_sysctl(NULL, NULL, SLEEPQ_SBUFSIZE, req);
sbuf_printf(sb, "\nwmesg\tcount\n");
enabled = prof_enabled;
mtx_lock_spin(&sleepq_prof_lock);
prof_enabled = 0;
mtx_unlock_spin(&sleepq_prof_lock);
for (i = 0; i < SC_TABLESIZE; i++) {
LIST_FOREACH(sp, &sleepq_hash[i], sp_link) {
sbuf_printf(sb, "%s\t%ld\n",
sp->sp_wmesg, sp->sp_count);
}
}
mtx_lock_spin(&sleepq_prof_lock);
prof_enabled = enabled;
mtx_unlock_spin(&sleepq_prof_lock);
error = sbuf_finish(sb);
sbuf_delete(sb);
return (error);
}
SYSCTL_PROC(_debug_sleepq, OID_AUTO, stats, CTLTYPE_STRING | CTLFLAG_RD,
NULL, 0, dump_sleepq_prof_stats, "A", "Sleepqueue profiling statistics");
SYSCTL_PROC(_debug_sleepq, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_RW,
NULL, 0, reset_sleepq_prof_stats, "I",
"Reset sleepqueue profiling statistics");
SYSCTL_PROC(_debug_sleepq, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW,
NULL, 0, enable_sleepq_prof, "I", "Enable sleepqueue profiling");
#endif
#ifdef DDB
DB_SHOW_COMMAND(sleepq, db_show_sleepqueue)
{
struct sleepqueue_chain *sc;
struct sleepqueue *sq;
#ifdef INVARIANTS
struct lock_object *lock;
#endif
struct thread *td;
void *wchan;
int i;
if (!have_addr)
return;
/*
* First, see if there is an active sleep queue for the wait channel
* indicated by the address.
*/
wchan = (void *)addr;
sc = SC_LOOKUP(wchan);
LIST_FOREACH(sq, &sc->sc_queues, sq_hash)
if (sq->sq_wchan == wchan)
goto found;
/*
* Second, see if there is an active sleep queue at the address
* indicated.
*/
for (i = 0; i < SC_TABLESIZE; i++)
LIST_FOREACH(sq, &sleepq_chains[i].sc_queues, sq_hash) {
if (sq == (struct sleepqueue *)addr)
goto found;
}
db_printf("Unable to locate a sleep queue via %p\n", (void *)addr);
return;
found:
db_printf("Wait channel: %p\n", sq->sq_wchan);
db_printf("Queue type: %d\n", sq->sq_type);
#ifdef INVARIANTS
if (sq->sq_lock) {
lock = sq->sq_lock;
db_printf("Associated Interlock: %p - (%s) %s\n", lock,
LOCK_CLASS(lock)->lc_name, lock->lo_name);
}
#endif
db_printf("Blocked threads:\n");
for (i = 0; i < NR_SLEEPQS; i++) {
db_printf("\nQueue[%d]:\n", i);
if (TAILQ_EMPTY(&sq->sq_blocked[i]))
db_printf("\tempty\n");
else
TAILQ_FOREACH(td, &sq->sq_blocked[i],
td_slpq) {
db_printf("\t%p (tid %d, pid %d, \"%s\")\n", td,
td->td_tid, td->td_proc->p_pid,
td->td_name);
}
db_printf("(expected: %u)\n", sq->sq_blockedcnt[i]);
}
}
/* Alias 'show sleepqueue' to 'show sleepq'. */
DB_SHOW_ALIAS(sleepqueue, db_show_sleepqueue);
#endif
Index: head/sys/kern/subr_witness.c
===================================================================
--- head/sys/kern/subr_witness.c (revision 327172)
+++ head/sys/kern/subr_witness.c (revision 327173)
@@ -1,3058 +1,3047 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (c) 2008 Isilon Systems, Inc.
* Copyright (c) 2008 Ilya Maykov <ivmaykov@gmail.com>
* Copyright (c) 1998 Berkeley Software Design, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Berkeley Software Design Inc's name may not be used to endorse or
* promote products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
* and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
*/
/*
* Implementation of the `witness' lock verifier. Originally implemented for
* mutexes in BSD/OS. Extended to handle generic lock objects and lock
* classes in FreeBSD.
*/
/*
* Main Entry: witness
* Pronunciation: 'wit-n&s
* Function: noun
* Etymology: Middle English witnesse, from Old English witnes knowledge,
* testimony, witness, from 2wit
* Date: before 12th century
* 1 : attestation of a fact or event : TESTIMONY
* 2 : one that gives evidence; specifically : one who testifies in
* a cause or before a judicial tribunal
* 3 : one asked to be present at a transaction so as to be able to
* testify to its having taken place
* 4 : one who has personal knowledge of something
* 5 a : something serving as evidence or proof : SIGN
* b : public affirmation by word or example of usually
* religious faith or conviction <the heroic witness to divine
* life -- Pilot>
* 6 capitalized : a member of the Jehovah's Witnesses
*/
/*
* Special rules concerning Giant and lock orders:
*
* 1) Giant must be acquired before any other mutexes. Stated another way,
* no other mutex may be held when Giant is acquired.
*
* 2) Giant must be released when blocking on a sleepable lock.
*
* This rule is less obvious, but is a result of Giant providing the same
* semantics as spl(). Basically, when a thread sleeps, it must release
* Giant. When a thread blocks on a sleepable lock, it sleeps. Hence rule
* 2).
*
* 3) Giant may be acquired before or after sleepable locks.
*
* This rule is also not quite as obvious. Giant may be acquired after
* a sleepable lock because it is a non-sleepable lock and non-sleepable
* locks may always be acquired while holding a sleepable lock. The second
* case, Giant before a sleepable lock, follows from rule 2) above. Suppose
* you have two threads T1 and T2 and a sleepable lock X. Suppose that T1
* acquires X and blocks on Giant. Then suppose that T2 acquires Giant and
* blocks on X. When T2 blocks on X, T2 will release Giant allowing T1 to
* execute. Thus, acquiring Giant both before and after a sleepable lock
* will not result in a lock order reversal.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_ddb.h"
#include "opt_hwpmc_hooks.h"
#include "opt_stack.h"
#include "opt_witness.h"
#include <sys/param.h>
#include <sys/bus.h>
#include <sys/kdb.h>
#include <sys/kernel.h>
#include <sys/ktr.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/sbuf.h>
#include <sys/sched.h>
#include <sys/stack.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/systm.h>
#ifdef DDB
#include <ddb/ddb.h>
#endif
#include <machine/stdarg.h>
#if !defined(DDB) && !defined(STACK)
#error "DDB or STACK options are required for WITNESS"
#endif
/* Note that these traces do not work with KTR_ALQ. */
#if 0
#define KTR_WITNESS KTR_SUBSYS
#else
#define KTR_WITNESS 0
#endif
#define LI_RECURSEMASK 0x0000ffff /* Recursion depth of lock instance. */
#define LI_EXCLUSIVE 0x00010000 /* Exclusive lock instance. */
#define LI_NORELEASE 0x00020000 /* Lock not allowed to be released. */
/* Define this to check for blessed mutexes */
#undef BLESSING
#ifndef WITNESS_COUNT
#define WITNESS_COUNT 1536
#endif
#define WITNESS_HASH_SIZE 251 /* Prime, gives load factor < 2 */
#define WITNESS_PENDLIST (2048 + MAXCPU)
/* Allocate 256 KB of stack data space */
#define WITNESS_LO_DATA_COUNT 2048
/* Prime, gives load factor of ~2 at full load */
#define WITNESS_LO_HASH_SIZE 1021
/*
* XXX: This is somewhat bogus, as we assume here that at most 2048 threads
* will hold LOCK_NCHILDREN locks. We handle failure ok, and we should
* probably be safe for the most part, but it's still a SWAG.
*/
#define LOCK_NCHILDREN 5
#define LOCK_CHILDCOUNT 2048
#define MAX_W_NAME 64
#define FULLGRAPH_SBUF_SIZE 512
/*
* These flags go in the witness relationship matrix and describe the
* relationship between any two struct witness objects.
*/
#define WITNESS_UNRELATED 0x00 /* No lock order relation. */
#define WITNESS_PARENT 0x01 /* Parent, aka direct ancestor. */
#define WITNESS_ANCESTOR 0x02 /* Direct or indirect ancestor. */
#define WITNESS_CHILD 0x04 /* Child, aka direct descendant. */
#define WITNESS_DESCENDANT 0x08 /* Direct or indirect descendant. */
#define WITNESS_ANCESTOR_MASK (WITNESS_PARENT | WITNESS_ANCESTOR)
#define WITNESS_DESCENDANT_MASK (WITNESS_CHILD | WITNESS_DESCENDANT)
#define WITNESS_RELATED_MASK \
(WITNESS_ANCESTOR_MASK | WITNESS_DESCENDANT_MASK)
#define WITNESS_REVERSAL 0x10 /* A lock order reversal has been
* observed. */
#define WITNESS_RESERVED1 0x20 /* Unused flag, reserved. */
#define WITNESS_RESERVED2 0x40 /* Unused flag, reserved. */
#define WITNESS_LOCK_ORDER_KNOWN 0x80 /* This lock order is known. */
/* Descendant to ancestor flags */
#define WITNESS_DTOA(x) (((x) & WITNESS_RELATED_MASK) >> 2)
/* Ancestor to descendant flags */
#define WITNESS_ATOD(x) (((x) & WITNESS_RELATED_MASK) << 2)
#define WITNESS_INDEX_ASSERT(i) \
MPASS((i) > 0 && (i) <= w_max_used_index && (i) < witness_count)
static MALLOC_DEFINE(M_WITNESS, "Witness", "Witness");
/*
* Lock instances. A lock instance is the data associated with a lock while
* it is held by witness. For example, a lock instance will hold the
* recursion count of a lock. Lock instances are held in lists. Spin locks
* are held in a per-cpu list while sleep locks are held in per-thread list.
*/
struct lock_instance {
struct lock_object *li_lock;
const char *li_file;
int li_line;
u_int li_flags;
};
/*
* A simple list type used to build the list of locks held by a thread
* or CPU. We can't simply embed the list in struct lock_object since a
* lock may be held by more than one thread if it is a shared lock. Locks
* are added to the head of the list, so we fill up each list entry from
* "the back" logically. To ease some of the arithmetic, we actually fill
* in each list entry the normal way (children[0] then children[1], etc.) but
* when we traverse the list we read children[count-1] as the first entry
* down to children[0] as the final entry.
*/
struct lock_list_entry {
struct lock_list_entry *ll_next;
struct lock_instance ll_children[LOCK_NCHILDREN];
u_int ll_count;
};
/*
* The main witness structure. One of these per named lock type in the system
* (for example, "vnode interlock").
*/
struct witness {
char w_name[MAX_W_NAME];
uint32_t w_index; /* Index in the relationship matrix */
struct lock_class *w_class;
STAILQ_ENTRY(witness) w_list; /* List of all witnesses. */
STAILQ_ENTRY(witness) w_typelist; /* Witnesses of a type. */
struct witness *w_hash_next; /* Linked list in hash buckets. */
const char *w_file; /* File where last acquired */
uint32_t w_line; /* Line where last acquired */
uint32_t w_refcount;
uint16_t w_num_ancestors; /* direct/indirect
* ancestor count */
uint16_t w_num_descendants; /* direct/indirect
* descendant count */
int16_t w_ddb_level;
unsigned w_displayed:1;
unsigned w_reversed:1;
};
STAILQ_HEAD(witness_list, witness);
/*
* The witness hash table. Keys are witness names (const char *), elements are
* witness objects (struct witness *).
*/
struct witness_hash {
struct witness *wh_array[WITNESS_HASH_SIZE];
uint32_t wh_size;
uint32_t wh_count;
};
/*
* Key type for the lock order data hash table.
*/
struct witness_lock_order_key {
uint16_t from;
uint16_t to;
};
struct witness_lock_order_data {
struct stack wlod_stack;
struct witness_lock_order_key wlod_key;
struct witness_lock_order_data *wlod_next;
};
/*
* The witness lock order data hash table. Keys are witness index tuples
* (struct witness_lock_order_key), elements are lock order data objects
* (struct witness_lock_order_data).
*/
struct witness_lock_order_hash {
struct witness_lock_order_data *wloh_array[WITNESS_LO_HASH_SIZE];
u_int wloh_size;
u_int wloh_count;
};
#ifdef BLESSING
struct witness_blessed {
const char *b_lock1;
const char *b_lock2;
};
#endif
struct witness_pendhelp {
const char *wh_type;
struct lock_object *wh_lock;
};
struct witness_order_list_entry {
const char *w_name;
struct lock_class *w_class;
};
/*
* Returns 0 if one of the locks is a spin lock and the other is not.
* Returns 1 otherwise.
*/
static __inline int
witness_lock_type_equal(struct witness *w1, struct witness *w2)
{
return ((w1->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)) ==
(w2->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)));
}
static __inline int
witness_lock_order_key_equal(const struct witness_lock_order_key *a,
const struct witness_lock_order_key *b)
{
return (a->from == b->from && a->to == b->to);
}
static int _isitmyx(struct witness *w1, struct witness *w2, int rmask,
const char *fname);
static void adopt(struct witness *parent, struct witness *child);
#ifdef BLESSING
static int blessed(struct witness *, struct witness *);
#endif
static void depart(struct witness *w);
static struct witness *enroll(const char *description,
struct lock_class *lock_class);
static struct lock_instance *find_instance(struct lock_list_entry *list,
const struct lock_object *lock);
static int isitmychild(struct witness *parent, struct witness *child);
static int isitmydescendant(struct witness *parent, struct witness *child);
static void itismychild(struct witness *parent, struct witness *child);
static int sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS);
static int sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS);
static int sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS);
static int sysctl_debug_witness_channel(SYSCTL_HANDLER_ARGS);
static void witness_add_fullgraph(struct sbuf *sb, struct witness *parent);
#ifdef DDB
static void witness_ddb_compute_levels(void);
static void witness_ddb_display(int(*)(const char *fmt, ...));
static void witness_ddb_display_descendants(int(*)(const char *fmt, ...),
struct witness *, int indent);
static void witness_ddb_display_list(int(*prnt)(const char *fmt, ...),
struct witness_list *list);
static void witness_ddb_level_descendants(struct witness *parent, int l);
static void witness_ddb_list(struct thread *td);
#endif
static void witness_debugger(int cond, const char *msg);
static void witness_free(struct witness *m);
static struct witness *witness_get(void);
static uint32_t witness_hash_djb2(const uint8_t *key, uint32_t size);
static struct witness *witness_hash_get(const char *key);
static void witness_hash_put(struct witness *w);
static void witness_init_hash_tables(void);
static void witness_increment_graph_generation(void);
static void witness_lock_list_free(struct lock_list_entry *lle);
static struct lock_list_entry *witness_lock_list_get(void);
static int witness_lock_order_add(struct witness *parent,
struct witness *child);
static int witness_lock_order_check(struct witness *parent,
struct witness *child);
static struct witness_lock_order_data *witness_lock_order_get(
struct witness *parent,
struct witness *child);
static void witness_list_lock(struct lock_instance *instance,
int (*prnt)(const char *fmt, ...));
static int witness_output(const char *fmt, ...) __printflike(1, 2);
static int witness_voutput(const char *fmt, va_list ap) __printflike(1, 0);
static void witness_setflag(struct lock_object *lock, int flag, int set);
static SYSCTL_NODE(_debug, OID_AUTO, witness, CTLFLAG_RW, NULL,
"Witness Locking");
/*
* If set to 0, lock order checking is disabled. If set to -1,
* witness is completely disabled. Otherwise witness performs full
* lock order checking for all locks. At runtime, lock order checking
* may be toggled. However, witness cannot be reenabled once it is
* completely disabled.
*/
static int witness_watch = 1;
SYSCTL_PROC(_debug_witness, OID_AUTO, watch, CTLFLAG_RWTUN | CTLTYPE_INT, NULL, 0,
sysctl_debug_witness_watch, "I", "witness is watching lock operations");
#ifdef KDB
/*
* When KDB is enabled and witness_kdb is 1, it will cause the system
* to drop into kdebug() when:
* - a lock hierarchy violation occurs
* - locks are held when going to sleep.
*/
#ifdef WITNESS_KDB
int witness_kdb = 1;
#else
int witness_kdb = 0;
#endif
SYSCTL_INT(_debug_witness, OID_AUTO, kdb, CTLFLAG_RWTUN, &witness_kdb, 0, "");
#endif /* KDB */
#if defined(DDB) || defined(KDB)
/*
* When DDB or KDB is enabled and witness_trace is 1, it will cause the system
* to print a stack trace:
* - a lock hierarchy violation occurs
* - locks are held when going to sleep.
*/
int witness_trace = 1;
SYSCTL_INT(_debug_witness, OID_AUTO, trace, CTLFLAG_RWTUN, &witness_trace, 0, "");
#endif /* DDB || KDB */
#ifdef WITNESS_SKIPSPIN
int witness_skipspin = 1;
#else
int witness_skipspin = 0;
#endif
SYSCTL_INT(_debug_witness, OID_AUTO, skipspin, CTLFLAG_RDTUN, &witness_skipspin, 0, "");
int badstack_sbuf_size;
int witness_count = WITNESS_COUNT;
SYSCTL_INT(_debug_witness, OID_AUTO, witness_count, CTLFLAG_RDTUN,
&witness_count, 0, "");
/*
* Output channel for witness messages. By default we print to the console.
*/
enum witness_channel {
WITNESS_CONSOLE,
WITNESS_LOG,
WITNESS_NONE,
};
static enum witness_channel witness_channel = WITNESS_CONSOLE;
SYSCTL_PROC(_debug_witness, OID_AUTO, output_channel, CTLTYPE_STRING |
CTLFLAG_RWTUN, NULL, 0, sysctl_debug_witness_channel, "A",
"Output channel for warnings");
/*
* Call this to print out the relations between locks.
*/
SYSCTL_PROC(_debug_witness, OID_AUTO, fullgraph, CTLTYPE_STRING | CTLFLAG_RD,
NULL, 0, sysctl_debug_witness_fullgraph, "A", "Show locks relation graphs");
/*
* Call this to print out the witness faulty stacks.
*/
SYSCTL_PROC(_debug_witness, OID_AUTO, badstacks, CTLTYPE_STRING | CTLFLAG_RD,
NULL, 0, sysctl_debug_witness_badstacks, "A", "Show bad witness stacks");
static struct mtx w_mtx;
/* w_list */
static struct witness_list w_free = STAILQ_HEAD_INITIALIZER(w_free);
static struct witness_list w_all = STAILQ_HEAD_INITIALIZER(w_all);
/* w_typelist */
static struct witness_list w_spin = STAILQ_HEAD_INITIALIZER(w_spin);
static struct witness_list w_sleep = STAILQ_HEAD_INITIALIZER(w_sleep);
/* lock list */
static struct lock_list_entry *w_lock_list_free = NULL;
static struct witness_pendhelp pending_locks[WITNESS_PENDLIST];
static u_int pending_cnt;
static int w_free_cnt, w_spin_cnt, w_sleep_cnt;
SYSCTL_INT(_debug_witness, OID_AUTO, free_cnt, CTLFLAG_RD, &w_free_cnt, 0, "");
SYSCTL_INT(_debug_witness, OID_AUTO, spin_cnt, CTLFLAG_RD, &w_spin_cnt, 0, "");
SYSCTL_INT(_debug_witness, OID_AUTO, sleep_cnt, CTLFLAG_RD, &w_sleep_cnt, 0,
"");
static struct witness *w_data;
static uint8_t **w_rmatrix;
static struct lock_list_entry w_locklistdata[LOCK_CHILDCOUNT];
static struct witness_hash w_hash; /* The witness hash table. */
/* The lock order data hash */
static struct witness_lock_order_data w_lodata[WITNESS_LO_DATA_COUNT];
static struct witness_lock_order_data *w_lofree = NULL;
static struct witness_lock_order_hash w_lohash;
static int w_max_used_index = 0;
static unsigned int w_generation = 0;
static const char w_notrunning[] = "Witness not running\n";
static const char w_stillcold[] = "Witness is still cold\n";
static struct witness_order_list_entry order_lists[] = {
/*
* sx locks
*/
{ "proctree", &lock_class_sx },
{ "allproc", &lock_class_sx },
{ "allprison", &lock_class_sx },
{ NULL, NULL },
/*
* Various mutexes
*/
{ "Giant", &lock_class_mtx_sleep },
{ "pipe mutex", &lock_class_mtx_sleep },
{ "sigio lock", &lock_class_mtx_sleep },
{ "process group", &lock_class_mtx_sleep },
{ "process lock", &lock_class_mtx_sleep },
{ "session", &lock_class_mtx_sleep },
{ "uidinfo hash", &lock_class_rw },
#ifdef HWPMC_HOOKS
{ "pmc-sleep", &lock_class_mtx_sleep },
#endif
{ "time lock", &lock_class_mtx_sleep },
{ NULL, NULL },
/*
* umtx
*/
{ "umtx lock", &lock_class_mtx_sleep },
{ NULL, NULL },
/*
* Sockets
*/
{ "accept", &lock_class_mtx_sleep },
{ "so_snd", &lock_class_mtx_sleep },
{ "so_rcv", &lock_class_mtx_sleep },
{ "sellck", &lock_class_mtx_sleep },
{ NULL, NULL },
/*
* Routing
*/
{ "so_rcv", &lock_class_mtx_sleep },
{ "radix node head", &lock_class_rw },
{ "rtentry", &lock_class_mtx_sleep },
{ "ifaddr", &lock_class_mtx_sleep },
{ NULL, NULL },
/*
* IPv4 multicast:
* protocol locks before interface locks, after UDP locks.
*/
{ "udpinp", &lock_class_rw },
{ "in_multi_mtx", &lock_class_mtx_sleep },
{ "igmp_mtx", &lock_class_mtx_sleep },
{ "if_addr_lock", &lock_class_rw },
{ NULL, NULL },
/*
* IPv6 multicast:
* protocol locks before interface locks, after UDP locks.
*/
{ "udpinp", &lock_class_rw },
{ "in6_multi_mtx", &lock_class_mtx_sleep },
{ "mld_mtx", &lock_class_mtx_sleep },
{ "if_addr_lock", &lock_class_rw },
{ NULL, NULL },
/*
* UNIX Domain Sockets
*/
{ "unp_link_rwlock", &lock_class_rw },
{ "unp_list_lock", &lock_class_mtx_sleep },
{ "unp", &lock_class_mtx_sleep },
{ "so_snd", &lock_class_mtx_sleep },
{ NULL, NULL },
/*
* UDP/IP
*/
{ "udp", &lock_class_rw },
{ "udpinp", &lock_class_rw },
{ "so_snd", &lock_class_mtx_sleep },
{ NULL, NULL },
/*
* TCP/IP
*/
{ "tcp", &lock_class_rw },
{ "tcpinp", &lock_class_rw },
{ "so_snd", &lock_class_mtx_sleep },
{ NULL, NULL },
/*
* BPF
*/
{ "bpf global lock", &lock_class_mtx_sleep },
{ "bpf interface lock", &lock_class_rw },
{ "bpf cdev lock", &lock_class_mtx_sleep },
{ NULL, NULL },
/*
* NFS server
*/
{ "nfsd_mtx", &lock_class_mtx_sleep },
{ "so_snd", &lock_class_mtx_sleep },
{ NULL, NULL },
/*
* IEEE 802.11
*/
{ "802.11 com lock", &lock_class_mtx_sleep},
{ NULL, NULL },
/*
* Network drivers
*/
{ "network driver", &lock_class_mtx_sleep},
{ NULL, NULL },
/*
* Netgraph
*/
{ "ng_node", &lock_class_mtx_sleep },
{ "ng_worklist", &lock_class_mtx_sleep },
{ NULL, NULL },
/*
* CDEV
*/
{ "vm map (system)", &lock_class_mtx_sleep },
{ "vm pagequeue", &lock_class_mtx_sleep },
{ "vnode interlock", &lock_class_mtx_sleep },
{ "cdev", &lock_class_mtx_sleep },
{ NULL, NULL },
/*
* VM
*/
{ "vm map (user)", &lock_class_sx },
{ "vm object", &lock_class_rw },
{ "vm page", &lock_class_mtx_sleep },
{ "vm pagequeue", &lock_class_mtx_sleep },
{ "pmap pv global", &lock_class_rw },
{ "pmap", &lock_class_mtx_sleep },
{ "pmap pv list", &lock_class_rw },
{ "vm page free queue", &lock_class_mtx_sleep },
{ NULL, NULL },
/*
* kqueue/VFS interaction
*/
{ "kqueue", &lock_class_mtx_sleep },
{ "struct mount mtx", &lock_class_mtx_sleep },
{ "vnode interlock", &lock_class_mtx_sleep },
{ NULL, NULL },
/*
* VFS namecache
*/
{ "ncvn", &lock_class_mtx_sleep },
{ "ncbuc", &lock_class_rw },
{ "vnode interlock", &lock_class_mtx_sleep },
{ "ncneg", &lock_class_mtx_sleep },
{ NULL, NULL },
/*
* ZFS locking
*/
{ "dn->dn_mtx", &lock_class_sx },
{ "dr->dt.di.dr_mtx", &lock_class_sx },
{ "db->db_mtx", &lock_class_sx },
{ NULL, NULL },
/*
* spin locks
*/
#ifdef SMP
{ "ap boot", &lock_class_mtx_spin },
#endif
{ "rm.mutex_mtx", &lock_class_mtx_spin },
{ "sio", &lock_class_mtx_spin },
#ifdef __i386__
{ "cy", &lock_class_mtx_spin },
#endif
#ifdef __sparc64__
{ "pcib_mtx", &lock_class_mtx_spin },
{ "rtc_mtx", &lock_class_mtx_spin },
#endif
{ "scc_hwmtx", &lock_class_mtx_spin },
{ "uart_hwmtx", &lock_class_mtx_spin },
{ "fast_taskqueue", &lock_class_mtx_spin },
{ "intr table", &lock_class_mtx_spin },
#ifdef HWPMC_HOOKS
{ "pmc-per-proc", &lock_class_mtx_spin },
#endif
{ "process slock", &lock_class_mtx_spin },
{ "syscons video lock", &lock_class_mtx_spin },
{ "sleepq chain", &lock_class_mtx_spin },
{ "rm_spinlock", &lock_class_mtx_spin },
{ "turnstile chain", &lock_class_mtx_spin },
{ "turnstile lock", &lock_class_mtx_spin },
{ "sched lock", &lock_class_mtx_spin },
{ "td_contested", &lock_class_mtx_spin },
{ "callout", &lock_class_mtx_spin },
{ "entropy harvest mutex", &lock_class_mtx_spin },
#ifdef SMP
{ "smp rendezvous", &lock_class_mtx_spin },
#endif
#ifdef __powerpc__
{ "tlb0", &lock_class_mtx_spin },
#endif
/*
* leaf locks
*/
{ "intrcnt", &lock_class_mtx_spin },
{ "icu", &lock_class_mtx_spin },
#if defined(SMP) && defined(__sparc64__)
{ "ipi", &lock_class_mtx_spin },
#endif
#ifdef __i386__
{ "allpmaps", &lock_class_mtx_spin },
{ "descriptor tables", &lock_class_mtx_spin },
#endif
{ "clk", &lock_class_mtx_spin },
{ "cpuset", &lock_class_mtx_spin },
{ "mprof lock", &lock_class_mtx_spin },
{ "zombie lock", &lock_class_mtx_spin },
{ "ALD Queue", &lock_class_mtx_spin },
#if defined(__i386__) || defined(__amd64__)
{ "pcicfg", &lock_class_mtx_spin },
{ "NDIS thread lock", &lock_class_mtx_spin },
#endif
{ "tw_osl_io_lock", &lock_class_mtx_spin },
{ "tw_osl_q_lock", &lock_class_mtx_spin },
{ "tw_cl_io_lock", &lock_class_mtx_spin },
{ "tw_cl_intr_lock", &lock_class_mtx_spin },
{ "tw_cl_gen_lock", &lock_class_mtx_spin },
#ifdef HWPMC_HOOKS
{ "pmc-leaf", &lock_class_mtx_spin },
#endif
{ "blocked lock", &lock_class_mtx_spin },
{ NULL, NULL },
{ NULL, NULL }
};
#ifdef BLESSING
/*
* Pairs of locks which have been blessed
* Don't complain about order problems with blessed locks
*/
static struct witness_blessed blessed_list[] = {
};
#endif
/*
* This global is set to 0 once it becomes safe to use the witness code.
*/
static int witness_cold = 1;
/*
* This global is set to 1 once the static lock orders have been enrolled
* so that a warning can be issued for any spin locks enrolled later.
*/
static int witness_spin_warn = 0;
/* Trim useless garbage from filenames. */
static const char *
fixup_filename(const char *file)
{
if (file == NULL)
return (NULL);
while (strncmp(file, "../", 3) == 0)
file += 3;
return (file);
}
/*
* The WITNESS-enabled diagnostic code. Note that the witness code does
* assume that the early boot is single-threaded at least until after this
* routine is completed.
*/
static void
witness_initialize(void *dummy __unused)
{
struct lock_object *lock;
struct witness_order_list_entry *order;
struct witness *w, *w1;
int i;
w_data = malloc(sizeof (struct witness) * witness_count, M_WITNESS,
M_WAITOK | M_ZERO);
w_rmatrix = malloc(sizeof(*w_rmatrix) * (witness_count + 1),
M_WITNESS, M_WAITOK | M_ZERO);
for (i = 0; i < witness_count + 1; i++) {
w_rmatrix[i] = malloc(sizeof(*w_rmatrix[i]) *
(witness_count + 1), M_WITNESS, M_WAITOK | M_ZERO);
}
badstack_sbuf_size = witness_count * 256;
/*
* We have to release Giant before initializing its witness
* structure so that WITNESS doesn't get confused.
*/
mtx_unlock(&Giant);
mtx_assert(&Giant, MA_NOTOWNED);
CTR1(KTR_WITNESS, "%s: initializing witness", __func__);
mtx_init(&w_mtx, "witness lock", NULL, MTX_SPIN | MTX_QUIET |
MTX_NOWITNESS | MTX_NOPROFILE);
for (i = witness_count - 1; i >= 0; i--) {
w = &w_data[i];
memset(w, 0, sizeof(*w));
w_data[i].w_index = i; /* Witness index never changes. */
witness_free(w);
}
KASSERT(STAILQ_FIRST(&w_free)->w_index == 0,
("%s: Invalid list of free witness objects", __func__));
/* Witness with index 0 is not used to aid in debugging. */
STAILQ_REMOVE_HEAD(&w_free, w_list);
w_free_cnt--;
for (i = 0; i < witness_count; i++) {
memset(w_rmatrix[i], 0, sizeof(*w_rmatrix[i]) *
(witness_count + 1));
}
for (i = 0; i < LOCK_CHILDCOUNT; i++)
witness_lock_list_free(&w_locklistdata[i]);
witness_init_hash_tables();
/* First add in all the specified order lists. */
for (order = order_lists; order->w_name != NULL; order++) {
w = enroll(order->w_name, order->w_class);
if (w == NULL)
continue;
w->w_file = "order list";
for (order++; order->w_name != NULL; order++) {
w1 = enroll(order->w_name, order->w_class);
if (w1 == NULL)
continue;
w1->w_file = "order list";
itismychild(w, w1);
w = w1;
}
}
witness_spin_warn = 1;
/* Iterate through all locks and add them to witness. */
for (i = 0; pending_locks[i].wh_lock != NULL; i++) {
lock = pending_locks[i].wh_lock;
KASSERT(lock->lo_flags & LO_WITNESS,
("%s: lock %s is on pending list but not LO_WITNESS",
__func__, lock->lo_name));
lock->lo_witness = enroll(pending_locks[i].wh_type,
LOCK_CLASS(lock));
}
/* Mark the witness code as being ready for use. */
witness_cold = 0;
mtx_lock(&Giant);
}
SYSINIT(witness_init, SI_SUB_WITNESS, SI_ORDER_FIRST, witness_initialize,
NULL);
void
witness_init(struct lock_object *lock, const char *type)
{
struct lock_class *class;
/* Various sanity checks. */
class = LOCK_CLASS(lock);
if ((lock->lo_flags & LO_RECURSABLE) != 0 &&
(class->lc_flags & LC_RECURSABLE) == 0)
kassert_panic("%s: lock (%s) %s can not be recursable",
__func__, class->lc_name, lock->lo_name);
if ((lock->lo_flags & LO_SLEEPABLE) != 0 &&
(class->lc_flags & LC_SLEEPABLE) == 0)
kassert_panic("%s: lock (%s) %s can not be sleepable",
__func__, class->lc_name, lock->lo_name);
if ((lock->lo_flags & LO_UPGRADABLE) != 0 &&
(class->lc_flags & LC_UPGRADABLE) == 0)
kassert_panic("%s: lock (%s) %s can not be upgradable",
__func__, class->lc_name, lock->lo_name);
/*
* If we shouldn't watch this lock, then just clear lo_witness.
* Otherwise, if witness_cold is set, then it is too early to
* enroll this lock, so defer it to witness_initialize() by adding
* it to the pending_locks list. If it is not too early, then enroll
* the lock now.
*/
if (witness_watch < 1 || panicstr != NULL ||
(lock->lo_flags & LO_WITNESS) == 0)
lock->lo_witness = NULL;
else if (witness_cold) {
pending_locks[pending_cnt].wh_lock = lock;
pending_locks[pending_cnt++].wh_type = type;
if (pending_cnt > WITNESS_PENDLIST)
panic("%s: pending locks list is too small, "
"increase WITNESS_PENDLIST\n",
__func__);
} else
lock->lo_witness = enroll(type, class);
}
void
witness_destroy(struct lock_object *lock)
{
struct lock_class *class;
struct witness *w;
class = LOCK_CLASS(lock);
if (witness_cold)
panic("lock (%s) %s destroyed while witness_cold",
class->lc_name, lock->lo_name);
/* XXX: need to verify that no one holds the lock */
if ((lock->lo_flags & LO_WITNESS) == 0 || lock->lo_witness == NULL)
return;
w = lock->lo_witness;
mtx_lock_spin(&w_mtx);
MPASS(w->w_refcount > 0);
w->w_refcount--;
if (w->w_refcount == 0)
depart(w);
mtx_unlock_spin(&w_mtx);
}
#ifdef DDB
static void
witness_ddb_compute_levels(void)
{
struct witness *w;
/*
* First clear all levels.
*/
STAILQ_FOREACH(w, &w_all, w_list)
w->w_ddb_level = -1;
/*
* Look for locks with no parents and level all their descendants.
*/
STAILQ_FOREACH(w, &w_all, w_list) {
/* If the witness has ancestors (is not a root), skip it. */
if (w->w_num_ancestors > 0)
continue;
witness_ddb_level_descendants(w, 0);
}
}
static void
witness_ddb_level_descendants(struct witness *w, int l)
{
int i;
if (w->w_ddb_level >= l)
return;
w->w_ddb_level = l;
l++;
for (i = 1; i <= w_max_used_index; i++) {
if (w_rmatrix[w->w_index][i] & WITNESS_PARENT)
witness_ddb_level_descendants(&w_data[i], l);
}
}
static void
witness_ddb_display_descendants(int(*prnt)(const char *fmt, ...),
struct witness *w, int indent)
{
int i;
for (i = 0; i < indent; i++)
prnt(" ");
prnt("%s (type: %s, depth: %d, active refs: %d)",
w->w_name, w->w_class->lc_name,
w->w_ddb_level, w->w_refcount);
if (w->w_displayed) {
prnt(" -- (already displayed)\n");
return;
}
w->w_displayed = 1;
if (w->w_file != NULL && w->w_line != 0)
prnt(" -- last acquired @ %s:%d\n", fixup_filename(w->w_file),
w->w_line);
else
prnt(" -- never acquired\n");
indent++;
WITNESS_INDEX_ASSERT(w->w_index);
for (i = 1; i <= w_max_used_index; i++) {
if (db_pager_quit)
return;
if (w_rmatrix[w->w_index][i] & WITNESS_PARENT)
witness_ddb_display_descendants(prnt, &w_data[i],
indent);
}
}
static void
witness_ddb_display_list(int(*prnt)(const char *fmt, ...),
struct witness_list *list)
{
struct witness *w;
STAILQ_FOREACH(w, list, w_typelist) {
if (w->w_file == NULL || w->w_ddb_level > 0)
continue;
/* This lock has no anscestors - display its descendants. */
witness_ddb_display_descendants(prnt, w, 0);
if (db_pager_quit)
return;
}
}
static void
witness_ddb_display(int(*prnt)(const char *fmt, ...))
{
struct witness *w;
KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
witness_ddb_compute_levels();
/* Clear all the displayed flags. */
STAILQ_FOREACH(w, &w_all, w_list)
w->w_displayed = 0;
/*
* First, handle sleep locks which have been acquired at least
* once.
*/
prnt("Sleep locks:\n");
witness_ddb_display_list(prnt, &w_sleep);
if (db_pager_quit)
return;
/*
* Now do spin locks which have been acquired at least once.
*/
prnt("\nSpin locks:\n");
witness_ddb_display_list(prnt, &w_spin);
if (db_pager_quit)
return;
/*
* Finally, any locks which have not been acquired yet.
*/
prnt("\nLocks which were never acquired:\n");
STAILQ_FOREACH(w, &w_all, w_list) {
if (w->w_file != NULL || w->w_refcount == 0)
continue;
prnt("%s (type: %s, depth: %d)\n", w->w_name,
w->w_class->lc_name, w->w_ddb_level);
if (db_pager_quit)
return;
}
}
#endif /* DDB */
int
witness_defineorder(struct lock_object *lock1, struct lock_object *lock2)
{
if (witness_watch == -1 || panicstr != NULL)
return (0);
/* Require locks that witness knows about. */
if (lock1 == NULL || lock1->lo_witness == NULL || lock2 == NULL ||
lock2->lo_witness == NULL)
return (EINVAL);
mtx_assert(&w_mtx, MA_NOTOWNED);
mtx_lock_spin(&w_mtx);
/*
* If we already have either an explicit or implied lock order that
* is the other way around, then return an error.
*/
if (witness_watch &&
isitmydescendant(lock2->lo_witness, lock1->lo_witness)) {
mtx_unlock_spin(&w_mtx);
return (EDOOFUS);
}
/* Try to add the new order. */
CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__,
lock2->lo_witness->w_name, lock1->lo_witness->w_name);
itismychild(lock1->lo_witness, lock2->lo_witness);
mtx_unlock_spin(&w_mtx);
return (0);
}
void
witness_checkorder(struct lock_object *lock, int flags, const char *file,
int line, struct lock_object *interlock)
{
struct lock_list_entry *lock_list, *lle;
struct lock_instance *lock1, *lock2, *plock;
struct lock_class *class, *iclass;
struct witness *w, *w1;
struct thread *td;
int i, j;
if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL ||
panicstr != NULL)
return;
w = lock->lo_witness;
class = LOCK_CLASS(lock);
td = curthread;
if (class->lc_flags & LC_SLEEPLOCK) {
/*
* Since spin locks include a critical section, this check
* implicitly enforces a lock order of all sleep locks before
* all spin locks.
*/
if (td->td_critnest != 0 && !kdb_active)
kassert_panic("acquiring blockable sleep lock with "
"spinlock or critical section held (%s) %s @ %s:%d",
class->lc_name, lock->lo_name,
fixup_filename(file), line);
/*
* If this is the first lock acquired then just return as
* no order checking is needed.
*/
lock_list = td->td_sleeplocks;
if (lock_list == NULL || lock_list->ll_count == 0)
return;
} else {
/*
* If this is the first lock, just return as no order
* checking is needed. Avoid problems with thread
* migration pinning the thread while checking if
* spinlocks are held. If at least one spinlock is held
* the thread is in a safe path and it is allowed to
* unpin it.
*/
sched_pin();
lock_list = PCPU_GET(spinlocks);
if (lock_list == NULL || lock_list->ll_count == 0) {
sched_unpin();
return;
}
sched_unpin();
}
/*
* Check to see if we are recursing on a lock we already own. If
* so, make sure that we don't mismatch exclusive and shared lock
* acquires.
*/
lock1 = find_instance(lock_list, lock);
if (lock1 != NULL) {
if ((lock1->li_flags & LI_EXCLUSIVE) != 0 &&
(flags & LOP_EXCLUSIVE) == 0) {
witness_output("shared lock of (%s) %s @ %s:%d\n",
class->lc_name, lock->lo_name,
fixup_filename(file), line);
witness_output("while exclusively locked from %s:%d\n",
fixup_filename(lock1->li_file), lock1->li_line);
kassert_panic("excl->share");
}
if ((lock1->li_flags & LI_EXCLUSIVE) == 0 &&
(flags & LOP_EXCLUSIVE) != 0) {
witness_output("exclusive lock of (%s) %s @ %s:%d\n",
class->lc_name, lock->lo_name,
fixup_filename(file), line);
witness_output("while share locked from %s:%d\n",
fixup_filename(lock1->li_file), lock1->li_line);
kassert_panic("share->excl");
}
return;
}
/* Warn if the interlock is not locked exactly once. */
if (interlock != NULL) {
iclass = LOCK_CLASS(interlock);
lock1 = find_instance(lock_list, interlock);
if (lock1 == NULL)
kassert_panic("interlock (%s) %s not locked @ %s:%d",
iclass->lc_name, interlock->lo_name,
fixup_filename(file), line);
else if ((lock1->li_flags & LI_RECURSEMASK) != 0)
kassert_panic("interlock (%s) %s recursed @ %s:%d",
iclass->lc_name, interlock->lo_name,
fixup_filename(file), line);
}
/*
* Find the previously acquired lock, but ignore interlocks.
*/
plock = &lock_list->ll_children[lock_list->ll_count - 1];
if (interlock != NULL && plock->li_lock == interlock) {
if (lock_list->ll_count > 1)
plock =
&lock_list->ll_children[lock_list->ll_count - 2];
else {
lle = lock_list->ll_next;
/*
* The interlock is the only lock we hold, so
* simply return.
*/
if (lle == NULL)
return;
plock = &lle->ll_children[lle->ll_count - 1];
}
}
/*
* Try to perform most checks without a lock. If this succeeds we
* can skip acquiring the lock and return success. Otherwise we redo
* the check with the lock held to handle races with concurrent updates.
*/
w1 = plock->li_lock->lo_witness;
if (witness_lock_order_check(w1, w))
return;
mtx_lock_spin(&w_mtx);
if (witness_lock_order_check(w1, w)) {
mtx_unlock_spin(&w_mtx);
return;
}
witness_lock_order_add(w1, w);
/*
* Check for duplicate locks of the same type. Note that we only
* have to check for this on the last lock we just acquired. Any
* other cases will be caught as lock order violations.
*/
if (w1 == w) {
i = w->w_index;
if (!(lock->lo_flags & LO_DUPOK) && !(flags & LOP_DUPOK) &&
!(w_rmatrix[i][i] & WITNESS_REVERSAL)) {
w_rmatrix[i][i] |= WITNESS_REVERSAL;
w->w_reversed = 1;
mtx_unlock_spin(&w_mtx);
witness_output(
"acquiring duplicate lock of same type: \"%s\"\n",
w->w_name);
witness_output(" 1st %s @ %s:%d\n", plock->li_lock->lo_name,
fixup_filename(plock->li_file), plock->li_line);
witness_output(" 2nd %s @ %s:%d\n", lock->lo_name,
fixup_filename(file), line);
witness_debugger(1, __func__);
} else
mtx_unlock_spin(&w_mtx);
return;
}
mtx_assert(&w_mtx, MA_OWNED);
/*
* If we know that the lock we are acquiring comes after
* the lock we most recently acquired in the lock order tree,
* then there is no need for any further checks.
*/
if (isitmychild(w1, w))
goto out;
for (j = 0, lle = lock_list; lle != NULL; lle = lle->ll_next) {
for (i = lle->ll_count - 1; i >= 0; i--, j++) {
MPASS(j < LOCK_CHILDCOUNT * LOCK_NCHILDREN);
lock1 = &lle->ll_children[i];
/*
* Ignore the interlock.
*/
if (interlock == lock1->li_lock)
continue;
/*
* If this lock doesn't undergo witness checking,
* then skip it.
*/
w1 = lock1->li_lock->lo_witness;
if (w1 == NULL) {
KASSERT((lock1->li_lock->lo_flags & LO_WITNESS) == 0,
("lock missing witness structure"));
continue;
}
/*
* If we are locking Giant and this is a sleepable
* lock, then skip it.
*/
if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0 &&
lock == &Giant.lock_object)
continue;
/*
* If we are locking a sleepable lock and this lock
* is Giant, then skip it.
*/
if ((lock->lo_flags & LO_SLEEPABLE) != 0 &&
lock1->li_lock == &Giant.lock_object)
continue;
/*
* If we are locking a sleepable lock and this lock
* isn't sleepable, we want to treat it as a lock
* order violation to enfore a general lock order of
* sleepable locks before non-sleepable locks.
*/
if (((lock->lo_flags & LO_SLEEPABLE) != 0 &&
(lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0))
goto reversal;
/*
* If we are locking Giant and this is a non-sleepable
* lock, then treat it as a reversal.
*/
if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0 &&
lock == &Giant.lock_object)
goto reversal;
/*
* Check the lock order hierarchy for a reveresal.
*/
if (!isitmydescendant(w, w1))
continue;
reversal:
/*
* We have a lock order violation, check to see if it
* is allowed or has already been yelled about.
*/
#ifdef BLESSING
/*
* If the lock order is blessed, just bail. We don't
* look for other lock order violations though, which
* may be a bug.
*/
if (blessed(w, w1))
goto out;
#endif
/* Bail if this violation is known */
if (w_rmatrix[w1->w_index][w->w_index] & WITNESS_REVERSAL)
goto out;
/* Record this as a violation */
w_rmatrix[w1->w_index][w->w_index] |= WITNESS_REVERSAL;
w_rmatrix[w->w_index][w1->w_index] |= WITNESS_REVERSAL;
w->w_reversed = w1->w_reversed = 1;
witness_increment_graph_generation();
mtx_unlock_spin(&w_mtx);
#ifdef WITNESS_NO_VNODE
/*
* There are known LORs between VNODE locks. They are
* not an indication of a bug. VNODE locks are flagged
* as such (LO_IS_VNODE) and we don't yell if the LOR
* is between 2 VNODE locks.
*/
if ((lock->lo_flags & LO_IS_VNODE) != 0 &&
(lock1->li_lock->lo_flags & LO_IS_VNODE) != 0)
return;
#endif
/*
* Ok, yell about it.
*/
if (((lock->lo_flags & LO_SLEEPABLE) != 0 &&
(lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0))
witness_output(
"lock order reversal: (sleepable after non-sleepable)\n");
else if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0
&& lock == &Giant.lock_object)
witness_output(
"lock order reversal: (Giant after non-sleepable)\n");
else
witness_output("lock order reversal:\n");
/*
* Try to locate an earlier lock with
* witness w in our list.
*/
do {
lock2 = &lle->ll_children[i];
MPASS(lock2->li_lock != NULL);
if (lock2->li_lock->lo_witness == w)
break;
if (i == 0 && lle->ll_next != NULL) {
lle = lle->ll_next;
i = lle->ll_count - 1;
MPASS(i >= 0 && i < LOCK_NCHILDREN);
} else
i--;
} while (i >= 0);
if (i < 0) {
witness_output(" 1st %p %s (%s) @ %s:%d\n",
lock1->li_lock, lock1->li_lock->lo_name,
w1->w_name, fixup_filename(lock1->li_file),
lock1->li_line);
witness_output(" 2nd %p %s (%s) @ %s:%d\n", lock,
lock->lo_name, w->w_name,
fixup_filename(file), line);
} else {
witness_output(" 1st %p %s (%s) @ %s:%d\n",
lock2->li_lock, lock2->li_lock->lo_name,
lock2->li_lock->lo_witness->w_name,
fixup_filename(lock2->li_file),
lock2->li_line);
witness_output(" 2nd %p %s (%s) @ %s:%d\n",
lock1->li_lock, lock1->li_lock->lo_name,
w1->w_name, fixup_filename(lock1->li_file),
lock1->li_line);
witness_output(" 3rd %p %s (%s) @ %s:%d\n", lock,
lock->lo_name, w->w_name,
fixup_filename(file), line);
}
witness_debugger(1, __func__);
return;
}
}
/*
* If requested, build a new lock order. However, don't build a new
* relationship between a sleepable lock and Giant if it is in the
* wrong direction. The correct lock order is that sleepable locks
* always come before Giant.
*/
if (flags & LOP_NEWORDER &&
!(plock->li_lock == &Giant.lock_object &&
(lock->lo_flags & LO_SLEEPABLE) != 0)) {
CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__,
w->w_name, plock->li_lock->lo_witness->w_name);
itismychild(plock->li_lock->lo_witness, w);
}
out:
mtx_unlock_spin(&w_mtx);
}
void
witness_lock(struct lock_object *lock, int flags, const char *file, int line)
{
struct lock_list_entry **lock_list, *lle;
struct lock_instance *instance;
struct witness *w;
struct thread *td;
if (witness_cold || witness_watch == -1 || lock->lo_witness == NULL ||
panicstr != NULL)
return;
w = lock->lo_witness;
td = curthread;
/* Determine lock list for this lock. */
if (LOCK_CLASS(lock)->lc_flags & LC_SLEEPLOCK)
lock_list = &td->td_sleeplocks;
else
lock_list = PCPU_PTR(spinlocks);
/* Check to see if we are recursing on a lock we already own. */
instance = find_instance(*lock_list, lock);
if (instance != NULL) {
instance->li_flags++;
CTR4(KTR_WITNESS, "%s: pid %d recursed on %s r=%d", __func__,
td->td_proc->p_pid, lock->lo_name,
instance->li_flags & LI_RECURSEMASK);
instance->li_file = file;
instance->li_line = line;
return;
}
/* Update per-witness last file and line acquire. */
w->w_file = file;
w->w_line = line;
/* Find the next open lock instance in the list and fill it. */
lle = *lock_list;
if (lle == NULL || lle->ll_count == LOCK_NCHILDREN) {
lle = witness_lock_list_get();
if (lle == NULL)
return;
lle->ll_next = *lock_list;
CTR3(KTR_WITNESS, "%s: pid %d added lle %p", __func__,
td->td_proc->p_pid, lle);
*lock_list = lle;
}
instance = &lle->ll_children[lle->ll_count++];
instance->li_lock = lock;
instance->li_line = line;
instance->li_file = file;
if ((flags & LOP_EXCLUSIVE) != 0)
instance->li_flags = LI_EXCLUSIVE;
else
instance->li_flags = 0;
CTR4(KTR_WITNESS, "%s: pid %d added %s as lle[%d]", __func__,
td->td_proc->p_pid, lock->lo_name, lle->ll_count - 1);
}
void
witness_upgrade(struct lock_object *lock, int flags, const char *file, int line)
{
struct lock_instance *instance;
struct lock_class *class;
KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
return;
class = LOCK_CLASS(lock);
if (witness_watch) {
if ((lock->lo_flags & LO_UPGRADABLE) == 0)
kassert_panic(
"upgrade of non-upgradable lock (%s) %s @ %s:%d",
class->lc_name, lock->lo_name,
fixup_filename(file), line);
if ((class->lc_flags & LC_SLEEPLOCK) == 0)
kassert_panic(
"upgrade of non-sleep lock (%s) %s @ %s:%d",
class->lc_name, lock->lo_name,
fixup_filename(file), line);
}
instance = find_instance(curthread->td_sleeplocks, lock);
if (instance == NULL) {
kassert_panic("upgrade of unlocked lock (%s) %s @ %s:%d",
class->lc_name, lock->lo_name,
fixup_filename(file), line);
return;
}
if (witness_watch) {
if ((instance->li_flags & LI_EXCLUSIVE) != 0)
kassert_panic(
"upgrade of exclusive lock (%s) %s @ %s:%d",
class->lc_name, lock->lo_name,
fixup_filename(file), line);
if ((instance->li_flags & LI_RECURSEMASK) != 0)
kassert_panic(
"upgrade of recursed lock (%s) %s r=%d @ %s:%d",
class->lc_name, lock->lo_name,
instance->li_flags & LI_RECURSEMASK,
fixup_filename(file), line);
}
instance->li_flags |= LI_EXCLUSIVE;
}
void
witness_downgrade(struct lock_object *lock, int flags, const char *file,
int line)
{
struct lock_instance *instance;
struct lock_class *class;
KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
return;
class = LOCK_CLASS(lock);
if (witness_watch) {
if ((lock->lo_flags & LO_UPGRADABLE) == 0)
kassert_panic(
"downgrade of non-upgradable lock (%s) %s @ %s:%d",
class->lc_name, lock->lo_name,
fixup_filename(file), line);
if ((class->lc_flags & LC_SLEEPLOCK) == 0)
kassert_panic(
"downgrade of non-sleep lock (%s) %s @ %s:%d",
class->lc_name, lock->lo_name,
fixup_filename(file), line);
}
instance = find_instance(curthread->td_sleeplocks, lock);
if (instance == NULL) {
kassert_panic("downgrade of unlocked lock (%s) %s @ %s:%d",
class->lc_name, lock->lo_name,
fixup_filename(file), line);
return;
}
if (witness_watch) {
if ((instance->li_flags & LI_EXCLUSIVE) == 0)
kassert_panic(
"downgrade of shared lock (%s) %s @ %s:%d",
class->lc_name, lock->lo_name,
fixup_filename(file), line);
if ((instance->li_flags & LI_RECURSEMASK) != 0)
kassert_panic(
"downgrade of recursed lock (%s) %s r=%d @ %s:%d",
class->lc_name, lock->lo_name,
instance->li_flags & LI_RECURSEMASK,
fixup_filename(file), line);
}
instance->li_flags &= ~LI_EXCLUSIVE;
}
void
witness_unlock(struct lock_object *lock, int flags, const char *file, int line)
{
struct lock_list_entry **lock_list, *lle;
struct lock_instance *instance;
struct lock_class *class;
struct thread *td;
register_t s;
int i, j;
if (witness_cold || lock->lo_witness == NULL || panicstr != NULL)
return;
td = curthread;
class = LOCK_CLASS(lock);
/* Find lock instance associated with this lock. */
if (class->lc_flags & LC_SLEEPLOCK)
lock_list = &td->td_sleeplocks;
else
lock_list = PCPU_PTR(spinlocks);
lle = *lock_list;
for (; *lock_list != NULL; lock_list = &(*lock_list)->ll_next)
for (i = 0; i < (*lock_list)->ll_count; i++) {
instance = &(*lock_list)->ll_children[i];
if (instance->li_lock == lock)
goto found;
}
/*
* When disabling WITNESS through witness_watch we could end up in
* having registered locks in the td_sleeplocks queue.
* We have to make sure we flush these queues, so just search for
* eventual register locks and remove them.
*/
if (witness_watch > 0) {
kassert_panic("lock (%s) %s not locked @ %s:%d", class->lc_name,
lock->lo_name, fixup_filename(file), line);
return;
} else {
return;
}
found:
/* First, check for shared/exclusive mismatches. */
if ((instance->li_flags & LI_EXCLUSIVE) != 0 && witness_watch > 0 &&
(flags & LOP_EXCLUSIVE) == 0) {
witness_output("shared unlock of (%s) %s @ %s:%d\n",
class->lc_name, lock->lo_name, fixup_filename(file), line);
witness_output("while exclusively locked from %s:%d\n",
fixup_filename(instance->li_file), instance->li_line);
kassert_panic("excl->ushare");
}
if ((instance->li_flags & LI_EXCLUSIVE) == 0 && witness_watch > 0 &&
(flags & LOP_EXCLUSIVE) != 0) {
witness_output("exclusive unlock of (%s) %s @ %s:%d\n",
class->lc_name, lock->lo_name, fixup_filename(file), line);
witness_output("while share locked from %s:%d\n",
fixup_filename(instance->li_file),
instance->li_line);
kassert_panic("share->uexcl");
}
/* If we are recursed, unrecurse. */
if ((instance->li_flags & LI_RECURSEMASK) > 0) {
CTR4(KTR_WITNESS, "%s: pid %d unrecursed on %s r=%d", __func__,
td->td_proc->p_pid, instance->li_lock->lo_name,
instance->li_flags);
instance->li_flags--;
return;
}
/* The lock is now being dropped, check for NORELEASE flag */
if ((instance->li_flags & LI_NORELEASE) != 0 && witness_watch > 0) {
witness_output("forbidden unlock of (%s) %s @ %s:%d\n",
class->lc_name, lock->lo_name, fixup_filename(file), line);
kassert_panic("lock marked norelease");
}
/* Otherwise, remove this item from the list. */
s = intr_disable();
CTR4(KTR_WITNESS, "%s: pid %d removed %s from lle[%d]", __func__,
td->td_proc->p_pid, instance->li_lock->lo_name,
(*lock_list)->ll_count - 1);
for (j = i; j < (*lock_list)->ll_count - 1; j++)
(*lock_list)->ll_children[j] =
(*lock_list)->ll_children[j + 1];
(*lock_list)->ll_count--;
intr_restore(s);
/*
* In order to reduce contention on w_mtx, we want to keep always an
* head object into lists so that frequent allocation from the
* free witness pool (and subsequent locking) is avoided.
* In order to maintain the current code simple, when the head
* object is totally unloaded it means also that we do not have
* further objects in the list, so the list ownership needs to be
* hand over to another object if the current head needs to be freed.
*/
if ((*lock_list)->ll_count == 0) {
if (*lock_list == lle) {
if (lle->ll_next == NULL)
return;
} else
lle = *lock_list;
*lock_list = lle->ll_next;
CTR3(KTR_WITNESS, "%s: pid %d removed lle %p", __func__,
td->td_proc->p_pid, lle);
witness_lock_list_free(lle);
}
}
void
witness_thread_exit(struct thread *td)
{
struct lock_list_entry *lle;
int i, n;
lle = td->td_sleeplocks;
if (lle == NULL || panicstr != NULL)
return;
if (lle->ll_count != 0) {
for (n = 0; lle != NULL; lle = lle->ll_next)
for (i = lle->ll_count - 1; i >= 0; i--) {
if (n == 0)
witness_output(
"Thread %p exiting with the following locks held:\n", td);
n++;
witness_list_lock(&lle->ll_children[i],
witness_output);
}
kassert_panic(
"Thread %p cannot exit while holding sleeplocks\n", td);
}
witness_lock_list_free(lle);
}
/*
* Warn if any locks other than 'lock' are held. Flags can be passed in to
* exempt Giant and sleepable locks from the checks as well. If any
* non-exempt locks are held, then a supplied message is printed to the
* output channel along with a list of the offending locks. If indicated in the
* flags then a failure results in a panic as well.
*/
int
witness_warn(int flags, struct lock_object *lock, const char *fmt, ...)
{
struct lock_list_entry *lock_list, *lle;
struct lock_instance *lock1;
struct thread *td;
va_list ap;
int i, n;
if (witness_cold || witness_watch < 1 || panicstr != NULL)
return (0);
n = 0;
td = curthread;
for (lle = td->td_sleeplocks; lle != NULL; lle = lle->ll_next)
for (i = lle->ll_count - 1; i >= 0; i--) {
lock1 = &lle->ll_children[i];
if (lock1->li_lock == lock)
continue;
if (flags & WARN_GIANTOK &&
lock1->li_lock == &Giant.lock_object)
continue;
if (flags & WARN_SLEEPOK &&
(lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0)
continue;
if (n == 0) {
va_start(ap, fmt);
vprintf(fmt, ap);
va_end(ap);
printf(" with the following %slocks held:\n",
(flags & WARN_SLEEPOK) != 0 ?
"non-sleepable " : "");
}
n++;
witness_list_lock(lock1, printf);
}
/*
* Pin the thread in order to avoid problems with thread migration.
* Once that all verifies are passed about spinlocks ownership,
* the thread is in a safe path and it can be unpinned.
*/
sched_pin();
lock_list = PCPU_GET(spinlocks);
if (lock_list != NULL && lock_list->ll_count != 0) {
sched_unpin();
/*
* We should only have one spinlock and as long as
* the flags cannot match for this locks class,
* check if the first spinlock is the one curthread
* should hold.
*/
lock1 = &lock_list->ll_children[lock_list->ll_count - 1];
if (lock_list->ll_count == 1 && lock_list->ll_next == NULL &&
lock1->li_lock == lock && n == 0)
return (0);
va_start(ap, fmt);
vprintf(fmt, ap);
va_end(ap);
printf(" with the following %slocks held:\n",
(flags & WARN_SLEEPOK) != 0 ? "non-sleepable " : "");
n += witness_list_locks(&lock_list, printf);
} else
sched_unpin();
if (flags & WARN_PANIC && n)
kassert_panic("%s", __func__);
else
witness_debugger(n, __func__);
return (n);
}
const char *
witness_file(struct lock_object *lock)
{
struct witness *w;
if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL)
return ("?");
w = lock->lo_witness;
return (w->w_file);
}
int
witness_line(struct lock_object *lock)
{
struct witness *w;
if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL)
return (0);
w = lock->lo_witness;
return (w->w_line);
}
static struct witness *
enroll(const char *description, struct lock_class *lock_class)
{
struct witness *w;
- struct witness_list *typelist;
MPASS(description != NULL);
if (witness_watch == -1 || panicstr != NULL)
return (NULL);
if ((lock_class->lc_flags & LC_SPINLOCK)) {
if (witness_skipspin)
return (NULL);
- else
- typelist = &w_spin;
- } else if ((lock_class->lc_flags & LC_SLEEPLOCK)) {
- typelist = &w_sleep;
- } else {
+ } else if ((lock_class->lc_flags & LC_SLEEPLOCK) == 0) {
kassert_panic("lock class %s is not sleep or spin",
lock_class->lc_name);
return (NULL);
}
mtx_lock_spin(&w_mtx);
w = witness_hash_get(description);
if (w)
goto found;
if ((w = witness_get()) == NULL)
return (NULL);
MPASS(strlen(description) < MAX_W_NAME);
strcpy(w->w_name, description);
w->w_class = lock_class;
w->w_refcount = 1;
STAILQ_INSERT_HEAD(&w_all, w, w_list);
if (lock_class->lc_flags & LC_SPINLOCK) {
STAILQ_INSERT_HEAD(&w_spin, w, w_typelist);
w_spin_cnt++;
} else if (lock_class->lc_flags & LC_SLEEPLOCK) {
STAILQ_INSERT_HEAD(&w_sleep, w, w_typelist);
w_sleep_cnt++;
}
/* Insert new witness into the hash */
witness_hash_put(w);
witness_increment_graph_generation();
mtx_unlock_spin(&w_mtx);
return (w);
found:
w->w_refcount++;
if (w->w_refcount == 1)
w->w_class = lock_class;
mtx_unlock_spin(&w_mtx);
if (lock_class != w->w_class)
kassert_panic(
"lock (%s) %s does not match earlier (%s) lock",
description, lock_class->lc_name,
w->w_class->lc_name);
return (w);
}
static void
depart(struct witness *w)
{
- struct witness_list *list;
MPASS(w->w_refcount == 0);
if (w->w_class->lc_flags & LC_SLEEPLOCK) {
- list = &w_sleep;
w_sleep_cnt--;
} else {
- list = &w_spin;
w_spin_cnt--;
}
/*
* Set file to NULL as it may point into a loadable module.
*/
w->w_file = NULL;
w->w_line = 0;
witness_increment_graph_generation();
}
static void
adopt(struct witness *parent, struct witness *child)
{
int pi, ci, i, j;
if (witness_cold == 0)
mtx_assert(&w_mtx, MA_OWNED);
/* If the relationship is already known, there's no work to be done. */
if (isitmychild(parent, child))
return;
/* When the structure of the graph changes, bump up the generation. */
witness_increment_graph_generation();
/*
* The hard part ... create the direct relationship, then propagate all
* indirect relationships.
*/
pi = parent->w_index;
ci = child->w_index;
WITNESS_INDEX_ASSERT(pi);
WITNESS_INDEX_ASSERT(ci);
MPASS(pi != ci);
w_rmatrix[pi][ci] |= WITNESS_PARENT;
w_rmatrix[ci][pi] |= WITNESS_CHILD;
/*
* If parent was not already an ancestor of child,
* then we increment the descendant and ancestor counters.
*/
if ((w_rmatrix[pi][ci] & WITNESS_ANCESTOR) == 0) {
parent->w_num_descendants++;
child->w_num_ancestors++;
}
/*
* Find each ancestor of 'pi'. Note that 'pi' itself is counted as
* an ancestor of 'pi' during this loop.
*/
for (i = 1; i <= w_max_used_index; i++) {
if ((w_rmatrix[i][pi] & WITNESS_ANCESTOR_MASK) == 0 &&
(i != pi))
continue;
/* Find each descendant of 'i' and mark it as a descendant. */
for (j = 1; j <= w_max_used_index; j++) {
/*
* Skip children that are already marked as
* descendants of 'i'.
*/
if (w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK)
continue;
/*
* We are only interested in descendants of 'ci'. Note
* that 'ci' itself is counted as a descendant of 'ci'.
*/
if ((w_rmatrix[ci][j] & WITNESS_ANCESTOR_MASK) == 0 &&
(j != ci))
continue;
w_rmatrix[i][j] |= WITNESS_ANCESTOR;
w_rmatrix[j][i] |= WITNESS_DESCENDANT;
w_data[i].w_num_descendants++;
w_data[j].w_num_ancestors++;
/*
* Make sure we aren't marking a node as both an
* ancestor and descendant. We should have caught
* this as a lock order reversal earlier.
*/
if ((w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK) &&
(w_rmatrix[i][j] & WITNESS_DESCENDANT_MASK)) {
printf("witness rmatrix paradox! [%d][%d]=%d "
"both ancestor and descendant\n",
i, j, w_rmatrix[i][j]);
kdb_backtrace();
printf("Witness disabled.\n");
witness_watch = -1;
}
if ((w_rmatrix[j][i] & WITNESS_ANCESTOR_MASK) &&
(w_rmatrix[j][i] & WITNESS_DESCENDANT_MASK)) {
printf("witness rmatrix paradox! [%d][%d]=%d "
"both ancestor and descendant\n",
j, i, w_rmatrix[j][i]);
kdb_backtrace();
printf("Witness disabled.\n");
witness_watch = -1;
}
}
}
}
static void
itismychild(struct witness *parent, struct witness *child)
{
int unlocked;
MPASS(child != NULL && parent != NULL);
if (witness_cold == 0)
mtx_assert(&w_mtx, MA_OWNED);
if (!witness_lock_type_equal(parent, child)) {
if (witness_cold == 0) {
unlocked = 1;
mtx_unlock_spin(&w_mtx);
} else {
unlocked = 0;
}
kassert_panic(
"%s: parent \"%s\" (%s) and child \"%s\" (%s) are not "
"the same lock type", __func__, parent->w_name,
parent->w_class->lc_name, child->w_name,
child->w_class->lc_name);
if (unlocked)
mtx_lock_spin(&w_mtx);
}
adopt(parent, child);
}
/*
* Generic code for the isitmy*() functions. The rmask parameter is the
* expected relationship of w1 to w2.
*/
static int
_isitmyx(struct witness *w1, struct witness *w2, int rmask, const char *fname)
{
unsigned char r1, r2;
int i1, i2;
i1 = w1->w_index;
i2 = w2->w_index;
WITNESS_INDEX_ASSERT(i1);
WITNESS_INDEX_ASSERT(i2);
r1 = w_rmatrix[i1][i2] & WITNESS_RELATED_MASK;
r2 = w_rmatrix[i2][i1] & WITNESS_RELATED_MASK;
/* The flags on one better be the inverse of the flags on the other */
if (!((WITNESS_ATOD(r1) == r2 && WITNESS_DTOA(r2) == r1) ||
(WITNESS_DTOA(r1) == r2 && WITNESS_ATOD(r2) == r1))) {
/* Don't squawk if we're potentially racing with an update. */
if (!mtx_owned(&w_mtx))
return (0);
printf("%s: rmatrix mismatch between %s (index %d) and %s "
"(index %d): w_rmatrix[%d][%d] == %hhx but "
"w_rmatrix[%d][%d] == %hhx\n",
fname, w1->w_name, i1, w2->w_name, i2, i1, i2, r1,
i2, i1, r2);
kdb_backtrace();
printf("Witness disabled.\n");
witness_watch = -1;
}
return (r1 & rmask);
}
/*
* Checks if @child is a direct child of @parent.
*/
static int
isitmychild(struct witness *parent, struct witness *child)
{
return (_isitmyx(parent, child, WITNESS_PARENT, __func__));
}
/*
* Checks if @descendant is a direct or inderect descendant of @ancestor.
*/
static int
isitmydescendant(struct witness *ancestor, struct witness *descendant)
{
return (_isitmyx(ancestor, descendant, WITNESS_ANCESTOR_MASK,
__func__));
}
#ifdef BLESSING
static int
blessed(struct witness *w1, struct witness *w2)
{
int i;
struct witness_blessed *b;
for (i = 0; i < nitems(blessed_list); i++) {
b = &blessed_list[i];
if (strcmp(w1->w_name, b->b_lock1) == 0) {
if (strcmp(w2->w_name, b->b_lock2) == 0)
return (1);
continue;
}
if (strcmp(w1->w_name, b->b_lock2) == 0)
if (strcmp(w2->w_name, b->b_lock1) == 0)
return (1);
}
return (0);
}
#endif
static struct witness *
witness_get(void)
{
struct witness *w;
int index;
if (witness_cold == 0)
mtx_assert(&w_mtx, MA_OWNED);
if (witness_watch == -1) {
mtx_unlock_spin(&w_mtx);
return (NULL);
}
if (STAILQ_EMPTY(&w_free)) {
witness_watch = -1;
mtx_unlock_spin(&w_mtx);
printf("WITNESS: unable to allocate a new witness object\n");
return (NULL);
}
w = STAILQ_FIRST(&w_free);
STAILQ_REMOVE_HEAD(&w_free, w_list);
w_free_cnt--;
index = w->w_index;
MPASS(index > 0 && index == w_max_used_index+1 &&
index < witness_count);
bzero(w, sizeof(*w));
w->w_index = index;
if (index > w_max_used_index)
w_max_used_index = index;
return (w);
}
static void
witness_free(struct witness *w)
{
STAILQ_INSERT_HEAD(&w_free, w, w_list);
w_free_cnt++;
}
static struct lock_list_entry *
witness_lock_list_get(void)
{
struct lock_list_entry *lle;
if (witness_watch == -1)
return (NULL);
mtx_lock_spin(&w_mtx);
lle = w_lock_list_free;
if (lle == NULL) {
witness_watch = -1;
mtx_unlock_spin(&w_mtx);
printf("%s: witness exhausted\n", __func__);
return (NULL);
}
w_lock_list_free = lle->ll_next;
mtx_unlock_spin(&w_mtx);
bzero(lle, sizeof(*lle));
return (lle);
}
static void
witness_lock_list_free(struct lock_list_entry *lle)
{
mtx_lock_spin(&w_mtx);
lle->ll_next = w_lock_list_free;
w_lock_list_free = lle;
mtx_unlock_spin(&w_mtx);
}
static struct lock_instance *
find_instance(struct lock_list_entry *list, const struct lock_object *lock)
{
struct lock_list_entry *lle;
struct lock_instance *instance;
int i;
for (lle = list; lle != NULL; lle = lle->ll_next)
for (i = lle->ll_count - 1; i >= 0; i--) {
instance = &lle->ll_children[i];
if (instance->li_lock == lock)
return (instance);
}
return (NULL);
}
static void
witness_list_lock(struct lock_instance *instance,
int (*prnt)(const char *fmt, ...))
{
struct lock_object *lock;
lock = instance->li_lock;
prnt("%s %s %s", (instance->li_flags & LI_EXCLUSIVE) != 0 ?
"exclusive" : "shared", LOCK_CLASS(lock)->lc_name, lock->lo_name);
if (lock->lo_witness->w_name != lock->lo_name)
prnt(" (%s)", lock->lo_witness->w_name);
prnt(" r = %d (%p) locked @ %s:%d\n",
instance->li_flags & LI_RECURSEMASK, lock,
fixup_filename(instance->li_file), instance->li_line);
}
static int
witness_output(const char *fmt, ...)
{
va_list ap;
int ret;
va_start(ap, fmt);
ret = witness_voutput(fmt, ap);
va_end(ap);
return (ret);
}
static int
witness_voutput(const char *fmt, va_list ap)
{
int ret;
ret = 0;
switch (witness_channel) {
case WITNESS_CONSOLE:
ret = vprintf(fmt, ap);
break;
case WITNESS_LOG:
vlog(LOG_NOTICE, fmt, ap);
break;
case WITNESS_NONE:
break;
}
return (ret);
}
#ifdef DDB
static int
witness_thread_has_locks(struct thread *td)
{
if (td->td_sleeplocks == NULL)
return (0);
return (td->td_sleeplocks->ll_count != 0);
}
static int
witness_proc_has_locks(struct proc *p)
{
struct thread *td;
FOREACH_THREAD_IN_PROC(p, td) {
if (witness_thread_has_locks(td))
return (1);
}
return (0);
}
#endif
int
witness_list_locks(struct lock_list_entry **lock_list,
int (*prnt)(const char *fmt, ...))
{
struct lock_list_entry *lle;
int i, nheld;
nheld = 0;
for (lle = *lock_list; lle != NULL; lle = lle->ll_next)
for (i = lle->ll_count - 1; i >= 0; i--) {
witness_list_lock(&lle->ll_children[i], prnt);
nheld++;
}
return (nheld);
}
/*
* This is a bit risky at best. We call this function when we have timed
* out acquiring a spin lock, and we assume that the other CPU is stuck
* with this lock held. So, we go groveling around in the other CPU's
* per-cpu data to try to find the lock instance for this spin lock to
* see when it was last acquired.
*/
void
witness_display_spinlock(struct lock_object *lock, struct thread *owner,
int (*prnt)(const char *fmt, ...))
{
struct lock_instance *instance;
struct pcpu *pc;
if (owner->td_critnest == 0 || owner->td_oncpu == NOCPU)
return;
pc = pcpu_find(owner->td_oncpu);
instance = find_instance(pc->pc_spinlocks, lock);
if (instance != NULL)
witness_list_lock(instance, prnt);
}
void
witness_save(struct lock_object *lock, const char **filep, int *linep)
{
struct lock_list_entry *lock_list;
struct lock_instance *instance;
struct lock_class *class;
/*
* This function is used independently in locking code to deal with
* Giant, SCHEDULER_STOPPED() check can be removed here after Giant
* is gone.
*/
if (SCHEDULER_STOPPED())
return;
KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
return;
class = LOCK_CLASS(lock);
if (class->lc_flags & LC_SLEEPLOCK)
lock_list = curthread->td_sleeplocks;
else {
if (witness_skipspin)
return;
lock_list = PCPU_GET(spinlocks);
}
instance = find_instance(lock_list, lock);
if (instance == NULL) {
kassert_panic("%s: lock (%s) %s not locked", __func__,
class->lc_name, lock->lo_name);
return;
}
*filep = instance->li_file;
*linep = instance->li_line;
}
void
witness_restore(struct lock_object *lock, const char *file, int line)
{
struct lock_list_entry *lock_list;
struct lock_instance *instance;
struct lock_class *class;
/*
* This function is used independently in locking code to deal with
* Giant, SCHEDULER_STOPPED() check can be removed here after Giant
* is gone.
*/
if (SCHEDULER_STOPPED())
return;
KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
return;
class = LOCK_CLASS(lock);
if (class->lc_flags & LC_SLEEPLOCK)
lock_list = curthread->td_sleeplocks;
else {
if (witness_skipspin)
return;
lock_list = PCPU_GET(spinlocks);
}
instance = find_instance(lock_list, lock);
if (instance == NULL)
kassert_panic("%s: lock (%s) %s not locked", __func__,
class->lc_name, lock->lo_name);
lock->lo_witness->w_file = file;
lock->lo_witness->w_line = line;
if (instance == NULL)
return;
instance->li_file = file;
instance->li_line = line;
}
void
witness_assert(const struct lock_object *lock, int flags, const char *file,
int line)
{
#ifdef INVARIANT_SUPPORT
struct lock_instance *instance;
struct lock_class *class;
if (lock->lo_witness == NULL || witness_watch < 1 || panicstr != NULL)
return;
class = LOCK_CLASS(lock);
if ((class->lc_flags & LC_SLEEPLOCK) != 0)
instance = find_instance(curthread->td_sleeplocks, lock);
else if ((class->lc_flags & LC_SPINLOCK) != 0)
instance = find_instance(PCPU_GET(spinlocks), lock);
else {
kassert_panic("Lock (%s) %s is not sleep or spin!",
class->lc_name, lock->lo_name);
return;
}
switch (flags) {
case LA_UNLOCKED:
if (instance != NULL)
kassert_panic("Lock (%s) %s locked @ %s:%d.",
class->lc_name, lock->lo_name,
fixup_filename(file), line);
break;
case LA_LOCKED:
case LA_LOCKED | LA_RECURSED:
case LA_LOCKED | LA_NOTRECURSED:
case LA_SLOCKED:
case LA_SLOCKED | LA_RECURSED:
case LA_SLOCKED | LA_NOTRECURSED:
case LA_XLOCKED:
case LA_XLOCKED | LA_RECURSED:
case LA_XLOCKED | LA_NOTRECURSED:
if (instance == NULL) {
kassert_panic("Lock (%s) %s not locked @ %s:%d.",
class->lc_name, lock->lo_name,
fixup_filename(file), line);
break;
}
if ((flags & LA_XLOCKED) != 0 &&
(instance->li_flags & LI_EXCLUSIVE) == 0)
kassert_panic(
"Lock (%s) %s not exclusively locked @ %s:%d.",
class->lc_name, lock->lo_name,
fixup_filename(file), line);
if ((flags & LA_SLOCKED) != 0 &&
(instance->li_flags & LI_EXCLUSIVE) != 0)
kassert_panic(
"Lock (%s) %s exclusively locked @ %s:%d.",
class->lc_name, lock->lo_name,
fixup_filename(file), line);
if ((flags & LA_RECURSED) != 0 &&
(instance->li_flags & LI_RECURSEMASK) == 0)
kassert_panic("Lock (%s) %s not recursed @ %s:%d.",
class->lc_name, lock->lo_name,
fixup_filename(file), line);
if ((flags & LA_NOTRECURSED) != 0 &&
(instance->li_flags & LI_RECURSEMASK) != 0)
kassert_panic("Lock (%s) %s recursed @ %s:%d.",
class->lc_name, lock->lo_name,
fixup_filename(file), line);
break;
default:
kassert_panic("Invalid lock assertion at %s:%d.",
fixup_filename(file), line);
}
#endif /* INVARIANT_SUPPORT */
}
static void
witness_setflag(struct lock_object *lock, int flag, int set)
{
struct lock_list_entry *lock_list;
struct lock_instance *instance;
struct lock_class *class;
if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
return;
class = LOCK_CLASS(lock);
if (class->lc_flags & LC_SLEEPLOCK)
lock_list = curthread->td_sleeplocks;
else {
if (witness_skipspin)
return;
lock_list = PCPU_GET(spinlocks);
}
instance = find_instance(lock_list, lock);
if (instance == NULL) {
kassert_panic("%s: lock (%s) %s not locked", __func__,
class->lc_name, lock->lo_name);
return;
}
if (set)
instance->li_flags |= flag;
else
instance->li_flags &= ~flag;
}
void
witness_norelease(struct lock_object *lock)
{
witness_setflag(lock, LI_NORELEASE, 1);
}
void
witness_releaseok(struct lock_object *lock)
{
witness_setflag(lock, LI_NORELEASE, 0);
}
#ifdef DDB
static void
witness_ddb_list(struct thread *td)
{
KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
KASSERT(kdb_active, ("%s: not in the debugger", __func__));
if (witness_watch < 1)
return;
witness_list_locks(&td->td_sleeplocks, db_printf);
/*
* We only handle spinlocks if td == curthread. This is somewhat broken
* if td is currently executing on some other CPU and holds spin locks
* as we won't display those locks. If we had a MI way of getting
* the per-cpu data for a given cpu then we could use
* td->td_oncpu to get the list of spinlocks for this thread
* and "fix" this.
*
* That still wouldn't really fix this unless we locked the scheduler
* lock or stopped the other CPU to make sure it wasn't changing the
* list out from under us. It is probably best to just not try to
* handle threads on other CPU's for now.
*/
if (td == curthread && PCPU_GET(spinlocks) != NULL)
witness_list_locks(PCPU_PTR(spinlocks), db_printf);
}
DB_SHOW_COMMAND(locks, db_witness_list)
{
struct thread *td;
if (have_addr)
td = db_lookup_thread(addr, true);
else
td = kdb_thread;
witness_ddb_list(td);
}
DB_SHOW_ALL_COMMAND(locks, db_witness_list_all)
{
struct thread *td;
struct proc *p;
/*
* It would be nice to list only threads and processes that actually
* held sleep locks, but that information is currently not exported
* by WITNESS.
*/
FOREACH_PROC_IN_SYSTEM(p) {
if (!witness_proc_has_locks(p))
continue;
FOREACH_THREAD_IN_PROC(p, td) {
if (!witness_thread_has_locks(td))
continue;
db_printf("Process %d (%s) thread %p (%d)\n", p->p_pid,
p->p_comm, td, td->td_tid);
witness_ddb_list(td);
if (db_pager_quit)
return;
}
}
}
DB_SHOW_ALIAS(alllocks, db_witness_list_all)
DB_SHOW_COMMAND(witness, db_witness_display)
{
witness_ddb_display(db_printf);
}
#endif
static void
sbuf_print_witness_badstacks(struct sbuf *sb, size_t *oldidx)
{
struct witness_lock_order_data *data1, *data2, *tmp_data1, *tmp_data2;
struct witness *tmp_w1, *tmp_w2, *w1, *w2;
- u_int w_rmatrix1, w_rmatrix2;
int generation, i, j;
tmp_data1 = NULL;
tmp_data2 = NULL;
tmp_w1 = NULL;
tmp_w2 = NULL;
/* Allocate and init temporary storage space. */
tmp_w1 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO);
tmp_w2 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO);
tmp_data1 = malloc(sizeof(struct witness_lock_order_data), M_TEMP,
M_WAITOK | M_ZERO);
tmp_data2 = malloc(sizeof(struct witness_lock_order_data), M_TEMP,
M_WAITOK | M_ZERO);
stack_zero(&tmp_data1->wlod_stack);
stack_zero(&tmp_data2->wlod_stack);
restart:
mtx_lock_spin(&w_mtx);
generation = w_generation;
mtx_unlock_spin(&w_mtx);
sbuf_printf(sb, "Number of known direct relationships is %d\n",
w_lohash.wloh_count);
for (i = 1; i < w_max_used_index; i++) {
mtx_lock_spin(&w_mtx);
if (generation != w_generation) {
mtx_unlock_spin(&w_mtx);
/* The graph has changed, try again. */
*oldidx = 0;
sbuf_clear(sb);
goto restart;
}
w1 = &w_data[i];
if (w1->w_reversed == 0) {
mtx_unlock_spin(&w_mtx);
continue;
}
/* Copy w1 locally so we can release the spin lock. */
*tmp_w1 = *w1;
mtx_unlock_spin(&w_mtx);
if (tmp_w1->w_reversed == 0)
continue;
for (j = 1; j < w_max_used_index; j++) {
if ((w_rmatrix[i][j] & WITNESS_REVERSAL) == 0 || i > j)
continue;
mtx_lock_spin(&w_mtx);
if (generation != w_generation) {
mtx_unlock_spin(&w_mtx);
/* The graph has changed, try again. */
*oldidx = 0;
sbuf_clear(sb);
goto restart;
}
w2 = &w_data[j];
data1 = witness_lock_order_get(w1, w2);
data2 = witness_lock_order_get(w2, w1);
/*
* Copy information locally so we can release the
* spin lock.
*/
*tmp_w2 = *w2;
- w_rmatrix1 = (unsigned int)w_rmatrix[i][j];
- w_rmatrix2 = (unsigned int)w_rmatrix[j][i];
if (data1) {
stack_zero(&tmp_data1->wlod_stack);
stack_copy(&data1->wlod_stack,
&tmp_data1->wlod_stack);
}
if (data2 && data2 != data1) {
stack_zero(&tmp_data2->wlod_stack);
stack_copy(&data2->wlod_stack,
&tmp_data2->wlod_stack);
}
mtx_unlock_spin(&w_mtx);
sbuf_printf(sb,
"\nLock order reversal between \"%s\"(%s) and \"%s\"(%s)!\n",
tmp_w1->w_name, tmp_w1->w_class->lc_name,
tmp_w2->w_name, tmp_w2->w_class->lc_name);
if (data1) {
sbuf_printf(sb,
"Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n",
tmp_w1->w_name, tmp_w1->w_class->lc_name,
tmp_w2->w_name, tmp_w2->w_class->lc_name);
stack_sbuf_print(sb, &tmp_data1->wlod_stack);
sbuf_printf(sb, "\n");
}
if (data2 && data2 != data1) {
sbuf_printf(sb,
"Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n",
tmp_w2->w_name, tmp_w2->w_class->lc_name,
tmp_w1->w_name, tmp_w1->w_class->lc_name);
stack_sbuf_print(sb, &tmp_data2->wlod_stack);
sbuf_printf(sb, "\n");
}
}
}
mtx_lock_spin(&w_mtx);
if (generation != w_generation) {
mtx_unlock_spin(&w_mtx);
/*
* The graph changed while we were printing stack data,
* try again.
*/
*oldidx = 0;
sbuf_clear(sb);
goto restart;
}
mtx_unlock_spin(&w_mtx);
/* Free temporary storage space. */
free(tmp_data1, M_TEMP);
free(tmp_data2, M_TEMP);
free(tmp_w1, M_TEMP);
free(tmp_w2, M_TEMP);
}
static int
sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS)
{
struct sbuf *sb;
int error;
if (witness_watch < 1) {
error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning));
return (error);
}
if (witness_cold) {
error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold));
return (error);
}
error = 0;
sb = sbuf_new(NULL, NULL, badstack_sbuf_size, SBUF_AUTOEXTEND);
if (sb == NULL)
return (ENOMEM);
sbuf_print_witness_badstacks(sb, &req->oldidx);
sbuf_finish(sb);
error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
sbuf_delete(sb);
return (error);
}
#ifdef DDB
static int
sbuf_db_printf_drain(void *arg __unused, const char *data, int len)
{
return (db_printf("%.*s", len, data));
}
DB_SHOW_COMMAND(badstacks, db_witness_badstacks)
{
struct sbuf sb;
char buffer[128];
size_t dummy;
sbuf_new(&sb, buffer, sizeof(buffer), SBUF_FIXEDLEN);
sbuf_set_drain(&sb, sbuf_db_printf_drain, NULL);
sbuf_print_witness_badstacks(&sb, &dummy);
sbuf_finish(&sb);
}
#endif
static int
sysctl_debug_witness_channel(SYSCTL_HANDLER_ARGS)
{
static const struct {
enum witness_channel channel;
const char *name;
} channels[] = {
{ WITNESS_CONSOLE, "console" },
{ WITNESS_LOG, "log" },
{ WITNESS_NONE, "none" },
};
char buf[16];
u_int i;
int error;
buf[0] = '\0';
for (i = 0; i < nitems(channels); i++)
if (witness_channel == channels[i].channel) {
snprintf(buf, sizeof(buf), "%s", channels[i].name);
break;
}
error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
if (error != 0 || req->newptr == NULL)
return (error);
error = EINVAL;
for (i = 0; i < nitems(channels); i++)
if (strcmp(channels[i].name, buf) == 0) {
witness_channel = channels[i].channel;
error = 0;
break;
}
return (error);
}
static int
sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS)
{
struct witness *w;
struct sbuf *sb;
int error;
if (witness_watch < 1) {
error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning));
return (error);
}
if (witness_cold) {
error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold));
return (error);
}
error = 0;
error = sysctl_wire_old_buffer(req, 0);
if (error != 0)
return (error);
sb = sbuf_new_for_sysctl(NULL, NULL, FULLGRAPH_SBUF_SIZE, req);
if (sb == NULL)
return (ENOMEM);
sbuf_printf(sb, "\n");
mtx_lock_spin(&w_mtx);
STAILQ_FOREACH(w, &w_all, w_list)
w->w_displayed = 0;
STAILQ_FOREACH(w, &w_all, w_list)
witness_add_fullgraph(sb, w);
mtx_unlock_spin(&w_mtx);
/*
* Close the sbuf and return to userland.
*/
error = sbuf_finish(sb);
sbuf_delete(sb);
return (error);
}
static int
sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS)
{
int error, value;
value = witness_watch;
error = sysctl_handle_int(oidp, &value, 0, req);
if (error != 0 || req->newptr == NULL)
return (error);
if (value > 1 || value < -1 ||
(witness_watch == -1 && value != witness_watch))
return (EINVAL);
witness_watch = value;
return (0);
}
static void
witness_add_fullgraph(struct sbuf *sb, struct witness *w)
{
int i;
if (w->w_displayed != 0 || (w->w_file == NULL && w->w_line == 0))
return;
w->w_displayed = 1;
WITNESS_INDEX_ASSERT(w->w_index);
for (i = 1; i <= w_max_used_index; i++) {
if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) {
sbuf_printf(sb, "\"%s\",\"%s\"\n", w->w_name,
w_data[i].w_name);
witness_add_fullgraph(sb, &w_data[i]);
}
}
}
/*
* A simple hash function. Takes a key pointer and a key size. If size == 0,
* interprets the key as a string and reads until the null
* terminator. Otherwise, reads the first size bytes. Returns an unsigned 32-bit
* hash value computed from the key.
*/
static uint32_t
witness_hash_djb2(const uint8_t *key, uint32_t size)
{
unsigned int hash = 5381;
int i;
/* hash = hash * 33 + key[i] */
if (size)
for (i = 0; i < size; i++)
hash = ((hash << 5) + hash) + (unsigned int)key[i];
else
for (i = 0; key[i] != 0; i++)
hash = ((hash << 5) + hash) + (unsigned int)key[i];
return (hash);
}
/*
* Initializes the two witness hash tables. Called exactly once from
* witness_initialize().
*/
static void
witness_init_hash_tables(void)
{
int i;
MPASS(witness_cold);
/* Initialize the hash tables. */
for (i = 0; i < WITNESS_HASH_SIZE; i++)
w_hash.wh_array[i] = NULL;
w_hash.wh_size = WITNESS_HASH_SIZE;
w_hash.wh_count = 0;
/* Initialize the lock order data hash. */
w_lofree = NULL;
for (i = 0; i < WITNESS_LO_DATA_COUNT; i++) {
memset(&w_lodata[i], 0, sizeof(w_lodata[i]));
w_lodata[i].wlod_next = w_lofree;
w_lofree = &w_lodata[i];
}
w_lohash.wloh_size = WITNESS_LO_HASH_SIZE;
w_lohash.wloh_count = 0;
for (i = 0; i < WITNESS_LO_HASH_SIZE; i++)
w_lohash.wloh_array[i] = NULL;
}
static struct witness *
witness_hash_get(const char *key)
{
struct witness *w;
uint32_t hash;
MPASS(key != NULL);
if (witness_cold == 0)
mtx_assert(&w_mtx, MA_OWNED);
hash = witness_hash_djb2(key, 0) % w_hash.wh_size;
w = w_hash.wh_array[hash];
while (w != NULL) {
if (strcmp(w->w_name, key) == 0)
goto out;
w = w->w_hash_next;
}
out:
return (w);
}
static void
witness_hash_put(struct witness *w)
{
uint32_t hash;
MPASS(w != NULL);
MPASS(w->w_name != NULL);
if (witness_cold == 0)
mtx_assert(&w_mtx, MA_OWNED);
KASSERT(witness_hash_get(w->w_name) == NULL,
("%s: trying to add a hash entry that already exists!", __func__));
KASSERT(w->w_hash_next == NULL,
("%s: w->w_hash_next != NULL", __func__));
hash = witness_hash_djb2(w->w_name, 0) % w_hash.wh_size;
w->w_hash_next = w_hash.wh_array[hash];
w_hash.wh_array[hash] = w;
w_hash.wh_count++;
}
static struct witness_lock_order_data *
witness_lock_order_get(struct witness *parent, struct witness *child)
{
struct witness_lock_order_data *data = NULL;
struct witness_lock_order_key key;
unsigned int hash;
MPASS(parent != NULL && child != NULL);
key.from = parent->w_index;
key.to = child->w_index;
WITNESS_INDEX_ASSERT(key.from);
WITNESS_INDEX_ASSERT(key.to);
if ((w_rmatrix[parent->w_index][child->w_index]
& WITNESS_LOCK_ORDER_KNOWN) == 0)
goto out;
hash = witness_hash_djb2((const char*)&key,
sizeof(key)) % w_lohash.wloh_size;
data = w_lohash.wloh_array[hash];
while (data != NULL) {
if (witness_lock_order_key_equal(&data->wlod_key, &key))
break;
data = data->wlod_next;
}
out:
return (data);
}
/*
* Verify that parent and child have a known relationship, are not the same,
* and child is actually a child of parent. This is done without w_mtx
* to avoid contention in the common case.
*/
static int
witness_lock_order_check(struct witness *parent, struct witness *child)
{
if (parent != child &&
w_rmatrix[parent->w_index][child->w_index]
& WITNESS_LOCK_ORDER_KNOWN &&
isitmychild(parent, child))
return (1);
return (0);
}
static int
witness_lock_order_add(struct witness *parent, struct witness *child)
{
struct witness_lock_order_data *data = NULL;
struct witness_lock_order_key key;
unsigned int hash;
MPASS(parent != NULL && child != NULL);
key.from = parent->w_index;
key.to = child->w_index;
WITNESS_INDEX_ASSERT(key.from);
WITNESS_INDEX_ASSERT(key.to);
if (w_rmatrix[parent->w_index][child->w_index]
& WITNESS_LOCK_ORDER_KNOWN)
return (1);
hash = witness_hash_djb2((const char*)&key,
sizeof(key)) % w_lohash.wloh_size;
w_rmatrix[parent->w_index][child->w_index] |= WITNESS_LOCK_ORDER_KNOWN;
data = w_lofree;
if (data == NULL)
return (0);
w_lofree = data->wlod_next;
data->wlod_next = w_lohash.wloh_array[hash];
data->wlod_key = key;
w_lohash.wloh_array[hash] = data;
w_lohash.wloh_count++;
stack_zero(&data->wlod_stack);
stack_save(&data->wlod_stack);
return (1);
}
/* Call this whenever the structure of the witness graph changes. */
static void
witness_increment_graph_generation(void)
{
if (witness_cold == 0)
mtx_assert(&w_mtx, MA_OWNED);
w_generation++;
}
static int
witness_output_drain(void *arg __unused, const char *data, int len)
{
witness_output("%.*s", len, data);
return (len);
}
static void
witness_debugger(int cond, const char *msg)
{
char buf[32];
struct sbuf sb;
struct stack st;
if (!cond)
return;
if (witness_trace) {
sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
sbuf_set_drain(&sb, witness_output_drain, NULL);
stack_zero(&st);
stack_save(&st);
witness_output("stack backtrace:\n");
stack_sbuf_print_ddb(&sb, &st);
sbuf_finish(&sb);
}
#ifdef KDB
if (witness_kdb)
kdb_enter(KDB_WHY_WITNESS, msg);
#endif
}
Index: head/sys/kern/vfs_aio.c
===================================================================
--- head/sys/kern/vfs_aio.c (revision 327172)
+++ head/sys/kern/vfs_aio.c (revision 327173)
@@ -1,3005 +1,3001 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 1997 John S. Dyson. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. John S. Dyson's name may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* DISCLAIMER: This code isn't warranted to do anything useful. Anything
* bad that happens because of using this software isn't the responsibility
* of the author. This software is distributed AS-IS.
*/
/*
* This file contains support for the POSIX 1003.1B AIO/LIO facility.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_compat.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/capsicum.h>
#include <sys/eventhandler.h>
#include <sys/sysproto.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/module.h>
#include <sys/kthread.h>
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/unistd.h>
#include <sys/posix4.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/signalvar.h>
#include <sys/syscallsubr.h>
#include <sys/protosw.h>
#include <sys/rwlock.h>
#include <sys/sema.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/syscall.h>
#include <sys/sysent.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/sx.h>
#include <sys/taskqueue.h>
#include <sys/vnode.h>
#include <sys/conf.h>
#include <sys/event.h>
#include <sys/mount.h>
#include <geom/geom.h>
#include <machine/atomic.h>
#include <vm/vm.h>
#include <vm/vm_page.h>
#include <vm/vm_extern.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_object.h>
#include <vm/uma.h>
#include <sys/aio.h>
/*
* Counter for allocating reference ids to new jobs. Wrapped to 1 on
* overflow. (XXX will be removed soon.)
*/
static u_long jobrefid;
/*
* Counter for aio_fsync.
*/
static uint64_t jobseqno;
#ifndef MAX_AIO_PER_PROC
#define MAX_AIO_PER_PROC 32
#endif
#ifndef MAX_AIO_QUEUE_PER_PROC
#define MAX_AIO_QUEUE_PER_PROC 256
#endif
#ifndef MAX_AIO_QUEUE
#define MAX_AIO_QUEUE 1024 /* Bigger than MAX_AIO_QUEUE_PER_PROC */
#endif
#ifndef MAX_BUF_AIO
#define MAX_BUF_AIO 16
#endif
FEATURE(aio, "Asynchronous I/O");
SYSCTL_DECL(_p1003_1b);
static MALLOC_DEFINE(M_LIO, "lio", "listio aio control block list");
static MALLOC_DEFINE(M_AIOS, "aios", "aio_suspend aio control block list");
static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0,
"Async IO management");
static int enable_aio_unsafe = 0;
SYSCTL_INT(_vfs_aio, OID_AUTO, enable_unsafe, CTLFLAG_RW, &enable_aio_unsafe, 0,
"Permit asynchronous IO on all file types, not just known-safe types");
static unsigned int unsafe_warningcnt = 1;
SYSCTL_UINT(_vfs_aio, OID_AUTO, unsafe_warningcnt, CTLFLAG_RW,
&unsafe_warningcnt, 0,
"Warnings that will be triggered upon failed IO requests on unsafe files");
static int max_aio_procs = MAX_AIO_PROCS;
SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, CTLFLAG_RW, &max_aio_procs, 0,
"Maximum number of kernel processes to use for handling async IO ");
static int num_aio_procs = 0;
SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, CTLFLAG_RD, &num_aio_procs, 0,
"Number of presently active kernel processes for async IO");
/*
* The code will adjust the actual number of AIO processes towards this
* number when it gets a chance.
*/
static int target_aio_procs = TARGET_AIO_PROCS;
SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs,
0,
"Preferred number of ready kernel processes for async IO");
static int max_queue_count = MAX_AIO_QUEUE;
SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0,
"Maximum number of aio requests to queue, globally");
static int num_queue_count = 0;
SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0,
"Number of queued aio requests");
static int num_buf_aio = 0;
SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0,
"Number of aio requests presently handled by the buf subsystem");
/* Number of async I/O processes in the process of being started */
/* XXX This should be local to aio_aqueue() */
static int num_aio_resv_start = 0;
static int aiod_lifetime;
SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0,
"Maximum lifetime for idle aiod");
static int max_aio_per_proc = MAX_AIO_PER_PROC;
SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc,
0,
"Maximum active aio requests per process (stored in the process)");
static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW,
&max_aio_queue_per_proc, 0,
"Maximum queued aio requests per process (stored in the process)");
static int max_buf_aio = MAX_BUF_AIO;
SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
"Maximum buf aio requests per process (stored in the process)");
/*
* Though redundant with vfs.aio.max_aio_queue_per_proc, POSIX requires
* sysconf(3) to support AIO_LISTIO_MAX, and we implement that with
* vfs.aio.aio_listio_max.
*/
SYSCTL_INT(_p1003_1b, CTL_P1003_1B_AIO_LISTIO_MAX, aio_listio_max,
CTLFLAG_RD | CTLFLAG_CAPRD, &max_aio_queue_per_proc,
0, "Maximum aio requests for a single lio_listio call");
#ifdef COMPAT_FREEBSD6
typedef struct oaiocb {
int aio_fildes; /* File descriptor */
off_t aio_offset; /* File offset for I/O */
volatile void *aio_buf; /* I/O buffer in process space */
size_t aio_nbytes; /* Number of bytes for I/O */
struct osigevent aio_sigevent; /* Signal to deliver */
int aio_lio_opcode; /* LIO opcode */
int aio_reqprio; /* Request priority -- ignored */
struct __aiocb_private _aiocb_private;
} oaiocb_t;
#endif
/*
* Below is a key of locks used to protect each member of struct kaiocb
* aioliojob and kaioinfo and any backends.
*
* * - need not protected
* a - locked by kaioinfo lock
* b - locked by backend lock, the backend lock can be null in some cases,
* for example, BIO belongs to this type, in this case, proc lock is
* reused.
* c - locked by aio_job_mtx, the lock for the generic file I/O backend.
*/
/*
* If the routine that services an AIO request blocks while running in an
* AIO kernel process it can starve other I/O requests. BIO requests
* queued via aio_qphysio() complete in GEOM and do not use AIO kernel
* processes at all. Socket I/O requests use a separate pool of
* kprocs and also force non-blocking I/O. Other file I/O requests
* use the generic fo_read/fo_write operations which can block. The
* fsync and mlock operations can also block while executing. Ideally
* none of these requests would block while executing.
*
* Note that the service routines cannot toggle O_NONBLOCK in the file
* structure directly while handling a request due to races with
* userland threads.
*/
/* jobflags */
#define KAIOCB_QUEUEING 0x01
#define KAIOCB_CANCELLED 0x02
#define KAIOCB_CANCELLING 0x04
#define KAIOCB_CHECKSYNC 0x08
#define KAIOCB_CLEARED 0x10
#define KAIOCB_FINISHED 0x20
/*
* AIO process info
*/
#define AIOP_FREE 0x1 /* proc on free queue */
struct aioproc {
int aioprocflags; /* (c) AIO proc flags */
TAILQ_ENTRY(aioproc) list; /* (c) list of processes */
struct proc *aioproc; /* (*) the AIO proc */
};
/*
* data-structure for lio signal management
*/
struct aioliojob {
int lioj_flags; /* (a) listio flags */
int lioj_count; /* (a) listio flags */
int lioj_finished_count; /* (a) listio flags */
struct sigevent lioj_signal; /* (a) signal on all I/O done */
TAILQ_ENTRY(aioliojob) lioj_list; /* (a) lio list */
struct knlist klist; /* (a) list of knotes */
ksiginfo_t lioj_ksi; /* (a) Realtime signal info */
};
#define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */
#define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */
#define LIOJ_KEVENT_POSTED 0x4 /* kevent triggered */
/*
* per process aio data structure
*/
struct kaioinfo {
struct mtx kaio_mtx; /* the lock to protect this struct */
int kaio_flags; /* (a) per process kaio flags */
int kaio_maxactive_count; /* (*) maximum number of AIOs */
int kaio_active_count; /* (c) number of currently used AIOs */
int kaio_qallowed_count; /* (*) maxiumu size of AIO queue */
int kaio_count; /* (a) size of AIO queue */
int kaio_ballowed_count; /* (*) maximum number of buffers */
int kaio_buffer_count; /* (a) number of physio buffers */
TAILQ_HEAD(,kaiocb) kaio_all; /* (a) all AIOs in a process */
TAILQ_HEAD(,kaiocb) kaio_done; /* (a) done queue for process */
TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */
TAILQ_HEAD(,kaiocb) kaio_jobqueue; /* (a) job queue for process */
TAILQ_HEAD(,kaiocb) kaio_syncqueue; /* (a) queue for aio_fsync */
TAILQ_HEAD(,kaiocb) kaio_syncready; /* (a) second q for aio_fsync */
struct task kaio_task; /* (*) task to kick aio processes */
struct task kaio_sync_task; /* (*) task to schedule fsync jobs */
};
#define AIO_LOCK(ki) mtx_lock(&(ki)->kaio_mtx)
#define AIO_UNLOCK(ki) mtx_unlock(&(ki)->kaio_mtx)
#define AIO_LOCK_ASSERT(ki, f) mtx_assert(&(ki)->kaio_mtx, (f))
#define AIO_MTX(ki) (&(ki)->kaio_mtx)
#define KAIO_RUNDOWN 0x1 /* process is being run down */
#define KAIO_WAKEUP 0x2 /* wakeup process when AIO completes */
/*
* Operations used to interact with userland aio control blocks.
* Different ABIs provide their own operations.
*/
struct aiocb_ops {
int (*copyin)(struct aiocb *ujob, struct aiocb *kjob);
long (*fetch_status)(struct aiocb *ujob);
long (*fetch_error)(struct aiocb *ujob);
int (*store_status)(struct aiocb *ujob, long status);
int (*store_error)(struct aiocb *ujob, long error);
int (*store_kernelinfo)(struct aiocb *ujob, long jobref);
int (*store_aiocb)(struct aiocb **ujobp, struct aiocb *ujob);
};
static TAILQ_HEAD(,aioproc) aio_freeproc; /* (c) Idle daemons */
static struct sema aio_newproc_sem;
static struct mtx aio_job_mtx;
static TAILQ_HEAD(,kaiocb) aio_jobs; /* (c) Async job list */
static struct unrhdr *aiod_unr;
void aio_init_aioinfo(struct proc *p);
static int aio_onceonly(void);
static int aio_free_entry(struct kaiocb *job);
static void aio_process_rw(struct kaiocb *job);
static void aio_process_sync(struct kaiocb *job);
static void aio_process_mlock(struct kaiocb *job);
static void aio_schedule_fsync(void *context, int pending);
static int aio_newproc(int *);
int aio_aqueue(struct thread *td, struct aiocb *ujob,
struct aioliojob *lio, int type, struct aiocb_ops *ops);
static int aio_queue_file(struct file *fp, struct kaiocb *job);
static void aio_physwakeup(struct bio *bp);
static void aio_proc_rundown(void *arg, struct proc *p);
static void aio_proc_rundown_exec(void *arg, struct proc *p,
struct image_params *imgp);
static int aio_qphysio(struct proc *p, struct kaiocb *job);
static void aio_daemon(void *param);
static void aio_bio_done_notify(struct proc *userp, struct kaiocb *job);
static bool aio_clear_cancel_function_locked(struct kaiocb *job);
static int aio_kick(struct proc *userp);
static void aio_kick_nowait(struct proc *userp);
static void aio_kick_helper(void *context, int pending);
static int filt_aioattach(struct knote *kn);
static void filt_aiodetach(struct knote *kn);
static int filt_aio(struct knote *kn, long hint);
static int filt_lioattach(struct knote *kn);
static void filt_liodetach(struct knote *kn);
static int filt_lio(struct knote *kn, long hint);
/*
* Zones for:
* kaio Per process async io info
* aiop async io process data
* aiocb async io jobs
* aiolio list io jobs
*/
static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiolio_zone;
/* kqueue filters for aio */
static struct filterops aio_filtops = {
.f_isfd = 0,
.f_attach = filt_aioattach,
.f_detach = filt_aiodetach,
.f_event = filt_aio,
};
static struct filterops lio_filtops = {
.f_isfd = 0,
.f_attach = filt_lioattach,
.f_detach = filt_liodetach,
.f_event = filt_lio
};
static eventhandler_tag exit_tag, exec_tag;
TASKQUEUE_DEFINE_THREAD(aiod_kick);
/*
* Main operations function for use as a kernel module.
*/
static int
aio_modload(struct module *module, int cmd, void *arg)
{
int error = 0;
switch (cmd) {
case MOD_LOAD:
aio_onceonly();
break;
case MOD_SHUTDOWN:
break;
default:
error = EOPNOTSUPP;
break;
}
return (error);
}
static moduledata_t aio_mod = {
"aio",
&aio_modload,
NULL
};
DECLARE_MODULE(aio, aio_mod, SI_SUB_VFS, SI_ORDER_ANY);
MODULE_VERSION(aio, 1);
/*
* Startup initialization
*/
static int
aio_onceonly(void)
{
exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL,
EVENTHANDLER_PRI_ANY);
exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec,
NULL, EVENTHANDLER_PRI_ANY);
kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
kqueue_add_filteropts(EVFILT_LIO, &lio_filtops);
TAILQ_INIT(&aio_freeproc);
sema_init(&aio_newproc_sem, 0, "aio_new_proc");
mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF);
TAILQ_INIT(&aio_jobs);
aiod_unr = new_unrhdr(1, INT_MAX, NULL);
kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL,
NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
aiop_zone = uma_zcreate("AIOP", sizeof(struct aioproc), NULL,
NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
aiocb_zone = uma_zcreate("AIOCB", sizeof(struct kaiocb), NULL, NULL,
NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL,
NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
aiod_lifetime = AIOD_LIFETIME_DEFAULT;
jobrefid = 1;
p31b_setcfg(CTL_P1003_1B_ASYNCHRONOUS_IO, _POSIX_ASYNCHRONOUS_IO);
p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE);
p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0);
return (0);
}
/*
* Init the per-process aioinfo structure. The aioinfo limits are set
* per-process for user limit (resource) management.
*/
void
aio_init_aioinfo(struct proc *p)
{
struct kaioinfo *ki;
ki = uma_zalloc(kaio_zone, M_WAITOK);
mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF | MTX_NEW);
ki->kaio_flags = 0;
ki->kaio_maxactive_count = max_aio_per_proc;
ki->kaio_active_count = 0;
ki->kaio_qallowed_count = max_aio_queue_per_proc;
ki->kaio_count = 0;
ki->kaio_ballowed_count = max_buf_aio;
ki->kaio_buffer_count = 0;
TAILQ_INIT(&ki->kaio_all);
TAILQ_INIT(&ki->kaio_done);
TAILQ_INIT(&ki->kaio_jobqueue);
TAILQ_INIT(&ki->kaio_liojoblist);
TAILQ_INIT(&ki->kaio_syncqueue);
TAILQ_INIT(&ki->kaio_syncready);
TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p);
TASK_INIT(&ki->kaio_sync_task, 0, aio_schedule_fsync, ki);
PROC_LOCK(p);
if (p->p_aioinfo == NULL) {
p->p_aioinfo = ki;
PROC_UNLOCK(p);
} else {
PROC_UNLOCK(p);
mtx_destroy(&ki->kaio_mtx);
uma_zfree(kaio_zone, ki);
}
while (num_aio_procs < MIN(target_aio_procs, max_aio_procs))
aio_newproc(NULL);
}
static int
aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi)
{
struct thread *td;
int error;
error = sigev_findtd(p, sigev, &td);
if (error)
return (error);
if (!KSI_ONQ(ksi)) {
ksiginfo_set_sigev(ksi, sigev);
ksi->ksi_code = SI_ASYNCIO;
ksi->ksi_flags |= KSI_EXT | KSI_INS;
tdsendsignal(p, td, ksi->ksi_signo, ksi);
}
PROC_UNLOCK(p);
return (error);
}
/*
* Free a job entry. Wait for completion if it is currently active, but don't
* delay forever. If we delay, we return a flag that says that we have to
* restart the queue scan.
*/
static int
aio_free_entry(struct kaiocb *job)
{
struct kaioinfo *ki;
struct aioliojob *lj;
struct proc *p;
p = job->userproc;
MPASS(curproc == p);
ki = p->p_aioinfo;
MPASS(ki != NULL);
AIO_LOCK_ASSERT(ki, MA_OWNED);
MPASS(job->jobflags & KAIOCB_FINISHED);
atomic_subtract_int(&num_queue_count, 1);
ki->kaio_count--;
MPASS(ki->kaio_count >= 0);
TAILQ_REMOVE(&ki->kaio_done, job, plist);
TAILQ_REMOVE(&ki->kaio_all, job, allist);
lj = job->lio;
if (lj) {
lj->lioj_count--;
lj->lioj_finished_count--;
if (lj->lioj_count == 0) {
TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
/* lio is going away, we need to destroy any knotes */
knlist_delete(&lj->klist, curthread, 1);
PROC_LOCK(p);
sigqueue_take(&lj->lioj_ksi);
PROC_UNLOCK(p);
uma_zfree(aiolio_zone, lj);
}
}
/* job is going away, we need to destroy any knotes */
knlist_delete(&job->klist, curthread, 1);
PROC_LOCK(p);
sigqueue_take(&job->ksi);
PROC_UNLOCK(p);
AIO_UNLOCK(ki);
/*
* The thread argument here is used to find the owning process
* and is also passed to fo_close() which may pass it to various
* places such as devsw close() routines. Because of that, we
* need a thread pointer from the process owning the job that is
* persistent and won't disappear out from under us or move to
* another process.
*
* Currently, all the callers of this function call it to remove
* a kaiocb from the current process' job list either via a
* syscall or due to the current process calling exit() or
* execve(). Thus, we know that p == curproc. We also know that
* curthread can't exit since we are curthread.
*
* Therefore, we use curthread as the thread to pass to
* knlist_delete(). This does mean that it is possible for the
* thread pointer at close time to differ from the thread pointer
* at open time, but this is already true of file descriptors in
* a multithreaded process.
*/
if (job->fd_file)
fdrop(job->fd_file, curthread);
crfree(job->cred);
uma_zfree(aiocb_zone, job);
AIO_LOCK(ki);
return (0);
}
static void
aio_proc_rundown_exec(void *arg, struct proc *p,
struct image_params *imgp __unused)
{
aio_proc_rundown(arg, p);
}
static int
aio_cancel_job(struct proc *p, struct kaioinfo *ki, struct kaiocb *job)
{
aio_cancel_fn_t *func;
int cancelled;
AIO_LOCK_ASSERT(ki, MA_OWNED);
if (job->jobflags & (KAIOCB_CANCELLED | KAIOCB_FINISHED))
return (0);
MPASS((job->jobflags & KAIOCB_CANCELLING) == 0);
job->jobflags |= KAIOCB_CANCELLED;
func = job->cancel_fn;
/*
* If there is no cancel routine, just leave the job marked as
* cancelled. The job should be in active use by a caller who
* should complete it normally or when it fails to install a
* cancel routine.
*/
if (func == NULL)
return (0);
/*
* Set the CANCELLING flag so that aio_complete() will defer
* completions of this job. This prevents the job from being
* freed out from under the cancel callback. After the
* callback any deferred completion (whether from the callback
* or any other source) will be completed.
*/
job->jobflags |= KAIOCB_CANCELLING;
AIO_UNLOCK(ki);
func(job);
AIO_LOCK(ki);
job->jobflags &= ~KAIOCB_CANCELLING;
if (job->jobflags & KAIOCB_FINISHED) {
cancelled = job->uaiocb._aiocb_private.error == ECANCELED;
TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist);
aio_bio_done_notify(p, job);
} else {
/*
* The cancel callback might have scheduled an
* operation to cancel this request, but it is
* only counted as cancelled if the request is
* cancelled when the callback returns.
*/
cancelled = 0;
}
return (cancelled);
}
/*
* Rundown the jobs for a given process.
*/
static void
aio_proc_rundown(void *arg, struct proc *p)
{
struct kaioinfo *ki;
struct aioliojob *lj;
struct kaiocb *job, *jobn;
KASSERT(curthread->td_proc == p,
("%s: called on non-curproc", __func__));
ki = p->p_aioinfo;
if (ki == NULL)
return;
AIO_LOCK(ki);
ki->kaio_flags |= KAIO_RUNDOWN;
restart:
/*
* Try to cancel all pending requests. This code simulates
* aio_cancel on all pending I/O requests.
*/
TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) {
aio_cancel_job(p, ki, job);
}
/* Wait for all running I/O to be finished */
if (TAILQ_FIRST(&ki->kaio_jobqueue) || ki->kaio_active_count != 0) {
ki->kaio_flags |= KAIO_WAKEUP;
msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz);
goto restart;
}
/* Free all completed I/O requests. */
while ((job = TAILQ_FIRST(&ki->kaio_done)) != NULL)
aio_free_entry(job);
while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) {
if (lj->lioj_count == 0) {
TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
knlist_delete(&lj->klist, curthread, 1);
PROC_LOCK(p);
sigqueue_take(&lj->lioj_ksi);
PROC_UNLOCK(p);
uma_zfree(aiolio_zone, lj);
} else {
panic("LIO job not cleaned up: C:%d, FC:%d\n",
lj->lioj_count, lj->lioj_finished_count);
}
}
AIO_UNLOCK(ki);
taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_task);
taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_sync_task);
mtx_destroy(&ki->kaio_mtx);
uma_zfree(kaio_zone, ki);
p->p_aioinfo = NULL;
}
/*
* Select a job to run (called by an AIO daemon).
*/
static struct kaiocb *
aio_selectjob(struct aioproc *aiop)
{
struct kaiocb *job;
struct kaioinfo *ki;
struct proc *userp;
mtx_assert(&aio_job_mtx, MA_OWNED);
restart:
TAILQ_FOREACH(job, &aio_jobs, list) {
userp = job->userproc;
ki = userp->p_aioinfo;
if (ki->kaio_active_count < ki->kaio_maxactive_count) {
TAILQ_REMOVE(&aio_jobs, job, list);
if (!aio_clear_cancel_function(job))
goto restart;
/* Account for currently active jobs. */
ki->kaio_active_count++;
break;
}
}
return (job);
}
/*
* Move all data to a permanent storage device. This code
* simulates the fsync syscall.
*/
static int
aio_fsync_vnode(struct thread *td, struct vnode *vp)
{
struct mount *mp;
int error;
if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
goto drop;
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
if (vp->v_object != NULL) {
VM_OBJECT_WLOCK(vp->v_object);
vm_object_page_clean(vp->v_object, 0, 0, 0);
VM_OBJECT_WUNLOCK(vp->v_object);
}
error = VOP_FSYNC(vp, MNT_WAIT, td);
VOP_UNLOCK(vp, 0);
vn_finished_write(mp);
drop:
return (error);
}
/*
* The AIO processing activity for LIO_READ/LIO_WRITE. This is the code that
* does the I/O request for the non-physio version of the operations. The
* normal vn operations are used, and this code should work in all instances
* for every type of file, including pipes, sockets, fifos, and regular files.
*
* XXX I don't think it works well for socket, pipe, and fifo.
*/
static void
aio_process_rw(struct kaiocb *job)
{
struct ucred *td_savedcred;
struct thread *td;
struct aiocb *cb;
struct file *fp;
struct uio auio;
struct iovec aiov;
ssize_t cnt;
long msgsnd_st, msgsnd_end;
long msgrcv_st, msgrcv_end;
long oublock_st, oublock_end;
long inblock_st, inblock_end;
int error;
KASSERT(job->uaiocb.aio_lio_opcode == LIO_READ ||
job->uaiocb.aio_lio_opcode == LIO_WRITE,
("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
aio_switch_vmspace(job);
td = curthread;
td_savedcred = td->td_ucred;
td->td_ucred = job->cred;
cb = &job->uaiocb;
fp = job->fd_file;
aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
aiov.iov_len = cb->aio_nbytes;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_offset = cb->aio_offset;
auio.uio_resid = cb->aio_nbytes;
cnt = cb->aio_nbytes;
auio.uio_segflg = UIO_USERSPACE;
auio.uio_td = td;
msgrcv_st = td->td_ru.ru_msgrcv;
msgsnd_st = td->td_ru.ru_msgsnd;
inblock_st = td->td_ru.ru_inblock;
oublock_st = td->td_ru.ru_oublock;
/*
* aio_aqueue() acquires a reference to the file that is
* released in aio_free_entry().
*/
if (cb->aio_lio_opcode == LIO_READ) {
auio.uio_rw = UIO_READ;
if (auio.uio_resid == 0)
error = 0;
else
error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
} else {
if (fp->f_type == DTYPE_VNODE)
bwillwrite();
auio.uio_rw = UIO_WRITE;
error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td);
}
msgrcv_end = td->td_ru.ru_msgrcv;
msgsnd_end = td->td_ru.ru_msgsnd;
inblock_end = td->td_ru.ru_inblock;
oublock_end = td->td_ru.ru_oublock;
job->msgrcv = msgrcv_end - msgrcv_st;
job->msgsnd = msgsnd_end - msgsnd_st;
job->inblock = inblock_end - inblock_st;
job->outblock = oublock_end - oublock_st;
if ((error) && (auio.uio_resid != cnt)) {
if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
error = 0;
if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
PROC_LOCK(job->userproc);
kern_psignal(job->userproc, SIGPIPE);
PROC_UNLOCK(job->userproc);
}
}
cnt -= auio.uio_resid;
td->td_ucred = td_savedcred;
if (error)
aio_complete(job, -1, error);
else
aio_complete(job, cnt, 0);
}
static void
aio_process_sync(struct kaiocb *job)
{
struct thread *td = curthread;
struct ucred *td_savedcred = td->td_ucred;
struct file *fp = job->fd_file;
int error = 0;
KASSERT(job->uaiocb.aio_lio_opcode == LIO_SYNC,
("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
td->td_ucred = job->cred;
if (fp->f_vnode != NULL)
error = aio_fsync_vnode(td, fp->f_vnode);
td->td_ucred = td_savedcred;
if (error)
aio_complete(job, -1, error);
else
aio_complete(job, 0, 0);
}
static void
aio_process_mlock(struct kaiocb *job)
{
struct aiocb *cb = &job->uaiocb;
int error;
KASSERT(job->uaiocb.aio_lio_opcode == LIO_MLOCK,
("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
aio_switch_vmspace(job);
error = kern_mlock(job->userproc, job->cred,
__DEVOLATILE(uintptr_t, cb->aio_buf), cb->aio_nbytes);
aio_complete(job, error != 0 ? -1 : 0, error);
}
static void
aio_bio_done_notify(struct proc *userp, struct kaiocb *job)
{
struct aioliojob *lj;
struct kaioinfo *ki;
struct kaiocb *sjob, *sjobn;
int lj_done;
bool schedule_fsync;
ki = userp->p_aioinfo;
AIO_LOCK_ASSERT(ki, MA_OWNED);
lj = job->lio;
lj_done = 0;
if (lj) {
lj->lioj_finished_count++;
if (lj->lioj_count == lj->lioj_finished_count)
lj_done = 1;
}
TAILQ_INSERT_TAIL(&ki->kaio_done, job, plist);
MPASS(job->jobflags & KAIOCB_FINISHED);
if (ki->kaio_flags & KAIO_RUNDOWN)
goto notification_done;
if (job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID)
aio_sendsig(userp, &job->uaiocb.aio_sigevent, &job->ksi);
KNOTE_LOCKED(&job->klist, 1);
if (lj_done) {
if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
lj->lioj_flags |= LIOJ_KEVENT_POSTED;
KNOTE_LOCKED(&lj->klist, 1);
}
if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
== LIOJ_SIGNAL
&& (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi);
lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
}
}
notification_done:
if (job->jobflags & KAIOCB_CHECKSYNC) {
schedule_fsync = false;
TAILQ_FOREACH_SAFE(sjob, &ki->kaio_syncqueue, list, sjobn) {
if (job->fd_file != sjob->fd_file ||
job->seqno >= sjob->seqno)
continue;
if (--sjob->pending > 0)
continue;
TAILQ_REMOVE(&ki->kaio_syncqueue, sjob, list);
if (!aio_clear_cancel_function_locked(sjob))
continue;
TAILQ_INSERT_TAIL(&ki->kaio_syncready, sjob, list);
schedule_fsync = true;
}
if (schedule_fsync)
taskqueue_enqueue(taskqueue_aiod_kick,
&ki->kaio_sync_task);
}
if (ki->kaio_flags & KAIO_WAKEUP) {
ki->kaio_flags &= ~KAIO_WAKEUP;
wakeup(&userp->p_aioinfo);
}
}
static void
aio_schedule_fsync(void *context, int pending)
{
struct kaioinfo *ki;
struct kaiocb *job;
ki = context;
AIO_LOCK(ki);
while (!TAILQ_EMPTY(&ki->kaio_syncready)) {
job = TAILQ_FIRST(&ki->kaio_syncready);
TAILQ_REMOVE(&ki->kaio_syncready, job, list);
AIO_UNLOCK(ki);
aio_schedule(job, aio_process_sync);
AIO_LOCK(ki);
}
AIO_UNLOCK(ki);
}
bool
aio_cancel_cleared(struct kaiocb *job)
{
- struct kaioinfo *ki;
/*
* The caller should hold the same queue lock held when
* aio_clear_cancel_function() was called and set this flag
* ensuring this check sees an up-to-date value. However,
* there is no way to assert that.
*/
- ki = job->userproc->p_aioinfo;
return ((job->jobflags & KAIOCB_CLEARED) != 0);
}
static bool
aio_clear_cancel_function_locked(struct kaiocb *job)
{
AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED);
MPASS(job->cancel_fn != NULL);
if (job->jobflags & KAIOCB_CANCELLING) {
job->jobflags |= KAIOCB_CLEARED;
return (false);
}
job->cancel_fn = NULL;
return (true);
}
bool
aio_clear_cancel_function(struct kaiocb *job)
{
struct kaioinfo *ki;
bool ret;
ki = job->userproc->p_aioinfo;
AIO_LOCK(ki);
ret = aio_clear_cancel_function_locked(job);
AIO_UNLOCK(ki);
return (ret);
}
static bool
aio_set_cancel_function_locked(struct kaiocb *job, aio_cancel_fn_t *func)
{
AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED);
if (job->jobflags & KAIOCB_CANCELLED)
return (false);
job->cancel_fn = func;
return (true);
}
bool
aio_set_cancel_function(struct kaiocb *job, aio_cancel_fn_t *func)
{
struct kaioinfo *ki;
bool ret;
ki = job->userproc->p_aioinfo;
AIO_LOCK(ki);
ret = aio_set_cancel_function_locked(job, func);
AIO_UNLOCK(ki);
return (ret);
}
void
aio_complete(struct kaiocb *job, long status, int error)
{
struct kaioinfo *ki;
struct proc *userp;
job->uaiocb._aiocb_private.error = error;
job->uaiocb._aiocb_private.status = status;
userp = job->userproc;
ki = userp->p_aioinfo;
AIO_LOCK(ki);
KASSERT(!(job->jobflags & KAIOCB_FINISHED),
("duplicate aio_complete"));
job->jobflags |= KAIOCB_FINISHED;
if ((job->jobflags & (KAIOCB_QUEUEING | KAIOCB_CANCELLING)) == 0) {
TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist);
aio_bio_done_notify(userp, job);
}
AIO_UNLOCK(ki);
}
void
aio_cancel(struct kaiocb *job)
{
aio_complete(job, -1, ECANCELED);
}
void
aio_switch_vmspace(struct kaiocb *job)
{
vmspace_switch_aio(job->userproc->p_vmspace);
}
/*
* The AIO daemon, most of the actual work is done in aio_process_*,
* but the setup (and address space mgmt) is done in this routine.
*/
static void
aio_daemon(void *_id)
{
struct kaiocb *job;
struct aioproc *aiop;
struct kaioinfo *ki;
struct proc *p;
struct vmspace *myvm;
struct thread *td = curthread;
int id = (intptr_t)_id;
/*
* Grab an extra reference on the daemon's vmspace so that it
* doesn't get freed by jobs that switch to a different
* vmspace.
*/
p = td->td_proc;
myvm = vmspace_acquire_ref(p);
KASSERT(p->p_textvp == NULL, ("kthread has a textvp"));
/*
* Allocate and ready the aio control info. There is one aiop structure
* per daemon.
*/
aiop = uma_zalloc(aiop_zone, M_WAITOK);
aiop->aioproc = p;
aiop->aioprocflags = 0;
/*
* Wakeup parent process. (Parent sleeps to keep from blasting away
* and creating too many daemons.)
*/
sema_post(&aio_newproc_sem);
mtx_lock(&aio_job_mtx);
for (;;) {
/*
* Take daemon off of free queue
*/
if (aiop->aioprocflags & AIOP_FREE) {
TAILQ_REMOVE(&aio_freeproc, aiop, list);
aiop->aioprocflags &= ~AIOP_FREE;
}
/*
* Check for jobs.
*/
while ((job = aio_selectjob(aiop)) != NULL) {
mtx_unlock(&aio_job_mtx);
ki = job->userproc->p_aioinfo;
job->handle_fn(job);
mtx_lock(&aio_job_mtx);
/* Decrement the active job count. */
ki->kaio_active_count--;
}
/*
* Disconnect from user address space.
*/
if (p->p_vmspace != myvm) {
mtx_unlock(&aio_job_mtx);
vmspace_switch_aio(myvm);
mtx_lock(&aio_job_mtx);
/*
* We have to restart to avoid race, we only sleep if
* no job can be selected.
*/
continue;
}
mtx_assert(&aio_job_mtx, MA_OWNED);
TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
aiop->aioprocflags |= AIOP_FREE;
/*
* If daemon is inactive for a long time, allow it to exit,
* thereby freeing resources.
*/
if (msleep(p, &aio_job_mtx, PRIBIO, "aiordy",
aiod_lifetime) == EWOULDBLOCK && TAILQ_EMPTY(&aio_jobs) &&
(aiop->aioprocflags & AIOP_FREE) &&
num_aio_procs > target_aio_procs)
break;
}
TAILQ_REMOVE(&aio_freeproc, aiop, list);
num_aio_procs--;
mtx_unlock(&aio_job_mtx);
uma_zfree(aiop_zone, aiop);
free_unr(aiod_unr, id);
vmspace_free(myvm);
KASSERT(p->p_vmspace == myvm,
("AIOD: bad vmspace for exiting daemon"));
KASSERT(myvm->vm_refcnt > 1,
("AIOD: bad vm refcnt for exiting daemon: %d", myvm->vm_refcnt));
kproc_exit(0);
}
/*
* Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
* AIO daemon modifies its environment itself.
*/
static int
aio_newproc(int *start)
{
int error;
struct proc *p;
int id;
id = alloc_unr(aiod_unr);
error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p,
RFNOWAIT, 0, "aiod%d", id);
if (error == 0) {
/*
* Wait until daemon is started.
*/
sema_wait(&aio_newproc_sem);
mtx_lock(&aio_job_mtx);
num_aio_procs++;
if (start != NULL)
(*start)--;
mtx_unlock(&aio_job_mtx);
} else {
free_unr(aiod_unr, id);
}
return (error);
}
/*
* Try the high-performance, low-overhead physio method for eligible
* VCHR devices. This method doesn't use an aio helper thread, and
* thus has very low overhead.
*
* Assumes that the caller, aio_aqueue(), has incremented the file
* structure's reference count, preventing its deallocation for the
* duration of this call.
*/
static int
aio_qphysio(struct proc *p, struct kaiocb *job)
{
struct aiocb *cb;
struct file *fp;
struct bio *bp;
struct buf *pbuf;
struct vnode *vp;
struct cdevsw *csw;
struct cdev *dev;
struct kaioinfo *ki;
int error, ref, poff;
vm_prot_t prot;
cb = &job->uaiocb;
fp = job->fd_file;
if (fp == NULL || fp->f_type != DTYPE_VNODE)
return (-1);
vp = fp->f_vnode;
if (vp->v_type != VCHR)
return (-1);
if (vp->v_bufobj.bo_bsize == 0)
return (-1);
if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
return (-1);
ref = 0;
csw = devvn_refthread(vp, &dev, &ref);
if (csw == NULL)
return (ENXIO);
if ((csw->d_flags & D_DISK) == 0) {
error = -1;
goto unref;
}
if (cb->aio_nbytes > dev->si_iosize_max) {
error = -1;
goto unref;
}
ki = p->p_aioinfo;
poff = (vm_offset_t)cb->aio_buf & PAGE_MASK;
if ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed) {
if (cb->aio_nbytes > MAXPHYS) {
error = -1;
goto unref;
}
pbuf = NULL;
} else {
if (cb->aio_nbytes > MAXPHYS - poff) {
error = -1;
goto unref;
}
if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) {
error = -1;
goto unref;
}
job->pbuf = pbuf = (struct buf *)getpbuf(NULL);
BUF_KERNPROC(pbuf);
AIO_LOCK(ki);
ki->kaio_buffer_count++;
AIO_UNLOCK(ki);
}
job->bp = bp = g_alloc_bio();
bp->bio_length = cb->aio_nbytes;
bp->bio_bcount = cb->aio_nbytes;
bp->bio_done = aio_physwakeup;
bp->bio_data = (void *)(uintptr_t)cb->aio_buf;
bp->bio_offset = cb->aio_offset;
bp->bio_cmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ;
bp->bio_dev = dev;
bp->bio_caller1 = (void *)job;
prot = VM_PROT_READ;
if (cb->aio_lio_opcode == LIO_READ)
prot |= VM_PROT_WRITE; /* Less backwards than it looks */
job->npages = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
(vm_offset_t)bp->bio_data, bp->bio_length, prot, job->pages,
nitems(job->pages));
if (job->npages < 0) {
error = EFAULT;
goto doerror;
}
if (pbuf != NULL) {
pmap_qenter((vm_offset_t)pbuf->b_data,
job->pages, job->npages);
bp->bio_data = pbuf->b_data + poff;
atomic_add_int(&num_buf_aio, 1);
} else {
bp->bio_ma = job->pages;
bp->bio_ma_n = job->npages;
bp->bio_ma_offset = poff;
bp->bio_data = unmapped_buf;
bp->bio_flags |= BIO_UNMAPPED;
}
/* Perform transfer. */
csw->d_strategy(bp);
dev_relthread(dev, ref);
return (0);
doerror:
if (pbuf != NULL) {
AIO_LOCK(ki);
ki->kaio_buffer_count--;
AIO_UNLOCK(ki);
relpbuf(pbuf, NULL);
job->pbuf = NULL;
}
g_destroy_bio(bp);
job->bp = NULL;
unref:
dev_relthread(dev, ref);
return (error);
}
#ifdef COMPAT_FREEBSD6
static int
convert_old_sigevent(struct osigevent *osig, struct sigevent *nsig)
{
/*
* Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
* supported by AIO with the old sigevent structure.
*/
nsig->sigev_notify = osig->sigev_notify;
switch (nsig->sigev_notify) {
case SIGEV_NONE:
break;
case SIGEV_SIGNAL:
nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
break;
case SIGEV_KEVENT:
nsig->sigev_notify_kqueue =
osig->__sigev_u.__sigev_notify_kqueue;
nsig->sigev_value.sival_ptr = osig->sigev_value.sival_ptr;
break;
default:
return (EINVAL);
}
return (0);
}
static int
aiocb_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
{
struct oaiocb *ojob;
int error;
bzero(kjob, sizeof(struct aiocb));
error = copyin(ujob, kjob, sizeof(struct oaiocb));
if (error)
return (error);
ojob = (struct oaiocb *)kjob;
return (convert_old_sigevent(&ojob->aio_sigevent, &kjob->aio_sigevent));
}
#endif
static int
aiocb_copyin(struct aiocb *ujob, struct aiocb *kjob)
{
return (copyin(ujob, kjob, sizeof(struct aiocb)));
}
static long
aiocb_fetch_status(struct aiocb *ujob)
{
return (fuword(&ujob->_aiocb_private.status));
}
static long
aiocb_fetch_error(struct aiocb *ujob)
{
return (fuword(&ujob->_aiocb_private.error));
}
static int
aiocb_store_status(struct aiocb *ujob, long status)
{
return (suword(&ujob->_aiocb_private.status, status));
}
static int
aiocb_store_error(struct aiocb *ujob, long error)
{
return (suword(&ujob->_aiocb_private.error, error));
}
static int
aiocb_store_kernelinfo(struct aiocb *ujob, long jobref)
{
return (suword(&ujob->_aiocb_private.kernelinfo, jobref));
}
static int
aiocb_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
{
return (suword(ujobp, (long)ujob));
}
static struct aiocb_ops aiocb_ops = {
.copyin = aiocb_copyin,
.fetch_status = aiocb_fetch_status,
.fetch_error = aiocb_fetch_error,
.store_status = aiocb_store_status,
.store_error = aiocb_store_error,
.store_kernelinfo = aiocb_store_kernelinfo,
.store_aiocb = aiocb_store_aiocb,
};
#ifdef COMPAT_FREEBSD6
static struct aiocb_ops aiocb_ops_osigevent = {
.copyin = aiocb_copyin_old_sigevent,
.fetch_status = aiocb_fetch_status,
.fetch_error = aiocb_fetch_error,
.store_status = aiocb_store_status,
.store_error = aiocb_store_error,
.store_kernelinfo = aiocb_store_kernelinfo,
.store_aiocb = aiocb_store_aiocb,
};
#endif
/*
* Queue a new AIO request. Choosing either the threaded or direct physio VCHR
* technique is done in this code.
*/
int
aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj,
int type, struct aiocb_ops *ops)
{
struct proc *p = td->td_proc;
cap_rights_t rights;
struct file *fp;
struct kaiocb *job;
struct kaioinfo *ki;
struct kevent kev;
int opcode;
int error;
int fd, kqfd;
int jid;
u_short evflags;
if (p->p_aioinfo == NULL)
aio_init_aioinfo(p);
ki = p->p_aioinfo;
ops->store_status(ujob, -1);
ops->store_error(ujob, 0);
ops->store_kernelinfo(ujob, -1);
if (num_queue_count >= max_queue_count ||
ki->kaio_count >= ki->kaio_qallowed_count) {
ops->store_error(ujob, EAGAIN);
return (EAGAIN);
}
job = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO);
knlist_init_mtx(&job->klist, AIO_MTX(ki));
error = ops->copyin(ujob, &job->uaiocb);
if (error) {
ops->store_error(ujob, error);
uma_zfree(aiocb_zone, job);
return (error);
}
if (job->uaiocb.aio_nbytes > IOSIZE_MAX) {
uma_zfree(aiocb_zone, job);
return (EINVAL);
}
if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT &&
job->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL &&
job->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID &&
job->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) {
ops->store_error(ujob, EINVAL);
uma_zfree(aiocb_zone, job);
return (EINVAL);
}
if ((job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) &&
!_SIG_VALID(job->uaiocb.aio_sigevent.sigev_signo)) {
uma_zfree(aiocb_zone, job);
return (EINVAL);
}
ksiginfo_init(&job->ksi);
/* Save userspace address of the job info. */
job->ujob = ujob;
/* Get the opcode. */
if (type != LIO_NOP)
job->uaiocb.aio_lio_opcode = type;
opcode = job->uaiocb.aio_lio_opcode;
/*
* Validate the opcode and fetch the file object for the specified
* file descriptor.
*
* XXXRW: Moved the opcode validation up here so that we don't
* retrieve a file descriptor without knowing what the capabiltity
* should be.
*/
fd = job->uaiocb.aio_fildes;
switch (opcode) {
case LIO_WRITE:
error = fget_write(td, fd,
cap_rights_init(&rights, CAP_PWRITE), &fp);
break;
case LIO_READ:
error = fget_read(td, fd,
cap_rights_init(&rights, CAP_PREAD), &fp);
break;
case LIO_SYNC:
error = fget(td, fd, cap_rights_init(&rights, CAP_FSYNC), &fp);
break;
case LIO_MLOCK:
fp = NULL;
break;
case LIO_NOP:
error = fget(td, fd, cap_rights_init(&rights), &fp);
break;
default:
error = EINVAL;
}
if (error) {
uma_zfree(aiocb_zone, job);
ops->store_error(ujob, error);
return (error);
}
if (opcode == LIO_SYNC && fp->f_vnode == NULL) {
error = EINVAL;
goto aqueue_fail;
}
if ((opcode == LIO_READ || opcode == LIO_WRITE) &&
job->uaiocb.aio_offset < 0 &&
(fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR)) {
error = EINVAL;
goto aqueue_fail;
}
job->fd_file = fp;
mtx_lock(&aio_job_mtx);
jid = jobrefid++;
job->seqno = jobseqno++;
mtx_unlock(&aio_job_mtx);
error = ops->store_kernelinfo(ujob, jid);
if (error) {
error = EINVAL;
goto aqueue_fail;
}
job->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid;
if (opcode == LIO_NOP) {
fdrop(fp, td);
uma_zfree(aiocb_zone, job);
return (0);
}
if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT)
goto no_kqueue;
evflags = job->uaiocb.aio_sigevent.sigev_notify_kevent_flags;
if ((evflags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) {
error = EINVAL;
goto aqueue_fail;
}
kqfd = job->uaiocb.aio_sigevent.sigev_notify_kqueue;
kev.ident = (uintptr_t)job->ujob;
kev.filter = EVFILT_AIO;
kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | evflags;
kev.data = (intptr_t)job;
kev.udata = job->uaiocb.aio_sigevent.sigev_value.sival_ptr;
error = kqfd_register(kqfd, &kev, td, 1);
if (error)
goto aqueue_fail;
no_kqueue:
ops->store_error(ujob, EINPROGRESS);
job->uaiocb._aiocb_private.error = EINPROGRESS;
job->userproc = p;
job->cred = crhold(td->td_ucred);
job->jobflags = KAIOCB_QUEUEING;
job->lio = lj;
if (opcode == LIO_MLOCK) {
aio_schedule(job, aio_process_mlock);
error = 0;
} else if (fp->f_ops->fo_aio_queue == NULL)
error = aio_queue_file(fp, job);
else
error = fo_aio_queue(fp, job);
if (error)
goto aqueue_fail;
AIO_LOCK(ki);
job->jobflags &= ~KAIOCB_QUEUEING;
TAILQ_INSERT_TAIL(&ki->kaio_all, job, allist);
ki->kaio_count++;
if (lj)
lj->lioj_count++;
atomic_add_int(&num_queue_count, 1);
if (job->jobflags & KAIOCB_FINISHED) {
/*
* The queue callback completed the request synchronously.
* The bulk of the completion is deferred in that case
* until this point.
*/
aio_bio_done_notify(p, job);
} else
TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, job, plist);
AIO_UNLOCK(ki);
return (0);
aqueue_fail:
knlist_delete(&job->klist, curthread, 0);
if (fp)
fdrop(fp, td);
uma_zfree(aiocb_zone, job);
ops->store_error(ujob, error);
return (error);
}
static void
aio_cancel_daemon_job(struct kaiocb *job)
{
mtx_lock(&aio_job_mtx);
if (!aio_cancel_cleared(job))
TAILQ_REMOVE(&aio_jobs, job, list);
mtx_unlock(&aio_job_mtx);
aio_cancel(job);
}
void
aio_schedule(struct kaiocb *job, aio_handle_fn_t *func)
{
mtx_lock(&aio_job_mtx);
if (!aio_set_cancel_function(job, aio_cancel_daemon_job)) {
mtx_unlock(&aio_job_mtx);
aio_cancel(job);
return;
}
job->handle_fn = func;
TAILQ_INSERT_TAIL(&aio_jobs, job, list);
aio_kick_nowait(job->userproc);
mtx_unlock(&aio_job_mtx);
}
static void
aio_cancel_sync(struct kaiocb *job)
{
struct kaioinfo *ki;
ki = job->userproc->p_aioinfo;
AIO_LOCK(ki);
if (!aio_cancel_cleared(job))
TAILQ_REMOVE(&ki->kaio_syncqueue, job, list);
AIO_UNLOCK(ki);
aio_cancel(job);
}
int
aio_queue_file(struct file *fp, struct kaiocb *job)
{
- struct aioliojob *lj;
struct kaioinfo *ki;
struct kaiocb *job2;
struct vnode *vp;
struct mount *mp;
int error, opcode;
bool safe;
- lj = job->lio;
ki = job->userproc->p_aioinfo;
opcode = job->uaiocb.aio_lio_opcode;
if (opcode == LIO_SYNC)
goto queueit;
if ((error = aio_qphysio(job->userproc, job)) == 0)
goto done;
#if 0
/*
* XXX: This means qphysio() failed with EFAULT. The current
* behavior is to retry the operation via fo_read/fo_write.
* Wouldn't it be better to just complete the request with an
* error here?
*/
if (error > 0)
goto done;
#endif
queueit:
safe = false;
if (fp->f_type == DTYPE_VNODE) {
vp = fp->f_vnode;
if (vp->v_type == VREG || vp->v_type == VDIR) {
mp = fp->f_vnode->v_mount;
if (mp == NULL || (mp->mnt_flag & MNT_LOCAL) != 0)
safe = true;
}
}
if (!(safe || enable_aio_unsafe)) {
counted_warning(&unsafe_warningcnt,
"is attempting to use unsafe AIO requests");
return (EOPNOTSUPP);
}
if (opcode == LIO_SYNC) {
AIO_LOCK(ki);
TAILQ_FOREACH(job2, &ki->kaio_jobqueue, plist) {
if (job2->fd_file == job->fd_file &&
job2->uaiocb.aio_lio_opcode != LIO_SYNC &&
job2->seqno < job->seqno) {
job2->jobflags |= KAIOCB_CHECKSYNC;
job->pending++;
}
}
if (job->pending != 0) {
if (!aio_set_cancel_function_locked(job,
aio_cancel_sync)) {
AIO_UNLOCK(ki);
aio_cancel(job);
return (0);
}
TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, job, list);
AIO_UNLOCK(ki);
return (0);
}
AIO_UNLOCK(ki);
}
switch (opcode) {
case LIO_READ:
case LIO_WRITE:
aio_schedule(job, aio_process_rw);
error = 0;
break;
case LIO_SYNC:
aio_schedule(job, aio_process_sync);
error = 0;
break;
default:
error = EINVAL;
}
done:
return (error);
}
static void
aio_kick_nowait(struct proc *userp)
{
struct kaioinfo *ki = userp->p_aioinfo;
struct aioproc *aiop;
mtx_assert(&aio_job_mtx, MA_OWNED);
if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
TAILQ_REMOVE(&aio_freeproc, aiop, list);
aiop->aioprocflags &= ~AIOP_FREE;
wakeup(aiop->aioproc);
} else if (num_aio_resv_start + num_aio_procs < max_aio_procs &&
ki->kaio_active_count + num_aio_resv_start <
ki->kaio_maxactive_count) {
taskqueue_enqueue(taskqueue_aiod_kick, &ki->kaio_task);
}
}
static int
aio_kick(struct proc *userp)
{
struct kaioinfo *ki = userp->p_aioinfo;
struct aioproc *aiop;
int error, ret = 0;
mtx_assert(&aio_job_mtx, MA_OWNED);
retryproc:
if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
TAILQ_REMOVE(&aio_freeproc, aiop, list);
aiop->aioprocflags &= ~AIOP_FREE;
wakeup(aiop->aioproc);
} else if (num_aio_resv_start + num_aio_procs < max_aio_procs &&
ki->kaio_active_count + num_aio_resv_start <
ki->kaio_maxactive_count) {
num_aio_resv_start++;
mtx_unlock(&aio_job_mtx);
error = aio_newproc(&num_aio_resv_start);
mtx_lock(&aio_job_mtx);
if (error) {
num_aio_resv_start--;
goto retryproc;
}
} else {
ret = -1;
}
return (ret);
}
static void
aio_kick_helper(void *context, int pending)
{
struct proc *userp = context;
mtx_lock(&aio_job_mtx);
while (--pending >= 0) {
if (aio_kick(userp))
break;
}
mtx_unlock(&aio_job_mtx);
}
/*
* Support the aio_return system call, as a side-effect, kernel resources are
* released.
*/
static int
kern_aio_return(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops)
{
struct proc *p = td->td_proc;
struct kaiocb *job;
struct kaioinfo *ki;
long status, error;
ki = p->p_aioinfo;
if (ki == NULL)
return (EINVAL);
AIO_LOCK(ki);
TAILQ_FOREACH(job, &ki->kaio_done, plist) {
if (job->ujob == ujob)
break;
}
if (job != NULL) {
MPASS(job->jobflags & KAIOCB_FINISHED);
status = job->uaiocb._aiocb_private.status;
error = job->uaiocb._aiocb_private.error;
td->td_retval[0] = status;
td->td_ru.ru_oublock += job->outblock;
td->td_ru.ru_inblock += job->inblock;
td->td_ru.ru_msgsnd += job->msgsnd;
td->td_ru.ru_msgrcv += job->msgrcv;
aio_free_entry(job);
AIO_UNLOCK(ki);
ops->store_error(ujob, error);
ops->store_status(ujob, status);
} else {
error = EINVAL;
AIO_UNLOCK(ki);
}
return (error);
}
int
sys_aio_return(struct thread *td, struct aio_return_args *uap)
{
return (kern_aio_return(td, uap->aiocbp, &aiocb_ops));
}
/*
* Allow a process to wakeup when any of the I/O requests are completed.
*/
static int
kern_aio_suspend(struct thread *td, int njoblist, struct aiocb **ujoblist,
struct timespec *ts)
{
struct proc *p = td->td_proc;
struct timeval atv;
struct kaioinfo *ki;
struct kaiocb *firstjob, *job;
int error, i, timo;
timo = 0;
if (ts) {
if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000)
return (EINVAL);
TIMESPEC_TO_TIMEVAL(&atv, ts);
if (itimerfix(&atv))
return (EINVAL);
timo = tvtohz(&atv);
}
ki = p->p_aioinfo;
if (ki == NULL)
return (EAGAIN);
if (njoblist == 0)
return (0);
AIO_LOCK(ki);
for (;;) {
firstjob = NULL;
error = 0;
TAILQ_FOREACH(job, &ki->kaio_all, allist) {
for (i = 0; i < njoblist; i++) {
if (job->ujob == ujoblist[i]) {
if (firstjob == NULL)
firstjob = job;
if (job->jobflags & KAIOCB_FINISHED)
goto RETURN;
}
}
}
/* All tasks were finished. */
if (firstjob == NULL)
break;
ki->kaio_flags |= KAIO_WAKEUP;
error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
"aiospn", timo);
if (error == ERESTART)
error = EINTR;
if (error)
break;
}
RETURN:
AIO_UNLOCK(ki);
return (error);
}
int
sys_aio_suspend(struct thread *td, struct aio_suspend_args *uap)
{
struct timespec ts, *tsp;
struct aiocb **ujoblist;
int error;
if (uap->nent < 0 || uap->nent > max_aio_queue_per_proc)
return (EINVAL);
if (uap->timeout) {
/* Get timespec struct. */
if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
return (error);
tsp = &ts;
} else
tsp = NULL;
ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIOS, M_WAITOK);
error = copyin(uap->aiocbp, ujoblist, uap->nent * sizeof(ujoblist[0]));
if (error == 0)
error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
free(ujoblist, M_AIOS);
return (error);
}
/*
* aio_cancel cancels any non-physio aio operations not currently in
* progress.
*/
int
sys_aio_cancel(struct thread *td, struct aio_cancel_args *uap)
{
struct proc *p = td->td_proc;
struct kaioinfo *ki;
struct kaiocb *job, *jobn;
struct file *fp;
cap_rights_t rights;
int error;
int cancelled = 0;
int notcancelled = 0;
struct vnode *vp;
/* Lookup file object. */
error = fget(td, uap->fd, cap_rights_init(&rights), &fp);
if (error)
return (error);
ki = p->p_aioinfo;
if (ki == NULL)
goto done;
if (fp->f_type == DTYPE_VNODE) {
vp = fp->f_vnode;
if (vn_isdisk(vp, &error)) {
fdrop(fp, td);
td->td_retval[0] = AIO_NOTCANCELED;
return (0);
}
}
AIO_LOCK(ki);
TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) {
if ((uap->fd == job->uaiocb.aio_fildes) &&
((uap->aiocbp == NULL) ||
(uap->aiocbp == job->ujob))) {
if (aio_cancel_job(p, ki, job)) {
cancelled++;
} else {
notcancelled++;
}
if (uap->aiocbp != NULL)
break;
}
}
AIO_UNLOCK(ki);
done:
fdrop(fp, td);
if (uap->aiocbp != NULL) {
if (cancelled) {
td->td_retval[0] = AIO_CANCELED;
return (0);
}
}
if (notcancelled) {
td->td_retval[0] = AIO_NOTCANCELED;
return (0);
}
if (cancelled) {
td->td_retval[0] = AIO_CANCELED;
return (0);
}
td->td_retval[0] = AIO_ALLDONE;
return (0);
}
/*
* aio_error is implemented in the kernel level for compatibility purposes
* only. For a user mode async implementation, it would be best to do it in
* a userland subroutine.
*/
static int
kern_aio_error(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops)
{
struct proc *p = td->td_proc;
struct kaiocb *job;
struct kaioinfo *ki;
int status;
ki = p->p_aioinfo;
if (ki == NULL) {
td->td_retval[0] = EINVAL;
return (0);
}
AIO_LOCK(ki);
TAILQ_FOREACH(job, &ki->kaio_all, allist) {
if (job->ujob == ujob) {
if (job->jobflags & KAIOCB_FINISHED)
td->td_retval[0] =
job->uaiocb._aiocb_private.error;
else
td->td_retval[0] = EINPROGRESS;
AIO_UNLOCK(ki);
return (0);
}
}
AIO_UNLOCK(ki);
/*
* Hack for failure of aio_aqueue.
*/
status = ops->fetch_status(ujob);
if (status == -1) {
td->td_retval[0] = ops->fetch_error(ujob);
return (0);
}
td->td_retval[0] = EINVAL;
return (0);
}
int
sys_aio_error(struct thread *td, struct aio_error_args *uap)
{
return (kern_aio_error(td, uap->aiocbp, &aiocb_ops));
}
/* syscall - asynchronous read from a file (REALTIME) */
#ifdef COMPAT_FREEBSD6
int
freebsd6_aio_read(struct thread *td, struct freebsd6_aio_read_args *uap)
{
return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
&aiocb_ops_osigevent));
}
#endif
int
sys_aio_read(struct thread *td, struct aio_read_args *uap)
{
return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops));
}
/* syscall - asynchronous write to a file (REALTIME) */
#ifdef COMPAT_FREEBSD6
int
freebsd6_aio_write(struct thread *td, struct freebsd6_aio_write_args *uap)
{
return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
&aiocb_ops_osigevent));
}
#endif
int
sys_aio_write(struct thread *td, struct aio_write_args *uap)
{
return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops));
}
int
sys_aio_mlock(struct thread *td, struct aio_mlock_args *uap)
{
return (aio_aqueue(td, uap->aiocbp, NULL, LIO_MLOCK, &aiocb_ops));
}
static int
kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list,
struct aiocb **acb_list, int nent, struct sigevent *sig,
struct aiocb_ops *ops)
{
struct proc *p = td->td_proc;
struct aiocb *job;
struct kaioinfo *ki;
struct aioliojob *lj;
struct kevent kev;
int error;
int nerror;
int i;
if ((mode != LIO_NOWAIT) && (mode != LIO_WAIT))
return (EINVAL);
if (nent < 0 || nent > max_aio_queue_per_proc)
return (EINVAL);
if (p->p_aioinfo == NULL)
aio_init_aioinfo(p);
ki = p->p_aioinfo;
lj = uma_zalloc(aiolio_zone, M_WAITOK);
lj->lioj_flags = 0;
lj->lioj_count = 0;
lj->lioj_finished_count = 0;
knlist_init_mtx(&lj->klist, AIO_MTX(ki));
ksiginfo_init(&lj->lioj_ksi);
/*
* Setup signal.
*/
if (sig && (mode == LIO_NOWAIT)) {
bcopy(sig, &lj->lioj_signal, sizeof(lj->lioj_signal));
if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
/* Assume only new style KEVENT */
kev.filter = EVFILT_LIO;
kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
kev.ident = (uintptr_t)uacb_list; /* something unique */
kev.data = (intptr_t)lj;
/* pass user defined sigval data */
kev.udata = lj->lioj_signal.sigev_value.sival_ptr;
error = kqfd_register(
lj->lioj_signal.sigev_notify_kqueue, &kev, td, 1);
if (error) {
uma_zfree(aiolio_zone, lj);
return (error);
}
} else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) {
;
} else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) {
if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
uma_zfree(aiolio_zone, lj);
return EINVAL;
}
lj->lioj_flags |= LIOJ_SIGNAL;
} else {
uma_zfree(aiolio_zone, lj);
return EINVAL;
}
}
AIO_LOCK(ki);
TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
/*
* Add extra aiocb count to avoid the lio to be freed
* by other threads doing aio_waitcomplete or aio_return,
* and prevent event from being sent until we have queued
* all tasks.
*/
lj->lioj_count = 1;
AIO_UNLOCK(ki);
/*
* Get pointers to the list of I/O requests.
*/
nerror = 0;
for (i = 0; i < nent; i++) {
job = acb_list[i];
if (job != NULL) {
error = aio_aqueue(td, job, lj, LIO_NOP, ops);
if (error != 0)
nerror++;
}
}
error = 0;
AIO_LOCK(ki);
if (mode == LIO_WAIT) {
while (lj->lioj_count - 1 != lj->lioj_finished_count) {
ki->kaio_flags |= KAIO_WAKEUP;
error = msleep(&p->p_aioinfo, AIO_MTX(ki),
PRIBIO | PCATCH, "aiospn", 0);
if (error == ERESTART)
error = EINTR;
if (error)
break;
}
} else {
if (lj->lioj_count - 1 == lj->lioj_finished_count) {
if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
lj->lioj_flags |= LIOJ_KEVENT_POSTED;
KNOTE_LOCKED(&lj->klist, 1);
}
if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
== LIOJ_SIGNAL
&& (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
aio_sendsig(p, &lj->lioj_signal,
&lj->lioj_ksi);
lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
}
}
}
lj->lioj_count--;
if (lj->lioj_count == 0) {
TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
knlist_delete(&lj->klist, curthread, 1);
PROC_LOCK(p);
sigqueue_take(&lj->lioj_ksi);
PROC_UNLOCK(p);
AIO_UNLOCK(ki);
uma_zfree(aiolio_zone, lj);
} else
AIO_UNLOCK(ki);
if (nerror)
return (EIO);
return (error);
}
/* syscall - list directed I/O (REALTIME) */
#ifdef COMPAT_FREEBSD6
int
freebsd6_lio_listio(struct thread *td, struct freebsd6_lio_listio_args *uap)
{
struct aiocb **acb_list;
struct sigevent *sigp, sig;
struct osigevent osig;
int error, nent;
if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
return (EINVAL);
nent = uap->nent;
if (nent < 0 || nent > max_aio_queue_per_proc)
return (EINVAL);
if (uap->sig && (uap->mode == LIO_NOWAIT)) {
error = copyin(uap->sig, &osig, sizeof(osig));
if (error)
return (error);
error = convert_old_sigevent(&osig, &sig);
if (error)
return (error);
sigp = &sig;
} else
sigp = NULL;
acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
if (error == 0)
error = kern_lio_listio(td, uap->mode,
(struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
&aiocb_ops_osigevent);
free(acb_list, M_LIO);
return (error);
}
#endif
/* syscall - list directed I/O (REALTIME) */
int
sys_lio_listio(struct thread *td, struct lio_listio_args *uap)
{
struct aiocb **acb_list;
struct sigevent *sigp, sig;
int error, nent;
if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
return (EINVAL);
nent = uap->nent;
if (nent < 0 || nent > max_aio_queue_per_proc)
return (EINVAL);
if (uap->sig && (uap->mode == LIO_NOWAIT)) {
error = copyin(uap->sig, &sig, sizeof(sig));
if (error)
return (error);
sigp = &sig;
} else
sigp = NULL;
acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
if (error == 0)
error = kern_lio_listio(td, uap->mode, uap->acb_list, acb_list,
nent, sigp, &aiocb_ops);
free(acb_list, M_LIO);
return (error);
}
static void
aio_physwakeup(struct bio *bp)
{
struct kaiocb *job = (struct kaiocb *)bp->bio_caller1;
struct proc *userp;
struct kaioinfo *ki;
size_t nbytes;
int error, nblks;
/* Release mapping into kernel space. */
userp = job->userproc;
ki = userp->p_aioinfo;
if (job->pbuf) {
pmap_qremove((vm_offset_t)job->pbuf->b_data, job->npages);
relpbuf(job->pbuf, NULL);
job->pbuf = NULL;
atomic_subtract_int(&num_buf_aio, 1);
AIO_LOCK(ki);
ki->kaio_buffer_count--;
AIO_UNLOCK(ki);
}
vm_page_unhold_pages(job->pages, job->npages);
bp = job->bp;
job->bp = NULL;
nbytes = job->uaiocb.aio_nbytes - bp->bio_resid;
error = 0;
if (bp->bio_flags & BIO_ERROR)
error = bp->bio_error;
nblks = btodb(nbytes);
if (job->uaiocb.aio_lio_opcode == LIO_WRITE)
job->outblock += nblks;
else
job->inblock += nblks;
if (error)
aio_complete(job, -1, error);
else
aio_complete(job, nbytes, 0);
g_destroy_bio(bp);
}
/* syscall - wait for the next completion of an aio request */
static int
kern_aio_waitcomplete(struct thread *td, struct aiocb **ujobp,
struct timespec *ts, struct aiocb_ops *ops)
{
struct proc *p = td->td_proc;
struct timeval atv;
struct kaioinfo *ki;
struct kaiocb *job;
struct aiocb *ujob;
long error, status;
int timo;
ops->store_aiocb(ujobp, NULL);
if (ts == NULL) {
timo = 0;
} else if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
timo = -1;
} else {
if ((ts->tv_nsec < 0) || (ts->tv_nsec >= 1000000000))
return (EINVAL);
TIMESPEC_TO_TIMEVAL(&atv, ts);
if (itimerfix(&atv))
return (EINVAL);
timo = tvtohz(&atv);
}
if (p->p_aioinfo == NULL)
aio_init_aioinfo(p);
ki = p->p_aioinfo;
error = 0;
job = NULL;
AIO_LOCK(ki);
while ((job = TAILQ_FIRST(&ki->kaio_done)) == NULL) {
if (timo == -1) {
error = EWOULDBLOCK;
break;
}
ki->kaio_flags |= KAIO_WAKEUP;
error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
"aiowc", timo);
if (timo && error == ERESTART)
error = EINTR;
if (error)
break;
}
if (job != NULL) {
MPASS(job->jobflags & KAIOCB_FINISHED);
ujob = job->ujob;
status = job->uaiocb._aiocb_private.status;
error = job->uaiocb._aiocb_private.error;
td->td_retval[0] = status;
td->td_ru.ru_oublock += job->outblock;
td->td_ru.ru_inblock += job->inblock;
td->td_ru.ru_msgsnd += job->msgsnd;
td->td_ru.ru_msgrcv += job->msgrcv;
aio_free_entry(job);
AIO_UNLOCK(ki);
ops->store_aiocb(ujobp, ujob);
ops->store_error(ujob, error);
ops->store_status(ujob, status);
} else
AIO_UNLOCK(ki);
return (error);
}
int
sys_aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap)
{
struct timespec ts, *tsp;
int error;
if (uap->timeout) {
/* Get timespec struct. */
error = copyin(uap->timeout, &ts, sizeof(ts));
if (error)
return (error);
tsp = &ts;
} else
tsp = NULL;
return (kern_aio_waitcomplete(td, uap->aiocbp, tsp, &aiocb_ops));
}
static int
kern_aio_fsync(struct thread *td, int op, struct aiocb *ujob,
struct aiocb_ops *ops)
{
if (op != O_SYNC) /* XXX lack of O_DSYNC */
return (EINVAL);
return (aio_aqueue(td, ujob, NULL, LIO_SYNC, ops));
}
int
sys_aio_fsync(struct thread *td, struct aio_fsync_args *uap)
{
return (kern_aio_fsync(td, uap->op, uap->aiocbp, &aiocb_ops));
}
/* kqueue attach function */
static int
filt_aioattach(struct knote *kn)
{
struct kaiocb *job;
job = (struct kaiocb *)(uintptr_t)kn->kn_sdata;
/*
* The job pointer must be validated before using it, so
* registration is restricted to the kernel; the user cannot
* set EV_FLAG1.
*/
if ((kn->kn_flags & EV_FLAG1) == 0)
return (EPERM);
kn->kn_ptr.p_aio = job;
kn->kn_flags &= ~EV_FLAG1;
knlist_add(&job->klist, kn, 0);
return (0);
}
/* kqueue detach function */
static void
filt_aiodetach(struct knote *kn)
{
struct knlist *knl;
knl = &kn->kn_ptr.p_aio->klist;
knl->kl_lock(knl->kl_lockarg);
if (!knlist_empty(knl))
knlist_remove(knl, kn, 1);
knl->kl_unlock(knl->kl_lockarg);
}
/* kqueue filter function */
/*ARGSUSED*/
static int
filt_aio(struct knote *kn, long hint)
{
struct kaiocb *job = kn->kn_ptr.p_aio;
kn->kn_data = job->uaiocb._aiocb_private.error;
if (!(job->jobflags & KAIOCB_FINISHED))
return (0);
kn->kn_flags |= EV_EOF;
return (1);
}
/* kqueue attach function */
static int
filt_lioattach(struct knote *kn)
{
struct aioliojob *lj;
lj = (struct aioliojob *)(uintptr_t)kn->kn_sdata;
/*
* The aioliojob pointer must be validated before using it, so
* registration is restricted to the kernel; the user cannot
* set EV_FLAG1.
*/
if ((kn->kn_flags & EV_FLAG1) == 0)
return (EPERM);
kn->kn_ptr.p_lio = lj;
kn->kn_flags &= ~EV_FLAG1;
knlist_add(&lj->klist, kn, 0);
return (0);
}
/* kqueue detach function */
static void
filt_liodetach(struct knote *kn)
{
struct knlist *knl;
knl = &kn->kn_ptr.p_lio->klist;
knl->kl_lock(knl->kl_lockarg);
if (!knlist_empty(knl))
knlist_remove(knl, kn, 1);
knl->kl_unlock(knl->kl_lockarg);
}
/* kqueue filter function */
/*ARGSUSED*/
static int
filt_lio(struct knote *kn, long hint)
{
struct aioliojob * lj = kn->kn_ptr.p_lio;
return (lj->lioj_flags & LIOJ_KEVENT_POSTED);
}
#ifdef COMPAT_FREEBSD32
#include <sys/mount.h>
#include <sys/socket.h>
#include <compat/freebsd32/freebsd32.h>
#include <compat/freebsd32/freebsd32_proto.h>
#include <compat/freebsd32/freebsd32_signal.h>
#include <compat/freebsd32/freebsd32_syscall.h>
#include <compat/freebsd32/freebsd32_util.h>
struct __aiocb_private32 {
int32_t status;
int32_t error;
uint32_t kernelinfo;
};
#ifdef COMPAT_FREEBSD6
typedef struct oaiocb32 {
int aio_fildes; /* File descriptor */
uint64_t aio_offset __packed; /* File offset for I/O */
uint32_t aio_buf; /* I/O buffer in process space */
uint32_t aio_nbytes; /* Number of bytes for I/O */
struct osigevent32 aio_sigevent; /* Signal to deliver */
int aio_lio_opcode; /* LIO opcode */
int aio_reqprio; /* Request priority -- ignored */
struct __aiocb_private32 _aiocb_private;
} oaiocb32_t;
#endif
typedef struct aiocb32 {
int32_t aio_fildes; /* File descriptor */
uint64_t aio_offset __packed; /* File offset for I/O */
uint32_t aio_buf; /* I/O buffer in process space */
uint32_t aio_nbytes; /* Number of bytes for I/O */
int __spare__[2];
uint32_t __spare2__;
int aio_lio_opcode; /* LIO opcode */
int aio_reqprio; /* Request priority -- ignored */
struct __aiocb_private32 _aiocb_private;
struct sigevent32 aio_sigevent; /* Signal to deliver */
} aiocb32_t;
#ifdef COMPAT_FREEBSD6
static int
convert_old_sigevent32(struct osigevent32 *osig, struct sigevent *nsig)
{
/*
* Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
* supported by AIO with the old sigevent structure.
*/
CP(*osig, *nsig, sigev_notify);
switch (nsig->sigev_notify) {
case SIGEV_NONE:
break;
case SIGEV_SIGNAL:
nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
break;
case SIGEV_KEVENT:
nsig->sigev_notify_kqueue =
osig->__sigev_u.__sigev_notify_kqueue;
PTRIN_CP(*osig, *nsig, sigev_value.sival_ptr);
break;
default:
return (EINVAL);
}
return (0);
}
static int
aiocb32_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
{
struct oaiocb32 job32;
int error;
bzero(kjob, sizeof(struct aiocb));
error = copyin(ujob, &job32, sizeof(job32));
if (error)
return (error);
CP(job32, *kjob, aio_fildes);
CP(job32, *kjob, aio_offset);
PTRIN_CP(job32, *kjob, aio_buf);
CP(job32, *kjob, aio_nbytes);
CP(job32, *kjob, aio_lio_opcode);
CP(job32, *kjob, aio_reqprio);
CP(job32, *kjob, _aiocb_private.status);
CP(job32, *kjob, _aiocb_private.error);
PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
return (convert_old_sigevent32(&job32.aio_sigevent,
&kjob->aio_sigevent));
}
#endif
static int
aiocb32_copyin(struct aiocb *ujob, struct aiocb *kjob)
{
struct aiocb32 job32;
int error;
error = copyin(ujob, &job32, sizeof(job32));
if (error)
return (error);
CP(job32, *kjob, aio_fildes);
CP(job32, *kjob, aio_offset);
PTRIN_CP(job32, *kjob, aio_buf);
CP(job32, *kjob, aio_nbytes);
CP(job32, *kjob, aio_lio_opcode);
CP(job32, *kjob, aio_reqprio);
CP(job32, *kjob, _aiocb_private.status);
CP(job32, *kjob, _aiocb_private.error);
PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
return (convert_sigevent32(&job32.aio_sigevent, &kjob->aio_sigevent));
}
static long
aiocb32_fetch_status(struct aiocb *ujob)
{
struct aiocb32 *ujob32;
ujob32 = (struct aiocb32 *)ujob;
return (fuword32(&ujob32->_aiocb_private.status));
}
static long
aiocb32_fetch_error(struct aiocb *ujob)
{
struct aiocb32 *ujob32;
ujob32 = (struct aiocb32 *)ujob;
return (fuword32(&ujob32->_aiocb_private.error));
}
static int
aiocb32_store_status(struct aiocb *ujob, long status)
{
struct aiocb32 *ujob32;
ujob32 = (struct aiocb32 *)ujob;
return (suword32(&ujob32->_aiocb_private.status, status));
}
static int
aiocb32_store_error(struct aiocb *ujob, long error)
{
struct aiocb32 *ujob32;
ujob32 = (struct aiocb32 *)ujob;
return (suword32(&ujob32->_aiocb_private.error, error));
}
static int
aiocb32_store_kernelinfo(struct aiocb *ujob, long jobref)
{
struct aiocb32 *ujob32;
ujob32 = (struct aiocb32 *)ujob;
return (suword32(&ujob32->_aiocb_private.kernelinfo, jobref));
}
static int
aiocb32_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
{
return (suword32(ujobp, (long)ujob));
}
static struct aiocb_ops aiocb32_ops = {
.copyin = aiocb32_copyin,
.fetch_status = aiocb32_fetch_status,
.fetch_error = aiocb32_fetch_error,
.store_status = aiocb32_store_status,
.store_error = aiocb32_store_error,
.store_kernelinfo = aiocb32_store_kernelinfo,
.store_aiocb = aiocb32_store_aiocb,
};
#ifdef COMPAT_FREEBSD6
static struct aiocb_ops aiocb32_ops_osigevent = {
.copyin = aiocb32_copyin_old_sigevent,
.fetch_status = aiocb32_fetch_status,
.fetch_error = aiocb32_fetch_error,
.store_status = aiocb32_store_status,
.store_error = aiocb32_store_error,
.store_kernelinfo = aiocb32_store_kernelinfo,
.store_aiocb = aiocb32_store_aiocb,
};
#endif
int
freebsd32_aio_return(struct thread *td, struct freebsd32_aio_return_args *uap)
{
return (kern_aio_return(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
}
int
freebsd32_aio_suspend(struct thread *td, struct freebsd32_aio_suspend_args *uap)
{
struct timespec32 ts32;
struct timespec ts, *tsp;
struct aiocb **ujoblist;
uint32_t *ujoblist32;
int error, i;
if (uap->nent < 0 || uap->nent > max_aio_queue_per_proc)
return (EINVAL);
if (uap->timeout) {
/* Get timespec struct. */
if ((error = copyin(uap->timeout, &ts32, sizeof(ts32))) != 0)
return (error);
CP(ts32, ts, tv_sec);
CP(ts32, ts, tv_nsec);
tsp = &ts;
} else
tsp = NULL;
ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIOS, M_WAITOK);
ujoblist32 = (uint32_t *)ujoblist;
error = copyin(uap->aiocbp, ujoblist32, uap->nent *
sizeof(ujoblist32[0]));
if (error == 0) {
for (i = uap->nent - 1; i >= 0; i--)
ujoblist[i] = PTRIN(ujoblist32[i]);
error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
}
free(ujoblist, M_AIOS);
return (error);
}
int
freebsd32_aio_error(struct thread *td, struct freebsd32_aio_error_args *uap)
{
return (kern_aio_error(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
}
#ifdef COMPAT_FREEBSD6
int
freebsd6_freebsd32_aio_read(struct thread *td,
struct freebsd6_freebsd32_aio_read_args *uap)
{
return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
&aiocb32_ops_osigevent));
}
#endif
int
freebsd32_aio_read(struct thread *td, struct freebsd32_aio_read_args *uap)
{
return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
&aiocb32_ops));
}
#ifdef COMPAT_FREEBSD6
int
freebsd6_freebsd32_aio_write(struct thread *td,
struct freebsd6_freebsd32_aio_write_args *uap)
{
return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
&aiocb32_ops_osigevent));
}
#endif
int
freebsd32_aio_write(struct thread *td, struct freebsd32_aio_write_args *uap)
{
return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
&aiocb32_ops));
}
int
freebsd32_aio_mlock(struct thread *td, struct freebsd32_aio_mlock_args *uap)
{
return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_MLOCK,
&aiocb32_ops));
}
int
freebsd32_aio_waitcomplete(struct thread *td,
struct freebsd32_aio_waitcomplete_args *uap)
{
struct timespec32 ts32;
struct timespec ts, *tsp;
int error;
if (uap->timeout) {
/* Get timespec struct. */
error = copyin(uap->timeout, &ts32, sizeof(ts32));
if (error)
return (error);
CP(ts32, ts, tv_sec);
CP(ts32, ts, tv_nsec);
tsp = &ts;
} else
tsp = NULL;
return (kern_aio_waitcomplete(td, (struct aiocb **)uap->aiocbp, tsp,
&aiocb32_ops));
}
int
freebsd32_aio_fsync(struct thread *td, struct freebsd32_aio_fsync_args *uap)
{
return (kern_aio_fsync(td, uap->op, (struct aiocb *)uap->aiocbp,
&aiocb32_ops));
}
#ifdef COMPAT_FREEBSD6
int
freebsd6_freebsd32_lio_listio(struct thread *td,
struct freebsd6_freebsd32_lio_listio_args *uap)
{
struct aiocb **acb_list;
struct sigevent *sigp, sig;
struct osigevent32 osig;
uint32_t *acb_list32;
int error, i, nent;
if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
return (EINVAL);
nent = uap->nent;
if (nent < 0 || nent > max_aio_queue_per_proc)
return (EINVAL);
if (uap->sig && (uap->mode == LIO_NOWAIT)) {
error = copyin(uap->sig, &osig, sizeof(osig));
if (error)
return (error);
error = convert_old_sigevent32(&osig, &sig);
if (error)
return (error);
sigp = &sig;
} else
sigp = NULL;
acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
if (error) {
free(acb_list32, M_LIO);
return (error);
}
acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
for (i = 0; i < nent; i++)
acb_list[i] = PTRIN(acb_list32[i]);
free(acb_list32, M_LIO);
error = kern_lio_listio(td, uap->mode,
(struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
&aiocb32_ops_osigevent);
free(acb_list, M_LIO);
return (error);
}
#endif
int
freebsd32_lio_listio(struct thread *td, struct freebsd32_lio_listio_args *uap)
{
struct aiocb **acb_list;
struct sigevent *sigp, sig;
struct sigevent32 sig32;
uint32_t *acb_list32;
int error, i, nent;
if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
return (EINVAL);
nent = uap->nent;
if (nent < 0 || nent > max_aio_queue_per_proc)
return (EINVAL);
if (uap->sig && (uap->mode == LIO_NOWAIT)) {
error = copyin(uap->sig, &sig32, sizeof(sig32));
if (error)
return (error);
error = convert_sigevent32(&sig32, &sig);
if (error)
return (error);
sigp = &sig;
} else
sigp = NULL;
acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
if (error) {
free(acb_list32, M_LIO);
return (error);
}
acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
for (i = 0; i < nent; i++)
acb_list[i] = PTRIN(acb_list32[i]);
free(acb_list32, M_LIO);
error = kern_lio_listio(td, uap->mode,
(struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
&aiocb32_ops);
free(acb_list, M_LIO);
return (error);
}
#endif
Index: head/sys/kern/vfs_subr.c
===================================================================
--- head/sys/kern/vfs_subr.c (revision 327172)
+++ head/sys/kern/vfs_subr.c (revision 327173)
@@ -1,5566 +1,5565 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
*/
/*
* External virtual filesystem routines
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_compat.h"
#include "opt_ddb.h"
#include "opt_watchdog.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/condvar.h>
#include <sys/conf.h>
#include <sys/counter.h>
#include <sys/dirent.h>
#include <sys/event.h>
#include <sys/eventhandler.h>
#include <sys/extattr.h>
#include <sys/file.h>
#include <sys/fcntl.h>
#include <sys/jail.h>
#include <sys/kdb.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/lockf.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/pctrie.h>
#include <sys/priv.h>
#include <sys/reboot.h>
#include <sys/refcount.h>
#include <sys/rwlock.h>
#include <sys/sched.h>
#include <sys/sleepqueue.h>
#include <sys/smp.h>
#include <sys/stat.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/vmmeter.h>
#include <sys/vnode.h>
#include <sys/watchdog.h>
#include <machine/stdarg.h>
#include <security/mac/mac_framework.h>
#include <vm/vm.h>
#include <vm/vm_object.h>
#include <vm/vm_extern.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_page.h>
#include <vm/vm_kern.h>
#include <vm/uma.h>
#ifdef DDB
#include <ddb/ddb.h>
#endif
static void delmntque(struct vnode *vp);
static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
int slpflag, int slptimeo);
static void syncer_shutdown(void *arg, int howto);
static int vtryrecycle(struct vnode *vp);
static void v_init_counters(struct vnode *);
static void v_incr_usecount(struct vnode *);
static void v_incr_usecount_locked(struct vnode *);
static void v_incr_devcount(struct vnode *);
static void v_decr_devcount(struct vnode *);
static void vgonel(struct vnode *);
static void vfs_knllock(void *arg);
static void vfs_knlunlock(void *arg);
static void vfs_knl_assert_locked(void *arg);
static void vfs_knl_assert_unlocked(void *arg);
static void vnlru_return_batches(struct vfsops *mnt_op);
static void destroy_vpollinfo(struct vpollinfo *vi);
/*
* Number of vnodes in existence. Increased whenever getnewvnode()
* allocates a new vnode, decreased in vdropl() for VI_DOOMED vnode.
*/
static unsigned long numvnodes;
SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
"Number of vnodes in existence");
static counter_u64_t vnodes_created;
SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created,
"Number of vnodes created by getnewvnode");
static u_long mnt_free_list_batch = 128;
SYSCTL_ULONG(_vfs, OID_AUTO, mnt_free_list_batch, CTLFLAG_RW,
&mnt_free_list_batch, 0, "Limit of vnodes held on mnt's free list");
/*
* Conversion tables for conversion from vnode types to inode formats
* and back.
*/
enum vtype iftovt_tab[16] = {
VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
};
int vttoif_tab[10] = {
0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
};
/*
* List of vnodes that are ready for recycling.
*/
static TAILQ_HEAD(freelst, vnode) vnode_free_list;
/*
* "Free" vnode target. Free vnodes are rarely completely free, but are
* just ones that are cheap to recycle. Usually they are for files which
* have been stat'd but not read; these usually have inode and namecache
* data attached to them. This target is the preferred minimum size of a
* sub-cache consisting mostly of such files. The system balances the size
* of this sub-cache with its complement to try to prevent either from
* thrashing while the other is relatively inactive. The targets express
* a preference for the best balance.
*
* "Above" this target there are 2 further targets (watermarks) related
* to recyling of free vnodes. In the best-operating case, the cache is
* exactly full, the free list has size between vlowat and vhiwat above the
* free target, and recycling from it and normal use maintains this state.
* Sometimes the free list is below vlowat or even empty, but this state
* is even better for immediate use provided the cache is not full.
* Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free
* ones) to reach one of these states. The watermarks are currently hard-
* coded as 4% and 9% of the available space higher. These and the default
* of 25% for wantfreevnodes are too large if the memory size is large.
* E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim
* whenever vnlru_proc() becomes active.
*/
static u_long wantfreevnodes;
SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW,
&wantfreevnodes, 0, "Target for minimum number of \"free\" vnodes");
static u_long freevnodes;
SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD,
&freevnodes, 0, "Number of \"free\" vnodes");
static counter_u64_t recycles_count;
SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count,
"Number of vnodes recycled to meet vnode cache targets");
/*
* Various variables used for debugging the new implementation of
* reassignbuf().
* XXX these are probably of (very) limited utility now.
*/
static int reassignbufcalls;
SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0,
"Number of calls to reassignbuf");
static counter_u64_t free_owe_inact;
SYSCTL_COUNTER_U64(_vfs, OID_AUTO, free_owe_inact, CTLFLAG_RD, &free_owe_inact,
"Number of times free vnodes kept on active list due to VFS "
"owing inactivation");
/* To keep more than one thread at a time from running vfs_getnewfsid */
static struct mtx mntid_mtx;
/*
* Lock for any access to the following:
* vnode_free_list
* numvnodes
* freevnodes
*/
static struct mtx vnode_free_list_mtx;
/* Publicly exported FS */
struct nfs_public nfs_pub;
static uma_zone_t buf_trie_zone;
/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
static uma_zone_t vnode_zone;
static uma_zone_t vnodepoll_zone;
/*
* The workitem queue.
*
* It is useful to delay writes of file data and filesystem metadata
* for tens of seconds so that quickly created and deleted files need
* not waste disk bandwidth being created and removed. To realize this,
* we append vnodes to a "workitem" queue. When running with a soft
* updates implementation, most pending metadata dependencies should
* not wait for more than a few seconds. Thus, mounted on block devices
* are delayed only about a half the time that file data is delayed.
* Similarly, directory updates are more critical, so are only delayed
* about a third the time that file data is delayed. Thus, there are
* SYNCER_MAXDELAY queues that are processed round-robin at a rate of
* one each second (driven off the filesystem syncer process). The
* syncer_delayno variable indicates the next queue that is to be processed.
* Items that need to be processed soon are placed in this queue:
*
* syncer_workitem_pending[syncer_delayno]
*
* A delay of fifteen seconds is done by placing the request fifteen
* entries later in the queue:
*
* syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
*
*/
static int syncer_delayno;
static long syncer_mask;
LIST_HEAD(synclist, bufobj);
static struct synclist *syncer_workitem_pending;
/*
* The sync_mtx protects:
* bo->bo_synclist
* sync_vnode_count
* syncer_delayno
* syncer_state
* syncer_workitem_pending
* syncer_worklist_len
* rushjob
*/
static struct mtx sync_mtx;
static struct cv sync_wakeup;
#define SYNCER_MAXDELAY 32
static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */
static int syncdelay = 30; /* max time to delay syncing data */
static int filedelay = 30; /* time to delay syncing files */
SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
"Time to delay syncing files (in seconds)");
static int dirdelay = 29; /* time to delay syncing directories */
SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
"Time to delay syncing directories (in seconds)");
static int metadelay = 28; /* time to delay syncing metadata */
SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
"Time to delay syncing metadata (in seconds)");
static int rushjob; /* number of slots to run ASAP */
static int stat_rush_requests; /* number of times I/O speeded up */
SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
"Number of times I/O speeded up (rush requests)");
/*
* When shutting down the syncer, run it at four times normal speed.
*/
#define SYNCER_SHUTDOWN_SPEEDUP 4
static int sync_vnode_count;
static int syncer_worklist_len;
static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
syncer_state;
/* Target for maximum number of vnodes. */
int desiredvnodes;
static int gapvnodes; /* gap between wanted and desired */
static int vhiwat; /* enough extras after expansion */
static int vlowat; /* minimal extras before expansion */
static int vstir; /* nonzero to stir non-free vnodes */
static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */
static int
sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS)
{
int error, old_desiredvnodes;
old_desiredvnodes = desiredvnodes;
if ((error = sysctl_handle_int(oidp, arg1, arg2, req)) != 0)
return (error);
if (old_desiredvnodes != desiredvnodes) {
wantfreevnodes = desiredvnodes / 4;
/* XXX locking seems to be incomplete. */
vfs_hash_changesize(desiredvnodes);
cache_changesize(desiredvnodes);
}
return (0);
}
SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes,
CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &desiredvnodes, 0,
sysctl_update_desiredvnodes, "I", "Target for maximum number of vnodes");
SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
&wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)");
static int vnlru_nowhere;
SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
&vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
/* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
static int vnsz2log;
/*
* Support for the bufobj clean & dirty pctrie.
*/
static void *
buf_trie_alloc(struct pctrie *ptree)
{
return uma_zalloc(buf_trie_zone, M_NOWAIT);
}
static void
buf_trie_free(struct pctrie *ptree, void *node)
{
uma_zfree(buf_trie_zone, node);
}
PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free);
/*
* Initialize the vnode management data structures.
*
* Reevaluate the following cap on the number of vnodes after the physical
* memory size exceeds 512GB. In the limit, as the physical memory size
* grows, the ratio of the memory size in KB to to vnodes approaches 64:1.
*/
#ifndef MAXVNODES_MAX
#define MAXVNODES_MAX (512 * 1024 * 1024 / 64) /* 8M */
#endif
/*
* Initialize a vnode as it first enters the zone.
*/
static int
vnode_init(void *mem, int size, int flags)
{
struct vnode *vp;
struct bufobj *bo;
vp = mem;
bzero(vp, size);
/*
* Setup locks.
*/
vp->v_vnlock = &vp->v_lock;
mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
/*
* By default, don't allow shared locks unless filesystems opt-in.
*/
lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT,
LK_NOSHARE | LK_IS_VNODE);
/*
* Initialize bufobj.
*/
bo = &vp->v_bufobj;
rw_init(BO_LOCKPTR(bo), "bufobj interlock");
bo->bo_private = vp;
TAILQ_INIT(&bo->bo_clean.bv_hd);
TAILQ_INIT(&bo->bo_dirty.bv_hd);
/*
* Initialize namecache.
*/
LIST_INIT(&vp->v_cache_src);
TAILQ_INIT(&vp->v_cache_dst);
/*
* Initialize rangelocks.
*/
rangelock_init(&vp->v_rl);
return (0);
}
/*
* Free a vnode when it is cleared from the zone.
*/
static void
vnode_fini(void *mem, int size)
{
struct vnode *vp;
struct bufobj *bo;
vp = mem;
rangelock_destroy(&vp->v_rl);
lockdestroy(vp->v_vnlock);
mtx_destroy(&vp->v_interlock);
bo = &vp->v_bufobj;
rw_destroy(BO_LOCKPTR(bo));
}
/*
* Provide the size of NFS nclnode and NFS fh for calculation of the
* vnode memory consumption. The size is specified directly to
* eliminate dependency on NFS-private header.
*
* Other filesystems may use bigger or smaller (like UFS and ZFS)
* private inode data, but the NFS-based estimation is ample enough.
* Still, we care about differences in the size between 64- and 32-bit
* platforms.
*
* Namecache structure size is heuristically
* sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1.
*/
#ifdef _LP64
#define NFS_NCLNODE_SZ (528 + 64)
#define NC_SZ 148
#else
#define NFS_NCLNODE_SZ (360 + 32)
#define NC_SZ 92
#endif
static void
vntblinit(void *dummy __unused)
{
u_int i;
int physvnodes, virtvnodes;
/*
* Desiredvnodes is a function of the physical memory size and the
* kernel's heap size. Generally speaking, it scales with the
* physical memory size. The ratio of desiredvnodes to the physical
* memory size is 1:16 until desiredvnodes exceeds 98,304.
* Thereafter, the
* marginal ratio of desiredvnodes to the physical memory size is
* 1:64. However, desiredvnodes is limited by the kernel's heap
* size. The memory required by desiredvnodes vnodes and vm objects
* must not exceed 1/10th of the kernel's heap size.
*/
physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 +
3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64;
virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) +
sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ));
desiredvnodes = min(physvnodes, virtvnodes);
if (desiredvnodes > MAXVNODES_MAX) {
if (bootverbose)
printf("Reducing kern.maxvnodes %d -> %d\n",
desiredvnodes, MAXVNODES_MAX);
desiredvnodes = MAXVNODES_MAX;
}
wantfreevnodes = desiredvnodes / 4;
mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
TAILQ_INIT(&vnode_free_list);
mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
vnode_init, vnode_fini, UMA_ALIGN_PTR, 0);
vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
/*
* Preallocate enough nodes to support one-per buf so that
* we can not fail an insert. reassignbuf() callers can not
* tolerate the insertion failure.
*/
buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(),
NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR,
UMA_ZONE_NOFREE | UMA_ZONE_VM);
uma_prealloc(buf_trie_zone, nbuf);
vnodes_created = counter_u64_alloc(M_WAITOK);
recycles_count = counter_u64_alloc(M_WAITOK);
free_owe_inact = counter_u64_alloc(M_WAITOK);
/*
* Initialize the filesystem syncer.
*/
syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
&syncer_mask);
syncer_maxdelay = syncer_mask + 1;
mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
cv_init(&sync_wakeup, "syncer");
for (i = 1; i <= sizeof(struct vnode); i <<= 1)
vnsz2log++;
vnsz2log--;
}
SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
/*
* Mark a mount point as busy. Used to synchronize access and to delay
* unmounting. Eventually, mountlist_mtx is not released on failure.
*
* vfs_busy() is a custom lock, it can block the caller.
* vfs_busy() only sleeps if the unmount is active on the mount point.
* For a mountpoint mp, vfs_busy-enforced lock is before lock of any
* vnode belonging to mp.
*
* Lookup uses vfs_busy() to traverse mount points.
* root fs var fs
* / vnode lock A / vnode lock (/var) D
* /var vnode lock B /log vnode lock(/var/log) E
* vfs_busy lock C vfs_busy lock F
*
* Within each file system, the lock order is C->A->B and F->D->E.
*
* When traversing across mounts, the system follows that lock order:
*
* C->A->B
* |
* +->F->D->E
*
* The lookup() process for namei("/var") illustrates the process:
* VOP_LOOKUP() obtains B while A is held
* vfs_busy() obtains a shared lock on F while A and B are held
* vput() releases lock on B
* vput() releases lock on A
* VFS_ROOT() obtains lock on D while shared lock on F is held
* vfs_unbusy() releases shared lock on F
* vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A.
* Attempt to lock A (instead of vp_crossmp) while D is held would
* violate the global order, causing deadlocks.
*
* dounmount() locks B while F is drained.
*/
int
vfs_busy(struct mount *mp, int flags)
{
MPASS((flags & ~MBF_MASK) == 0);
CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags);
MNT_ILOCK(mp);
MNT_REF(mp);
/*
* If mount point is currently being unmounted, sleep until the
* mount point fate is decided. If thread doing the unmounting fails,
* it will clear MNTK_UNMOUNT flag before waking us up, indicating
* that this mount point has survived the unmount attempt and vfs_busy
* should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE
* flag in addition to MNTK_UNMOUNT, indicating that mount point is
* about to be really destroyed. vfs_busy needs to release its
* reference on the mount point in this case and return with ENOENT,
* telling the caller that mount mount it tried to busy is no longer
* valid.
*/
while (mp->mnt_kern_flag & MNTK_UNMOUNT) {
if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) {
MNT_REL(mp);
MNT_IUNLOCK(mp);
CTR1(KTR_VFS, "%s: failed busying before sleeping",
__func__);
return (ENOENT);
}
if (flags & MBF_MNTLSTLOCK)
mtx_unlock(&mountlist_mtx);
mp->mnt_kern_flag |= MNTK_MWAIT;
msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0);
if (flags & MBF_MNTLSTLOCK)
mtx_lock(&mountlist_mtx);
MNT_ILOCK(mp);
}
if (flags & MBF_MNTLSTLOCK)
mtx_unlock(&mountlist_mtx);
mp->mnt_lockref++;
MNT_IUNLOCK(mp);
return (0);
}
/*
* Free a busy filesystem.
*/
void
vfs_unbusy(struct mount *mp)
{
CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
MNT_ILOCK(mp);
MNT_REL(mp);
KASSERT(mp->mnt_lockref > 0, ("negative mnt_lockref"));
mp->mnt_lockref--;
if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
CTR1(KTR_VFS, "%s: waking up waiters", __func__);
mp->mnt_kern_flag &= ~MNTK_DRAINING;
wakeup(&mp->mnt_lockref);
}
MNT_IUNLOCK(mp);
}
/*
* Lookup a mount point by filesystem identifier.
*/
struct mount *
vfs_getvfs(fsid_t *fsid)
{
struct mount *mp;
CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
mtx_lock(&mountlist_mtx);
TAILQ_FOREACH(mp, &mountlist, mnt_list) {
if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
vfs_ref(mp);
mtx_unlock(&mountlist_mtx);
return (mp);
}
}
mtx_unlock(&mountlist_mtx);
CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
return ((struct mount *) 0);
}
/*
* Lookup a mount point by filesystem identifier, busying it before
* returning.
*
* To avoid congestion on mountlist_mtx, implement simple direct-mapped
* cache for popular filesystem identifiers. The cache is lockess, using
* the fact that struct mount's are never freed. In worst case we may
* get pointer to unmounted or even different filesystem, so we have to
* check what we got, and go slow way if so.
*/
struct mount *
vfs_busyfs(fsid_t *fsid)
{
#define FSID_CACHE_SIZE 256
typedef struct mount * volatile vmp_t;
static vmp_t cache[FSID_CACHE_SIZE];
struct mount *mp;
int error;
uint32_t hash;
CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
hash = fsid->val[0] ^ fsid->val[1];
hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1);
mp = cache[hash];
if (mp == NULL ||
mp->mnt_stat.f_fsid.val[0] != fsid->val[0] ||
mp->mnt_stat.f_fsid.val[1] != fsid->val[1])
goto slow;
if (vfs_busy(mp, 0) != 0) {
cache[hash] = NULL;
goto slow;
}
if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
mp->mnt_stat.f_fsid.val[1] == fsid->val[1])
return (mp);
else
vfs_unbusy(mp);
slow:
mtx_lock(&mountlist_mtx);
TAILQ_FOREACH(mp, &mountlist, mnt_list) {
if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
error = vfs_busy(mp, MBF_MNTLSTLOCK);
if (error) {
cache[hash] = NULL;
mtx_unlock(&mountlist_mtx);
return (NULL);
}
cache[hash] = mp;
return (mp);
}
}
CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
mtx_unlock(&mountlist_mtx);
return ((struct mount *) 0);
}
/*
* Check if a user can access privileged mount options.
*/
int
vfs_suser(struct mount *mp, struct thread *td)
{
int error;
/*
* If the thread is jailed, but this is not a jail-friendly file
* system, deny immediately.
*/
if (!(mp->mnt_vfc->vfc_flags & VFCF_JAIL) && jailed(td->td_ucred))
return (EPERM);
/*
* If the file system was mounted outside the jail of the calling
* thread, deny immediately.
*/
if (prison_check(td->td_ucred, mp->mnt_cred) != 0)
return (EPERM);
/*
* If file system supports delegated administration, we don't check
* for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified
* by the file system itself.
* If this is not the user that did original mount, we check for
* the PRIV_VFS_MOUNT_OWNER privilege.
*/
if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) &&
mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
return (error);
}
return (0);
}
/*
* Get a new unique fsid. Try to make its val[0] unique, since this value
* will be used to create fake device numbers for stat(). Also try (but
* not so hard) make its val[0] unique mod 2^16, since some emulators only
* support 16-bit device numbers. We end up with unique val[0]'s for the
* first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
*
* Keep in mind that several mounts may be running in parallel. Starting
* the search one past where the previous search terminated is both a
* micro-optimization and a defense against returning the same fsid to
* different mounts.
*/
void
vfs_getnewfsid(struct mount *mp)
{
static uint16_t mntid_base;
struct mount *nmp;
fsid_t tfsid;
int mtype;
CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
mtx_lock(&mntid_mtx);
mtype = mp->mnt_vfc->vfc_typenum;
tfsid.val[1] = mtype;
mtype = (mtype & 0xFF) << 24;
for (;;) {
tfsid.val[0] = makedev(255,
mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
mntid_base++;
if ((nmp = vfs_getvfs(&tfsid)) == NULL)
break;
vfs_rel(nmp);
}
mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
mtx_unlock(&mntid_mtx);
}
/*
* Knob to control the precision of file timestamps:
*
* 0 = seconds only; nanoseconds zeroed.
* 1 = seconds and nanoseconds, accurate within 1/HZ.
* 2 = seconds and nanoseconds, truncated to microseconds.
* >=3 = seconds and nanoseconds, maximum precision.
*/
enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
static int timestamp_precision = TSP_USEC;
SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
&timestamp_precision, 0, "File timestamp precision (0: seconds, "
"1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, "
"3+: sec + ns (max. precision))");
/*
* Get a current timestamp.
*/
void
vfs_timestamp(struct timespec *tsp)
{
struct timeval tv;
switch (timestamp_precision) {
case TSP_SEC:
tsp->tv_sec = time_second;
tsp->tv_nsec = 0;
break;
case TSP_HZ:
getnanotime(tsp);
break;
case TSP_USEC:
microtime(&tv);
TIMEVAL_TO_TIMESPEC(&tv, tsp);
break;
case TSP_NSEC:
default:
nanotime(tsp);
break;
}
}
/*
* Set vnode attributes to VNOVAL
*/
void
vattr_null(struct vattr *vap)
{
vap->va_type = VNON;
vap->va_size = VNOVAL;
vap->va_bytes = VNOVAL;
vap->va_mode = VNOVAL;
vap->va_nlink = VNOVAL;
vap->va_uid = VNOVAL;
vap->va_gid = VNOVAL;
vap->va_fsid = VNOVAL;
vap->va_fileid = VNOVAL;
vap->va_blocksize = VNOVAL;
vap->va_rdev = VNOVAL;
vap->va_atime.tv_sec = VNOVAL;
vap->va_atime.tv_nsec = VNOVAL;
vap->va_mtime.tv_sec = VNOVAL;
vap->va_mtime.tv_nsec = VNOVAL;
vap->va_ctime.tv_sec = VNOVAL;
vap->va_ctime.tv_nsec = VNOVAL;
vap->va_birthtime.tv_sec = VNOVAL;
vap->va_birthtime.tv_nsec = VNOVAL;
vap->va_flags = VNOVAL;
vap->va_gen = VNOVAL;
vap->va_vaflags = 0;
}
/*
* This routine is called when we have too many vnodes. It attempts
* to free <count> vnodes and will potentially free vnodes that still
* have VM backing store (VM backing store is typically the cause
* of a vnode blowout so we want to do this). Therefore, this operation
* is not considered cheap.
*
* A number of conditions may prevent a vnode from being reclaimed.
* the buffer cache may have references on the vnode, a directory
* vnode may still have references due to the namei cache representing
* underlying files, or the vnode may be in active use. It is not
* desirable to reuse such vnodes. These conditions may cause the
* number of vnodes to reach some minimum value regardless of what
* you set kern.maxvnodes to. Do not set kern.maxvnodes too low.
*/
static int
vlrureclaim(struct mount *mp, int reclaim_nc_src, int trigger)
{
struct vnode *vp;
int count, done, target;
done = 0;
vn_start_write(NULL, &mp, V_WAIT);
MNT_ILOCK(mp);
count = mp->mnt_nvnodelistsize;
target = count * (int64_t)gapvnodes / imax(desiredvnodes, 1);
target = target / 10 + 1;
while (count != 0 && done < target) {
vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
while (vp != NULL && vp->v_type == VMARKER)
vp = TAILQ_NEXT(vp, v_nmntvnodes);
if (vp == NULL)
break;
/*
* XXX LRU is completely broken for non-free vnodes. First
* by calling here in mountpoint order, then by moving
* unselected vnodes to the end here, and most grossly by
* removing the vlruvp() function that was supposed to
* maintain the order. (This function was born broken
* since syncer problems prevented it doing anything.) The
* order is closer to LRC (C = Created).
*
* LRU reclaiming of vnodes seems to have last worked in
* FreeBSD-3 where LRU wasn't mentioned under any spelling.
* Then there was no hold count, and inactive vnodes were
* simply put on the free list in LRU order. The separate
* lists also break LRU. We prefer to reclaim from the
* free list for technical reasons. This tends to thrash
* the free list to keep very unrecently used held vnodes.
* The problem is mitigated by keeping the free list large.
*/
TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
--count;
if (!VI_TRYLOCK(vp))
goto next_iter;
/*
* If it's been deconstructed already, it's still
* referenced, or it exceeds the trigger, skip it.
* Also skip free vnodes. We are trying to make space
* to expand the free list, not reduce it.
*/
if (vp->v_usecount ||
(!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
((vp->v_iflag & VI_FREE) != 0) ||
(vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
vp->v_object->resident_page_count > trigger)) {
VI_UNLOCK(vp);
goto next_iter;
}
MNT_IUNLOCK(mp);
vholdl(vp);
if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) {
vdrop(vp);
goto next_iter_mntunlocked;
}
VI_LOCK(vp);
/*
* v_usecount may have been bumped after VOP_LOCK() dropped
* the vnode interlock and before it was locked again.
*
* It is not necessary to recheck VI_DOOMED because it can
* only be set by another thread that holds both the vnode
* lock and vnode interlock. If another thread has the
* vnode lock before we get to VOP_LOCK() and obtains the
* vnode interlock after VOP_LOCK() drops the vnode
* interlock, the other thread will be unable to drop the
* vnode lock before our VOP_LOCK() call fails.
*/
if (vp->v_usecount ||
(!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
(vp->v_iflag & VI_FREE) != 0 ||
(vp->v_object != NULL &&
vp->v_object->resident_page_count > trigger)) {
VOP_UNLOCK(vp, LK_INTERLOCK);
vdrop(vp);
goto next_iter_mntunlocked;
}
KASSERT((vp->v_iflag & VI_DOOMED) == 0,
("VI_DOOMED unexpectedly detected in vlrureclaim()"));
counter_u64_add(recycles_count, 1);
vgonel(vp);
VOP_UNLOCK(vp, 0);
vdropl(vp);
done++;
next_iter_mntunlocked:
if (!should_yield())
goto relock_mnt;
goto yield;
next_iter:
if (!should_yield())
continue;
MNT_IUNLOCK(mp);
yield:
kern_yield(PRI_USER);
relock_mnt:
MNT_ILOCK(mp);
}
MNT_IUNLOCK(mp);
vn_finished_write(mp);
return done;
}
static int max_vnlru_free = 10000; /* limit on vnode free requests per call */
SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free,
0,
"limit on vnode free requests per call to the vnlru_free routine");
/*
* Attempt to reduce the free list by the requested amount.
*/
static void
vnlru_free_locked(int count, struct vfsops *mnt_op)
{
struct vnode *vp;
struct mount *mp;
bool tried_batches;
tried_batches = false;
mtx_assert(&vnode_free_list_mtx, MA_OWNED);
if (count > max_vnlru_free)
count = max_vnlru_free;
for (; count > 0; count--) {
vp = TAILQ_FIRST(&vnode_free_list);
/*
* The list can be modified while the free_list_mtx
* has been dropped and vp could be NULL here.
*/
if (vp == NULL) {
if (tried_batches)
break;
mtx_unlock(&vnode_free_list_mtx);
vnlru_return_batches(mnt_op);
tried_batches = true;
mtx_lock(&vnode_free_list_mtx);
continue;
}
VNASSERT(vp->v_op != NULL, vp,
("vnlru_free: vnode already reclaimed."));
KASSERT((vp->v_iflag & VI_FREE) != 0,
("Removing vnode not on freelist"));
KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
("Mangling active vnode"));
TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
/*
* Don't recycle if our vnode is from different type
* of mount point. Note that mp is type-safe, the
* check does not reach unmapped address even if
* vnode is reclaimed.
* Don't recycle if we can't get the interlock without
* blocking.
*/
if ((mnt_op != NULL && (mp = vp->v_mount) != NULL &&
mp->mnt_op != mnt_op) || !VI_TRYLOCK(vp)) {
TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist);
continue;
}
VNASSERT((vp->v_iflag & VI_FREE) != 0 && vp->v_holdcnt == 0,
vp, ("vp inconsistent on freelist"));
/*
* The clear of VI_FREE prevents activation of the
* vnode. There is no sense in putting the vnode on
* the mount point active list, only to remove it
* later during recycling. Inline the relevant part
* of vholdl(), to avoid triggering assertions or
* activating.
*/
freevnodes--;
vp->v_iflag &= ~VI_FREE;
refcount_acquire(&vp->v_holdcnt);
mtx_unlock(&vnode_free_list_mtx);
VI_UNLOCK(vp);
vtryrecycle(vp);
/*
* If the recycled succeeded this vdrop will actually free
* the vnode. If not it will simply place it back on
* the free list.
*/
vdrop(vp);
mtx_lock(&vnode_free_list_mtx);
}
}
void
vnlru_free(int count, struct vfsops *mnt_op)
{
mtx_lock(&vnode_free_list_mtx);
vnlru_free_locked(count, mnt_op);
mtx_unlock(&vnode_free_list_mtx);
}
/* XXX some names and initialization are bad for limits and watermarks. */
static int
vspace(void)
{
int space;
gapvnodes = imax(desiredvnodes - wantfreevnodes, 100);
vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */
vlowat = vhiwat / 2;
if (numvnodes > desiredvnodes)
return (0);
space = desiredvnodes - numvnodes;
if (freevnodes > wantfreevnodes)
space += freevnodes - wantfreevnodes;
return (space);
}
static void
vnlru_return_batch_locked(struct mount *mp)
{
struct vnode *vp;
mtx_assert(&mp->mnt_listmtx, MA_OWNED);
if (mp->mnt_tmpfreevnodelistsize == 0)
return;
TAILQ_FOREACH(vp, &mp->mnt_tmpfreevnodelist, v_actfreelist) {
VNASSERT((vp->v_mflag & VMP_TMPMNTFREELIST) != 0, vp,
("vnode without VMP_TMPMNTFREELIST on mnt_tmpfreevnodelist"));
vp->v_mflag &= ~VMP_TMPMNTFREELIST;
}
mtx_lock(&vnode_free_list_mtx);
TAILQ_CONCAT(&vnode_free_list, &mp->mnt_tmpfreevnodelist, v_actfreelist);
freevnodes += mp->mnt_tmpfreevnodelistsize;
mtx_unlock(&vnode_free_list_mtx);
mp->mnt_tmpfreevnodelistsize = 0;
}
static void
vnlru_return_batch(struct mount *mp)
{
mtx_lock(&mp->mnt_listmtx);
vnlru_return_batch_locked(mp);
mtx_unlock(&mp->mnt_listmtx);
}
static void
vnlru_return_batches(struct vfsops *mnt_op)
{
struct mount *mp, *nmp;
bool need_unbusy;
mtx_lock(&mountlist_mtx);
for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
need_unbusy = false;
if (mnt_op != NULL && mp->mnt_op != mnt_op)
goto next;
if (mp->mnt_tmpfreevnodelistsize == 0)
goto next;
if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) == 0) {
vnlru_return_batch(mp);
need_unbusy = true;
mtx_lock(&mountlist_mtx);
}
next:
nmp = TAILQ_NEXT(mp, mnt_list);
if (need_unbusy)
vfs_unbusy(mp);
}
mtx_unlock(&mountlist_mtx);
}
/*
* Attempt to recycle vnodes in a context that is always safe to block.
* Calling vlrurecycle() from the bowels of filesystem code has some
* interesting deadlock problems.
*/
static struct proc *vnlruproc;
static int vnlruproc_sig;
static void
vnlru_proc(void)
{
struct mount *mp, *nmp;
- unsigned long ofreevnodes, onumvnodes;
+ unsigned long onumvnodes;
int done, force, reclaim_nc_src, trigger, usevnodes;
EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc,
SHUTDOWN_PRI_FIRST);
force = 0;
for (;;) {
kproc_suspend_check(vnlruproc);
mtx_lock(&vnode_free_list_mtx);
/*
* If numvnodes is too large (due to desiredvnodes being
* adjusted using its sysctl, or emergency growth), first
* try to reduce it by discarding from the free list.
*/
if (numvnodes > desiredvnodes)
vnlru_free_locked(numvnodes - desiredvnodes, NULL);
/*
* Sleep if the vnode cache is in a good state. This is
* when it is not over-full and has space for about a 4%
* or 9% expansion (by growing its size or inexcessively
* reducing its free list). Otherwise, try to reclaim
* space for a 10% expansion.
*/
if (vstir && force == 0) {
force = 1;
vstir = 0;
}
if (vspace() >= vlowat && force == 0) {
vnlruproc_sig = 0;
wakeup(&vnlruproc_sig);
msleep(vnlruproc, &vnode_free_list_mtx,
PVFS|PDROP, "vlruwt", hz);
continue;
}
mtx_unlock(&vnode_free_list_mtx);
done = 0;
- ofreevnodes = freevnodes;
onumvnodes = numvnodes;
/*
* Calculate parameters for recycling. These are the same
* throughout the loop to give some semblance of fairness.
* The trigger point is to avoid recycling vnodes with lots
* of resident pages. We aren't trying to free memory; we
* are trying to recycle or at least free vnodes.
*/
if (numvnodes <= desiredvnodes)
usevnodes = numvnodes - freevnodes;
else
usevnodes = numvnodes;
if (usevnodes <= 0)
usevnodes = 1;
/*
* The trigger value is is chosen to give a conservatively
* large value to ensure that it alone doesn't prevent
* making progress. The value can easily be so large that
* it is effectively infinite in some congested and
* misconfigured cases, and this is necessary. Normally
* it is about 8 to 100 (pages), which is quite large.
*/
trigger = vm_cnt.v_page_count * 2 / usevnodes;
if (force < 2)
trigger = vsmalltrigger;
reclaim_nc_src = force >= 3;
mtx_lock(&mountlist_mtx);
for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
nmp = TAILQ_NEXT(mp, mnt_list);
continue;
}
done += vlrureclaim(mp, reclaim_nc_src, trigger);
mtx_lock(&mountlist_mtx);
nmp = TAILQ_NEXT(mp, mnt_list);
vfs_unbusy(mp);
}
mtx_unlock(&mountlist_mtx);
if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes)
uma_reclaim();
if (done == 0) {
if (force == 0 || force == 1) {
force = 2;
continue;
}
if (force == 2) {
force = 3;
continue;
}
force = 0;
vnlru_nowhere++;
tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
} else
kern_yield(PRI_USER);
/*
* After becoming active to expand above low water, keep
* active until above high water.
*/
force = vspace() < vhiwat;
}
}
static struct kproc_desc vnlru_kp = {
"vnlru",
vnlru_proc,
&vnlruproc
};
SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
&vnlru_kp);
/*
* Routines having to do with the management of the vnode table.
*/
/*
* Try to recycle a freed vnode. We abort if anyone picks up a reference
* before we actually vgone(). This function must be called with the vnode
* held to prevent the vnode from being returned to the free list midway
* through vgone().
*/
static int
vtryrecycle(struct vnode *vp)
{
struct mount *vnmp;
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
VNASSERT(vp->v_holdcnt, vp,
("vtryrecycle: Recycling vp %p without a reference.", vp));
/*
* This vnode may found and locked via some other list, if so we
* can't recycle it yet.
*/
if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
CTR2(KTR_VFS,
"%s: impossible to recycle, vp %p lock is already held",
__func__, vp);
return (EWOULDBLOCK);
}
/*
* Don't recycle if its filesystem is being suspended.
*/
if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
VOP_UNLOCK(vp, 0);
CTR2(KTR_VFS,
"%s: impossible to recycle, cannot start the write for %p",
__func__, vp);
return (EBUSY);
}
/*
* If we got this far, we need to acquire the interlock and see if
* anyone picked up this vnode from another list. If not, we will
* mark it with DOOMED via vgonel() so that anyone who does find it
* will skip over it.
*/
VI_LOCK(vp);
if (vp->v_usecount) {
VOP_UNLOCK(vp, LK_INTERLOCK);
vn_finished_write(vnmp);
CTR2(KTR_VFS,
"%s: impossible to recycle, %p is already referenced",
__func__, vp);
return (EBUSY);
}
if ((vp->v_iflag & VI_DOOMED) == 0) {
counter_u64_add(recycles_count, 1);
vgonel(vp);
}
VOP_UNLOCK(vp, LK_INTERLOCK);
vn_finished_write(vnmp);
return (0);
}
static void
vcheckspace(void)
{
if (vspace() < vlowat && vnlruproc_sig == 0) {
vnlruproc_sig = 1;
wakeup(vnlruproc);
}
}
/*
* Wait if necessary for space for a new vnode.
*/
static int
getnewvnode_wait(int suspended)
{
mtx_assert(&vnode_free_list_mtx, MA_OWNED);
if (numvnodes >= desiredvnodes) {
if (suspended) {
/*
* The file system is being suspended. We cannot
* risk a deadlock here, so allow allocation of
* another vnode even if this would give too many.
*/
return (0);
}
if (vnlruproc_sig == 0) {
vnlruproc_sig = 1; /* avoid unnecessary wakeups */
wakeup(vnlruproc);
}
msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
"vlruwk", hz);
}
/* Post-adjust like the pre-adjust in getnewvnode(). */
if (numvnodes + 1 > desiredvnodes && freevnodes > 1)
vnlru_free_locked(1, NULL);
return (numvnodes >= desiredvnodes ? ENFILE : 0);
}
/*
* This hack is fragile, and probably not needed any more now that the
* watermark handling works.
*/
void
getnewvnode_reserve(u_int count)
{
struct thread *td;
/* Pre-adjust like the pre-adjust in getnewvnode(), with any count. */
/* XXX no longer so quick, but this part is not racy. */
mtx_lock(&vnode_free_list_mtx);
if (numvnodes + count > desiredvnodes && freevnodes > wantfreevnodes)
vnlru_free_locked(ulmin(numvnodes + count - desiredvnodes,
freevnodes - wantfreevnodes), NULL);
mtx_unlock(&vnode_free_list_mtx);
td = curthread;
/* First try to be quick and racy. */
if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) {
td->td_vp_reserv += count;
vcheckspace(); /* XXX no longer so quick, but more racy */
return;
} else
atomic_subtract_long(&numvnodes, count);
mtx_lock(&vnode_free_list_mtx);
while (count > 0) {
if (getnewvnode_wait(0) == 0) {
count--;
td->td_vp_reserv++;
atomic_add_long(&numvnodes, 1);
}
}
vcheckspace();
mtx_unlock(&vnode_free_list_mtx);
}
/*
* This hack is fragile, especially if desiredvnodes or wantvnodes are
* misconfgured or changed significantly. Reducing desiredvnodes below
* the reserved amount should cause bizarre behaviour like reducing it
* below the number of active vnodes -- the system will try to reduce
* numvnodes to match, but should fail, so the subtraction below should
* not overflow.
*/
void
getnewvnode_drop_reserve(void)
{
struct thread *td;
td = curthread;
atomic_subtract_long(&numvnodes, td->td_vp_reserv);
td->td_vp_reserv = 0;
}
/*
* Return the next vnode from the free list.
*/
int
getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
struct vnode **vpp)
{
struct vnode *vp;
struct thread *td;
struct lock_object *lo;
static int cyclecount;
int error;
CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
vp = NULL;
td = curthread;
if (td->td_vp_reserv > 0) {
td->td_vp_reserv -= 1;
goto alloc;
}
mtx_lock(&vnode_free_list_mtx);
if (numvnodes < desiredvnodes)
cyclecount = 0;
else if (cyclecount++ >= freevnodes) {
cyclecount = 0;
vstir = 1;
}
/*
* Grow the vnode cache if it will not be above its target max
* after growing. Otherwise, if the free list is nonempty, try
* to reclaim 1 item from it before growing the cache (possibly
* above its target max if the reclamation failed or is delayed).
* Otherwise, wait for some space. In all cases, schedule
* vnlru_proc() if we are getting short of space. The watermarks
* should be chosen so that we never wait or even reclaim from
* the free list to below its target minimum.
*/
if (numvnodes + 1 <= desiredvnodes)
;
else if (freevnodes > 0)
vnlru_free_locked(1, NULL);
else {
error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag &
MNTK_SUSPEND));
#if 0 /* XXX Not all VFS_VGET/ffs_vget callers check returns. */
if (error != 0) {
mtx_unlock(&vnode_free_list_mtx);
return (error);
}
#endif
}
vcheckspace();
atomic_add_long(&numvnodes, 1);
mtx_unlock(&vnode_free_list_mtx);
alloc:
counter_u64_add(vnodes_created, 1);
vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK);
/*
* Locks are given the generic name "vnode" when created.
* Follow the historic practice of using the filesystem
* name when they allocated, e.g., "zfs", "ufs", "nfs, etc.
*
* Locks live in a witness group keyed on their name. Thus,
* when a lock is renamed, it must also move from the witness
* group of its old name to the witness group of its new name.
*
* The change only needs to be made when the vnode moves
* from one filesystem type to another. We ensure that each
* filesystem use a single static name pointer for its tag so
* that we can compare pointers rather than doing a strcmp().
*/
lo = &vp->v_vnlock->lock_object;
if (lo->lo_name != tag) {
lo->lo_name = tag;
WITNESS_DESTROY(lo);
WITNESS_INIT(lo, tag);
}
/*
* By default, don't allow shared locks unless filesystems opt-in.
*/
vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE;
/*
* Finalize various vnode identity bits.
*/
KASSERT(vp->v_object == NULL, ("stale v_object %p", vp));
KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp));
KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp));
vp->v_type = VNON;
vp->v_tag = tag;
vp->v_op = vops;
v_init_counters(vp);
vp->v_bufobj.bo_ops = &buf_ops_bio;
#ifdef DIAGNOSTIC
if (mp == NULL && vops != &dead_vnodeops)
printf("NULL mp in getnewvnode(9), tag %s\n", tag);
#endif
#ifdef MAC
mac_vnode_init(vp);
if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
mac_vnode_associate_singlelabel(mp, vp);
#endif
if (mp != NULL) {
vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize;
if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
vp->v_vflag |= VV_NOKNOTE;
}
/*
* For the filesystems which do not use vfs_hash_insert(),
* still initialize v_hash to have vfs_hash_index() useful.
* E.g., nullfs uses vfs_hash_index() on the lower vnode for
* its own hashing.
*/
vp->v_hash = (uintptr_t)vp >> vnsz2log;
*vpp = vp;
return (0);
}
/*
* Delete from old mount point vnode list, if on one.
*/
static void
delmntque(struct vnode *vp)
{
struct mount *mp;
int active;
mp = vp->v_mount;
if (mp == NULL)
return;
MNT_ILOCK(mp);
VI_LOCK(vp);
KASSERT(mp->mnt_activevnodelistsize <= mp->mnt_nvnodelistsize,
("Active vnode list size %d > Vnode list size %d",
mp->mnt_activevnodelistsize, mp->mnt_nvnodelistsize));
active = vp->v_iflag & VI_ACTIVE;
vp->v_iflag &= ~VI_ACTIVE;
if (active) {
mtx_lock(&mp->mnt_listmtx);
TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist);
mp->mnt_activevnodelistsize--;
mtx_unlock(&mp->mnt_listmtx);
}
vp->v_mount = NULL;
VI_UNLOCK(vp);
VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
("bad mount point vnode list size"));
TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
mp->mnt_nvnodelistsize--;
MNT_REL(mp);
MNT_IUNLOCK(mp);
}
static void
insmntque_stddtr(struct vnode *vp, void *dtr_arg)
{
vp->v_data = NULL;
vp->v_op = &dead_vnodeops;
vgone(vp);
vput(vp);
}
/*
* Insert into list of vnodes for the new mount point, if available.
*/
int
insmntque1(struct vnode *vp, struct mount *mp,
void (*dtr)(struct vnode *, void *), void *dtr_arg)
{
KASSERT(vp->v_mount == NULL,
("insmntque: vnode already on per mount vnode list"));
VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp");
/*
* We acquire the vnode interlock early to ensure that the
* vnode cannot be recycled by another process releasing a
* holdcnt on it before we get it on both the vnode list
* and the active vnode list. The mount mutex protects only
* manipulation of the vnode list and the vnode freelist
* mutex protects only manipulation of the active vnode list.
* Hence the need to hold the vnode interlock throughout.
*/
MNT_ILOCK(mp);
VI_LOCK(vp);
if (((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 &&
((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
mp->mnt_nvnodelistsize == 0)) &&
(vp->v_vflag & VV_FORCEINSMQ) == 0) {
VI_UNLOCK(vp);
MNT_IUNLOCK(mp);
if (dtr != NULL)
dtr(vp, dtr_arg);
return (EBUSY);
}
vp->v_mount = mp;
MNT_REF(mp);
TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
("neg mount point vnode list size"));
mp->mnt_nvnodelistsize++;
KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
("Activating already active vnode"));
vp->v_iflag |= VI_ACTIVE;
mtx_lock(&mp->mnt_listmtx);
TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
mp->mnt_activevnodelistsize++;
mtx_unlock(&mp->mnt_listmtx);
VI_UNLOCK(vp);
MNT_IUNLOCK(mp);
return (0);
}
int
insmntque(struct vnode *vp, struct mount *mp)
{
return (insmntque1(vp, mp, insmntque_stddtr, NULL));
}
/*
* Flush out and invalidate all buffers associated with a bufobj
* Called with the underlying object locked.
*/
int
bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
{
int error;
BO_LOCK(bo);
if (flags & V_SAVE) {
error = bufobj_wwait(bo, slpflag, slptimeo);
if (error) {
BO_UNLOCK(bo);
return (error);
}
if (bo->bo_dirty.bv_cnt > 0) {
BO_UNLOCK(bo);
if ((error = BO_SYNC(bo, MNT_WAIT)) != 0)
return (error);
/*
* XXX We could save a lock/unlock if this was only
* enabled under INVARIANTS
*/
BO_LOCK(bo);
if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
panic("vinvalbuf: dirty bufs");
}
}
/*
* If you alter this loop please notice that interlock is dropped and
* reacquired in flushbuflist. Special care is needed to ensure that
* no race conditions occur from this.
*/
do {
error = flushbuflist(&bo->bo_clean,
flags, bo, slpflag, slptimeo);
if (error == 0 && !(flags & V_CLEANONLY))
error = flushbuflist(&bo->bo_dirty,
flags, bo, slpflag, slptimeo);
if (error != 0 && error != EAGAIN) {
BO_UNLOCK(bo);
return (error);
}
} while (error != 0);
/*
* Wait for I/O to complete. XXX needs cleaning up. The vnode can
* have write I/O in-progress but if there is a VM object then the
* VM object can also have read-I/O in-progress.
*/
do {
bufobj_wwait(bo, 0, 0);
if ((flags & V_VMIO) == 0) {
BO_UNLOCK(bo);
if (bo->bo_object != NULL) {
VM_OBJECT_WLOCK(bo->bo_object);
vm_object_pip_wait(bo->bo_object, "bovlbx");
VM_OBJECT_WUNLOCK(bo->bo_object);
}
BO_LOCK(bo);
}
} while (bo->bo_numoutput > 0);
BO_UNLOCK(bo);
/*
* Destroy the copy in the VM cache, too.
*/
if (bo->bo_object != NULL &&
(flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) {
VM_OBJECT_WLOCK(bo->bo_object);
vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
OBJPR_CLEANONLY : 0);
VM_OBJECT_WUNLOCK(bo->bo_object);
}
#ifdef INVARIANTS
BO_LOCK(bo);
if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO |
V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 ||
bo->bo_clean.bv_cnt > 0))
panic("vinvalbuf: flush failed");
if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 &&
bo->bo_dirty.bv_cnt > 0)
panic("vinvalbuf: flush dirty failed");
BO_UNLOCK(bo);
#endif
return (0);
}
/*
* Flush out and invalidate all buffers associated with a vnode.
* Called with the underlying object locked.
*/
int
vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
{
CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
ASSERT_VOP_LOCKED(vp, "vinvalbuf");
if (vp->v_object != NULL && vp->v_object->handle != vp)
return (0);
return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
}
/*
* Flush out buffers on the specified list.
*
*/
static int
flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
int slptimeo)
{
struct buf *bp, *nbp;
int retval, error;
daddr_t lblkno;
b_xflags_t xflags;
ASSERT_BO_WLOCKED(bo);
retval = 0;
TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
continue;
}
if (nbp != NULL) {
lblkno = nbp->b_lblkno;
xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN);
}
retval = EAGAIN;
error = BUF_TIMELOCK(bp,
LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo),
"flushbuf", slpflag, slptimeo);
if (error) {
BO_LOCK(bo);
return (error != ENOLCK ? error : EAGAIN);
}
KASSERT(bp->b_bufobj == bo,
("bp %p wrong b_bufobj %p should be %p",
bp, bp->b_bufobj, bo));
/*
* XXX Since there are no node locks for NFS, I
* believe there is a slight chance that a delayed
* write will occur while sleeping just above, so
* check for it.
*/
if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
(flags & V_SAVE)) {
bremfree(bp);
bp->b_flags |= B_ASYNC;
bwrite(bp);
BO_LOCK(bo);
return (EAGAIN); /* XXX: why not loop ? */
}
bremfree(bp);
bp->b_flags |= (B_INVAL | B_RELBUF);
bp->b_flags &= ~B_ASYNC;
brelse(bp);
BO_LOCK(bo);
if (nbp == NULL)
break;
nbp = gbincore(bo, lblkno);
if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
!= xflags)
break; /* nbp invalid */
}
return (retval);
}
int
bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn)
{
struct buf *bp;
int error;
daddr_t lblkno;
ASSERT_BO_LOCKED(bo);
for (lblkno = startn;;) {
again:
bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno);
if (bp == NULL || bp->b_lblkno >= endn ||
bp->b_lblkno < startn)
break;
error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0);
if (error != 0) {
BO_RLOCK(bo);
if (error == ENOLCK)
goto again;
return (error);
}
KASSERT(bp->b_bufobj == bo,
("bp %p wrong b_bufobj %p should be %p",
bp, bp->b_bufobj, bo));
lblkno = bp->b_lblkno + 1;
if ((bp->b_flags & B_MANAGED) == 0)
bremfree(bp);
bp->b_flags |= B_RELBUF;
/*
* In the VMIO case, use the B_NOREUSE flag to hint that the
* pages backing each buffer in the range are unlikely to be
* reused. Dirty buffers will have the hint applied once
* they've been written.
*/
if (bp->b_vp->v_object != NULL)
bp->b_flags |= B_NOREUSE;
brelse(bp);
BO_RLOCK(bo);
}
return (0);
}
/*
* Truncate a file's buffer and pages to a specified length. This
* is in lieu of the old vinvalbuf mechanism, which performed unneeded
* sync activity.
*/
int
vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length, int blksize)
{
struct buf *bp, *nbp;
int anyfreed;
int trunclbn;
struct bufobj *bo;
CTR5(KTR_VFS, "%s: vp %p with cred %p and block %d:%ju", __func__,
vp, cred, blksize, (uintmax_t)length);
/*
* Round up to the *next* lbn.
*/
trunclbn = howmany(length, blksize);
ASSERT_VOP_LOCKED(vp, "vtruncbuf");
restart:
bo = &vp->v_bufobj;
BO_LOCK(bo);
anyfreed = 1;
for (;anyfreed;) {
anyfreed = 0;
TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
if (bp->b_lblkno < trunclbn)
continue;
if (BUF_LOCK(bp,
LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
BO_LOCKPTR(bo)) == ENOLCK)
goto restart;
bremfree(bp);
bp->b_flags |= (B_INVAL | B_RELBUF);
bp->b_flags &= ~B_ASYNC;
brelse(bp);
anyfreed = 1;
BO_LOCK(bo);
if (nbp != NULL &&
(((nbp->b_xflags & BX_VNCLEAN) == 0) ||
(nbp->b_vp != vp) ||
(nbp->b_flags & B_DELWRI))) {
BO_UNLOCK(bo);
goto restart;
}
}
TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
if (bp->b_lblkno < trunclbn)
continue;
if (BUF_LOCK(bp,
LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
BO_LOCKPTR(bo)) == ENOLCK)
goto restart;
bremfree(bp);
bp->b_flags |= (B_INVAL | B_RELBUF);
bp->b_flags &= ~B_ASYNC;
brelse(bp);
anyfreed = 1;
BO_LOCK(bo);
if (nbp != NULL &&
(((nbp->b_xflags & BX_VNDIRTY) == 0) ||
(nbp->b_vp != vp) ||
(nbp->b_flags & B_DELWRI) == 0)) {
BO_UNLOCK(bo);
goto restart;
}
}
}
if (length > 0) {
restartsync:
TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
if (bp->b_lblkno > 0)
continue;
/*
* Since we hold the vnode lock this should only
* fail if we're racing with the buf daemon.
*/
if (BUF_LOCK(bp,
LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
BO_LOCKPTR(bo)) == ENOLCK) {
goto restart;
}
VNASSERT((bp->b_flags & B_DELWRI), vp,
("buf(%p) on dirty queue without DELWRI", bp));
bremfree(bp);
bawrite(bp);
BO_LOCK(bo);
goto restartsync;
}
}
bufobj_wwait(bo, 0, 0);
BO_UNLOCK(bo);
vnode_pager_setsize(vp, length);
return (0);
}
static void
buf_vlist_remove(struct buf *bp)
{
struct bufv *bv;
KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
ASSERT_BO_WLOCKED(bp->b_bufobj);
KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
(BX_VNDIRTY|BX_VNCLEAN),
("buf_vlist_remove: Buf %p is on two lists", bp));
if (bp->b_xflags & BX_VNDIRTY)
bv = &bp->b_bufobj->bo_dirty;
else
bv = &bp->b_bufobj->bo_clean;
BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno);
TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
bv->bv_cnt--;
bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
}
/*
* Add the buffer to the sorted clean or dirty block list.
*
* NOTE: xflags is passed as a constant, optimizing this inline function!
*/
static void
buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
{
struct bufv *bv;
struct buf *n;
int error;
ASSERT_BO_WLOCKED(bo);
KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0,
("dead bo %p", bo));
KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
bp->b_xflags |= xflags;
if (xflags & BX_VNDIRTY)
bv = &bo->bo_dirty;
else
bv = &bo->bo_clean;
/*
* Keep the list ordered. Optimize empty list insertion. Assume
* we tend to grow at the tail so lookup_le should usually be cheaper
* than _ge.
*/
if (bv->bv_cnt == 0 ||
bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno)
TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL)
TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs);
else
TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs);
error = BUF_PCTRIE_INSERT(&bv->bv_root, bp);
if (error)
panic("buf_vlist_add: Preallocated nodes insufficient.");
bv->bv_cnt++;
}
/*
* Look up a buffer using the buffer tries.
*/
struct buf *
gbincore(struct bufobj *bo, daddr_t lblkno)
{
struct buf *bp;
ASSERT_BO_LOCKED(bo);
bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno);
if (bp != NULL)
return (bp);
return BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno);
}
/*
* Associate a buffer with a vnode.
*/
void
bgetvp(struct vnode *vp, struct buf *bp)
{
struct bufobj *bo;
bo = &vp->v_bufobj;
ASSERT_BO_WLOCKED(bo);
VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
("bgetvp: bp already attached! %p", bp));
vhold(vp);
bp->b_vp = vp;
bp->b_bufobj = bo;
/*
* Insert onto list for new vnode.
*/
buf_vlist_add(bp, bo, BX_VNCLEAN);
}
/*
* Disassociate a buffer from a vnode.
*/
void
brelvp(struct buf *bp)
{
struct bufobj *bo;
struct vnode *vp;
CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
/*
* Delete from old vnode list, if on one.
*/
vp = bp->b_vp; /* XXX */
bo = bp->b_bufobj;
BO_LOCK(bo);
if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
buf_vlist_remove(bp);
else
panic("brelvp: Buffer %p not on queue.", bp);
if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
bo->bo_flag &= ~BO_ONWORKLST;
mtx_lock(&sync_mtx);
LIST_REMOVE(bo, bo_synclist);
syncer_worklist_len--;
mtx_unlock(&sync_mtx);
}
bp->b_vp = NULL;
bp->b_bufobj = NULL;
BO_UNLOCK(bo);
vdrop(vp);
}
/*
* Add an item to the syncer work queue.
*/
static void
vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
{
int slot;
ASSERT_BO_WLOCKED(bo);
mtx_lock(&sync_mtx);
if (bo->bo_flag & BO_ONWORKLST)
LIST_REMOVE(bo, bo_synclist);
else {
bo->bo_flag |= BO_ONWORKLST;
syncer_worklist_len++;
}
if (delay > syncer_maxdelay - 2)
delay = syncer_maxdelay - 2;
slot = (syncer_delayno + delay) & syncer_mask;
LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
mtx_unlock(&sync_mtx);
}
static int
sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
{
int error, len;
mtx_lock(&sync_mtx);
len = syncer_worklist_len - sync_vnode_count;
mtx_unlock(&sync_mtx);
error = SYSCTL_OUT(req, &len, sizeof(len));
return (error);
}
SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
static struct proc *updateproc;
static void sched_sync(void);
static struct kproc_desc up_kp = {
"syncer",
sched_sync,
&updateproc
};
SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
static int
sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
{
struct vnode *vp;
struct mount *mp;
*bo = LIST_FIRST(slp);
if (*bo == NULL)
return (0);
vp = bo2vnode(*bo);
if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
return (1);
/*
* We use vhold in case the vnode does not
* successfully sync. vhold prevents the vnode from
* going away when we unlock the sync_mtx so that
* we can acquire the vnode interlock.
*/
vholdl(vp);
mtx_unlock(&sync_mtx);
VI_UNLOCK(vp);
if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
vdrop(vp);
mtx_lock(&sync_mtx);
return (*bo == LIST_FIRST(slp));
}
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
(void) VOP_FSYNC(vp, MNT_LAZY, td);
VOP_UNLOCK(vp, 0);
vn_finished_write(mp);
BO_LOCK(*bo);
if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
/*
* Put us back on the worklist. The worklist
* routine will remove us from our current
* position and then add us back in at a later
* position.
*/
vn_syncer_add_to_worklist(*bo, syncdelay);
}
BO_UNLOCK(*bo);
vdrop(vp);
mtx_lock(&sync_mtx);
return (0);
}
static int first_printf = 1;
/*
* System filesystem synchronizer daemon.
*/
static void
sched_sync(void)
{
struct synclist *next, *slp;
struct bufobj *bo;
long starttime;
struct thread *td = curthread;
int last_work_seen;
int net_worklist_len;
int syncer_final_iter;
int error;
last_work_seen = 0;
syncer_final_iter = 0;
syncer_state = SYNCER_RUNNING;
starttime = time_uptime;
td->td_pflags |= TDP_NORUNNINGBUF;
EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
SHUTDOWN_PRI_LAST);
mtx_lock(&sync_mtx);
for (;;) {
if (syncer_state == SYNCER_FINAL_DELAY &&
syncer_final_iter == 0) {
mtx_unlock(&sync_mtx);
kproc_suspend_check(td->td_proc);
mtx_lock(&sync_mtx);
}
net_worklist_len = syncer_worklist_len - sync_vnode_count;
if (syncer_state != SYNCER_RUNNING &&
starttime != time_uptime) {
if (first_printf) {
printf("\nSyncing disks, vnodes remaining... ");
first_printf = 0;
}
printf("%d ", net_worklist_len);
}
starttime = time_uptime;
/*
* Push files whose dirty time has expired. Be careful
* of interrupt race on slp queue.
*
* Skip over empty worklist slots when shutting down.
*/
do {
slp = &syncer_workitem_pending[syncer_delayno];
syncer_delayno += 1;
if (syncer_delayno == syncer_maxdelay)
syncer_delayno = 0;
next = &syncer_workitem_pending[syncer_delayno];
/*
* If the worklist has wrapped since the
* it was emptied of all but syncer vnodes,
* switch to the FINAL_DELAY state and run
* for one more second.
*/
if (syncer_state == SYNCER_SHUTTING_DOWN &&
net_worklist_len == 0 &&
last_work_seen == syncer_delayno) {
syncer_state = SYNCER_FINAL_DELAY;
syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
}
} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
syncer_worklist_len > 0);
/*
* Keep track of the last time there was anything
* on the worklist other than syncer vnodes.
* Return to the SHUTTING_DOWN state if any
* new work appears.
*/
if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
last_work_seen = syncer_delayno;
if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
syncer_state = SYNCER_SHUTTING_DOWN;
while (!LIST_EMPTY(slp)) {
error = sync_vnode(slp, &bo, td);
if (error == 1) {
LIST_REMOVE(bo, bo_synclist);
LIST_INSERT_HEAD(next, bo, bo_synclist);
continue;
}
if (first_printf == 0) {
/*
* Drop the sync mutex, because some watchdog
* drivers need to sleep while patting
*/
mtx_unlock(&sync_mtx);
wdog_kern_pat(WD_LASTVAL);
mtx_lock(&sync_mtx);
}
}
if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
syncer_final_iter--;
/*
* The variable rushjob allows the kernel to speed up the
* processing of the filesystem syncer process. A rushjob
* value of N tells the filesystem syncer to process the next
* N seconds worth of work on its queue ASAP. Currently rushjob
* is used by the soft update code to speed up the filesystem
* syncer process when the incore state is getting so far
* ahead of the disk that the kernel memory pool is being
* threatened with exhaustion.
*/
if (rushjob > 0) {
rushjob -= 1;
continue;
}
/*
* Just sleep for a short period of time between
* iterations when shutting down to allow some I/O
* to happen.
*
* If it has taken us less than a second to process the
* current work, then wait. Otherwise start right over
* again. We can still lose time if any single round
* takes more than two seconds, but it does not really
* matter as we are just trying to generally pace the
* filesystem activity.
*/
if (syncer_state != SYNCER_RUNNING ||
time_uptime == starttime) {
thread_lock(td);
sched_prio(td, PPAUSE);
thread_unlock(td);
}
if (syncer_state != SYNCER_RUNNING)
cv_timedwait(&sync_wakeup, &sync_mtx,
hz / SYNCER_SHUTDOWN_SPEEDUP);
else if (time_uptime == starttime)
cv_timedwait(&sync_wakeup, &sync_mtx, hz);
}
}
/*
* Request the syncer daemon to speed up its work.
* We never push it to speed up more than half of its
* normal turn time, otherwise it could take over the cpu.
*/
int
speedup_syncer(void)
{
int ret = 0;
mtx_lock(&sync_mtx);
if (rushjob < syncdelay / 2) {
rushjob += 1;
stat_rush_requests += 1;
ret = 1;
}
mtx_unlock(&sync_mtx);
cv_broadcast(&sync_wakeup);
return (ret);
}
/*
* Tell the syncer to speed up its work and run though its work
* list several times, then tell it to shut down.
*/
static void
syncer_shutdown(void *arg, int howto)
{
if (howto & RB_NOSYNC)
return;
mtx_lock(&sync_mtx);
syncer_state = SYNCER_SHUTTING_DOWN;
rushjob = 0;
mtx_unlock(&sync_mtx);
cv_broadcast(&sync_wakeup);
kproc_shutdown(arg, howto);
}
void
syncer_suspend(void)
{
syncer_shutdown(updateproc, 0);
}
void
syncer_resume(void)
{
mtx_lock(&sync_mtx);
first_printf = 1;
syncer_state = SYNCER_RUNNING;
mtx_unlock(&sync_mtx);
cv_broadcast(&sync_wakeup);
kproc_resume(updateproc);
}
/*
* Reassign a buffer from one vnode to another.
* Used to assign file specific control information
* (indirect blocks) to the vnode to which they belong.
*/
void
reassignbuf(struct buf *bp)
{
struct vnode *vp;
struct bufobj *bo;
int delay;
#ifdef INVARIANTS
struct bufv *bv;
#endif
vp = bp->b_vp;
bo = bp->b_bufobj;
++reassignbufcalls;
CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
bp, bp->b_vp, bp->b_flags);
/*
* B_PAGING flagged buffers cannot be reassigned because their vp
* is not fully linked in.
*/
if (bp->b_flags & B_PAGING)
panic("cannot reassign paging buffer");
/*
* Delete from old vnode list, if on one.
*/
BO_LOCK(bo);
if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
buf_vlist_remove(bp);
else
panic("reassignbuf: Buffer %p not on queue.", bp);
/*
* If dirty, put on list of dirty buffers; otherwise insert onto list
* of clean buffers.
*/
if (bp->b_flags & B_DELWRI) {
if ((bo->bo_flag & BO_ONWORKLST) == 0) {
switch (vp->v_type) {
case VDIR:
delay = dirdelay;
break;
case VCHR:
delay = metadelay;
break;
default:
delay = filedelay;
}
vn_syncer_add_to_worklist(bo, delay);
}
buf_vlist_add(bp, bo, BX_VNDIRTY);
} else {
buf_vlist_add(bp, bo, BX_VNCLEAN);
if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
mtx_lock(&sync_mtx);
LIST_REMOVE(bo, bo_synclist);
syncer_worklist_len--;
mtx_unlock(&sync_mtx);
bo->bo_flag &= ~BO_ONWORKLST;
}
}
#ifdef INVARIANTS
bv = &bo->bo_clean;
bp = TAILQ_FIRST(&bv->bv_hd);
KASSERT(bp == NULL || bp->b_bufobj == bo,
("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
bp = TAILQ_LAST(&bv->bv_hd, buflists);
KASSERT(bp == NULL || bp->b_bufobj == bo,
("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
bv = &bo->bo_dirty;
bp = TAILQ_FIRST(&bv->bv_hd);
KASSERT(bp == NULL || bp->b_bufobj == bo,
("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
bp = TAILQ_LAST(&bv->bv_hd, buflists);
KASSERT(bp == NULL || bp->b_bufobj == bo,
("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
#endif
BO_UNLOCK(bo);
}
/*
* A temporary hack until refcount_* APIs are sorted out.
*/
static __inline int
vfs_refcount_acquire_if_not_zero(volatile u_int *count)
{
u_int old;
old = *count;
for (;;) {
if (old == 0)
return (0);
if (atomic_fcmpset_int(count, &old, old + 1))
return (1);
}
}
static __inline int
vfs_refcount_release_if_not_last(volatile u_int *count)
{
u_int old;
old = *count;
for (;;) {
if (old == 1)
return (0);
if (atomic_fcmpset_int(count, &old, old - 1))
return (1);
}
}
static void
v_init_counters(struct vnode *vp)
{
VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0,
vp, ("%s called for an initialized vnode", __FUNCTION__));
ASSERT_VI_UNLOCKED(vp, __FUNCTION__);
refcount_init(&vp->v_holdcnt, 1);
refcount_init(&vp->v_usecount, 1);
}
static void
v_incr_usecount_locked(struct vnode *vp)
{
ASSERT_VI_LOCKED(vp, __func__);
if ((vp->v_iflag & VI_OWEINACT) != 0) {
VNASSERT(vp->v_usecount == 0, vp,
("vnode with usecount and VI_OWEINACT set"));
vp->v_iflag &= ~VI_OWEINACT;
}
refcount_acquire(&vp->v_usecount);
v_incr_devcount(vp);
}
/*
* Increment the use count on the vnode, taking care to reference
* the driver's usecount if this is a chardev.
*/
static void
v_incr_usecount(struct vnode *vp)
{
ASSERT_VI_UNLOCKED(vp, __func__);
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
if (vp->v_type != VCHR &&
vfs_refcount_acquire_if_not_zero(&vp->v_usecount)) {
VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
("vnode with usecount and VI_OWEINACT set"));
} else {
VI_LOCK(vp);
v_incr_usecount_locked(vp);
VI_UNLOCK(vp);
}
}
/*
* Increment si_usecount of the associated device, if any.
*/
static void
v_incr_devcount(struct vnode *vp)
{
ASSERT_VI_LOCKED(vp, __FUNCTION__);
if (vp->v_type == VCHR && vp->v_rdev != NULL) {
dev_lock();
vp->v_rdev->si_usecount++;
dev_unlock();
}
}
/*
* Decrement si_usecount of the associated device, if any.
*/
static void
v_decr_devcount(struct vnode *vp)
{
ASSERT_VI_LOCKED(vp, __FUNCTION__);
if (vp->v_type == VCHR && vp->v_rdev != NULL) {
dev_lock();
vp->v_rdev->si_usecount--;
dev_unlock();
}
}
/*
* Grab a particular vnode from the free list, increment its
* reference count and lock it. VI_DOOMED is set if the vnode
* is being destroyed. Only callers who specify LK_RETRY will
* see doomed vnodes. If inactive processing was delayed in
* vput try to do it here.
*
* Notes on lockless counter manipulation:
* _vhold, vputx and other routines make various decisions based
* on either holdcnt or usecount being 0. As long as either counter
* is not transitioning 0->1 nor 1->0, the manipulation can be done
* with atomic operations. Otherwise the interlock is taken covering
* both the atomic and additional actions.
*/
int
vget(struct vnode *vp, int flags, struct thread *td)
{
int error, oweinact;
VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
("vget: invalid lock operation"));
if ((flags & LK_INTERLOCK) != 0)
ASSERT_VI_LOCKED(vp, __func__);
else
ASSERT_VI_UNLOCKED(vp, __func__);
if ((flags & LK_VNHELD) != 0)
VNASSERT((vp->v_holdcnt > 0), vp,
("vget: LK_VNHELD passed but vnode not held"));
CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
if ((flags & LK_VNHELD) == 0)
_vhold(vp, (flags & LK_INTERLOCK) != 0);
if ((error = vn_lock(vp, flags)) != 0) {
vdrop(vp);
CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
vp);
return (error);
}
if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0)
panic("vget: vn_lock failed to return ENOENT\n");
/*
* We don't guarantee that any particular close will
* trigger inactive processing so just make a best effort
* here at preventing a reference to a removed file. If
* we don't succeed no harm is done.
*
* Upgrade our holdcnt to a usecount.
*/
if (vp->v_type == VCHR ||
!vfs_refcount_acquire_if_not_zero(&vp->v_usecount)) {
VI_LOCK(vp);
if ((vp->v_iflag & VI_OWEINACT) == 0) {
oweinact = 0;
} else {
oweinact = 1;
vp->v_iflag &= ~VI_OWEINACT;
}
refcount_acquire(&vp->v_usecount);
v_incr_devcount(vp);
if (oweinact && VOP_ISLOCKED(vp) == LK_EXCLUSIVE &&
(flags & LK_NOWAIT) == 0)
vinactive(vp, td);
VI_UNLOCK(vp);
}
return (0);
}
/*
* Increase the reference (use) and hold count of a vnode.
* This will also remove the vnode from the free list if it is presently free.
*/
void
vref(struct vnode *vp)
{
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
_vhold(vp, false);
v_incr_usecount(vp);
}
void
vrefl(struct vnode *vp)
{
ASSERT_VI_LOCKED(vp, __func__);
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
_vhold(vp, true);
v_incr_usecount_locked(vp);
}
void
vrefact(struct vnode *vp)
{
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
if (__predict_false(vp->v_type == VCHR)) {
VNASSERT(vp->v_holdcnt > 0 && vp->v_usecount > 0, vp,
("%s: wrong ref counts", __func__));
vref(vp);
return;
}
#ifdef INVARIANTS
int old = atomic_fetchadd_int(&vp->v_holdcnt, 1);
VNASSERT(old > 0, vp, ("%s: wrong hold count", __func__));
old = atomic_fetchadd_int(&vp->v_usecount, 1);
VNASSERT(old > 0, vp, ("%s: wrong use count", __func__));
#else
refcount_acquire(&vp->v_holdcnt);
refcount_acquire(&vp->v_usecount);
#endif
}
/*
* Return reference count of a vnode.
*
* The results of this call are only guaranteed when some mechanism is used to
* stop other processes from gaining references to the vnode. This may be the
* case if the caller holds the only reference. This is also useful when stale
* data is acceptable as race conditions may be accounted for by some other
* means.
*/
int
vrefcnt(struct vnode *vp)
{
return (vp->v_usecount);
}
#define VPUTX_VRELE 1
#define VPUTX_VPUT 2
#define VPUTX_VUNREF 3
/*
* Decrement the use and hold counts for a vnode.
*
* See an explanation near vget() as to why atomic operation is safe.
*/
static void
vputx(struct vnode *vp, int func)
{
int error;
KASSERT(vp != NULL, ("vputx: null vp"));
if (func == VPUTX_VUNREF)
ASSERT_VOP_LOCKED(vp, "vunref");
else if (func == VPUTX_VPUT)
ASSERT_VOP_LOCKED(vp, "vput");
else
KASSERT(func == VPUTX_VRELE, ("vputx: wrong func"));
ASSERT_VI_UNLOCKED(vp, __func__);
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
if (vp->v_type != VCHR &&
vfs_refcount_release_if_not_last(&vp->v_usecount)) {
if (func == VPUTX_VPUT)
VOP_UNLOCK(vp, 0);
vdrop(vp);
return;
}
VI_LOCK(vp);
/*
* We want to hold the vnode until the inactive finishes to
* prevent vgone() races. We drop the use count here and the
* hold count below when we're done.
*/
if (!refcount_release(&vp->v_usecount) ||
(vp->v_iflag & VI_DOINGINACT)) {
if (func == VPUTX_VPUT)
VOP_UNLOCK(vp, 0);
v_decr_devcount(vp);
vdropl(vp);
return;
}
v_decr_devcount(vp);
error = 0;
if (vp->v_usecount != 0) {
vn_printf(vp, "vputx: usecount not zero for vnode ");
panic("vputx: usecount not zero");
}
CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp);
/*
* We must call VOP_INACTIVE with the node locked. Mark
* as VI_DOINGINACT to avoid recursion.
*/
vp->v_iflag |= VI_OWEINACT;
switch (func) {
case VPUTX_VRELE:
error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
VI_LOCK(vp);
break;
case VPUTX_VPUT:
if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK |
LK_NOWAIT);
VI_LOCK(vp);
}
break;
case VPUTX_VUNREF:
if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK);
VI_LOCK(vp);
}
break;
}
VNASSERT(vp->v_usecount == 0 || (vp->v_iflag & VI_OWEINACT) == 0, vp,
("vnode with usecount and VI_OWEINACT set"));
if (error == 0) {
if (vp->v_iflag & VI_OWEINACT)
vinactive(vp, curthread);
if (func != VPUTX_VUNREF)
VOP_UNLOCK(vp, 0);
}
vdropl(vp);
}
/*
* Vnode put/release.
* If count drops to zero, call inactive routine and return to freelist.
*/
void
vrele(struct vnode *vp)
{
vputx(vp, VPUTX_VRELE);
}
/*
* Release an already locked vnode. This give the same effects as
* unlock+vrele(), but takes less time and avoids releasing and
* re-aquiring the lock (as vrele() acquires the lock internally.)
*/
void
vput(struct vnode *vp)
{
vputx(vp, VPUTX_VPUT);
}
/*
* Release an exclusively locked vnode. Do not unlock the vnode lock.
*/
void
vunref(struct vnode *vp)
{
vputx(vp, VPUTX_VUNREF);
}
/*
* Increase the hold count and activate if this is the first reference.
*/
void
_vhold(struct vnode *vp, bool locked)
{
struct mount *mp;
if (locked)
ASSERT_VI_LOCKED(vp, __func__);
else
ASSERT_VI_UNLOCKED(vp, __func__);
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
if (!locked && vfs_refcount_acquire_if_not_zero(&vp->v_holdcnt)) {
VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
("_vhold: vnode with holdcnt is free"));
return;
}
if (!locked)
VI_LOCK(vp);
if ((vp->v_iflag & VI_FREE) == 0) {
refcount_acquire(&vp->v_holdcnt);
if (!locked)
VI_UNLOCK(vp);
return;
}
VNASSERT(vp->v_holdcnt == 0, vp,
("%s: wrong hold count", __func__));
VNASSERT(vp->v_op != NULL, vp,
("%s: vnode already reclaimed.", __func__));
/*
* Remove a vnode from the free list, mark it as in use,
* and put it on the active list.
*/
VNASSERT(vp->v_mount != NULL, vp,
("_vhold: vnode not on per mount vnode list"));
mp = vp->v_mount;
mtx_lock(&mp->mnt_listmtx);
if ((vp->v_mflag & VMP_TMPMNTFREELIST) != 0) {
TAILQ_REMOVE(&mp->mnt_tmpfreevnodelist, vp, v_actfreelist);
mp->mnt_tmpfreevnodelistsize--;
vp->v_mflag &= ~VMP_TMPMNTFREELIST;
} else {
mtx_lock(&vnode_free_list_mtx);
TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
freevnodes--;
mtx_unlock(&vnode_free_list_mtx);
}
KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
("Activating already active vnode"));
vp->v_iflag &= ~VI_FREE;
vp->v_iflag |= VI_ACTIVE;
TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
mp->mnt_activevnodelistsize++;
mtx_unlock(&mp->mnt_listmtx);
refcount_acquire(&vp->v_holdcnt);
if (!locked)
VI_UNLOCK(vp);
}
/*
* Drop the hold count of the vnode. If this is the last reference to
* the vnode we place it on the free list unless it has been vgone'd
* (marked VI_DOOMED) in which case we will free it.
*
* Because the vnode vm object keeps a hold reference on the vnode if
* there is at least one resident non-cached page, the vnode cannot
* leave the active list without the page cleanup done.
*/
void
_vdrop(struct vnode *vp, bool locked)
{
struct bufobj *bo;
struct mount *mp;
int active;
if (locked)
ASSERT_VI_LOCKED(vp, __func__);
else
ASSERT_VI_UNLOCKED(vp, __func__);
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
if ((int)vp->v_holdcnt <= 0)
panic("vdrop: holdcnt %d", vp->v_holdcnt);
if (vfs_refcount_release_if_not_last(&vp->v_holdcnt)) {
if (locked)
VI_UNLOCK(vp);
return;
}
if (!locked)
VI_LOCK(vp);
if (refcount_release(&vp->v_holdcnt) == 0) {
VI_UNLOCK(vp);
return;
}
if ((vp->v_iflag & VI_DOOMED) == 0) {
/*
* Mark a vnode as free: remove it from its active list
* and put it up for recycling on the freelist.
*/
VNASSERT(vp->v_op != NULL, vp,
("vdropl: vnode already reclaimed."));
VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
("vnode already free"));
VNASSERT(vp->v_holdcnt == 0, vp,
("vdropl: freeing when we shouldn't"));
active = vp->v_iflag & VI_ACTIVE;
if ((vp->v_iflag & VI_OWEINACT) == 0) {
vp->v_iflag &= ~VI_ACTIVE;
mp = vp->v_mount;
if (mp != NULL) {
mtx_lock(&mp->mnt_listmtx);
if (active) {
TAILQ_REMOVE(&mp->mnt_activevnodelist,
vp, v_actfreelist);
mp->mnt_activevnodelistsize--;
}
TAILQ_INSERT_TAIL(&mp->mnt_tmpfreevnodelist,
vp, v_actfreelist);
mp->mnt_tmpfreevnodelistsize++;
vp->v_iflag |= VI_FREE;
vp->v_mflag |= VMP_TMPMNTFREELIST;
VI_UNLOCK(vp);
if (mp->mnt_tmpfreevnodelistsize >=
mnt_free_list_batch)
vnlru_return_batch_locked(mp);
mtx_unlock(&mp->mnt_listmtx);
} else {
VNASSERT(active == 0, vp,
("vdropl: active vnode not on per mount "
"vnode list"));
mtx_lock(&vnode_free_list_mtx);
TAILQ_INSERT_TAIL(&vnode_free_list, vp,
v_actfreelist);
freevnodes++;
vp->v_iflag |= VI_FREE;
VI_UNLOCK(vp);
mtx_unlock(&vnode_free_list_mtx);
}
} else {
VI_UNLOCK(vp);
counter_u64_add(free_owe_inact, 1);
}
return;
}
/*
* The vnode has been marked for destruction, so free it.
*
* The vnode will be returned to the zone where it will
* normally remain until it is needed for another vnode. We
* need to cleanup (or verify that the cleanup has already
* been done) any residual data left from its current use
* so as not to contaminate the freshly allocated vnode.
*/
CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
atomic_subtract_long(&numvnodes, 1);
bo = &vp->v_bufobj;
VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
("cleaned vnode still on the free list."));
VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp,
("clean blk trie not empty"));
VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp,
("dirty blk trie not empty"));
VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for .."));
VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp,
("Dangling rangelock waiters"));
VI_UNLOCK(vp);
#ifdef MAC
mac_vnode_destroy(vp);
#endif
if (vp->v_pollinfo != NULL) {
destroy_vpollinfo(vp->v_pollinfo);
vp->v_pollinfo = NULL;
}
#ifdef INVARIANTS
/* XXX Elsewhere we detect an already freed vnode via NULL v_op. */
vp->v_op = NULL;
#endif
vp->v_mountedhere = NULL;
vp->v_unpcb = NULL;
vp->v_rdev = NULL;
vp->v_fifoinfo = NULL;
vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
vp->v_iflag = 0;
vp->v_vflag = 0;
bo->bo_flag = 0;
uma_zfree(vnode_zone, vp);
}
/*
* Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
* flags. DOINGINACT prevents us from recursing in calls to vinactive.
* OWEINACT tracks whether a vnode missed a call to inactive due to a
* failed lock upgrade.
*/
void
vinactive(struct vnode *vp, struct thread *td)
{
struct vm_object *obj;
ASSERT_VOP_ELOCKED(vp, "vinactive");
ASSERT_VI_LOCKED(vp, "vinactive");
VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
("vinactive: recursed on VI_DOINGINACT"));
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
vp->v_iflag |= VI_DOINGINACT;
vp->v_iflag &= ~VI_OWEINACT;
VI_UNLOCK(vp);
/*
* Before moving off the active list, we must be sure that any
* modified pages are converted into the vnode's dirty
* buffers, since these will no longer be checked once the
* vnode is on the inactive list.
*
* The write-out of the dirty pages is asynchronous. At the
* point that VOP_INACTIVE() is called, there could still be
* pending I/O and dirty pages in the object.
*/
if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 &&
(obj->flags & OBJ_MIGHTBEDIRTY) != 0) {
VM_OBJECT_WLOCK(obj);
vm_object_page_clean(obj, 0, 0, 0);
VM_OBJECT_WUNLOCK(obj);
}
VOP_INACTIVE(vp, td);
VI_LOCK(vp);
VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
("vinactive: lost VI_DOINGINACT"));
vp->v_iflag &= ~VI_DOINGINACT;
}
/*
* Remove any vnodes in the vnode table belonging to mount point mp.
*
* If FORCECLOSE is not specified, there should not be any active ones,
* return error if any are found (nb: this is a user error, not a
* system error). If FORCECLOSE is specified, detach any active vnodes
* that are found.
*
* If WRITECLOSE is set, only flush out regular file vnodes open for
* writing.
*
* SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
*
* `rootrefs' specifies the base reference count for the root vnode
* of this filesystem. The root vnode is considered busy if its
* v_usecount exceeds this value. On a successful return, vflush(, td)
* will call vrele() on the root vnode exactly rootrefs times.
* If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
* be zero.
*/
#ifdef DIAGNOSTIC
static int busyprt = 0; /* print out busy vnodes */
SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes");
#endif
int
vflush(struct mount *mp, int rootrefs, int flags, struct thread *td)
{
struct vnode *vp, *mvp, *rootvp = NULL;
struct vattr vattr;
int busy = 0, error;
CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp,
rootrefs, flags);
if (rootrefs > 0) {
KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
("vflush: bad args"));
/*
* Get the filesystem root vnode. We can vput() it
* immediately, since with rootrefs > 0, it won't go away.
*/
if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) {
CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d",
__func__, error);
return (error);
}
vput(rootvp);
}
loop:
MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
vholdl(vp);
error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
if (error) {
vdrop(vp);
MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
goto loop;
}
/*
* Skip over a vnodes marked VV_SYSTEM.
*/
if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
VOP_UNLOCK(vp, 0);
vdrop(vp);
continue;
}
/*
* If WRITECLOSE is set, flush out unlinked but still open
* files (even if open only for reading) and regular file
* vnodes open for writing.
*/
if (flags & WRITECLOSE) {
if (vp->v_object != NULL) {
VM_OBJECT_WLOCK(vp->v_object);
vm_object_page_clean(vp->v_object, 0, 0, 0);
VM_OBJECT_WUNLOCK(vp->v_object);
}
error = VOP_FSYNC(vp, MNT_WAIT, td);
if (error != 0) {
VOP_UNLOCK(vp, 0);
vdrop(vp);
MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
return (error);
}
error = VOP_GETATTR(vp, &vattr, td->td_ucred);
VI_LOCK(vp);
if ((vp->v_type == VNON ||
(error == 0 && vattr.va_nlink > 0)) &&
(vp->v_writecount == 0 || vp->v_type != VREG)) {
VOP_UNLOCK(vp, 0);
vdropl(vp);
continue;
}
} else
VI_LOCK(vp);
/*
* With v_usecount == 0, all we need to do is clear out the
* vnode data structures and we are done.
*
* If FORCECLOSE is set, forcibly close the vnode.
*/
if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
vgonel(vp);
} else {
busy++;
#ifdef DIAGNOSTIC
if (busyprt)
vn_printf(vp, "vflush: busy vnode ");
#endif
}
VOP_UNLOCK(vp, 0);
vdropl(vp);
}
if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
/*
* If just the root vnode is busy, and if its refcount
* is equal to `rootrefs', then go ahead and kill it.
*/
VI_LOCK(rootvp);
KASSERT(busy > 0, ("vflush: not busy"));
VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
("vflush: usecount %d < rootrefs %d",
rootvp->v_usecount, rootrefs));
if (busy == 1 && rootvp->v_usecount == rootrefs) {
VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK);
vgone(rootvp);
VOP_UNLOCK(rootvp, 0);
busy = 0;
} else
VI_UNLOCK(rootvp);
}
if (busy) {
CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__,
busy);
return (EBUSY);
}
for (; rootrefs > 0; rootrefs--)
vrele(rootvp);
return (0);
}
/*
* Recycle an unused vnode to the front of the free list.
*/
int
vrecycle(struct vnode *vp)
{
int recycled;
VI_LOCK(vp);
recycled = vrecyclel(vp);
VI_UNLOCK(vp);
return (recycled);
}
/*
* vrecycle, with the vp interlock held.
*/
int
vrecyclel(struct vnode *vp)
{
int recycled;
ASSERT_VOP_ELOCKED(vp, __func__);
ASSERT_VI_LOCKED(vp, __func__);
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
recycled = 0;
if (vp->v_usecount == 0) {
recycled = 1;
vgonel(vp);
}
return (recycled);
}
/*
* Eliminate all activity associated with a vnode
* in preparation for reuse.
*/
void
vgone(struct vnode *vp)
{
VI_LOCK(vp);
vgonel(vp);
VI_UNLOCK(vp);
}
static void
notify_lowervp_vfs_dummy(struct mount *mp __unused,
struct vnode *lowervp __unused)
{
}
/*
* Notify upper mounts about reclaimed or unlinked vnode.
*/
void
vfs_notify_upper(struct vnode *vp, int event)
{
static struct vfsops vgonel_vfsops = {
.vfs_reclaim_lowervp = notify_lowervp_vfs_dummy,
.vfs_unlink_lowervp = notify_lowervp_vfs_dummy,
};
struct mount *mp, *ump, *mmp;
mp = vp->v_mount;
if (mp == NULL)
return;
MNT_ILOCK(mp);
if (TAILQ_EMPTY(&mp->mnt_uppers))
goto unlock;
MNT_IUNLOCK(mp);
mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO);
mmp->mnt_op = &vgonel_vfsops;
mmp->mnt_kern_flag |= MNTK_MARKER;
MNT_ILOCK(mp);
mp->mnt_kern_flag |= MNTK_VGONE_UPPER;
for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) {
if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) {
ump = TAILQ_NEXT(ump, mnt_upper_link);
continue;
}
TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link);
MNT_IUNLOCK(mp);
switch (event) {
case VFS_NOTIFY_UPPER_RECLAIM:
VFS_RECLAIM_LOWERVP(ump, vp);
break;
case VFS_NOTIFY_UPPER_UNLINK:
VFS_UNLINK_LOWERVP(ump, vp);
break;
default:
KASSERT(0, ("invalid event %d", event));
break;
}
MNT_ILOCK(mp);
ump = TAILQ_NEXT(mmp, mnt_upper_link);
TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link);
}
free(mmp, M_TEMP);
mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER;
if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) {
mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER;
wakeup(&mp->mnt_uppers);
}
unlock:
MNT_IUNLOCK(mp);
}
/*
* vgone, with the vp interlock held.
*/
static void
vgonel(struct vnode *vp)
{
struct thread *td;
int oweinact;
int active;
struct mount *mp;
ASSERT_VOP_ELOCKED(vp, "vgonel");
ASSERT_VI_LOCKED(vp, "vgonel");
VNASSERT(vp->v_holdcnt, vp,
("vgonel: vp %p has no reference.", vp));
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
td = curthread;
/*
* Don't vgonel if we're already doomed.
*/
if (vp->v_iflag & VI_DOOMED)
return;
vp->v_iflag |= VI_DOOMED;
/*
* Check to see if the vnode is in use. If so, we have to call
* VOP_CLOSE() and VOP_INACTIVE().
*/
active = vp->v_usecount;
oweinact = (vp->v_iflag & VI_OWEINACT);
VI_UNLOCK(vp);
vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM);
/*
* If purging an active vnode, it must be closed and
* deactivated before being reclaimed.
*/
if (active)
VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
if (oweinact || active) {
VI_LOCK(vp);
if ((vp->v_iflag & VI_DOINGINACT) == 0)
vinactive(vp, td);
VI_UNLOCK(vp);
}
if (vp->v_type == VSOCK)
vfs_unp_reclaim(vp);
/*
* Clean out any buffers associated with the vnode.
* If the flush fails, just toss the buffers.
*/
mp = NULL;
if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
(void) vn_start_secondary_write(vp, &mp, V_WAIT);
if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) {
while (vinvalbuf(vp, 0, 0, 0) != 0)
;
}
BO_LOCK(&vp->v_bufobj);
KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) &&
vp->v_bufobj.bo_dirty.bv_cnt == 0 &&
TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) &&
vp->v_bufobj.bo_clean.bv_cnt == 0,
("vp %p bufobj not invalidated", vp));
/*
* For VMIO bufobj, BO_DEAD is set in vm_object_terminate()
* after the object's page queue is flushed.
*/
if (vp->v_bufobj.bo_object == NULL)
vp->v_bufobj.bo_flag |= BO_DEAD;
BO_UNLOCK(&vp->v_bufobj);
/*
* Reclaim the vnode.
*/
if (VOP_RECLAIM(vp, td))
panic("vgone: cannot reclaim");
if (mp != NULL)
vn_finished_secondary_write(mp);
VNASSERT(vp->v_object == NULL, vp,
("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag));
/*
* Clear the advisory locks and wake up waiting threads.
*/
(void)VOP_ADVLOCKPURGE(vp);
vp->v_lockf = NULL;
/*
* Delete from old mount point vnode list.
*/
delmntque(vp);
cache_purge(vp);
/*
* Done with purge, reset to the standard lock and invalidate
* the vnode.
*/
VI_LOCK(vp);
vp->v_vnlock = &vp->v_lock;
vp->v_op = &dead_vnodeops;
vp->v_tag = "none";
vp->v_type = VBAD;
}
/*
* Calculate the total number of references to a special device.
*/
int
vcount(struct vnode *vp)
{
int count;
dev_lock();
count = vp->v_rdev->si_usecount;
dev_unlock();
return (count);
}
/*
* Same as above, but using the struct cdev *as argument
*/
int
count_dev(struct cdev *dev)
{
int count;
dev_lock();
count = dev->si_usecount;
dev_unlock();
return(count);
}
/*
* Print out a description of a vnode.
*/
static char *typename[] =
{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
"VMARKER"};
void
vn_printf(struct vnode *vp, const char *fmt, ...)
{
va_list ap;
char buf[256], buf2[16];
u_long flags;
va_start(ap, fmt);
vprintf(fmt, ap);
va_end(ap);
printf("%p: ", (void *)vp);
printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
printf(" usecount %d, writecount %d, refcount %d mountedhere %p\n",
vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere);
buf[0] = '\0';
buf[1] = '\0';
if (vp->v_vflag & VV_ROOT)
strlcat(buf, "|VV_ROOT", sizeof(buf));
if (vp->v_vflag & VV_ISTTY)
strlcat(buf, "|VV_ISTTY", sizeof(buf));
if (vp->v_vflag & VV_NOSYNC)
strlcat(buf, "|VV_NOSYNC", sizeof(buf));
if (vp->v_vflag & VV_ETERNALDEV)
strlcat(buf, "|VV_ETERNALDEV", sizeof(buf));
if (vp->v_vflag & VV_CACHEDLABEL)
strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
if (vp->v_vflag & VV_TEXT)
strlcat(buf, "|VV_TEXT", sizeof(buf));
if (vp->v_vflag & VV_COPYONWRITE)
strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
if (vp->v_vflag & VV_SYSTEM)
strlcat(buf, "|VV_SYSTEM", sizeof(buf));
if (vp->v_vflag & VV_PROCDEP)
strlcat(buf, "|VV_PROCDEP", sizeof(buf));
if (vp->v_vflag & VV_NOKNOTE)
strlcat(buf, "|VV_NOKNOTE", sizeof(buf));
if (vp->v_vflag & VV_DELETED)
strlcat(buf, "|VV_DELETED", sizeof(buf));
if (vp->v_vflag & VV_MD)
strlcat(buf, "|VV_MD", sizeof(buf));
if (vp->v_vflag & VV_FORCEINSMQ)
strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf));
flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV |
VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP |
VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ);
if (flags != 0) {
snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
strlcat(buf, buf2, sizeof(buf));
}
if (vp->v_iflag & VI_MOUNT)
strlcat(buf, "|VI_MOUNT", sizeof(buf));
if (vp->v_iflag & VI_DOOMED)
strlcat(buf, "|VI_DOOMED", sizeof(buf));
if (vp->v_iflag & VI_FREE)
strlcat(buf, "|VI_FREE", sizeof(buf));
if (vp->v_iflag & VI_ACTIVE)
strlcat(buf, "|VI_ACTIVE", sizeof(buf));
if (vp->v_iflag & VI_DOINGINACT)
strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
if (vp->v_iflag & VI_OWEINACT)
strlcat(buf, "|VI_OWEINACT", sizeof(buf));
flags = vp->v_iflag & ~(VI_MOUNT | VI_DOOMED | VI_FREE |
VI_ACTIVE | VI_DOINGINACT | VI_OWEINACT);
if (flags != 0) {
snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
strlcat(buf, buf2, sizeof(buf));
}
printf(" flags (%s)\n", buf + 1);
if (mtx_owned(VI_MTX(vp)))
printf(" VI_LOCKed");
if (vp->v_object != NULL)
printf(" v_object %p ref %d pages %d "
"cleanbuf %d dirtybuf %d\n",
vp->v_object, vp->v_object->ref_count,
vp->v_object->resident_page_count,
vp->v_bufobj.bo_clean.bv_cnt,
vp->v_bufobj.bo_dirty.bv_cnt);
printf(" ");
lockmgr_printinfo(vp->v_vnlock);
if (vp->v_data != NULL)
VOP_PRINT(vp);
}
#ifdef DDB
/*
* List all of the locked vnodes in the system.
* Called when debugging the kernel.
*/
DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
{
struct mount *mp;
struct vnode *vp;
/*
* Note: because this is DDB, we can't obey the locking semantics
* for these structures, which means we could catch an inconsistent
* state and dereference a nasty pointer. Not much to be done
* about that.
*/
db_printf("Locked vnodes\n");
TAILQ_FOREACH(mp, &mountlist, mnt_list) {
TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
if (vp->v_type != VMARKER && VOP_ISLOCKED(vp))
vn_printf(vp, "vnode ");
}
}
}
/*
* Show details about the given vnode.
*/
DB_SHOW_COMMAND(vnode, db_show_vnode)
{
struct vnode *vp;
if (!have_addr)
return;
vp = (struct vnode *)addr;
vn_printf(vp, "vnode ");
}
/*
* Show details about the given mount point.
*/
DB_SHOW_COMMAND(mount, db_show_mount)
{
struct mount *mp;
struct vfsopt *opt;
struct statfs *sp;
struct vnode *vp;
char buf[512];
uint64_t mflags;
u_int flags;
if (!have_addr) {
/* No address given, print short info about all mount points. */
TAILQ_FOREACH(mp, &mountlist, mnt_list) {
db_printf("%p %s on %s (%s)\n", mp,
mp->mnt_stat.f_mntfromname,
mp->mnt_stat.f_mntonname,
mp->mnt_stat.f_fstypename);
if (db_pager_quit)
break;
}
db_printf("\nMore info: show mount <addr>\n");
return;
}
mp = (struct mount *)addr;
db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
buf[0] = '\0';
mflags = mp->mnt_flag;
#define MNT_FLAG(flag) do { \
if (mflags & (flag)) { \
if (buf[0] != '\0') \
strlcat(buf, ", ", sizeof(buf)); \
strlcat(buf, (#flag) + 4, sizeof(buf)); \
mflags &= ~(flag); \
} \
} while (0)
MNT_FLAG(MNT_RDONLY);
MNT_FLAG(MNT_SYNCHRONOUS);
MNT_FLAG(MNT_NOEXEC);
MNT_FLAG(MNT_NOSUID);
MNT_FLAG(MNT_NFS4ACLS);
MNT_FLAG(MNT_UNION);
MNT_FLAG(MNT_ASYNC);
MNT_FLAG(MNT_SUIDDIR);
MNT_FLAG(MNT_SOFTDEP);
MNT_FLAG(MNT_NOSYMFOLLOW);
MNT_FLAG(MNT_GJOURNAL);
MNT_FLAG(MNT_MULTILABEL);
MNT_FLAG(MNT_ACLS);
MNT_FLAG(MNT_NOATIME);
MNT_FLAG(MNT_NOCLUSTERR);
MNT_FLAG(MNT_NOCLUSTERW);
MNT_FLAG(MNT_SUJ);
MNT_FLAG(MNT_EXRDONLY);
MNT_FLAG(MNT_EXPORTED);
MNT_FLAG(MNT_DEFEXPORTED);
MNT_FLAG(MNT_EXPORTANON);
MNT_FLAG(MNT_EXKERB);
MNT_FLAG(MNT_EXPUBLIC);
MNT_FLAG(MNT_LOCAL);
MNT_FLAG(MNT_QUOTA);
MNT_FLAG(MNT_ROOTFS);
MNT_FLAG(MNT_USER);
MNT_FLAG(MNT_IGNORE);
MNT_FLAG(MNT_UPDATE);
MNT_FLAG(MNT_DELEXPORT);
MNT_FLAG(MNT_RELOAD);
MNT_FLAG(MNT_FORCE);
MNT_FLAG(MNT_SNAPSHOT);
MNT_FLAG(MNT_BYFSID);
#undef MNT_FLAG
if (mflags != 0) {
if (buf[0] != '\0')
strlcat(buf, ", ", sizeof(buf));
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
"0x%016jx", mflags);
}
db_printf(" mnt_flag = %s\n", buf);
buf[0] = '\0';
flags = mp->mnt_kern_flag;
#define MNT_KERN_FLAG(flag) do { \
if (flags & (flag)) { \
if (buf[0] != '\0') \
strlcat(buf, ", ", sizeof(buf)); \
strlcat(buf, (#flag) + 5, sizeof(buf)); \
flags &= ~(flag); \
} \
} while (0)
MNT_KERN_FLAG(MNTK_UNMOUNTF);
MNT_KERN_FLAG(MNTK_ASYNC);
MNT_KERN_FLAG(MNTK_SOFTDEP);
MNT_KERN_FLAG(MNTK_NOINSMNTQ);
MNT_KERN_FLAG(MNTK_DRAINING);
MNT_KERN_FLAG(MNTK_REFEXPIRE);
MNT_KERN_FLAG(MNTK_EXTENDED_SHARED);
MNT_KERN_FLAG(MNTK_SHARED_WRITES);
MNT_KERN_FLAG(MNTK_NO_IOPF);
MNT_KERN_FLAG(MNTK_VGONE_UPPER);
MNT_KERN_FLAG(MNTK_VGONE_WAITER);
MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT);
MNT_KERN_FLAG(MNTK_MARKER);
MNT_KERN_FLAG(MNTK_USES_BCACHE);
MNT_KERN_FLAG(MNTK_NOASYNC);
MNT_KERN_FLAG(MNTK_UNMOUNT);
MNT_KERN_FLAG(MNTK_MWAIT);
MNT_KERN_FLAG(MNTK_SUSPEND);
MNT_KERN_FLAG(MNTK_SUSPEND2);
MNT_KERN_FLAG(MNTK_SUSPENDED);
MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
MNT_KERN_FLAG(MNTK_NOKNOTE);
#undef MNT_KERN_FLAG
if (flags != 0) {
if (buf[0] != '\0')
strlcat(buf, ", ", sizeof(buf));
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
"0x%08x", flags);
}
db_printf(" mnt_kern_flag = %s\n", buf);
db_printf(" mnt_opt = ");
opt = TAILQ_FIRST(mp->mnt_opt);
if (opt != NULL) {
db_printf("%s", opt->name);
opt = TAILQ_NEXT(opt, link);
while (opt != NULL) {
db_printf(", %s", opt->name);
opt = TAILQ_NEXT(opt, link);
}
}
db_printf("\n");
sp = &mp->mnt_stat;
db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx "
"bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju "
"ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju "
"asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n",
(u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags,
(uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize,
(uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree,
(intmax_t)sp->f_bavail, (uintmax_t)sp->f_files,
(intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites,
(uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads,
(uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax,
(u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]);
db_printf(" mnt_cred = { uid=%u ruid=%u",
(u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid);
if (jailed(mp->mnt_cred))
db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
db_printf(" }\n");
db_printf(" mnt_ref = %d\n", mp->mnt_ref);
db_printf(" mnt_gen = %d\n", mp->mnt_gen);
db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
db_printf(" mnt_activevnodelistsize = %d\n",
mp->mnt_activevnodelistsize);
db_printf(" mnt_writeopcount = %d\n", mp->mnt_writeopcount);
db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen);
db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max);
db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed);
db_printf(" mnt_lockref = %d\n", mp->mnt_lockref);
db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
db_printf(" mnt_secondary_accwrites = %d\n",
mp->mnt_secondary_accwrites);
db_printf(" mnt_gjprovider = %s\n",
mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");
db_printf("\n\nList of active vnodes\n");
TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) {
if (vp->v_type != VMARKER) {
vn_printf(vp, "vnode ");
if (db_pager_quit)
break;
}
}
db_printf("\n\nList of inactive vnodes\n");
TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
if (vp->v_type != VMARKER && (vp->v_iflag & VI_ACTIVE) == 0) {
vn_printf(vp, "vnode ");
if (db_pager_quit)
break;
}
}
}
#endif /* DDB */
/*
* Fill in a struct xvfsconf based on a struct vfsconf.
*/
static int
vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp)
{
struct xvfsconf xvfsp;
bzero(&xvfsp, sizeof(xvfsp));
strcpy(xvfsp.vfc_name, vfsp->vfc_name);
xvfsp.vfc_typenum = vfsp->vfc_typenum;
xvfsp.vfc_refcount = vfsp->vfc_refcount;
xvfsp.vfc_flags = vfsp->vfc_flags;
/*
* These are unused in userland, we keep them
* to not break binary compatibility.
*/
xvfsp.vfc_vfsops = NULL;
xvfsp.vfc_next = NULL;
return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
}
#ifdef COMPAT_FREEBSD32
struct xvfsconf32 {
uint32_t vfc_vfsops;
char vfc_name[MFSNAMELEN];
int32_t vfc_typenum;
int32_t vfc_refcount;
int32_t vfc_flags;
uint32_t vfc_next;
};
static int
vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp)
{
struct xvfsconf32 xvfsp;
bzero(&xvfsp, sizeof(xvfsp));
strcpy(xvfsp.vfc_name, vfsp->vfc_name);
xvfsp.vfc_typenum = vfsp->vfc_typenum;
xvfsp.vfc_refcount = vfsp->vfc_refcount;
xvfsp.vfc_flags = vfsp->vfc_flags;
return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
}
#endif
/*
* Top level filesystem related information gathering.
*/
static int
sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
{
struct vfsconf *vfsp;
int error;
error = 0;
vfsconf_slock();
TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
#ifdef COMPAT_FREEBSD32
if (req->flags & SCTL_MASK32)
error = vfsconf2x32(req, vfsp);
else
#endif
error = vfsconf2x(req, vfsp);
if (error)
break;
}
vfsconf_sunlock();
return (error);
}
SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD |
CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist,
"S,xvfsconf", "List of all configured filesystems");
#ifndef BURN_BRIDGES
static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
static int
vfs_sysctl(SYSCTL_HANDLER_ARGS)
{
int *name = (int *)arg1 - 1; /* XXX */
u_int namelen = arg2 + 1; /* XXX */
struct vfsconf *vfsp;
log(LOG_WARNING, "userland calling deprecated sysctl, "
"please rebuild world\n");
#if 1 || defined(COMPAT_PRELITE2)
/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
if (namelen == 1)
return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
#endif
switch (name[1]) {
case VFS_MAXTYPENUM:
if (namelen != 2)
return (ENOTDIR);
return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
case VFS_CONF:
if (namelen != 3)
return (ENOTDIR); /* overloaded */
vfsconf_slock();
TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
if (vfsp->vfc_typenum == name[2])
break;
}
vfsconf_sunlock();
if (vfsp == NULL)
return (EOPNOTSUPP);
#ifdef COMPAT_FREEBSD32
if (req->flags & SCTL_MASK32)
return (vfsconf2x32(req, vfsp));
else
#endif
return (vfsconf2x(req, vfsp));
}
return (EOPNOTSUPP);
}
static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP |
CTLFLAG_MPSAFE, vfs_sysctl,
"Generic filesystem");
#if 1 || defined(COMPAT_PRELITE2)
static int
sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
{
int error;
struct vfsconf *vfsp;
struct ovfsconf ovfs;
vfsconf_slock();
TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
bzero(&ovfs, sizeof(ovfs));
ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */
strcpy(ovfs.vfc_name, vfsp->vfc_name);
ovfs.vfc_index = vfsp->vfc_typenum;
ovfs.vfc_refcount = vfsp->vfc_refcount;
ovfs.vfc_flags = vfsp->vfc_flags;
error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
if (error != 0) {
vfsconf_sunlock();
return (error);
}
}
vfsconf_sunlock();
return (0);
}
#endif /* 1 || COMPAT_PRELITE2 */
#endif /* !BURN_BRIDGES */
#define KINFO_VNODESLOP 10
#ifdef notyet
/*
* Dump vnode list (via sysctl).
*/
/* ARGSUSED */
static int
sysctl_vnode(SYSCTL_HANDLER_ARGS)
{
struct xvnode *xvn;
struct mount *mp;
struct vnode *vp;
int error, len, n;
/*
* Stale numvnodes access is not fatal here.
*/
req->lock = 0;
len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
if (!req->oldptr)
/* Make an estimate */
return (SYSCTL_OUT(req, 0, len));
error = sysctl_wire_old_buffer(req, 0);
if (error != 0)
return (error);
xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
n = 0;
mtx_lock(&mountlist_mtx);
TAILQ_FOREACH(mp, &mountlist, mnt_list) {
if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
continue;
MNT_ILOCK(mp);
TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
if (n == len)
break;
vref(vp);
xvn[n].xv_size = sizeof *xvn;
xvn[n].xv_vnode = vp;
xvn[n].xv_id = 0; /* XXX compat */
#define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
XV_COPY(usecount);
XV_COPY(writecount);
XV_COPY(holdcnt);
XV_COPY(mount);
XV_COPY(numoutput);
XV_COPY(type);
#undef XV_COPY
xvn[n].xv_flag = vp->v_vflag;
switch (vp->v_type) {
case VREG:
case VDIR:
case VLNK:
break;
case VBLK:
case VCHR:
if (vp->v_rdev == NULL) {
vrele(vp);
continue;
}
xvn[n].xv_dev = dev2udev(vp->v_rdev);
break;
case VSOCK:
xvn[n].xv_socket = vp->v_socket;
break;
case VFIFO:
xvn[n].xv_fifo = vp->v_fifoinfo;
break;
case VNON:
case VBAD:
default:
/* shouldn't happen? */
vrele(vp);
continue;
}
vrele(vp);
++n;
}
MNT_IUNLOCK(mp);
mtx_lock(&mountlist_mtx);
vfs_unbusy(mp);
if (n == len)
break;
}
mtx_unlock(&mountlist_mtx);
error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
free(xvn, M_TEMP);
return (error);
}
SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD |
CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode",
"");
#endif
static void
unmount_or_warn(struct mount *mp)
{
int error;
error = dounmount(mp, MNT_FORCE, curthread);
if (error != 0) {
printf("unmount of %s failed (", mp->mnt_stat.f_mntonname);
if (error == EBUSY)
printf("BUSY)\n");
else
printf("%d)\n", error);
}
}
/*
* Unmount all filesystems. The list is traversed in reverse order
* of mounting to avoid dependencies.
*/
void
vfs_unmountall(void)
{
struct mount *mp, *tmp;
CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__);
/*
* Since this only runs when rebooting, it is not interlocked.
*/
TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) {
vfs_ref(mp);
/*
* Forcibly unmounting "/dev" before "/" would prevent clean
* unmount of the latter.
*/
if (mp == rootdevmp)
continue;
unmount_or_warn(mp);
}
if (rootdevmp != NULL)
unmount_or_warn(rootdevmp);
}
/*
* perform msync on all vnodes under a mount point
* the mount point must be locked.
*/
void
vfs_msync(struct mount *mp, int flags)
{
struct vnode *vp, *mvp;
struct vm_object *obj;
CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
vnlru_return_batch(mp);
MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) {
obj = vp->v_object;
if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0 &&
(flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) {
if (!vget(vp,
LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
curthread)) {
if (vp->v_vflag & VV_NOSYNC) { /* unlinked */
vput(vp);
continue;
}
obj = vp->v_object;
if (obj != NULL) {
VM_OBJECT_WLOCK(obj);
vm_object_page_clean(obj, 0, 0,
flags == MNT_WAIT ?
OBJPC_SYNC : OBJPC_NOSYNC);
VM_OBJECT_WUNLOCK(obj);
}
vput(vp);
}
} else
VI_UNLOCK(vp);
}
}
static void
destroy_vpollinfo_free(struct vpollinfo *vi)
{
knlist_destroy(&vi->vpi_selinfo.si_note);
mtx_destroy(&vi->vpi_lock);
uma_zfree(vnodepoll_zone, vi);
}
static void
destroy_vpollinfo(struct vpollinfo *vi)
{
knlist_clear(&vi->vpi_selinfo.si_note, 1);
seldrain(&vi->vpi_selinfo);
destroy_vpollinfo_free(vi);
}
/*
* Initialize per-vnode helper structure to hold poll-related state.
*/
void
v_addpollinfo(struct vnode *vp)
{
struct vpollinfo *vi;
if (vp->v_pollinfo != NULL)
return;
vi = uma_zalloc(vnodepoll_zone, M_WAITOK | M_ZERO);
mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked);
VI_LOCK(vp);
if (vp->v_pollinfo != NULL) {
VI_UNLOCK(vp);
destroy_vpollinfo_free(vi);
return;
}
vp->v_pollinfo = vi;
VI_UNLOCK(vp);
}
/*
* Record a process's interest in events which might happen to
* a vnode. Because poll uses the historic select-style interface
* internally, this routine serves as both the ``check for any
* pending events'' and the ``record my interest in future events''
* functions. (These are done together, while the lock is held,
* to avoid race conditions.)
*/
int
vn_pollrecord(struct vnode *vp, struct thread *td, int events)
{
v_addpollinfo(vp);
mtx_lock(&vp->v_pollinfo->vpi_lock);
if (vp->v_pollinfo->vpi_revents & events) {
/*
* This leaves events we are not interested
* in available for the other process which
* which presumably had requested them
* (otherwise they would never have been
* recorded).
*/
events &= vp->v_pollinfo->vpi_revents;
vp->v_pollinfo->vpi_revents &= ~events;
mtx_unlock(&vp->v_pollinfo->vpi_lock);
return (events);
}
vp->v_pollinfo->vpi_events |= events;
selrecord(td, &vp->v_pollinfo->vpi_selinfo);
mtx_unlock(&vp->v_pollinfo->vpi_lock);
return (0);
}
/*
* Routine to create and manage a filesystem syncer vnode.
*/
#define sync_close ((int (*)(struct vop_close_args *))nullop)
static int sync_fsync(struct vop_fsync_args *);
static int sync_inactive(struct vop_inactive_args *);
static int sync_reclaim(struct vop_reclaim_args *);
static struct vop_vector sync_vnodeops = {
.vop_bypass = VOP_EOPNOTSUPP,
.vop_close = sync_close, /* close */
.vop_fsync = sync_fsync, /* fsync */
.vop_inactive = sync_inactive, /* inactive */
.vop_reclaim = sync_reclaim, /* reclaim */
.vop_lock1 = vop_stdlock, /* lock */
.vop_unlock = vop_stdunlock, /* unlock */
.vop_islocked = vop_stdislocked, /* islocked */
};
/*
* Create a new filesystem syncer vnode for the specified mount point.
*/
void
vfs_allocate_syncvnode(struct mount *mp)
{
struct vnode *vp;
struct bufobj *bo;
static long start, incr, next;
int error;
/* Allocate a new vnode */
error = getnewvnode("syncer", mp, &sync_vnodeops, &vp);
if (error != 0)
panic("vfs_allocate_syncvnode: getnewvnode() failed");
vp->v_type = VNON;
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
vp->v_vflag |= VV_FORCEINSMQ;
error = insmntque(vp, mp);
if (error != 0)
panic("vfs_allocate_syncvnode: insmntque() failed");
vp->v_vflag &= ~VV_FORCEINSMQ;
VOP_UNLOCK(vp, 0);
/*
* Place the vnode onto the syncer worklist. We attempt to
* scatter them about on the list so that they will go off
* at evenly distributed times even if all the filesystems
* are mounted at once.
*/
next += incr;
if (next == 0 || next > syncer_maxdelay) {
start /= 2;
incr /= 2;
if (start == 0) {
start = syncer_maxdelay / 2;
incr = syncer_maxdelay;
}
next = start;
}
bo = &vp->v_bufobj;
BO_LOCK(bo);
vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
/* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
mtx_lock(&sync_mtx);
sync_vnode_count++;
if (mp->mnt_syncer == NULL) {
mp->mnt_syncer = vp;
vp = NULL;
}
mtx_unlock(&sync_mtx);
BO_UNLOCK(bo);
if (vp != NULL) {
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
vgone(vp);
vput(vp);
}
}
void
vfs_deallocate_syncvnode(struct mount *mp)
{
struct vnode *vp;
mtx_lock(&sync_mtx);
vp = mp->mnt_syncer;
if (vp != NULL)
mp->mnt_syncer = NULL;
mtx_unlock(&sync_mtx);
if (vp != NULL)
vrele(vp);
}
/*
* Do a lazy sync of the filesystem.
*/
static int
sync_fsync(struct vop_fsync_args *ap)
{
struct vnode *syncvp = ap->a_vp;
struct mount *mp = syncvp->v_mount;
int error, save;
struct bufobj *bo;
/*
* We only need to do something if this is a lazy evaluation.
*/
if (ap->a_waitfor != MNT_LAZY)
return (0);
/*
* Move ourselves to the back of the sync list.
*/
bo = &syncvp->v_bufobj;
BO_LOCK(bo);
vn_syncer_add_to_worklist(bo, syncdelay);
BO_UNLOCK(bo);
/*
* Walk the list of vnodes pushing all that are dirty and
* not already on the sync list.
*/
if (vfs_busy(mp, MBF_NOWAIT) != 0)
return (0);
if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
vfs_unbusy(mp);
return (0);
}
save = curthread_pflags_set(TDP_SYNCIO);
vfs_msync(mp, MNT_NOWAIT);
error = VFS_SYNC(mp, MNT_LAZY);
curthread_pflags_restore(save);
vn_finished_write(mp);
vfs_unbusy(mp);
return (error);
}
/*
* The syncer vnode is no referenced.
*/
static int
sync_inactive(struct vop_inactive_args *ap)
{
vgone(ap->a_vp);
return (0);
}
/*
* The syncer vnode is no longer needed and is being decommissioned.
*
* Modifications to the worklist must be protected by sync_mtx.
*/
static int
sync_reclaim(struct vop_reclaim_args *ap)
{
struct vnode *vp = ap->a_vp;
struct bufobj *bo;
bo = &vp->v_bufobj;
BO_LOCK(bo);
mtx_lock(&sync_mtx);
if (vp->v_mount->mnt_syncer == vp)
vp->v_mount->mnt_syncer = NULL;
if (bo->bo_flag & BO_ONWORKLST) {
LIST_REMOVE(bo, bo_synclist);
syncer_worklist_len--;
sync_vnode_count--;
bo->bo_flag &= ~BO_ONWORKLST;
}
mtx_unlock(&sync_mtx);
BO_UNLOCK(bo);
return (0);
}
/*
* Check if vnode represents a disk device
*/
int
vn_isdisk(struct vnode *vp, int *errp)
{
int error;
if (vp->v_type != VCHR) {
error = ENOTBLK;
goto out;
}
error = 0;
dev_lock();
if (vp->v_rdev == NULL)
error = ENXIO;
else if (vp->v_rdev->si_devsw == NULL)
error = ENXIO;
else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
error = ENOTBLK;
dev_unlock();
out:
if (errp != NULL)
*errp = error;
return (error == 0);
}
/*
* Common filesystem object access control check routine. Accepts a
* vnode's type, "mode", uid and gid, requested access mode, credentials,
* and optional call-by-reference privused argument allowing vaccess()
* to indicate to the caller whether privilege was used to satisfy the
* request (obsoleted). Returns 0 on success, or an errno on failure.
*/
int
vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
accmode_t accmode, struct ucred *cred, int *privused)
{
accmode_t dac_granted;
accmode_t priv_granted;
KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0,
("invalid bit in accmode"));
KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
("VAPPEND without VWRITE"));
/*
* Look for a normal, non-privileged way to access the file/directory
* as requested. If it exists, go with that.
*/
if (privused != NULL)
*privused = 0;
dac_granted = 0;
/* Check the owner. */
if (cred->cr_uid == file_uid) {
dac_granted |= VADMIN;
if (file_mode & S_IXUSR)
dac_granted |= VEXEC;
if (file_mode & S_IRUSR)
dac_granted |= VREAD;
if (file_mode & S_IWUSR)
dac_granted |= (VWRITE | VAPPEND);
if ((accmode & dac_granted) == accmode)
return (0);
goto privcheck;
}
/* Otherwise, check the groups (first match) */
if (groupmember(file_gid, cred)) {
if (file_mode & S_IXGRP)
dac_granted |= VEXEC;
if (file_mode & S_IRGRP)
dac_granted |= VREAD;
if (file_mode & S_IWGRP)
dac_granted |= (VWRITE | VAPPEND);
if ((accmode & dac_granted) == accmode)
return (0);
goto privcheck;
}
/* Otherwise, check everyone else. */
if (file_mode & S_IXOTH)
dac_granted |= VEXEC;
if (file_mode & S_IROTH)
dac_granted |= VREAD;
if (file_mode & S_IWOTH)
dac_granted |= (VWRITE | VAPPEND);
if ((accmode & dac_granted) == accmode)
return (0);
privcheck:
/*
* Build a privilege mask to determine if the set of privileges
* satisfies the requirements when combined with the granted mask
* from above. For each privilege, if the privilege is required,
* bitwise or the request type onto the priv_granted mask.
*/
priv_granted = 0;
if (type == VDIR) {
/*
* For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
* requests, instead of PRIV_VFS_EXEC.
*/
if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
!priv_check_cred(cred, PRIV_VFS_LOOKUP, 0))
priv_granted |= VEXEC;
} else {
/*
* Ensure that at least one execute bit is on. Otherwise,
* a privileged user will always succeed, and we don't want
* this to happen unless the file really is executable.
*/
if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
(file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
!priv_check_cred(cred, PRIV_VFS_EXEC, 0))
priv_granted |= VEXEC;
}
if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) &&
!priv_check_cred(cred, PRIV_VFS_READ, 0))
priv_granted |= VREAD;
if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
!priv_check_cred(cred, PRIV_VFS_WRITE, 0))
priv_granted |= (VWRITE | VAPPEND);
if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
!priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
priv_granted |= VADMIN;
if ((accmode & (priv_granted | dac_granted)) == accmode) {
/* XXX audit: privilege used */
if (privused != NULL)
*privused = 1;
return (0);
}
return ((accmode & VADMIN) ? EPERM : EACCES);
}
/*
* Credential check based on process requesting service, and per-attribute
* permissions.
*/
int
extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
struct thread *td, accmode_t accmode)
{
/*
* Kernel-invoked always succeeds.
*/
if (cred == NOCRED)
return (0);
/*
* Do not allow privileged processes in jail to directly manipulate
* system attributes.
*/
switch (attrnamespace) {
case EXTATTR_NAMESPACE_SYSTEM:
/* Potentially should be: return (EPERM); */
return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0));
case EXTATTR_NAMESPACE_USER:
return (VOP_ACCESS(vp, accmode, cred, td));
default:
return (EPERM);
}
}
#ifdef DEBUG_VFS_LOCKS
/*
* This only exists to suppress warnings from unlocked specfs accesses. It is
* no longer ok to have an unlocked VFS.
*/
#define IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL || \
(vp)->v_type == VCHR || (vp)->v_type == VBAD)
int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */
SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0,
"Drop into debugger on lock violation");
int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */
SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex,
0, "Check for interlock across VOPs");
int vfs_badlock_print = 1; /* Print lock violations. */
SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print,
0, "Print lock violations");
int vfs_badlock_vnode = 1; /* Print vnode details on lock violations. */
SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode,
0, "Print vnode details on lock violations");
#ifdef KDB
int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */
SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW,
&vfs_badlock_backtrace, 0, "Print backtrace at lock violations");
#endif
static void
vfs_badlock(const char *msg, const char *str, struct vnode *vp)
{
#ifdef KDB
if (vfs_badlock_backtrace)
kdb_backtrace();
#endif
if (vfs_badlock_vnode)
vn_printf(vp, "vnode ");
if (vfs_badlock_print)
printf("%s: %p %s\n", str, (void *)vp, msg);
if (vfs_badlock_ddb)
kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
}
void
assert_vi_locked(struct vnode *vp, const char *str)
{
if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
vfs_badlock("interlock is not locked but should be", str, vp);
}
void
assert_vi_unlocked(struct vnode *vp, const char *str)
{
if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
vfs_badlock("interlock is locked but should not be", str, vp);
}
void
assert_vop_locked(struct vnode *vp, const char *str)
{
int locked;
if (!IGNORE_LOCK(vp)) {
locked = VOP_ISLOCKED(vp);
if (locked == 0 || locked == LK_EXCLOTHER)
vfs_badlock("is not locked but should be", str, vp);
}
}
void
assert_vop_unlocked(struct vnode *vp, const char *str)
{
if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
vfs_badlock("is locked but should not be", str, vp);
}
void
assert_vop_elocked(struct vnode *vp, const char *str)
{
if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
vfs_badlock("is not exclusive locked but should be", str, vp);
}
#endif /* DEBUG_VFS_LOCKS */
void
vop_rename_fail(struct vop_rename_args *ap)
{
if (ap->a_tvp != NULL)
vput(ap->a_tvp);
if (ap->a_tdvp == ap->a_tvp)
vrele(ap->a_tdvp);
else
vput(ap->a_tdvp);
vrele(ap->a_fdvp);
vrele(ap->a_fvp);
}
void
vop_rename_pre(void *ap)
{
struct vop_rename_args *a = ap;
#ifdef DEBUG_VFS_LOCKS
if (a->a_tvp)
ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
/* Check the source (from). */
if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock &&
(a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock))
ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock)
ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
/* Check the target. */
if (a->a_tvp)
ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
#endif
if (a->a_tdvp != a->a_fdvp)
vhold(a->a_fdvp);
if (a->a_tvp != a->a_fvp)
vhold(a->a_fvp);
vhold(a->a_tdvp);
if (a->a_tvp)
vhold(a->a_tvp);
}
#ifdef DEBUG_VFS_LOCKS
void
vop_strategy_pre(void *ap)
{
struct vop_strategy_args *a;
struct buf *bp;
a = ap;
bp = a->a_bp;
/*
* Cluster ops lock their component buffers but not the IO container.
*/
if ((bp->b_flags & B_CLUSTER) != 0)
return;
if (panicstr == NULL && !BUF_ISLOCKED(bp)) {
if (vfs_badlock_print)
printf(
"VOP_STRATEGY: bp is not locked but should be\n");
if (vfs_badlock_ddb)
kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
}
}
void
vop_lock_pre(void *ap)
{
struct vop_lock1_args *a = ap;
if ((a->a_flags & LK_INTERLOCK) == 0)
ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
else
ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
}
void
vop_lock_post(void *ap, int rc)
{
struct vop_lock1_args *a = ap;
ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0)
ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
}
void
vop_unlock_pre(void *ap)
{
struct vop_unlock_args *a = ap;
if (a->a_flags & LK_INTERLOCK)
ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
}
void
vop_unlock_post(void *ap, int rc)
{
struct vop_unlock_args *a = ap;
if (a->a_flags & LK_INTERLOCK)
ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
}
#endif
void
vop_create_post(void *ap, int rc)
{
struct vop_create_args *a = ap;
if (!rc)
VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
}
void
vop_deleteextattr_post(void *ap, int rc)
{
struct vop_deleteextattr_args *a = ap;
if (!rc)
VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
}
void
vop_link_post(void *ap, int rc)
{
struct vop_link_args *a = ap;
if (!rc) {
VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK);
VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
}
}
void
vop_mkdir_post(void *ap, int rc)
{
struct vop_mkdir_args *a = ap;
if (!rc)
VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
}
void
vop_mknod_post(void *ap, int rc)
{
struct vop_mknod_args *a = ap;
if (!rc)
VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
}
void
vop_reclaim_post(void *ap, int rc)
{
struct vop_reclaim_args *a = ap;
if (!rc)
VFS_KNOTE_LOCKED(a->a_vp, NOTE_REVOKE);
}
void
vop_remove_post(void *ap, int rc)
{
struct vop_remove_args *a = ap;
if (!rc) {
VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
}
}
void
vop_rename_post(void *ap, int rc)
{
struct vop_rename_args *a = ap;
long hint;
if (!rc) {
hint = NOTE_WRITE;
if (a->a_fdvp == a->a_tdvp) {
if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR)
hint |= NOTE_LINK;
VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);
VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
} else {
hint |= NOTE_EXTEND;
if (a->a_fvp->v_type == VDIR)
hint |= NOTE_LINK;
VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);
if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL &&
a->a_tvp->v_type == VDIR)
hint &= ~NOTE_LINK;
VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
}
VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
if (a->a_tvp)
VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
}
if (a->a_tdvp != a->a_fdvp)
vdrop(a->a_fdvp);
if (a->a_tvp != a->a_fvp)
vdrop(a->a_fvp);
vdrop(a->a_tdvp);
if (a->a_tvp)
vdrop(a->a_tvp);
}
void
vop_rmdir_post(void *ap, int rc)
{
struct vop_rmdir_args *a = ap;
if (!rc) {
VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
}
}
void
vop_setattr_post(void *ap, int rc)
{
struct vop_setattr_args *a = ap;
if (!rc)
VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
}
void
vop_setextattr_post(void *ap, int rc)
{
struct vop_setextattr_args *a = ap;
if (!rc)
VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
}
void
vop_symlink_post(void *ap, int rc)
{
struct vop_symlink_args *a = ap;
if (!rc)
VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
}
void
vop_open_post(void *ap, int rc)
{
struct vop_open_args *a = ap;
if (!rc)
VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN);
}
void
vop_close_post(void *ap, int rc)
{
struct vop_close_args *a = ap;
if (!rc && (a->a_cred != NOCRED || /* filter out revokes */
(a->a_vp->v_iflag & VI_DOOMED) == 0)) {
VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ?
NOTE_CLOSE_WRITE : NOTE_CLOSE);
}
}
void
vop_read_post(void *ap, int rc)
{
struct vop_read_args *a = ap;
if (!rc)
VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
}
void
vop_readdir_post(void *ap, int rc)
{
struct vop_readdir_args *a = ap;
if (!rc)
VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
}
static struct knlist fs_knlist;
static void
vfs_event_init(void *arg)
{
knlist_init_mtx(&fs_knlist, NULL);
}
/* XXX - correct order? */
SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
void
vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused)
{
KNOTE_UNLOCKED(&fs_knlist, event);
}
static int filt_fsattach(struct knote *kn);
static void filt_fsdetach(struct knote *kn);
static int filt_fsevent(struct knote *kn, long hint);
struct filterops fs_filtops = {
.f_isfd = 0,
.f_attach = filt_fsattach,
.f_detach = filt_fsdetach,
.f_event = filt_fsevent
};
static int
filt_fsattach(struct knote *kn)
{
kn->kn_flags |= EV_CLEAR;
knlist_add(&fs_knlist, kn, 0);
return (0);
}
static void
filt_fsdetach(struct knote *kn)
{
knlist_remove(&fs_knlist, kn, 0);
}
static int
filt_fsevent(struct knote *kn, long hint)
{
kn->kn_fflags |= hint;
return (kn->kn_fflags != 0);
}
static int
sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
{
struct vfsidctl vc;
int error;
struct mount *mp;
error = SYSCTL_IN(req, &vc, sizeof(vc));
if (error)
return (error);
if (vc.vc_vers != VFS_CTL_VERS1)
return (EINVAL);
mp = vfs_getvfs(&vc.vc_fsid);
if (mp == NULL)
return (ENOENT);
/* ensure that a specific sysctl goes to the right filesystem. */
if (strcmp(vc.vc_fstypename, "*") != 0 &&
strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
vfs_rel(mp);
return (EINVAL);
}
VCTLTOREQ(&vc, req);
error = VFS_SYSCTL(mp, vc.vc_op, req);
vfs_rel(mp);
return (error);
}
SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR,
NULL, 0, sysctl_vfs_ctl, "",
"Sysctl by fsid");
/*
* Function to initialize a va_filerev field sensibly.
* XXX: Wouldn't a random number make a lot more sense ??
*/
u_quad_t
init_va_filerev(void)
{
struct bintime bt;
getbinuptime(&bt);
return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
}
static int filt_vfsread(struct knote *kn, long hint);
static int filt_vfswrite(struct knote *kn, long hint);
static int filt_vfsvnode(struct knote *kn, long hint);
static void filt_vfsdetach(struct knote *kn);
static struct filterops vfsread_filtops = {
.f_isfd = 1,
.f_detach = filt_vfsdetach,
.f_event = filt_vfsread
};
static struct filterops vfswrite_filtops = {
.f_isfd = 1,
.f_detach = filt_vfsdetach,
.f_event = filt_vfswrite
};
static struct filterops vfsvnode_filtops = {
.f_isfd = 1,
.f_detach = filt_vfsdetach,
.f_event = filt_vfsvnode
};
static void
vfs_knllock(void *arg)
{
struct vnode *vp = arg;
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
}
static void
vfs_knlunlock(void *arg)
{
struct vnode *vp = arg;
VOP_UNLOCK(vp, 0);
}
static void
vfs_knl_assert_locked(void *arg)
{
#ifdef DEBUG_VFS_LOCKS
struct vnode *vp = arg;
ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked");
#endif
}
static void
vfs_knl_assert_unlocked(void *arg)
{
#ifdef DEBUG_VFS_LOCKS
struct vnode *vp = arg;
ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked");
#endif
}
int
vfs_kqfilter(struct vop_kqfilter_args *ap)
{
struct vnode *vp = ap->a_vp;
struct knote *kn = ap->a_kn;
struct knlist *knl;
switch (kn->kn_filter) {
case EVFILT_READ:
kn->kn_fop = &vfsread_filtops;
break;
case EVFILT_WRITE:
kn->kn_fop = &vfswrite_filtops;
break;
case EVFILT_VNODE:
kn->kn_fop = &vfsvnode_filtops;
break;
default:
return (EINVAL);
}
kn->kn_hook = (caddr_t)vp;
v_addpollinfo(vp);
if (vp->v_pollinfo == NULL)
return (ENOMEM);
knl = &vp->v_pollinfo->vpi_selinfo.si_note;
vhold(vp);
knlist_add(knl, kn, 0);
return (0);
}
/*
* Detach knote from vnode
*/
static void
filt_vfsdetach(struct knote *kn)
{
struct vnode *vp = (struct vnode *)kn->kn_hook;
KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
vdrop(vp);
}
/*ARGSUSED*/
static int
filt_vfsread(struct knote *kn, long hint)
{
struct vnode *vp = (struct vnode *)kn->kn_hook;
struct vattr va;
int res;
/*
* filesystem is gone, so set the EOF flag and schedule
* the knote for deletion.
*/
if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
VI_LOCK(vp);
kn->kn_flags |= (EV_EOF | EV_ONESHOT);
VI_UNLOCK(vp);
return (1);
}
if (VOP_GETATTR(vp, &va, curthread->td_ucred))
return (0);
VI_LOCK(vp);
kn->kn_data = va.va_size - kn->kn_fp->f_offset;
res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0;
VI_UNLOCK(vp);
return (res);
}
/*ARGSUSED*/
static int
filt_vfswrite(struct knote *kn, long hint)
{
struct vnode *vp = (struct vnode *)kn->kn_hook;
VI_LOCK(vp);
/*
* filesystem is gone, so set the EOF flag and schedule
* the knote for deletion.
*/
if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD))
kn->kn_flags |= (EV_EOF | EV_ONESHOT);
kn->kn_data = 0;
VI_UNLOCK(vp);
return (1);
}
static int
filt_vfsvnode(struct knote *kn, long hint)
{
struct vnode *vp = (struct vnode *)kn->kn_hook;
int res;
VI_LOCK(vp);
if (kn->kn_sfflags & hint)
kn->kn_fflags |= hint;
if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
kn->kn_flags |= EV_EOF;
VI_UNLOCK(vp);
return (1);
}
res = (kn->kn_fflags != 0);
VI_UNLOCK(vp);
return (res);
}
int
vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
{
int error;
if (dp->d_reclen > ap->a_uio->uio_resid)
return (ENAMETOOLONG);
error = uiomove(dp, dp->d_reclen, ap->a_uio);
if (error) {
if (ap->a_ncookies != NULL) {
if (ap->a_cookies != NULL)
free(ap->a_cookies, M_TEMP);
ap->a_cookies = NULL;
*ap->a_ncookies = 0;
}
return (error);
}
if (ap->a_ncookies == NULL)
return (0);
KASSERT(ap->a_cookies,
("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
*ap->a_cookies = realloc(*ap->a_cookies,
(*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
(*ap->a_cookies)[*ap->a_ncookies] = off;
*ap->a_ncookies += 1;
return (0);
}
/*
* Mark for update the access time of the file if the filesystem
* supports VOP_MARKATIME. This functionality is used by execve and
* mmap, so we want to avoid the I/O implied by directly setting
* va_atime for the sake of efficiency.
*/
void
vfs_mark_atime(struct vnode *vp, struct ucred *cred)
{
struct mount *mp;
mp = vp->v_mount;
ASSERT_VOP_LOCKED(vp, "vfs_mark_atime");
if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
(void)VOP_MARKATIME(vp);
}
/*
* The purpose of this routine is to remove granularity from accmode_t,
* reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
* VADMIN and VAPPEND.
*
* If it returns 0, the caller is supposed to continue with the usual
* access checks using 'accmode' as modified by this routine. If it
* returns nonzero value, the caller is supposed to return that value
* as errno.
*
* Note that after this routine runs, accmode may be zero.
*/
int
vfs_unixify_accmode(accmode_t *accmode)
{
/*
* There is no way to specify explicit "deny" rule using
* file mode or POSIX.1e ACLs.
*/
if (*accmode & VEXPLICIT_DENY) {
*accmode = 0;
return (0);
}
/*
* None of these can be translated into usual access bits.
* Also, the common case for NFSv4 ACLs is to not contain
* either of these bits. Caller should check for VWRITE
* on the containing directory instead.
*/
if (*accmode & (VDELETE_CHILD | VDELETE))
return (EPERM);
if (*accmode & VADMIN_PERMS) {
*accmode &= ~VADMIN_PERMS;
*accmode |= VADMIN;
}
/*
* There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
* or VSYNCHRONIZE using file mode or POSIX.1e ACL.
*/
*accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
return (0);
}
/*
* These are helper functions for filesystems to traverse all
* their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h.
*
* This interface replaces MNT_VNODE_FOREACH.
*/
MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
struct vnode *
__mnt_vnode_next_all(struct vnode **mvp, struct mount *mp)
{
struct vnode *vp;
if (should_yield())
kern_yield(PRI_USER);
MNT_ILOCK(mp);
KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL;
vp = TAILQ_NEXT(vp, v_nmntvnodes)) {
/* Allow a racy peek at VI_DOOMED to save a lock acquisition. */
if (vp->v_type == VMARKER || (vp->v_iflag & VI_DOOMED) != 0)
continue;
VI_LOCK(vp);
if ((vp->v_iflag & VI_DOOMED) != 0) {
VI_UNLOCK(vp);
continue;
}
break;
}
if (vp == NULL) {
__mnt_vnode_markerfree_all(mvp, mp);
/* MNT_IUNLOCK(mp); -- done in above function */
mtx_assert(MNT_MTX(mp), MA_NOTOWNED);
return (NULL);
}
TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
MNT_IUNLOCK(mp);
return (vp);
}
struct vnode *
__mnt_vnode_first_all(struct vnode **mvp, struct mount *mp)
{
struct vnode *vp;
*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
MNT_ILOCK(mp);
MNT_REF(mp);
(*mvp)->v_mount = mp;
(*mvp)->v_type = VMARKER;
TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
/* Allow a racy peek at VI_DOOMED to save a lock acquisition. */
if (vp->v_type == VMARKER || (vp->v_iflag & VI_DOOMED) != 0)
continue;
VI_LOCK(vp);
if ((vp->v_iflag & VI_DOOMED) != 0) {
VI_UNLOCK(vp);
continue;
}
break;
}
if (vp == NULL) {
MNT_REL(mp);
MNT_IUNLOCK(mp);
free(*mvp, M_VNODE_MARKER);
*mvp = NULL;
return (NULL);
}
TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
MNT_IUNLOCK(mp);
return (vp);
}
void
__mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp)
{
if (*mvp == NULL) {
MNT_IUNLOCK(mp);
return;
}
mtx_assert(MNT_MTX(mp), MA_OWNED);
KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
MNT_REL(mp);
MNT_IUNLOCK(mp);
free(*mvp, M_VNODE_MARKER);
*mvp = NULL;
}
/*
* These are helper functions for filesystems to traverse their
* active vnodes. See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h
*/
static void
mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
{
KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
MNT_ILOCK(mp);
MNT_REL(mp);
MNT_IUNLOCK(mp);
free(*mvp, M_VNODE_MARKER);
*mvp = NULL;
}
/*
* Relock the mp mount vnode list lock with the vp vnode interlock in the
* conventional lock order during mnt_vnode_next_active iteration.
*
* On entry, the mount vnode list lock is held and the vnode interlock is not.
* The list lock is dropped and reacquired. On success, both locks are held.
* On failure, the mount vnode list lock is held but the vnode interlock is
* not, and the procedure may have yielded.
*/
static bool
mnt_vnode_next_active_relock(struct vnode *mvp, struct mount *mp,
struct vnode *vp)
{
const struct vnode *tmp;
bool held, ret;
VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER &&
TAILQ_NEXT(mvp, v_actfreelist) != NULL, mvp,
("%s: bad marker", __func__));
VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp,
("%s: inappropriate vnode", __func__));
ASSERT_VI_UNLOCKED(vp, __func__);
mtx_assert(&mp->mnt_listmtx, MA_OWNED);
ret = false;
TAILQ_REMOVE(&mp->mnt_activevnodelist, mvp, v_actfreelist);
TAILQ_INSERT_BEFORE(vp, mvp, v_actfreelist);
/*
* Use a hold to prevent vp from disappearing while the mount vnode
* list lock is dropped and reacquired. Normally a hold would be
* acquired with vhold(), but that might try to acquire the vnode
* interlock, which would be a LOR with the mount vnode list lock.
*/
held = vfs_refcount_acquire_if_not_zero(&vp->v_holdcnt);
mtx_unlock(&mp->mnt_listmtx);
if (!held)
goto abort;
VI_LOCK(vp);
if (!vfs_refcount_release_if_not_last(&vp->v_holdcnt)) {
vdropl(vp);
goto abort;
}
mtx_lock(&mp->mnt_listmtx);
/*
* Determine whether the vnode is still the next one after the marker,
* excepting any other markers. If the vnode has not been doomed by
* vgone() then the hold should have ensured that it remained on the
* active list. If it has been doomed but is still on the active list,
* don't abort, but rather skip over it (avoid spinning on doomed
* vnodes).
*/
tmp = mvp;
do {
tmp = TAILQ_NEXT(tmp, v_actfreelist);
} while (tmp != NULL && tmp->v_type == VMARKER);
if (tmp != vp) {
mtx_unlock(&mp->mnt_listmtx);
VI_UNLOCK(vp);
goto abort;
}
ret = true;
goto out;
abort:
maybe_yield();
mtx_lock(&mp->mnt_listmtx);
out:
if (ret)
ASSERT_VI_LOCKED(vp, __func__);
else
ASSERT_VI_UNLOCKED(vp, __func__);
mtx_assert(&mp->mnt_listmtx, MA_OWNED);
return (ret);
}
static struct vnode *
mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
{
struct vnode *vp, *nvp;
mtx_assert(&mp->mnt_listmtx, MA_OWNED);
KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
restart:
vp = TAILQ_NEXT(*mvp, v_actfreelist);
while (vp != NULL) {
if (vp->v_type == VMARKER) {
vp = TAILQ_NEXT(vp, v_actfreelist);
continue;
}
/*
* Try-lock because this is the wrong lock order. If that does
* not succeed, drop the mount vnode list lock and try to
* reacquire it and the vnode interlock in the right order.
*/
if (!VI_TRYLOCK(vp) &&
!mnt_vnode_next_active_relock(*mvp, mp, vp))
goto restart;
KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp));
KASSERT(vp->v_mount == mp || vp->v_mount == NULL,
("alien vnode on the active list %p %p", vp, mp));
if (vp->v_mount == mp && (vp->v_iflag & VI_DOOMED) == 0)
break;
nvp = TAILQ_NEXT(vp, v_actfreelist);
VI_UNLOCK(vp);
vp = nvp;
}
TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
/* Check if we are done */
if (vp == NULL) {
mtx_unlock(&mp->mnt_listmtx);
mnt_vnode_markerfree_active(mvp, mp);
return (NULL);
}
TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist);
mtx_unlock(&mp->mnt_listmtx);
ASSERT_VI_LOCKED(vp, "active iter");
KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp));
return (vp);
}
struct vnode *
__mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
{
if (should_yield())
kern_yield(PRI_USER);
mtx_lock(&mp->mnt_listmtx);
return (mnt_vnode_next_active(mvp, mp));
}
struct vnode *
__mnt_vnode_first_active(struct vnode **mvp, struct mount *mp)
{
struct vnode *vp;
*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
MNT_ILOCK(mp);
MNT_REF(mp);
MNT_IUNLOCK(mp);
(*mvp)->v_type = VMARKER;
(*mvp)->v_mount = mp;
mtx_lock(&mp->mnt_listmtx);
vp = TAILQ_FIRST(&mp->mnt_activevnodelist);
if (vp == NULL) {
mtx_unlock(&mp->mnt_listmtx);
mnt_vnode_markerfree_active(mvp, mp);
return (NULL);
}
TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
return (mnt_vnode_next_active(mvp, mp));
}
void
__mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
{
if (*mvp == NULL)
return;
mtx_lock(&mp->mnt_listmtx);
TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
mtx_unlock(&mp->mnt_listmtx);
mnt_vnode_markerfree_active(mvp, mp);
}
Index: head/sys/net/if_ethersubr.c
===================================================================
--- head/sys/net/if_ethersubr.c (revision 327172)
+++ head/sys/net/if_ethersubr.c (revision 327173)
@@ -1,1255 +1,1253 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (c) 1982, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)if_ethersubr.c 8.1 (Berkeley) 6/10/93
* $FreeBSD$
*/
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_netgraph.h"
#include "opt_mbuf_profiling.h"
#include "opt_rss.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/bus.h>
#include <sys/eventhandler.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/module.h>
#include <sys/mbuf.h>
#include <sys/random.h>
#include <sys/socket.h>
#include <sys/sockio.h>
#include <sys/sysctl.h>
#include <sys/uuid.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/if_arp.h>
#include <net/netisr.h>
#include <net/route.h>
#include <net/if_llc.h>
#include <net/if_dl.h>
#include <net/if_types.h>
#include <net/bpf.h>
#include <net/ethernet.h>
#include <net/if_bridgevar.h>
#include <net/if_vlan_var.h>
#include <net/if_llatbl.h>
#include <net/pfil.h>
#include <net/rss_config.h>
#include <net/vnet.h>
#include <netpfil/pf/pf_mtag.h>
#if defined(INET) || defined(INET6)
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/if_ether.h>
#include <netinet/ip_carp.h>
#include <netinet/ip_var.h>
#endif
#ifdef INET6
#include <netinet6/nd6.h>
#endif
#include <security/mac/mac_framework.h>
#ifdef CTASSERT
CTASSERT(sizeof (struct ether_header) == ETHER_ADDR_LEN * 2 + 2);
CTASSERT(sizeof (struct ether_addr) == ETHER_ADDR_LEN);
#endif
VNET_DEFINE(struct pfil_head, link_pfil_hook); /* Packet filter hooks */
/* netgraph node hooks for ng_ether(4) */
void (*ng_ether_input_p)(struct ifnet *ifp, struct mbuf **mp);
void (*ng_ether_input_orphan_p)(struct ifnet *ifp, struct mbuf *m);
int (*ng_ether_output_p)(struct ifnet *ifp, struct mbuf **mp);
void (*ng_ether_attach_p)(struct ifnet *ifp);
void (*ng_ether_detach_p)(struct ifnet *ifp);
void (*vlan_input_p)(struct ifnet *, struct mbuf *);
/* if_bridge(4) support */
struct mbuf *(*bridge_input_p)(struct ifnet *, struct mbuf *);
int (*bridge_output_p)(struct ifnet *, struct mbuf *,
struct sockaddr *, struct rtentry *);
void (*bridge_dn_p)(struct mbuf *, struct ifnet *);
/* if_lagg(4) support */
struct mbuf *(*lagg_input_p)(struct ifnet *, struct mbuf *);
static const u_char etherbroadcastaddr[ETHER_ADDR_LEN] =
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
static int ether_resolvemulti(struct ifnet *, struct sockaddr **,
struct sockaddr *);
#ifdef VIMAGE
static void ether_reassign(struct ifnet *, struct vnet *, char *);
#endif
static int ether_requestencap(struct ifnet *, struct if_encap_req *);
#define senderr(e) do { error = (e); goto bad;} while (0)
static void
update_mbuf_csumflags(struct mbuf *src, struct mbuf *dst)
{
int csum_flags = 0;
if (src->m_pkthdr.csum_flags & CSUM_IP)
csum_flags |= (CSUM_IP_CHECKED|CSUM_IP_VALID);
if (src->m_pkthdr.csum_flags & CSUM_DELAY_DATA)
csum_flags |= (CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
if (src->m_pkthdr.csum_flags & CSUM_SCTP)
csum_flags |= CSUM_SCTP_VALID;
dst->m_pkthdr.csum_flags |= csum_flags;
if (csum_flags & CSUM_DATA_VALID)
dst->m_pkthdr.csum_data = 0xffff;
}
/*
* Handle link-layer encapsulation requests.
*/
static int
ether_requestencap(struct ifnet *ifp, struct if_encap_req *req)
{
struct ether_header *eh;
struct arphdr *ah;
uint16_t etype;
const u_char *lladdr;
if (req->rtype != IFENCAP_LL)
return (EOPNOTSUPP);
if (req->bufsize < ETHER_HDR_LEN)
return (ENOMEM);
eh = (struct ether_header *)req->buf;
lladdr = req->lladdr;
req->lladdr_off = 0;
switch (req->family) {
case AF_INET:
etype = htons(ETHERTYPE_IP);
break;
case AF_INET6:
etype = htons(ETHERTYPE_IPV6);
break;
case AF_ARP:
ah = (struct arphdr *)req->hdata;
ah->ar_hrd = htons(ARPHRD_ETHER);
switch(ntohs(ah->ar_op)) {
case ARPOP_REVREQUEST:
case ARPOP_REVREPLY:
etype = htons(ETHERTYPE_REVARP);
break;
case ARPOP_REQUEST:
case ARPOP_REPLY:
default:
etype = htons(ETHERTYPE_ARP);
break;
}
if (req->flags & IFENCAP_FLAG_BROADCAST)
lladdr = ifp->if_broadcastaddr;
break;
default:
return (EAFNOSUPPORT);
}
memcpy(&eh->ether_type, &etype, sizeof(eh->ether_type));
memcpy(eh->ether_dhost, lladdr, ETHER_ADDR_LEN);
memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN);
req->bufsize = sizeof(struct ether_header);
return (0);
}
static int
ether_resolve_addr(struct ifnet *ifp, struct mbuf *m,
const struct sockaddr *dst, struct route *ro, u_char *phdr,
uint32_t *pflags, struct llentry **plle)
{
struct ether_header *eh;
uint32_t lleflags = 0;
int error = 0;
#if defined(INET) || defined(INET6)
uint16_t etype;
#endif
if (plle)
*plle = NULL;
eh = (struct ether_header *)phdr;
switch (dst->sa_family) {
#ifdef INET
case AF_INET:
if ((m->m_flags & (M_BCAST | M_MCAST)) == 0)
error = arpresolve(ifp, 0, m, dst, phdr, &lleflags,
plle);
else {
if (m->m_flags & M_BCAST)
memcpy(eh->ether_dhost, ifp->if_broadcastaddr,
ETHER_ADDR_LEN);
else {
const struct in_addr *a;
a = &(((const struct sockaddr_in *)dst)->sin_addr);
ETHER_MAP_IP_MULTICAST(a, eh->ether_dhost);
}
etype = htons(ETHERTYPE_IP);
memcpy(&eh->ether_type, &etype, sizeof(etype));
memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN);
}
break;
#endif
#ifdef INET6
case AF_INET6:
if ((m->m_flags & M_MCAST) == 0)
error = nd6_resolve(ifp, 0, m, dst, phdr, &lleflags,
plle);
else {
const struct in6_addr *a6;
a6 = &(((const struct sockaddr_in6 *)dst)->sin6_addr);
ETHER_MAP_IPV6_MULTICAST(a6, eh->ether_dhost);
etype = htons(ETHERTYPE_IPV6);
memcpy(&eh->ether_type, &etype, sizeof(etype));
memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN);
}
break;
#endif
default:
if_printf(ifp, "can't handle af%d\n", dst->sa_family);
if (m != NULL)
m_freem(m);
return (EAFNOSUPPORT);
}
if (error == EHOSTDOWN) {
if (ro != NULL && (ro->ro_flags & RT_HAS_GW) != 0)
error = EHOSTUNREACH;
}
if (error != 0)
return (error);
*pflags = RT_MAY_LOOP;
if (lleflags & LLE_IFADDR)
*pflags |= RT_L2_ME;
return (0);
}
/*
* Ethernet output routine.
* Encapsulate a packet of type family for the local net.
* Use trailer local net encapsulation if enough data in first
* packet leaves a multiple of 512 bytes of data in remainder.
*/
int
ether_output(struct ifnet *ifp, struct mbuf *m,
const struct sockaddr *dst, struct route *ro)
{
int error = 0;
char linkhdr[ETHER_HDR_LEN], *phdr;
struct ether_header *eh;
struct pf_mtag *t;
int loop_copy = 1;
int hlen; /* link layer header length */
uint32_t pflags;
struct llentry *lle = NULL;
- struct rtentry *rt0 = NULL;
int addref = 0;
phdr = NULL;
pflags = 0;
if (ro != NULL) {
/* XXX BPF uses ro_prepend */
if (ro->ro_prepend != NULL) {
phdr = ro->ro_prepend;
hlen = ro->ro_plen;
} else if (!(m->m_flags & (M_BCAST | M_MCAST))) {
if ((ro->ro_flags & RT_LLE_CACHE) != 0) {
lle = ro->ro_lle;
if (lle != NULL &&
(lle->la_flags & LLE_VALID) == 0) {
LLE_FREE(lle);
lle = NULL; /* redundant */
ro->ro_lle = NULL;
}
if (lle == NULL) {
/* if we lookup, keep cache */
addref = 1;
}
}
if (lle != NULL) {
phdr = lle->r_linkdata;
hlen = lle->r_hdrlen;
pflags = lle->r_flags;
}
}
- rt0 = ro->ro_rt;
}
#ifdef MAC
error = mac_ifnet_check_transmit(ifp, m);
if (error)
senderr(error);
#endif
M_PROFILE(m);
if (ifp->if_flags & IFF_MONITOR)
senderr(ENETDOWN);
if (!((ifp->if_flags & IFF_UP) &&
(ifp->if_drv_flags & IFF_DRV_RUNNING)))
senderr(ENETDOWN);
if (phdr == NULL) {
/* No prepend data supplied. Try to calculate ourselves. */
phdr = linkhdr;
hlen = ETHER_HDR_LEN;
error = ether_resolve_addr(ifp, m, dst, ro, phdr, &pflags,
addref ? &lle : NULL);
if (addref && lle != NULL)
ro->ro_lle = lle;
if (error != 0)
return (error == EWOULDBLOCK ? 0 : error);
}
if ((pflags & RT_L2_ME) != 0) {
update_mbuf_csumflags(m, m);
return (if_simloop(ifp, m, dst->sa_family, 0));
}
loop_copy = pflags & RT_MAY_LOOP;
/*
* Add local net header. If no space in first mbuf,
* allocate another.
*
* Note that we do prepend regardless of RT_HAS_HEADER flag.
* This is done because BPF code shifts m_data pointer
* to the end of ethernet header prior to calling if_output().
*/
M_PREPEND(m, hlen, M_NOWAIT);
if (m == NULL)
senderr(ENOBUFS);
if ((pflags & RT_HAS_HEADER) == 0) {
eh = mtod(m, struct ether_header *);
memcpy(eh, phdr, hlen);
}
/*
* If a simplex interface, and the packet is being sent to our
* Ethernet address or a broadcast address, loopback a copy.
* XXX To make a simplex device behave exactly like a duplex
* device, we should copy in the case of sending to our own
* ethernet address (thus letting the original actually appear
* on the wire). However, we don't do that here for security
* reasons and compatibility with the original behavior.
*/
if ((m->m_flags & M_BCAST) && loop_copy && (ifp->if_flags & IFF_SIMPLEX) &&
((t = pf_find_mtag(m)) == NULL || !t->routed)) {
struct mbuf *n;
/*
* Because if_simloop() modifies the packet, we need a
* writable copy through m_dup() instead of a readonly
* one as m_copy[m] would give us. The alternative would
* be to modify if_simloop() to handle the readonly mbuf,
* but performancewise it is mostly equivalent (trading
* extra data copying vs. extra locking).
*
* XXX This is a local workaround. A number of less
* often used kernel parts suffer from the same bug.
* See PR kern/105943 for a proposed general solution.
*/
if ((n = m_dup(m, M_NOWAIT)) != NULL) {
update_mbuf_csumflags(m, n);
(void)if_simloop(ifp, n, dst->sa_family, hlen);
} else
if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
}
/*
* Bridges require special output handling.
*/
if (ifp->if_bridge) {
BRIDGE_OUTPUT(ifp, m, error);
return (error);
}
#if defined(INET) || defined(INET6)
if (ifp->if_carp &&
(error = (*carp_output_p)(ifp, m, dst)))
goto bad;
#endif
/* Handle ng_ether(4) processing, if any */
if (ifp->if_l2com != NULL) {
KASSERT(ng_ether_output_p != NULL,
("ng_ether_output_p is NULL"));
if ((error = (*ng_ether_output_p)(ifp, &m)) != 0) {
bad: if (m != NULL)
m_freem(m);
return (error);
}
if (m == NULL)
return (0);
}
/* Continue with link-layer output */
return ether_output_frame(ifp, m);
}
/*
* Ethernet link layer output routine to send a raw frame to the device.
*
* This assumes that the 14 byte Ethernet header is present and contiguous
* in the first mbuf (if BRIDGE'ing).
*/
int
ether_output_frame(struct ifnet *ifp, struct mbuf *m)
{
int i;
if (PFIL_HOOKED(&V_link_pfil_hook)) {
i = pfil_run_hooks(&V_link_pfil_hook, &m, ifp, PFIL_OUT, NULL);
if (i != 0)
return (EACCES);
if (m == NULL)
return (0);
}
/*
* Queue message on interface, update output statistics if
* successful, and start output if interface not yet active.
*/
return ((ifp->if_transmit)(ifp, m));
}
/*
* Process a received Ethernet packet; the packet is in the
* mbuf chain m with the ethernet header at the front.
*/
static void
ether_input_internal(struct ifnet *ifp, struct mbuf *m)
{
struct ether_header *eh;
u_short etype;
if ((ifp->if_flags & IFF_UP) == 0) {
m_freem(m);
return;
}
#ifdef DIAGNOSTIC
if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
if_printf(ifp, "discard frame at !IFF_DRV_RUNNING\n");
m_freem(m);
return;
}
#endif
if (m->m_len < ETHER_HDR_LEN) {
/* XXX maybe should pullup? */
if_printf(ifp, "discard frame w/o leading ethernet "
"header (len %u pkt len %u)\n",
m->m_len, m->m_pkthdr.len);
if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
m_freem(m);
return;
}
eh = mtod(m, struct ether_header *);
etype = ntohs(eh->ether_type);
random_harvest_queue(m, sizeof(*m), 2, RANDOM_NET_ETHER);
CURVNET_SET_QUIET(ifp->if_vnet);
if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
if (ETHER_IS_BROADCAST(eh->ether_dhost))
m->m_flags |= M_BCAST;
else
m->m_flags |= M_MCAST;
if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1);
}
#ifdef MAC
/*
* Tag the mbuf with an appropriate MAC label before any other
* consumers can get to it.
*/
mac_ifnet_create_mbuf(ifp, m);
#endif
/*
* Give bpf a chance at the packet.
*/
ETHER_BPF_MTAP(ifp, m);
/*
* If the CRC is still on the packet, trim it off. We do this once
* and once only in case we are re-entered. Nothing else on the
* Ethernet receive path expects to see the FCS.
*/
if (m->m_flags & M_HASFCS) {
m_adj(m, -ETHER_CRC_LEN);
m->m_flags &= ~M_HASFCS;
}
if (!(ifp->if_capenable & IFCAP_HWSTATS))
if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
/* Allow monitor mode to claim this frame, after stats are updated. */
if (ifp->if_flags & IFF_MONITOR) {
m_freem(m);
CURVNET_RESTORE();
return;
}
/* Handle input from a lagg(4) port */
if (ifp->if_type == IFT_IEEE8023ADLAG) {
KASSERT(lagg_input_p != NULL,
("%s: if_lagg not loaded!", __func__));
m = (*lagg_input_p)(ifp, m);
if (m != NULL)
ifp = m->m_pkthdr.rcvif;
else {
CURVNET_RESTORE();
return;
}
}
/*
* If the hardware did not process an 802.1Q tag, do this now,
* to allow 802.1P priority frames to be passed to the main input
* path correctly.
* TODO: Deal with Q-in-Q frames, but not arbitrary nesting levels.
*/
if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_VLAN) {
struct ether_vlan_header *evl;
if (m->m_len < sizeof(*evl) &&
(m = m_pullup(m, sizeof(*evl))) == NULL) {
#ifdef DIAGNOSTIC
if_printf(ifp, "cannot pullup VLAN header\n");
#endif
if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
CURVNET_RESTORE();
return;
}
evl = mtod(m, struct ether_vlan_header *);
m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
m->m_flags |= M_VLANTAG;
bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
ETHER_HDR_LEN - ETHER_TYPE_LEN);
m_adj(m, ETHER_VLAN_ENCAP_LEN);
eh = mtod(m, struct ether_header *);
}
M_SETFIB(m, ifp->if_fib);
/* Allow ng_ether(4) to claim this frame. */
if (ifp->if_l2com != NULL) {
KASSERT(ng_ether_input_p != NULL,
("%s: ng_ether_input_p is NULL", __func__));
m->m_flags &= ~M_PROMISC;
(*ng_ether_input_p)(ifp, &m);
if (m == NULL) {
CURVNET_RESTORE();
return;
}
eh = mtod(m, struct ether_header *);
}
/*
* Allow if_bridge(4) to claim this frame.
* The BRIDGE_INPUT() macro will update ifp if the bridge changed it
* and the frame should be delivered locally.
*/
if (ifp->if_bridge != NULL) {
m->m_flags &= ~M_PROMISC;
BRIDGE_INPUT(ifp, m);
if (m == NULL) {
CURVNET_RESTORE();
return;
}
eh = mtod(m, struct ether_header *);
}
#if defined(INET) || defined(INET6)
/*
* Clear M_PROMISC on frame so that carp(4) will see it when the
* mbuf flows up to Layer 3.
* FreeBSD's implementation of carp(4) uses the inprotosw
* to dispatch IPPROTO_CARP. carp(4) also allocates its own
* Ethernet addresses of the form 00:00:5e:00:01:xx, which
* is outside the scope of the M_PROMISC test below.
* TODO: Maintain a hash table of ethernet addresses other than
* ether_dhost which may be active on this ifp.
*/
if (ifp->if_carp && (*carp_forus_p)(ifp, eh->ether_dhost)) {
m->m_flags &= ~M_PROMISC;
} else
#endif
{
/*
* If the frame received was not for our MAC address, set the
* M_PROMISC flag on the mbuf chain. The frame may need to
* be seen by the rest of the Ethernet input path in case of
* re-entry (e.g. bridge, vlan, netgraph) but should not be
* seen by upper protocol layers.
*/
if (!ETHER_IS_MULTICAST(eh->ether_dhost) &&
bcmp(IF_LLADDR(ifp), eh->ether_dhost, ETHER_ADDR_LEN) != 0)
m->m_flags |= M_PROMISC;
}
ether_demux(ifp, m);
CURVNET_RESTORE();
}
/*
* Ethernet input dispatch; by default, direct dispatch here regardless of
* global configuration. However, if RSS is enabled, hook up RSS affinity
* so that when deferred or hybrid dispatch is enabled, we can redistribute
* load based on RSS.
*
* XXXRW: Would be nice if the ifnet passed up a flag indicating whether or
* not it had already done work distribution via multi-queue. Then we could
* direct dispatch in the event load balancing was already complete and
* handle the case of interfaces with different capabilities better.
*
* XXXRW: Sort of want an M_DISTRIBUTED flag to avoid multiple distributions
* at multiple layers?
*
* XXXRW: For now, enable all this only if RSS is compiled in, although it
* works fine without RSS. Need to characterise the performance overhead
* of the detour through the netisr code in the event the result is always
* direct dispatch.
*/
static void
ether_nh_input(struct mbuf *m)
{
M_ASSERTPKTHDR(m);
KASSERT(m->m_pkthdr.rcvif != NULL,
("%s: NULL interface pointer", __func__));
ether_input_internal(m->m_pkthdr.rcvif, m);
}
static struct netisr_handler ether_nh = {
.nh_name = "ether",
.nh_handler = ether_nh_input,
.nh_proto = NETISR_ETHER,
#ifdef RSS
.nh_policy = NETISR_POLICY_CPU,
.nh_dispatch = NETISR_DISPATCH_DIRECT,
.nh_m2cpuid = rss_m2cpuid,
#else
.nh_policy = NETISR_POLICY_SOURCE,
.nh_dispatch = NETISR_DISPATCH_DIRECT,
#endif
};
static void
ether_init(__unused void *arg)
{
netisr_register(&ether_nh);
}
SYSINIT(ether, SI_SUB_INIT_IF, SI_ORDER_ANY, ether_init, NULL);
static void
vnet_ether_init(__unused void *arg)
{
int i;
/* Initialize packet filter hooks. */
V_link_pfil_hook.ph_type = PFIL_TYPE_AF;
V_link_pfil_hook.ph_af = AF_LINK;
if ((i = pfil_head_register(&V_link_pfil_hook)) != 0)
printf("%s: WARNING: unable to register pfil link hook, "
"error %d\n", __func__, i);
#ifdef VIMAGE
netisr_register_vnet(&ether_nh);
#endif
}
VNET_SYSINIT(vnet_ether_init, SI_SUB_PROTO_IF, SI_ORDER_ANY,
vnet_ether_init, NULL);
#ifdef VIMAGE
static void
vnet_ether_pfil_destroy(__unused void *arg)
{
int i;
if ((i = pfil_head_unregister(&V_link_pfil_hook)) != 0)
printf("%s: WARNING: unable to unregister pfil link hook, "
"error %d\n", __func__, i);
}
VNET_SYSUNINIT(vnet_ether_pfil_uninit, SI_SUB_PROTO_PFIL, SI_ORDER_ANY,
vnet_ether_pfil_destroy, NULL);
static void
vnet_ether_destroy(__unused void *arg)
{
netisr_unregister_vnet(&ether_nh);
}
VNET_SYSUNINIT(vnet_ether_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY,
vnet_ether_destroy, NULL);
#endif
static void
ether_input(struct ifnet *ifp, struct mbuf *m)
{
struct mbuf *mn;
/*
* The drivers are allowed to pass in a chain of packets linked with
* m_nextpkt. We split them up into separate packets here and pass
* them up. This allows the drivers to amortize the receive lock.
*/
while (m) {
mn = m->m_nextpkt;
m->m_nextpkt = NULL;
/*
* We will rely on rcvif being set properly in the deferred context,
* so assert it is correct here.
*/
KASSERT(m->m_pkthdr.rcvif == ifp, ("%s: ifnet mismatch m %p "
"rcvif %p ifp %p", __func__, m, m->m_pkthdr.rcvif, ifp));
CURVNET_SET_QUIET(ifp->if_vnet);
netisr_dispatch(NETISR_ETHER, m);
CURVNET_RESTORE();
m = mn;
}
}
/*
* Upper layer processing for a received Ethernet packet.
*/
void
ether_demux(struct ifnet *ifp, struct mbuf *m)
{
struct ether_header *eh;
int i, isr;
u_short ether_type;
KASSERT(ifp != NULL, ("%s: NULL interface pointer", __func__));
/* Do not grab PROMISC frames in case we are re-entered. */
if (PFIL_HOOKED(&V_link_pfil_hook) && !(m->m_flags & M_PROMISC)) {
i = pfil_run_hooks(&V_link_pfil_hook, &m, ifp, PFIL_IN, NULL);
if (i != 0 || m == NULL)
return;
}
eh = mtod(m, struct ether_header *);
ether_type = ntohs(eh->ether_type);
/*
* If this frame has a VLAN tag other than 0, call vlan_input()
* if its module is loaded. Otherwise, drop.
*/
if ((m->m_flags & M_VLANTAG) &&
EVL_VLANOFTAG(m->m_pkthdr.ether_vtag) != 0) {
if (ifp->if_vlantrunk == NULL) {
if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1);
m_freem(m);
return;
}
KASSERT(vlan_input_p != NULL,("%s: VLAN not loaded!",
__func__));
/* Clear before possibly re-entering ether_input(). */
m->m_flags &= ~M_PROMISC;
(*vlan_input_p)(ifp, m);
return;
}
/*
* Pass promiscuously received frames to the upper layer if the user
* requested this by setting IFF_PPROMISC. Otherwise, drop them.
*/
if ((ifp->if_flags & IFF_PPROMISC) == 0 && (m->m_flags & M_PROMISC)) {
m_freem(m);
return;
}
/*
* Reset layer specific mbuf flags to avoid confusing upper layers.
* Strip off Ethernet header.
*/
m->m_flags &= ~M_VLANTAG;
m_clrprotoflags(m);
m_adj(m, ETHER_HDR_LEN);
/*
* Dispatch frame to upper layer.
*/
switch (ether_type) {
#ifdef INET
case ETHERTYPE_IP:
isr = NETISR_IP;
break;
case ETHERTYPE_ARP:
if (ifp->if_flags & IFF_NOARP) {
/* Discard packet if ARP is disabled on interface */
m_freem(m);
return;
}
isr = NETISR_ARP;
break;
#endif
#ifdef INET6
case ETHERTYPE_IPV6:
isr = NETISR_IPV6;
break;
#endif
default:
goto discard;
}
netisr_dispatch(isr, m);
return;
discard:
/*
* Packet is to be discarded. If netgraph is present,
* hand the packet to it for last chance processing;
* otherwise dispose of it.
*/
if (ifp->if_l2com != NULL) {
KASSERT(ng_ether_input_orphan_p != NULL,
("ng_ether_input_orphan_p is NULL"));
/*
* Put back the ethernet header so netgraph has a
* consistent view of inbound packets.
*/
M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT);
(*ng_ether_input_orphan_p)(ifp, m);
return;
}
m_freem(m);
}
/*
* Convert Ethernet address to printable (loggable) representation.
* This routine is for compatibility; it's better to just use
*
* printf("%6D", <pointer to address>, ":");
*
* since there's no static buffer involved.
*/
char *
ether_sprintf(const u_char *ap)
{
static char etherbuf[18];
snprintf(etherbuf, sizeof (etherbuf), "%6D", ap, ":");
return (etherbuf);
}
/*
* Perform common duties while attaching to interface list
*/
void
ether_ifattach(struct ifnet *ifp, const u_int8_t *lla)
{
int i;
struct ifaddr *ifa;
struct sockaddr_dl *sdl;
ifp->if_addrlen = ETHER_ADDR_LEN;
ifp->if_hdrlen = ETHER_HDR_LEN;
if_attach(ifp);
ifp->if_mtu = ETHERMTU;
ifp->if_output = ether_output;
ifp->if_input = ether_input;
ifp->if_resolvemulti = ether_resolvemulti;
ifp->if_requestencap = ether_requestencap;
#ifdef VIMAGE
ifp->if_reassign = ether_reassign;
#endif
if (ifp->if_baudrate == 0)
ifp->if_baudrate = IF_Mbps(10); /* just a default */
ifp->if_broadcastaddr = etherbroadcastaddr;
ifa = ifp->if_addr;
KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__));
sdl = (struct sockaddr_dl *)ifa->ifa_addr;
sdl->sdl_type = IFT_ETHER;
sdl->sdl_alen = ifp->if_addrlen;
bcopy(lla, LLADDR(sdl), ifp->if_addrlen);
if (ifp->if_hw_addr != NULL)
bcopy(lla, ifp->if_hw_addr, ifp->if_addrlen);
bpfattach(ifp, DLT_EN10MB, ETHER_HDR_LEN);
if (ng_ether_attach_p != NULL)
(*ng_ether_attach_p)(ifp);
/* Announce Ethernet MAC address if non-zero. */
for (i = 0; i < ifp->if_addrlen; i++)
if (lla[i] != 0)
break;
if (i != ifp->if_addrlen)
if_printf(ifp, "Ethernet address: %6D\n", lla, ":");
uuid_ether_add(LLADDR(sdl));
/* Add necessary bits are setup; announce it now. */
EVENTHANDLER_INVOKE(ether_ifattach_event, ifp);
if (IS_DEFAULT_VNET(curvnet))
devctl_notify("ETHERNET", ifp->if_xname, "IFATTACH", NULL);
}
/*
* Perform common duties while detaching an Ethernet interface
*/
void
ether_ifdetach(struct ifnet *ifp)
{
struct sockaddr_dl *sdl;
sdl = (struct sockaddr_dl *)(ifp->if_addr->ifa_addr);
uuid_ether_del(LLADDR(sdl));
if (ifp->if_l2com != NULL) {
KASSERT(ng_ether_detach_p != NULL,
("ng_ether_detach_p is NULL"));
(*ng_ether_detach_p)(ifp);
}
bpfdetach(ifp);
if_detach(ifp);
}
#ifdef VIMAGE
void
ether_reassign(struct ifnet *ifp, struct vnet *new_vnet, char *unused __unused)
{
if (ifp->if_l2com != NULL) {
KASSERT(ng_ether_detach_p != NULL,
("ng_ether_detach_p is NULL"));
(*ng_ether_detach_p)(ifp);
}
if (ng_ether_attach_p != NULL) {
CURVNET_SET_QUIET(new_vnet);
(*ng_ether_attach_p)(ifp);
CURVNET_RESTORE();
}
}
#endif
SYSCTL_DECL(_net_link);
SYSCTL_NODE(_net_link, IFT_ETHER, ether, CTLFLAG_RW, 0, "Ethernet");
#if 0
/*
* This is for reference. We have a table-driven version
* of the little-endian crc32 generator, which is faster
* than the double-loop.
*/
uint32_t
ether_crc32_le(const uint8_t *buf, size_t len)
{
size_t i;
uint32_t crc;
int bit;
uint8_t data;
crc = 0xffffffff; /* initial value */
for (i = 0; i < len; i++) {
for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) {
carry = (crc ^ data) & 1;
crc >>= 1;
if (carry)
crc = (crc ^ ETHER_CRC_POLY_LE);
}
}
return (crc);
}
#else
uint32_t
ether_crc32_le(const uint8_t *buf, size_t len)
{
static const uint32_t crctab[] = {
0x00000000, 0x1db71064, 0x3b6e20c8, 0x26d930ac,
0x76dc4190, 0x6b6b51f4, 0x4db26158, 0x5005713c,
0xedb88320, 0xf00f9344, 0xd6d6a3e8, 0xcb61b38c,
0x9b64c2b0, 0x86d3d2d4, 0xa00ae278, 0xbdbdf21c
};
size_t i;
uint32_t crc;
crc = 0xffffffff; /* initial value */
for (i = 0; i < len; i++) {
crc ^= buf[i];
crc = (crc >> 4) ^ crctab[crc & 0xf];
crc = (crc >> 4) ^ crctab[crc & 0xf];
}
return (crc);
}
#endif
uint32_t
ether_crc32_be(const uint8_t *buf, size_t len)
{
size_t i;
uint32_t crc, carry;
int bit;
uint8_t data;
crc = 0xffffffff; /* initial value */
for (i = 0; i < len; i++) {
for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) {
carry = ((crc & 0x80000000) ? 1 : 0) ^ (data & 0x01);
crc <<= 1;
if (carry)
crc = (crc ^ ETHER_CRC_POLY_BE) | carry;
}
}
return (crc);
}
int
ether_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
{
struct ifaddr *ifa = (struct ifaddr *) data;
struct ifreq *ifr = (struct ifreq *) data;
int error = 0;
switch (command) {
case SIOCSIFADDR:
ifp->if_flags |= IFF_UP;
switch (ifa->ifa_addr->sa_family) {
#ifdef INET
case AF_INET:
ifp->if_init(ifp->if_softc); /* before arpwhohas */
arp_ifinit(ifp, ifa);
break;
#endif
default:
ifp->if_init(ifp->if_softc);
break;
}
break;
case SIOCGIFADDR:
{
struct sockaddr *sa;
sa = (struct sockaddr *) & ifr->ifr_data;
bcopy(IF_LLADDR(ifp),
(caddr_t) sa->sa_data, ETHER_ADDR_LEN);
}
break;
case SIOCSIFMTU:
/*
* Set the interface MTU.
*/
if (ifr->ifr_mtu > ETHERMTU) {
error = EINVAL;
} else {
ifp->if_mtu = ifr->ifr_mtu;
}
break;
default:
error = EINVAL; /* XXX netbsd has ENOTTY??? */
break;
}
return (error);
}
static int
ether_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa,
struct sockaddr *sa)
{
struct sockaddr_dl *sdl;
#ifdef INET
struct sockaddr_in *sin;
#endif
#ifdef INET6
struct sockaddr_in6 *sin6;
#endif
u_char *e_addr;
switch(sa->sa_family) {
case AF_LINK:
/*
* No mapping needed. Just check that it's a valid MC address.
*/
sdl = (struct sockaddr_dl *)sa;
e_addr = LLADDR(sdl);
if (!ETHER_IS_MULTICAST(e_addr))
return EADDRNOTAVAIL;
*llsa = NULL;
return 0;
#ifdef INET
case AF_INET:
sin = (struct sockaddr_in *)sa;
if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
return EADDRNOTAVAIL;
sdl = link_init_sdl(ifp, *llsa, IFT_ETHER);
sdl->sdl_alen = ETHER_ADDR_LEN;
e_addr = LLADDR(sdl);
ETHER_MAP_IP_MULTICAST(&sin->sin_addr, e_addr);
*llsa = (struct sockaddr *)sdl;
return 0;
#endif
#ifdef INET6
case AF_INET6:
sin6 = (struct sockaddr_in6 *)sa;
if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
/*
* An IP6 address of 0 means listen to all
* of the Ethernet multicast address used for IP6.
* (This is used for multicast routers.)
*/
ifp->if_flags |= IFF_ALLMULTI;
*llsa = NULL;
return 0;
}
if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
return EADDRNOTAVAIL;
sdl = link_init_sdl(ifp, *llsa, IFT_ETHER);
sdl->sdl_alen = ETHER_ADDR_LEN;
e_addr = LLADDR(sdl);
ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, e_addr);
*llsa = (struct sockaddr *)sdl;
return 0;
#endif
default:
/*
* Well, the text isn't quite right, but it's the name
* that counts...
*/
return EAFNOSUPPORT;
}
}
static moduledata_t ether_mod = {
.name = "ether",
};
void
ether_vlan_mtap(struct bpf_if *bp, struct mbuf *m, void *data, u_int dlen)
{
struct ether_vlan_header vlan;
struct mbuf mv, mb;
KASSERT((m->m_flags & M_VLANTAG) != 0,
("%s: vlan information not present", __func__));
KASSERT(m->m_len >= sizeof(struct ether_header),
("%s: mbuf not large enough for header", __func__));
bcopy(mtod(m, char *), &vlan, sizeof(struct ether_header));
vlan.evl_proto = vlan.evl_encap_proto;
vlan.evl_encap_proto = htons(ETHERTYPE_VLAN);
vlan.evl_tag = htons(m->m_pkthdr.ether_vtag);
m->m_len -= sizeof(struct ether_header);
m->m_data += sizeof(struct ether_header);
/*
* If a data link has been supplied by the caller, then we will need to
* re-create a stack allocated mbuf chain with the following structure:
*
* (1) mbuf #1 will contain the supplied data link
* (2) mbuf #2 will contain the vlan header
* (3) mbuf #3 will contain the original mbuf's packet data
*
* Otherwise, submit the packet and vlan header via bpf_mtap2().
*/
if (data != NULL) {
mv.m_next = m;
mv.m_data = (caddr_t)&vlan;
mv.m_len = sizeof(vlan);
mb.m_next = &mv;
mb.m_data = data;
mb.m_len = dlen;
bpf_mtap(bp, &mb);
} else
bpf_mtap2(bp, &vlan, sizeof(vlan), m);
m->m_len += sizeof(struct ether_header);
m->m_data -= sizeof(struct ether_header);
}
struct mbuf *
ether_vlanencap(struct mbuf *m, uint16_t tag)
{
struct ether_vlan_header *evl;
M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
if (m == NULL)
return (NULL);
/* M_PREPEND takes care of m_len, m_pkthdr.len for us */
if (m->m_len < sizeof(*evl)) {
m = m_pullup(m, sizeof(*evl));
if (m == NULL)
return (NULL);
}
/*
* Transform the Ethernet header into an Ethernet header
* with 802.1Q encapsulation.
*/
evl = mtod(m, struct ether_vlan_header *);
bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
(char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
evl->evl_tag = htons(tag);
return (m);
}
DECLARE_MODULE(ether, ether_mod, SI_SUB_INIT_IF, SI_ORDER_ANY);
MODULE_VERSION(ether, 1);
Index: head/sys/net/if_gif.c
===================================================================
--- head/sys/net/if_gif.c (revision 327172)
+++ head/sys/net/if_gif.c (revision 327173)
@@ -1,1066 +1,1064 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $KAME: if_gif.c,v 1.87 2001/10/19 08:50:27 itojun Exp $
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_inet.h"
#include "opt_inet6.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/module.h>
#include <sys/rmlock.h>
#include <sys/socket.h>
#include <sys/sockio.h>
#include <sys/sx.h>
#include <sys/errno.h>
#include <sys/time.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/protosw.h>
#include <sys/conf.h>
#include <machine/cpu.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/if_clone.h>
#include <net/if_types.h>
#include <net/netisr.h>
#include <net/route.h>
#include <net/bpf.h>
#include <net/vnet.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/ip_ecn.h>
#ifdef INET
#include <netinet/in_var.h>
#include <netinet/ip_var.h>
#endif /* INET */
#ifdef INET6
#ifndef INET
#include <netinet/in.h>
#endif
#include <netinet6/in6_var.h>
#include <netinet/ip6.h>
#include <netinet6/ip6_ecn.h>
#include <netinet6/ip6_var.h>
#include <netinet6/scope6_var.h>
#include <netinet6/ip6protosw.h>
#endif /* INET6 */
#include <netinet/ip_encap.h>
#include <net/ethernet.h>
#include <net/if_bridgevar.h>
#include <net/if_gif.h>
#include <security/mac/mac_framework.h>
static const char gifname[] = "gif";
/*
* gif_mtx protects a per-vnet gif_softc_list.
*/
static VNET_DEFINE(struct mtx, gif_mtx);
#define V_gif_mtx VNET(gif_mtx)
static MALLOC_DEFINE(M_GIF, "gif", "Generic Tunnel Interface");
static VNET_DEFINE(LIST_HEAD(, gif_softc), gif_softc_list);
#define V_gif_softc_list VNET(gif_softc_list)
static struct sx gif_ioctl_sx;
SX_SYSINIT(gif_ioctl_sx, &gif_ioctl_sx, "gif_ioctl");
#define GIF_LIST_LOCK_INIT(x) mtx_init(&V_gif_mtx, "gif_mtx", \
NULL, MTX_DEF)
#define GIF_LIST_LOCK_DESTROY(x) mtx_destroy(&V_gif_mtx)
#define GIF_LIST_LOCK(x) mtx_lock(&V_gif_mtx)
#define GIF_LIST_UNLOCK(x) mtx_unlock(&V_gif_mtx)
void (*ng_gif_input_p)(struct ifnet *ifp, struct mbuf **mp, int af);
void (*ng_gif_input_orphan_p)(struct ifnet *ifp, struct mbuf *m, int af);
void (*ng_gif_attach_p)(struct ifnet *ifp);
void (*ng_gif_detach_p)(struct ifnet *ifp);
static int gif_check_nesting(struct ifnet *, struct mbuf *);
static int gif_set_tunnel(struct ifnet *, struct sockaddr *,
struct sockaddr *);
static void gif_delete_tunnel(struct ifnet *);
static int gif_ioctl(struct ifnet *, u_long, caddr_t);
static int gif_transmit(struct ifnet *, struct mbuf *);
static void gif_qflush(struct ifnet *);
static int gif_clone_create(struct if_clone *, int, caddr_t);
static void gif_clone_destroy(struct ifnet *);
static VNET_DEFINE(struct if_clone *, gif_cloner);
#define V_gif_cloner VNET(gif_cloner)
static int gifmodevent(module_t, int, void *);
SYSCTL_DECL(_net_link);
static SYSCTL_NODE(_net_link, IFT_GIF, gif, CTLFLAG_RW, 0,
"Generic Tunnel Interface");
#ifndef MAX_GIF_NEST
/*
* This macro controls the default upper limitation on nesting of gif tunnels.
* Since, setting a large value to this macro with a careless configuration
* may introduce system crash, we don't allow any nestings by default.
* If you need to configure nested gif tunnels, you can define this macro
* in your kernel configuration file. However, if you do so, please be
* careful to configure the tunnels so that it won't make a loop.
*/
#define MAX_GIF_NEST 1
#endif
static VNET_DEFINE(int, max_gif_nesting) = MAX_GIF_NEST;
#define V_max_gif_nesting VNET(max_gif_nesting)
SYSCTL_INT(_net_link_gif, OID_AUTO, max_nesting, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(max_gif_nesting), 0, "Max nested tunnels");
/*
* By default, we disallow creation of multiple tunnels between the same
* pair of addresses. Some applications require this functionality so
* we allow control over this check here.
*/
#ifdef XBONEHACK
static VNET_DEFINE(int, parallel_tunnels) = 1;
#else
static VNET_DEFINE(int, parallel_tunnels) = 0;
#endif
#define V_parallel_tunnels VNET(parallel_tunnels)
SYSCTL_INT(_net_link_gif, OID_AUTO, parallel_tunnels,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(parallel_tunnels), 0,
"Allow parallel tunnels?");
static int
gif_clone_create(struct if_clone *ifc, int unit, caddr_t params)
{
struct gif_softc *sc;
sc = malloc(sizeof(struct gif_softc), M_GIF, M_WAITOK | M_ZERO);
sc->gif_fibnum = curthread->td_proc->p_fibnum;
GIF2IFP(sc) = if_alloc(IFT_GIF);
GIF_LOCK_INIT(sc);
GIF2IFP(sc)->if_softc = sc;
if_initname(GIF2IFP(sc), gifname, unit);
GIF2IFP(sc)->if_addrlen = 0;
GIF2IFP(sc)->if_mtu = GIF_MTU;
GIF2IFP(sc)->if_flags = IFF_POINTOPOINT | IFF_MULTICAST;
#if 0
/* turn off ingress filter */
GIF2IFP(sc)->if_flags |= IFF_LINK2;
#endif
GIF2IFP(sc)->if_ioctl = gif_ioctl;
GIF2IFP(sc)->if_transmit = gif_transmit;
GIF2IFP(sc)->if_qflush = gif_qflush;
GIF2IFP(sc)->if_output = gif_output;
GIF2IFP(sc)->if_capabilities |= IFCAP_LINKSTATE;
GIF2IFP(sc)->if_capenable |= IFCAP_LINKSTATE;
if_attach(GIF2IFP(sc));
bpfattach(GIF2IFP(sc), DLT_NULL, sizeof(u_int32_t));
if (ng_gif_attach_p != NULL)
(*ng_gif_attach_p)(GIF2IFP(sc));
GIF_LIST_LOCK();
LIST_INSERT_HEAD(&V_gif_softc_list, sc, gif_list);
GIF_LIST_UNLOCK();
return (0);
}
static void
gif_clone_destroy(struct ifnet *ifp)
{
struct gif_softc *sc;
sx_xlock(&gif_ioctl_sx);
sc = ifp->if_softc;
gif_delete_tunnel(ifp);
GIF_LIST_LOCK();
LIST_REMOVE(sc, gif_list);
GIF_LIST_UNLOCK();
if (ng_gif_detach_p != NULL)
(*ng_gif_detach_p)(ifp);
bpfdetach(ifp);
if_detach(ifp);
ifp->if_softc = NULL;
sx_xunlock(&gif_ioctl_sx);
if_free(ifp);
GIF_LOCK_DESTROY(sc);
free(sc, M_GIF);
}
static void
vnet_gif_init(const void *unused __unused)
{
LIST_INIT(&V_gif_softc_list);
GIF_LIST_LOCK_INIT();
V_gif_cloner = if_clone_simple(gifname, gif_clone_create,
gif_clone_destroy, 0);
}
VNET_SYSINIT(vnet_gif_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
vnet_gif_init, NULL);
static void
vnet_gif_uninit(const void *unused __unused)
{
if_clone_detach(V_gif_cloner);
GIF_LIST_LOCK_DESTROY();
}
VNET_SYSUNINIT(vnet_gif_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
vnet_gif_uninit, NULL);
static int
gifmodevent(module_t mod, int type, void *data)
{
switch (type) {
case MOD_LOAD:
case MOD_UNLOAD:
break;
default:
return (EOPNOTSUPP);
}
return (0);
}
static moduledata_t gif_mod = {
"if_gif",
gifmodevent,
0
};
DECLARE_MODULE(if_gif, gif_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
MODULE_VERSION(if_gif, 1);
int
gif_encapcheck(const struct mbuf *m, int off, int proto, void *arg)
{
GIF_RLOCK_TRACKER;
const struct ip *ip;
struct gif_softc *sc;
int ret;
sc = (struct gif_softc *)arg;
if (sc == NULL || (GIF2IFP(sc)->if_flags & IFF_UP) == 0)
return (0);
ret = 0;
GIF_RLOCK(sc);
/* no physical address */
if (sc->gif_family == 0)
goto done;
switch (proto) {
#ifdef INET
case IPPROTO_IPV4:
#endif
#ifdef INET6
case IPPROTO_IPV6:
#endif
case IPPROTO_ETHERIP:
break;
default:
goto done;
}
/* Bail on short packets */
M_ASSERTPKTHDR(m);
if (m->m_pkthdr.len < sizeof(struct ip))
goto done;
ip = mtod(m, const struct ip *);
switch (ip->ip_v) {
#ifdef INET
case 4:
if (sc->gif_family != AF_INET)
goto done;
ret = in_gif_encapcheck(m, off, proto, arg);
break;
#endif
#ifdef INET6
case 6:
if (m->m_pkthdr.len < sizeof(struct ip6_hdr))
goto done;
if (sc->gif_family != AF_INET6)
goto done;
ret = in6_gif_encapcheck(m, off, proto, arg);
break;
#endif
}
done:
GIF_RUNLOCK(sc);
return (ret);
}
static int
gif_transmit(struct ifnet *ifp, struct mbuf *m)
{
struct gif_softc *sc;
struct etherip_header *eth;
#ifdef INET
struct ip *ip;
#endif
#ifdef INET6
struct ip6_hdr *ip6;
uint32_t t;
#endif
uint32_t af;
uint8_t proto, ecn;
int error;
#ifdef MAC
error = mac_ifnet_check_transmit(ifp, m);
if (error) {
m_freem(m);
goto err;
}
#endif
error = ENETDOWN;
sc = ifp->if_softc;
if ((ifp->if_flags & IFF_MONITOR) != 0 ||
(ifp->if_flags & IFF_UP) == 0 ||
sc->gif_family == 0 ||
(error = gif_check_nesting(ifp, m)) != 0) {
m_freem(m);
goto err;
}
/* Now pull back the af that we stashed in the csum_data. */
if (ifp->if_bridge)
af = AF_LINK;
else
af = m->m_pkthdr.csum_data;
m->m_flags &= ~(M_BCAST|M_MCAST);
M_SETFIB(m, sc->gif_fibnum);
BPF_MTAP2(ifp, &af, sizeof(af), m);
if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len);
/* inner AF-specific encapsulation */
ecn = 0;
switch (af) {
#ifdef INET
case AF_INET:
proto = IPPROTO_IPV4;
if (m->m_len < sizeof(struct ip))
m = m_pullup(m, sizeof(struct ip));
if (m == NULL) {
error = ENOBUFS;
goto err;
}
ip = mtod(m, struct ip *);
ip_ecn_ingress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED:
ECN_NOCARE, &ecn, &ip->ip_tos);
break;
#endif
#ifdef INET6
case AF_INET6:
proto = IPPROTO_IPV6;
if (m->m_len < sizeof(struct ip6_hdr))
m = m_pullup(m, sizeof(struct ip6_hdr));
if (m == NULL) {
error = ENOBUFS;
goto err;
}
t = 0;
ip6 = mtod(m, struct ip6_hdr *);
ip6_ecn_ingress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED:
ECN_NOCARE, &t, &ip6->ip6_flow);
ecn = (ntohl(t) >> 20) & 0xff;
break;
#endif
case AF_LINK:
proto = IPPROTO_ETHERIP;
M_PREPEND(m, sizeof(struct etherip_header), M_NOWAIT);
if (m == NULL) {
error = ENOBUFS;
goto err;
}
eth = mtod(m, struct etherip_header *);
eth->eip_resvh = 0;
eth->eip_ver = ETHERIP_VERSION;
eth->eip_resvl = 0;
break;
default:
error = EAFNOSUPPORT;
m_freem(m);
goto err;
}
/* XXX should we check if our outer source is legal? */
/* dispatch to output logic based on outer AF */
switch (sc->gif_family) {
#ifdef INET
case AF_INET:
error = in_gif_output(ifp, m, proto, ecn);
break;
#endif
#ifdef INET6
case AF_INET6:
error = in6_gif_output(ifp, m, proto, ecn);
break;
#endif
default:
m_freem(m);
}
err:
if (error)
if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
return (error);
}
static void
gif_qflush(struct ifnet *ifp __unused)
{
}
#define MTAG_GIF 1080679712
static int
gif_check_nesting(struct ifnet *ifp, struct mbuf *m)
{
struct m_tag *mtag;
int count;
/*
* gif may cause infinite recursion calls when misconfigured.
* We'll prevent this by detecting loops.
*
* High nesting level may cause stack exhaustion.
* We'll prevent this by introducing upper limit.
*/
count = 1;
mtag = NULL;
while ((mtag = m_tag_locate(m, MTAG_GIF, 0, mtag)) != NULL) {
if (*(struct ifnet **)(mtag + 1) == ifp) {
log(LOG_NOTICE, "%s: loop detected\n", if_name(ifp));
return (EIO);
}
count++;
}
if (count > V_max_gif_nesting) {
log(LOG_NOTICE,
"%s: if_output recursively called too many times(%d)\n",
if_name(ifp), count);
return (EIO);
}
mtag = m_tag_alloc(MTAG_GIF, 0, sizeof(struct ifnet *), M_NOWAIT);
if (mtag == NULL)
return (ENOMEM);
*(struct ifnet **)(mtag + 1) = ifp;
m_tag_prepend(m, mtag);
return (0);
}
int
gif_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
struct route *ro)
{
uint32_t af;
if (dst->sa_family == AF_UNSPEC)
bcopy(dst->sa_data, &af, sizeof(af));
else
af = dst->sa_family;
/*
* Now save the af in the inbound pkt csum data, this is a cheat since
* we are using the inbound csum_data field to carry the af over to
* the gif_transmit() routine, avoiding using yet another mtag.
*/
m->m_pkthdr.csum_data = af;
return (ifp->if_transmit(ifp, m));
}
void
gif_input(struct mbuf *m, struct ifnet *ifp, int proto, uint8_t ecn)
{
struct etherip_header *eip;
#ifdef INET
struct ip *ip;
#endif
#ifdef INET6
struct ip6_hdr *ip6;
uint32_t t;
#endif
- struct gif_softc *sc;
struct ether_header *eh;
struct ifnet *oldifp;
int isr, n, af;
if (ifp == NULL) {
/* just in case */
m_freem(m);
return;
}
- sc = ifp->if_softc;
m->m_pkthdr.rcvif = ifp;
m_clrprotoflags(m);
switch (proto) {
#ifdef INET
case IPPROTO_IPV4:
af = AF_INET;
if (m->m_len < sizeof(struct ip))
m = m_pullup(m, sizeof(struct ip));
if (m == NULL)
goto drop;
ip = mtod(m, struct ip *);
if (ip_ecn_egress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED:
ECN_NOCARE, &ecn, &ip->ip_tos) == 0) {
m_freem(m);
goto drop;
}
break;
#endif
#ifdef INET6
case IPPROTO_IPV6:
af = AF_INET6;
if (m->m_len < sizeof(struct ip6_hdr))
m = m_pullup(m, sizeof(struct ip6_hdr));
if (m == NULL)
goto drop;
t = htonl((uint32_t)ecn << 20);
ip6 = mtod(m, struct ip6_hdr *);
if (ip6_ecn_egress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED:
ECN_NOCARE, &t, &ip6->ip6_flow) == 0) {
m_freem(m);
goto drop;
}
break;
#endif
case IPPROTO_ETHERIP:
af = AF_LINK;
break;
default:
m_freem(m);
goto drop;
}
#ifdef MAC
mac_ifnet_create_mbuf(ifp, m);
#endif
if (bpf_peers_present(ifp->if_bpf)) {
uint32_t af1 = af;
bpf_mtap2(ifp->if_bpf, &af1, sizeof(af1), m);
}
if ((ifp->if_flags & IFF_MONITOR) != 0) {
if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
m_freem(m);
return;
}
if (ng_gif_input_p != NULL) {
(*ng_gif_input_p)(ifp, &m, af);
if (m == NULL)
goto drop;
}
/*
* Put the packet to the network layer input queue according to the
* specified address family.
* Note: older versions of gif_input directly called network layer
* input functions, e.g. ip6_input, here. We changed the policy to
* prevent too many recursive calls of such input functions, which
* might cause kernel panic. But the change may introduce another
* problem; if the input queue is full, packets are discarded.
* The kernel stack overflow really happened, and we believed
* queue-full rarely occurs, so we changed the policy.
*/
switch (af) {
#ifdef INET
case AF_INET:
isr = NETISR_IP;
break;
#endif
#ifdef INET6
case AF_INET6:
isr = NETISR_IPV6;
break;
#endif
case AF_LINK:
n = sizeof(struct etherip_header) + sizeof(struct ether_header);
if (n > m->m_len)
m = m_pullup(m, n);
if (m == NULL)
goto drop;
eip = mtod(m, struct etherip_header *);
if (eip->eip_ver != ETHERIP_VERSION) {
/* discard unknown versions */
m_freem(m);
goto drop;
}
m_adj(m, sizeof(struct etherip_header));
m->m_flags &= ~(M_BCAST|M_MCAST);
m->m_pkthdr.rcvif = ifp;
if (ifp->if_bridge) {
oldifp = ifp;
eh = mtod(m, struct ether_header *);
if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
if (ETHER_IS_BROADCAST(eh->ether_dhost))
m->m_flags |= M_BCAST;
else
m->m_flags |= M_MCAST;
if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1);
}
BRIDGE_INPUT(ifp, m);
if (m != NULL && ifp != oldifp) {
/*
* The bridge gave us back itself or one of the
* members for which the frame is addressed.
*/
ether_demux(ifp, m);
return;
}
}
if (m != NULL)
m_freem(m);
return;
default:
if (ng_gif_input_orphan_p != NULL)
(*ng_gif_input_orphan_p)(ifp, m, af);
else
m_freem(m);
return;
}
if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
M_SETFIB(m, ifp->if_fib);
netisr_dispatch(isr, m);
return;
drop:
if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
}
/* XXX how should we handle IPv6 scope on SIOC[GS]IFPHYADDR? */
int
gif_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
{
GIF_RLOCK_TRACKER;
struct ifreq *ifr = (struct ifreq*)data;
struct sockaddr *dst, *src;
struct gif_softc *sc;
#ifdef INET
struct sockaddr_in *sin = NULL;
#endif
#ifdef INET6
struct sockaddr_in6 *sin6 = NULL;
#endif
u_int options;
int error;
switch (cmd) {
case SIOCSIFADDR:
ifp->if_flags |= IFF_UP;
case SIOCADDMULTI:
case SIOCDELMULTI:
case SIOCGIFMTU:
case SIOCSIFFLAGS:
return (0);
case SIOCSIFMTU:
if (ifr->ifr_mtu < GIF_MTU_MIN ||
ifr->ifr_mtu > GIF_MTU_MAX)
return (EINVAL);
else
ifp->if_mtu = ifr->ifr_mtu;
return (0);
}
sx_xlock(&gif_ioctl_sx);
sc = ifp->if_softc;
if (sc == NULL) {
error = ENXIO;
goto bad;
}
error = 0;
switch (cmd) {
case SIOCSIFPHYADDR:
#ifdef INET6
case SIOCSIFPHYADDR_IN6:
#endif
error = EINVAL;
switch (cmd) {
#ifdef INET
case SIOCSIFPHYADDR:
src = (struct sockaddr *)
&(((struct in_aliasreq *)data)->ifra_addr);
dst = (struct sockaddr *)
&(((struct in_aliasreq *)data)->ifra_dstaddr);
break;
#endif
#ifdef INET6
case SIOCSIFPHYADDR_IN6:
src = (struct sockaddr *)
&(((struct in6_aliasreq *)data)->ifra_addr);
dst = (struct sockaddr *)
&(((struct in6_aliasreq *)data)->ifra_dstaddr);
break;
#endif
default:
goto bad;
}
/* sa_family must be equal */
if (src->sa_family != dst->sa_family ||
src->sa_len != dst->sa_len)
goto bad;
/* validate sa_len */
/* check sa_family looks sane for the cmd */
switch (src->sa_family) {
#ifdef INET
case AF_INET:
if (src->sa_len != sizeof(struct sockaddr_in))
goto bad;
if (cmd != SIOCSIFPHYADDR) {
error = EAFNOSUPPORT;
goto bad;
}
if (satosin(src)->sin_addr.s_addr == INADDR_ANY ||
satosin(dst)->sin_addr.s_addr == INADDR_ANY) {
error = EADDRNOTAVAIL;
goto bad;
}
break;
#endif
#ifdef INET6
case AF_INET6:
if (src->sa_len != sizeof(struct sockaddr_in6))
goto bad;
if (cmd != SIOCSIFPHYADDR_IN6) {
error = EAFNOSUPPORT;
goto bad;
}
error = EADDRNOTAVAIL;
if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(src)->sin6_addr)
||
IN6_IS_ADDR_UNSPECIFIED(&satosin6(dst)->sin6_addr))
goto bad;
/*
* Check validity of the scope zone ID of the
* addresses, and convert it into the kernel
* internal form if necessary.
*/
error = sa6_embedscope(satosin6(src), 0);
if (error != 0)
goto bad;
error = sa6_embedscope(satosin6(dst), 0);
if (error != 0)
goto bad;
break;
#endif
default:
error = EAFNOSUPPORT;
goto bad;
}
error = gif_set_tunnel(ifp, src, dst);
break;
case SIOCDIFPHYADDR:
gif_delete_tunnel(ifp);
break;
case SIOCGIFPSRCADDR:
case SIOCGIFPDSTADDR:
#ifdef INET6
case SIOCGIFPSRCADDR_IN6:
case SIOCGIFPDSTADDR_IN6:
#endif
if (sc->gif_family == 0) {
error = EADDRNOTAVAIL;
break;
}
GIF_RLOCK(sc);
switch (cmd) {
#ifdef INET
case SIOCGIFPSRCADDR:
case SIOCGIFPDSTADDR:
if (sc->gif_family != AF_INET) {
error = EADDRNOTAVAIL;
break;
}
sin = (struct sockaddr_in *)&ifr->ifr_addr;
memset(sin, 0, sizeof(*sin));
sin->sin_family = AF_INET;
sin->sin_len = sizeof(*sin);
break;
#endif
#ifdef INET6
case SIOCGIFPSRCADDR_IN6:
case SIOCGIFPDSTADDR_IN6:
if (sc->gif_family != AF_INET6) {
error = EADDRNOTAVAIL;
break;
}
sin6 = (struct sockaddr_in6 *)
&(((struct in6_ifreq *)data)->ifr_addr);
memset(sin6, 0, sizeof(*sin6));
sin6->sin6_family = AF_INET6;
sin6->sin6_len = sizeof(*sin6);
break;
#endif
default:
error = EAFNOSUPPORT;
}
if (error == 0) {
switch (cmd) {
#ifdef INET
case SIOCGIFPSRCADDR:
sin->sin_addr = sc->gif_iphdr->ip_src;
break;
case SIOCGIFPDSTADDR:
sin->sin_addr = sc->gif_iphdr->ip_dst;
break;
#endif
#ifdef INET6
case SIOCGIFPSRCADDR_IN6:
sin6->sin6_addr = sc->gif_ip6hdr->ip6_src;
break;
case SIOCGIFPDSTADDR_IN6:
sin6->sin6_addr = sc->gif_ip6hdr->ip6_dst;
break;
#endif
}
}
GIF_RUNLOCK(sc);
if (error != 0)
break;
switch (cmd) {
#ifdef INET
case SIOCGIFPSRCADDR:
case SIOCGIFPDSTADDR:
error = prison_if(curthread->td_ucred,
(struct sockaddr *)sin);
if (error != 0)
memset(sin, 0, sizeof(*sin));
break;
#endif
#ifdef INET6
case SIOCGIFPSRCADDR_IN6:
case SIOCGIFPDSTADDR_IN6:
error = prison_if(curthread->td_ucred,
(struct sockaddr *)sin6);
if (error == 0)
error = sa6_recoverscope(sin6);
if (error != 0)
memset(sin6, 0, sizeof(*sin6));
#endif
}
break;
case SIOCGTUNFIB:
ifr->ifr_fib = sc->gif_fibnum;
break;
case SIOCSTUNFIB:
if ((error = priv_check(curthread, PRIV_NET_GIF)) != 0)
break;
if (ifr->ifr_fib >= rt_numfibs)
error = EINVAL;
else
sc->gif_fibnum = ifr->ifr_fib;
break;
case GIFGOPTS:
options = sc->gif_options;
error = copyout(&options, ifr->ifr_data, sizeof(options));
break;
case GIFSOPTS:
if ((error = priv_check(curthread, PRIV_NET_GIF)) != 0)
break;
error = copyin(ifr->ifr_data, &options, sizeof(options));
if (error)
break;
if (options & ~GIF_OPTMASK)
error = EINVAL;
else
sc->gif_options = options;
break;
default:
error = EINVAL;
break;
}
bad:
sx_xunlock(&gif_ioctl_sx);
return (error);
}
static void
gif_detach(struct gif_softc *sc)
{
sx_assert(&gif_ioctl_sx, SA_XLOCKED);
if (sc->gif_ecookie != NULL)
encap_detach(sc->gif_ecookie);
sc->gif_ecookie = NULL;
}
static int
gif_attach(struct gif_softc *sc, int af)
{
sx_assert(&gif_ioctl_sx, SA_XLOCKED);
switch (af) {
#ifdef INET
case AF_INET:
return (in_gif_attach(sc));
#endif
#ifdef INET6
case AF_INET6:
return (in6_gif_attach(sc));
#endif
}
return (EAFNOSUPPORT);
}
static int
gif_set_tunnel(struct ifnet *ifp, struct sockaddr *src, struct sockaddr *dst)
{
struct gif_softc *sc = ifp->if_softc;
struct gif_softc *tsc;
#ifdef INET
struct ip *ip;
#endif
#ifdef INET6
struct ip6_hdr *ip6;
#endif
void *hdr;
int error = 0;
if (sc == NULL)
return (ENXIO);
/* Disallow parallel tunnels unless instructed otherwise. */
if (V_parallel_tunnels == 0) {
GIF_LIST_LOCK();
LIST_FOREACH(tsc, &V_gif_softc_list, gif_list) {
if (tsc == sc || tsc->gif_family != src->sa_family)
continue;
#ifdef INET
if (tsc->gif_family == AF_INET &&
tsc->gif_iphdr->ip_src.s_addr ==
satosin(src)->sin_addr.s_addr &&
tsc->gif_iphdr->ip_dst.s_addr ==
satosin(dst)->sin_addr.s_addr) {
error = EADDRNOTAVAIL;
GIF_LIST_UNLOCK();
goto bad;
}
#endif
#ifdef INET6
if (tsc->gif_family == AF_INET6 &&
IN6_ARE_ADDR_EQUAL(&tsc->gif_ip6hdr->ip6_src,
&satosin6(src)->sin6_addr) &&
IN6_ARE_ADDR_EQUAL(&tsc->gif_ip6hdr->ip6_dst,
&satosin6(dst)->sin6_addr)) {
error = EADDRNOTAVAIL;
GIF_LIST_UNLOCK();
goto bad;
}
#endif
}
GIF_LIST_UNLOCK();
}
switch (src->sa_family) {
#ifdef INET
case AF_INET:
hdr = ip = malloc(sizeof(struct ip), M_GIF,
M_WAITOK | M_ZERO);
ip->ip_src.s_addr = satosin(src)->sin_addr.s_addr;
ip->ip_dst.s_addr = satosin(dst)->sin_addr.s_addr;
break;
#endif
#ifdef INET6
case AF_INET6:
hdr = ip6 = malloc(sizeof(struct ip6_hdr), M_GIF,
M_WAITOK | M_ZERO);
ip6->ip6_src = satosin6(src)->sin6_addr;
ip6->ip6_dst = satosin6(dst)->sin6_addr;
ip6->ip6_vfc = IPV6_VERSION;
break;
#endif
default:
return (EAFNOSUPPORT);
}
if (sc->gif_family != src->sa_family)
gif_detach(sc);
if (sc->gif_family == 0 ||
sc->gif_family != src->sa_family)
error = gif_attach(sc, src->sa_family);
GIF_WLOCK(sc);
if (sc->gif_family != 0)
free(sc->gif_hdr, M_GIF);
sc->gif_family = src->sa_family;
sc->gif_hdr = hdr;
GIF_WUNLOCK(sc);
#if defined(INET) || defined(INET6)
bad:
#endif
if (error == 0 && sc->gif_family != 0) {
ifp->if_drv_flags |= IFF_DRV_RUNNING;
if_link_state_change(ifp, LINK_STATE_UP);
} else {
ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
if_link_state_change(ifp, LINK_STATE_DOWN);
}
return (error);
}
static void
gif_delete_tunnel(struct ifnet *ifp)
{
struct gif_softc *sc = ifp->if_softc;
int family;
if (sc == NULL)
return;
GIF_WLOCK(sc);
family = sc->gif_family;
sc->gif_family = 0;
GIF_WUNLOCK(sc);
if (family != 0) {
gif_detach(sc);
free(sc->gif_hdr, M_GIF);
}
ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
if_link_state_change(ifp, LINK_STATE_DOWN);
}
Index: head/sys/netinet/tcp_output.c
===================================================================
--- head/sys/netinet/tcp_output.c (revision 327172)
+++ head/sys/netinet/tcp_output.c (revision 327173)
@@ -1,1868 +1,1872 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)tcp_output.c 8.4 (Berkeley) 5/24/95
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_ipsec.h"
#include "opt_tcpdebug.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/domain.h>
#ifdef TCP_HHOOK
#include <sys/hhook.h>
#endif
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mbuf.h>
#include <sys/mutex.h>
#include <sys/protosw.h>
#include <sys/sdt.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <net/if.h>
#include <net/route.h>
#include <net/vnet.h>
#include <netinet/in.h>
#include <netinet/in_kdtrace.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/ip_var.h>
#include <netinet/ip_options.h>
#ifdef INET6
#include <netinet6/in6_pcb.h>
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#endif
#ifdef TCP_RFC7413
#include <netinet/tcp_fastopen.h>
#endif
#include <netinet/tcp.h>
#define TCPOUTFLAGS
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcpip.h>
#include <netinet/cc/cc.h>
#ifdef TCPPCAP
#include <netinet/tcp_pcap.h>
#endif
#ifdef TCPDEBUG
#include <netinet/tcp_debug.h>
#endif
#ifdef TCP_OFFLOAD
#include <netinet/tcp_offload.h>
#endif
#include <netipsec/ipsec_support.h>
#include <machine/in_cksum.h>
#include <security/mac/mac_framework.h>
VNET_DEFINE(int, path_mtu_discovery) = 1;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(path_mtu_discovery), 1,
"Enable Path MTU Discovery");
VNET_DEFINE(int, tcp_do_tso) = 1;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_do_tso), 0,
"Enable TCP Segmentation Offload");
VNET_DEFINE(int, tcp_sendspace) = 1024*32;
#define V_tcp_sendspace VNET(tcp_sendspace)
SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_sendspace), 0, "Initial send socket buffer size");
VNET_DEFINE(int, tcp_do_autosndbuf) = 1;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_do_autosndbuf), 0,
"Enable automatic send buffer sizing");
VNET_DEFINE(int, tcp_autosndbuf_inc) = 8*1024;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_autosndbuf_inc), 0,
"Incrementor step size of automatic send buffer");
VNET_DEFINE(int, tcp_autosndbuf_max) = 2*1024*1024;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_autosndbuf_max), 0,
"Max size of automatic send buffer");
VNET_DEFINE(int, tcp_sendbuf_auto_lowat) = 0;
#define V_tcp_sendbuf_auto_lowat VNET(tcp_sendbuf_auto_lowat)
SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto_lowat, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_sendbuf_auto_lowat), 0,
"Modify threshold for auto send buffer growth to account for SO_SNDLOWAT");
/*
* Make sure that either retransmit or persist timer is set for SYN, FIN and
* non-ACK.
*/
#define TCP_XMIT_TIMER_ASSERT(tp, len, th_flags) \
KASSERT(((len) == 0 && ((th_flags) & (TH_SYN | TH_FIN)) == 0) ||\
tcp_timer_active((tp), TT_REXMT) || \
tcp_timer_active((tp), TT_PERSIST), \
("neither rexmt nor persist timer is set"))
#ifdef TCP_HHOOK
static void inline hhook_run_tcp_est_out(struct tcpcb *tp,
struct tcphdr *th, struct tcpopt *to,
uint32_t len, int tso);
#endif
static void inline cc_after_idle(struct tcpcb *tp);
#ifdef TCP_HHOOK
/*
* Wrapper for the TCP established output helper hook.
*/
static void inline
hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th,
struct tcpopt *to, uint32_t len, int tso)
{
struct tcp_hhook_data hhook_data;
if (V_tcp_hhh[HHOOK_TCP_EST_OUT]->hhh_nhooks > 0) {
hhook_data.tp = tp;
hhook_data.th = th;
hhook_data.to = to;
hhook_data.len = len;
hhook_data.tso = tso;
hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_OUT], &hhook_data,
tp->osd);
}
}
#endif
/*
* CC wrapper hook functions
*/
static void inline
cc_after_idle(struct tcpcb *tp)
{
INP_WLOCK_ASSERT(tp->t_inpcb);
if (CC_ALGO(tp)->after_idle != NULL)
CC_ALGO(tp)->after_idle(tp->ccv);
}
/*
* Tcp output routine: figure out what should be sent and send it.
*/
int
tcp_output(struct tcpcb *tp)
{
struct socket *so = tp->t_inpcb->inp_socket;
int32_t len;
uint32_t recwin, sendwin;
int off, flags, error = 0; /* Keep compiler happy */
struct mbuf *m;
struct ip *ip = NULL;
+#ifdef TCPDEBUG
struct ipovly *ipov = NULL;
+#endif
struct tcphdr *th;
u_char opt[TCP_MAXOLEN];
unsigned ipoptlen, optlen, hdrlen;
#if defined(IPSEC) || defined(IPSEC_SUPPORT)
unsigned ipsec_optlen = 0;
#endif
int idle, sendalot;
int sack_rxmit, sack_bytes_rxmt;
struct sackhole *p;
int tso, mtu;
struct tcpopt to;
#if 0
int maxburst = TCP_MAXBURST;
#endif
#ifdef INET6
struct ip6_hdr *ip6 = NULL;
int isipv6;
isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
#endif
INP_WLOCK_ASSERT(tp->t_inpcb);
#ifdef TCP_OFFLOAD
if (tp->t_flags & TF_TOE)
return (tcp_offload_output(tp));
#endif
#ifdef TCP_RFC7413
/*
* For TFO connections in SYN_RECEIVED, only allow the initial
* SYN|ACK and those sent by the retransmit timer.
*/
if (IS_FASTOPEN(tp->t_flags) &&
(tp->t_state == TCPS_SYN_RECEIVED) &&
SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN|ACK sent */
(tp->snd_nxt != tp->snd_una)) /* not a retransmit */
return (0);
#endif
/*
* Determine length of data that should be transmitted,
* and flags that will be used.
* If there is some data or critical controls (SYN, RST)
* to send, then transmit; otherwise, investigate further.
*/
idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
if (idle && ticks - tp->t_rcvtime >= tp->t_rxtcur)
cc_after_idle(tp);
tp->t_flags &= ~TF_LASTIDLE;
if (idle) {
if (tp->t_flags & TF_MORETOCOME) {
tp->t_flags |= TF_LASTIDLE;
idle = 0;
}
}
again:
/*
* If we've recently taken a timeout, snd_max will be greater than
* snd_nxt. There may be SACK information that allows us to avoid
* resending already delivered data. Adjust snd_nxt accordingly.
*/
if ((tp->t_flags & TF_SACK_PERMIT) &&
SEQ_LT(tp->snd_nxt, tp->snd_max))
tcp_sack_adjust(tp);
sendalot = 0;
tso = 0;
mtu = 0;
off = tp->snd_nxt - tp->snd_una;
sendwin = min(tp->snd_wnd, tp->snd_cwnd);
flags = tcp_outflags[tp->t_state];
/*
* Send any SACK-generated retransmissions. If we're explicitly trying
* to send out new data (when sendalot is 1), bypass this function.
* If we retransmit in fast recovery mode, decrement snd_cwnd, since
* we're replacing a (future) new transmission with a retransmission
* now, and we previously incremented snd_cwnd in tcp_input().
*/
/*
* Still in sack recovery , reset rxmit flag to zero.
*/
sack_rxmit = 0;
sack_bytes_rxmt = 0;
len = 0;
p = NULL;
if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp->t_flags) &&
(p = tcp_sack_output(tp, &sack_bytes_rxmt))) {
uint32_t cwin;
cwin =
imax(min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt, 0);
/* Do not retransmit SACK segments beyond snd_recover */
if (SEQ_GT(p->end, tp->snd_recover)) {
/*
* (At least) part of sack hole extends beyond
* snd_recover. Check to see if we can rexmit data
* for this hole.
*/
if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
/*
* Can't rexmit any more data for this hole.
* That data will be rexmitted in the next
* sack recovery episode, when snd_recover
* moves past p->rxmit.
*/
p = NULL;
goto after_sack_rexmit;
} else
/* Can rexmit part of the current hole */
len = ((int32_t)ulmin(cwin,
tp->snd_recover - p->rxmit));
} else
len = ((int32_t)ulmin(cwin, p->end - p->rxmit));
off = p->rxmit - tp->snd_una;
KASSERT(off >= 0,("%s: sack block to the left of una : %d",
__func__, off));
if (len > 0) {
sack_rxmit = 1;
sendalot = 1;
TCPSTAT_INC(tcps_sack_rexmits);
TCPSTAT_ADD(tcps_sack_rexmit_bytes,
min(len, tp->t_maxseg));
}
}
after_sack_rexmit:
/*
* Get standard flags, and add SYN or FIN if requested by 'hidden'
* state flags.
*/
if (tp->t_flags & TF_NEEDFIN)
flags |= TH_FIN;
if (tp->t_flags & TF_NEEDSYN)
flags |= TH_SYN;
SOCKBUF_LOCK(&so->so_snd);
/*
* If in persist timeout with window of 0, send 1 byte.
* Otherwise, if window is small but nonzero
* and timer expired, we will send what we can
* and go to transmit state.
*/
if (tp->t_flags & TF_FORCEDATA) {
if (sendwin == 0) {
/*
* If we still have some data to send, then
* clear the FIN bit. Usually this would
* happen below when it realizes that we
* aren't sending all the data. However,
* if we have exactly 1 byte of unsent data,
* then it won't clear the FIN bit below,
* and if we are in persist state, we wind
* up sending the packet without recording
* that we sent the FIN bit.
*
* We can't just blindly clear the FIN bit,
* because if we don't have any more data
* to send then the probe will be the FIN
* itself.
*/
if (off < sbused(&so->so_snd))
flags &= ~TH_FIN;
sendwin = 1;
} else {
tcp_timer_activate(tp, TT_PERSIST, 0);
tp->t_rxtshift = 0;
}
}
/*
* If snd_nxt == snd_max and we have transmitted a FIN, the
* offset will be > 0 even if so_snd.sb_cc is 0, resulting in
* a negative length. This can also occur when TCP opens up
* its congestion window while receiving additional duplicate
* acks after fast-retransmit because TCP will reset snd_nxt
* to snd_max after the fast-retransmit.
*
* In the normal retransmit-FIN-only case, however, snd_nxt will
* be set to snd_una, the offset will be 0, and the length may
* wind up 0.
*
* If sack_rxmit is true we are retransmitting from the scoreboard
* in which case len is already set.
*/
if (sack_rxmit == 0) {
if (sack_bytes_rxmt == 0)
len = ((int32_t)min(sbavail(&so->so_snd), sendwin) -
off);
else {
int32_t cwin;
/*
* We are inside of a SACK recovery episode and are
* sending new data, having retransmitted all the
* data possible in the scoreboard.
*/
len = ((int32_t)min(sbavail(&so->so_snd), tp->snd_wnd) -
off);
/*
* Don't remove this (len > 0) check !
* We explicitly check for len > 0 here (although it
* isn't really necessary), to work around a gcc
* optimization issue - to force gcc to compute
* len above. Without this check, the computation
* of len is bungled by the optimizer.
*/
if (len > 0) {
cwin = tp->snd_cwnd -
(tp->snd_nxt - tp->sack_newdata) -
sack_bytes_rxmt;
if (cwin < 0)
cwin = 0;
len = imin(len, cwin);
}
}
}
/*
* Lop off SYN bit if it has already been sent. However, if this
* is SYN-SENT state and if segment contains data and if we don't
* know that foreign host supports TAO, suppress sending segment.
*/
if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
if (tp->t_state != TCPS_SYN_RECEIVED)
flags &= ~TH_SYN;
#ifdef TCP_RFC7413
/*
* When sending additional segments following a TFO SYN|ACK,
* do not include the SYN bit.
*/
if (IS_FASTOPEN(tp->t_flags) &&
(tp->t_state == TCPS_SYN_RECEIVED))
flags &= ~TH_SYN;
#endif
off--, len++;
}
/*
* Be careful not to send data and/or FIN on SYN segments.
* This measure is needed to prevent interoperability problems
* with not fully conformant TCP implementations.
*/
if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
len = 0;
flags &= ~TH_FIN;
}
#ifdef TCP_RFC7413
/*
* When retransmitting SYN|ACK on a passively-created TFO socket,
* don't include data, as the presence of data may have caused the
* original SYN|ACK to have been dropped by a middlebox.
*/
if (IS_FASTOPEN(tp->t_flags) &&
(((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_rxtshift > 0)) ||
(flags & TH_RST)))
len = 0;
#endif
if (len <= 0) {
/*
* If FIN has been sent but not acked,
* but we haven't been called to retransmit,
* len will be < 0. Otherwise, window shrank
* after we sent into it. If window shrank to 0,
* cancel pending retransmit, pull snd_nxt back
* to (closed) window, and set the persist timer
* if it isn't already going. If the window didn't
* close completely, just wait for an ACK.
*
* We also do a general check here to ensure that
* we will set the persist timer when we have data
* to send, but a 0-byte window. This makes sure
* the persist timer is set even if the packet
* hits one of the "goto send" lines below.
*/
len = 0;
if ((sendwin == 0) && (TCPS_HAVEESTABLISHED(tp->t_state)) &&
(off < (int) sbavail(&so->so_snd))) {
tcp_timer_activate(tp, TT_REXMT, 0);
tp->t_rxtshift = 0;
tp->snd_nxt = tp->snd_una;
if (!tcp_timer_active(tp, TT_PERSIST))
tcp_setpersist(tp);
}
}
/* len will be >= 0 after this point. */
KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
tcp_sndbuf_autoscale(tp, so, sendwin);
/*
* Decide if we can use TCP Segmentation Offloading (if supported by
* hardware).
*
* TSO may only be used if we are in a pure bulk sending state. The
* presence of TCP-MD5, SACK retransmits, SACK advertizements and
* IP options prevent using TSO. With TSO the TCP header is the same
* (except for the sequence number) for all generated packets. This
* makes it impossible to transmit any options which vary per generated
* segment or packet.
*
* IPv4 handling has a clear separation of ip options and ip header
* flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does
* the right thing below to provide length of just ip options and thus
* checking for ipoptlen is enough to decide if ip options are present.
*/
#if defined(IPSEC) || defined(IPSEC_SUPPORT)
/*
* Pre-calculate here as we save another lookup into the darknesses
* of IPsec that way and can actually decide if TSO is ok.
*/
#ifdef INET6
if (isipv6 && IPSEC_ENABLED(ipv6))
ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb);
#ifdef INET
else
#endif
#endif /* INET6 */
#ifdef INET
if (IPSEC_ENABLED(ipv4))
ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb);
#endif /* INET */
#endif /* IPSEC */
#ifdef INET6
if (isipv6)
ipoptlen = ip6_optlen(tp->t_inpcb);
else
#endif
if (tp->t_inpcb->inp_options)
ipoptlen = tp->t_inpcb->inp_options->m_len -
offsetof(struct ipoption, ipopt_list);
else
ipoptlen = 0;
#if defined(IPSEC) || defined(IPSEC_SUPPORT)
ipoptlen += ipsec_optlen;
#endif
if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg &&
((tp->t_flags & TF_SIGNATURE) == 0) &&
tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
ipoptlen == 0)
tso = 1;
if (sack_rxmit) {
if (SEQ_LT(p->rxmit + len, tp->snd_una + sbused(&so->so_snd)))
flags &= ~TH_FIN;
} else {
if (SEQ_LT(tp->snd_nxt + len, tp->snd_una +
sbused(&so->so_snd)))
flags &= ~TH_FIN;
}
recwin = lmin(lmax(sbspace(&so->so_rcv), 0),
(long)TCP_MAXWIN << tp->rcv_scale);
/*
* Sender silly window avoidance. We transmit under the following
* conditions when len is non-zero:
*
* - We have a full segment (or more with TSO)
* - This is the last buffer in a write()/send() and we are
* either idle or running NODELAY
* - we've timed out (e.g. persist timer)
* - we have more then 1/2 the maximum send window's worth of
* data (receiver may be limited the window size)
* - we need to retransmit
*/
if (len) {
if (len >= tp->t_maxseg)
goto send;
/*
* NOTE! on localhost connections an 'ack' from the remote
* end may occur synchronously with the output and cause
* us to flush a buffer queued with moretocome. XXX
*
* note: the len + off check is almost certainly unnecessary.
*/
if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */
(idle || (tp->t_flags & TF_NODELAY)) &&
(uint32_t)len + (uint32_t)off >= sbavail(&so->so_snd) &&
(tp->t_flags & TF_NOPUSH) == 0) {
goto send;
}
if (tp->t_flags & TF_FORCEDATA) /* typ. timeout case */
goto send;
if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
goto send;
if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */
goto send;
if (sack_rxmit)
goto send;
}
/*
* Sending of standalone window updates.
*
* Window updates are important when we close our window due to a
* full socket buffer and are opening it again after the application
* reads data from it. Once the window has opened again and the
* remote end starts to send again the ACK clock takes over and
* provides the most current window information.
*
* We must avoid the silly window syndrome whereas every read
* from the receive buffer, no matter how small, causes a window
* update to be sent. We also should avoid sending a flurry of
* window updates when the socket buffer had queued a lot of data
* and the application is doing small reads.
*
* Prevent a flurry of pointless window updates by only sending
* an update when we can increase the advertized window by more
* than 1/4th of the socket buffer capacity. When the buffer is
* getting full or is very small be more aggressive and send an
* update whenever we can increase by two mss sized segments.
* In all other situations the ACK's to new incoming data will
* carry further window increases.
*
* Don't send an independent window update if a delayed
* ACK is pending (it will get piggy-backed on it) or the
* remote side already has done a half-close and won't send
* more data. Skip this if the connection is in T/TCP
* half-open state.
*/
if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) &&
!(tp->t_flags & TF_DELACK) &&
!TCPS_HAVERCVDFIN(tp->t_state)) {
/*
* "adv" is the amount we could increase the window,
* taking into account that we are limited by
* TCP_MAXWIN << tp->rcv_scale.
*/
int32_t adv;
int oldwin;
adv = recwin;
if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) {
oldwin = (tp->rcv_adv - tp->rcv_nxt);
adv -= oldwin;
} else
oldwin = 0;
/*
* If the new window size ends up being the same as or less
* than the old size when it is scaled, then don't force
* a window update.
*/
if (oldwin >> tp->rcv_scale >= (adv + oldwin) >> tp->rcv_scale)
goto dontupdate;
if (adv >= (int32_t)(2 * tp->t_maxseg) &&
(adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) ||
recwin <= (so->so_rcv.sb_hiwat / 8) ||
so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg))
goto send;
if (2 * adv >= (int32_t)so->so_rcv.sb_hiwat)
goto send;
}
dontupdate:
/*
* Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW
* is also a catch-all for the retransmit timer timeout case.
*/
if (tp->t_flags & TF_ACKNOW)
goto send;
if ((flags & TH_RST) ||
((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0))
goto send;
if (SEQ_GT(tp->snd_up, tp->snd_una))
goto send;
/*
* If our state indicates that FIN should be sent
* and we have not yet done so, then we need to send.
*/
if (flags & TH_FIN &&
((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
goto send;
/*
* In SACK, it is possible for tcp_output to fail to send a segment
* after the retransmission timer has been turned off. Make sure
* that the retransmission timer is set.
*/
if ((tp->t_flags & TF_SACK_PERMIT) &&
SEQ_GT(tp->snd_max, tp->snd_una) &&
!tcp_timer_active(tp, TT_REXMT) &&
!tcp_timer_active(tp, TT_PERSIST)) {
tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
goto just_return;
}
/*
* TCP window updates are not reliable, rather a polling protocol
* using ``persist'' packets is used to insure receipt of window
* updates. The three ``states'' for the output side are:
* idle not doing retransmits or persists
* persisting to move a small or zero window
* (re)transmitting and thereby not persisting
*
* tcp_timer_active(tp, TT_PERSIST)
* is true when we are in persist state.
* (tp->t_flags & TF_FORCEDATA)
* is set when we are called to send a persist packet.
* tcp_timer_active(tp, TT_REXMT)
* is set when we are retransmitting
* The output side is idle when both timers are zero.
*
* If send window is too small, there is data to transmit, and no
* retransmit or persist is pending, then go to persist state.
* If nothing happens soon, send when timer expires:
* if window is nonzero, transmit what we can,
* otherwise force out a byte.
*/
if (sbavail(&so->so_snd) && !tcp_timer_active(tp, TT_REXMT) &&
!tcp_timer_active(tp, TT_PERSIST)) {
tp->t_rxtshift = 0;
tcp_setpersist(tp);
}
/*
* No reason to send a segment, just return.
*/
just_return:
SOCKBUF_UNLOCK(&so->so_snd);
return (0);
send:
SOCKBUF_LOCK_ASSERT(&so->so_snd);
if (len > 0) {
if (len >= tp->t_maxseg)
tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT;
else
tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT;
}
/*
* Before ESTABLISHED, force sending of initial options
* unless TCP set not to do any options.
* NOTE: we assume that the IP/TCP header plus TCP options
* always fit in a single mbuf, leaving room for a maximum
* link header, i.e.
* max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
*/
optlen = 0;
#ifdef INET6
if (isipv6)
hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
else
#endif
hdrlen = sizeof (struct tcpiphdr);
/*
* Compute options for segment.
* We only have to care about SYN and established connection
* segments. Options for SYN-ACK segments are handled in TCP
* syncache.
*/
to.to_flags = 0;
if ((tp->t_flags & TF_NOOPT) == 0) {
/* Maximum segment size. */
if (flags & TH_SYN) {
tp->snd_nxt = tp->iss;
to.to_mss = tcp_mssopt(&tp->t_inpcb->inp_inc);
to.to_flags |= TOF_MSS;
#ifdef TCP_RFC7413
/*
* Only include the TFO option on the first
* transmission of the SYN|ACK on a
* passively-created TFO socket, as the presence of
* the TFO option may have caused the original
* SYN|ACK to have been dropped by a middlebox.
*/
if (IS_FASTOPEN(tp->t_flags) &&
(tp->t_state == TCPS_SYN_RECEIVED) &&
(tp->t_rxtshift == 0)) {
to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN;
to.to_tfo_cookie = (u_char *)&tp->t_tfo_cookie;
to.to_flags |= TOF_FASTOPEN;
}
#endif
}
/* Window scaling. */
if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
to.to_wscale = tp->request_r_scale;
to.to_flags |= TOF_SCALE;
}
/* Timestamps. */
if ((tp->t_flags & TF_RCVD_TSTMP) ||
((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
to.to_tsval = tcp_ts_getticks() + tp->ts_offset;
to.to_tsecr = tp->ts_recent;
to.to_flags |= TOF_TS;
}
/* Set receive buffer autosizing timestamp. */
if (tp->rfbuf_ts == 0 &&
(so->so_rcv.sb_flags & SB_AUTOSIZE))
tp->rfbuf_ts = tcp_ts_getticks();
/* Selective ACK's. */
if (tp->t_flags & TF_SACK_PERMIT) {
if (flags & TH_SYN)
to.to_flags |= TOF_SACKPERM;
else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
(tp->t_flags & TF_SACK_PERMIT) &&
tp->rcv_numsacks > 0) {
to.to_flags |= TOF_SACK;
to.to_nsacks = tp->rcv_numsacks;
to.to_sacks = (u_char *)tp->sackblks;
}
}
#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
/* TCP-MD5 (RFC2385). */
/*
* Check that TCP_MD5SIG is enabled in tcpcb to
* account the size needed to set this TCP option.
*/
if (tp->t_flags & TF_SIGNATURE)
to.to_flags |= TOF_SIGNATURE;
#endif /* TCP_SIGNATURE */
/* Processing the options. */
hdrlen += optlen = tcp_addoptions(&to, opt);
}
/*
* Adjust data length if insertion of options will
* bump the packet length beyond the t_maxseg length.
* Clear the FIN bit because we cut off the tail of
* the segment.
*/
if (len + optlen + ipoptlen > tp->t_maxseg) {
flags &= ~TH_FIN;
if (tso) {
u_int if_hw_tsomax;
u_int if_hw_tsomaxsegcount;
u_int if_hw_tsomaxsegsize;
struct mbuf *mb;
u_int moff;
int max_len;
/* extract TSO information */
if_hw_tsomax = tp->t_tsomax;
if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
/*
* Limit a TSO burst to prevent it from
* overflowing or exceeding the maximum length
* allowed by the network interface:
*/
KASSERT(ipoptlen == 0,
("%s: TSO can't do IP options", __func__));
/*
* Check if we should limit by maximum payload
* length:
*/
if (if_hw_tsomax != 0) {
/* compute maximum TSO length */
max_len = (if_hw_tsomax - hdrlen -
max_linkhdr);
if (max_len <= 0) {
len = 0;
} else if (len > max_len) {
sendalot = 1;
len = max_len;
}
}
/*
* Check if we should limit by maximum segment
* size and count:
*/
if (if_hw_tsomaxsegcount != 0 &&
if_hw_tsomaxsegsize != 0) {
/*
* Subtract one segment for the LINK
* and TCP/IP headers mbuf that will
* be prepended to this mbuf chain
* after the code in this section
* limits the number of mbufs in the
* chain to if_hw_tsomaxsegcount.
*/
if_hw_tsomaxsegcount -= 1;
max_len = 0;
mb = sbsndmbuf(&so->so_snd, off, &moff);
while (mb != NULL && max_len < len) {
u_int mlen;
u_int frags;
/*
* Get length of mbuf fragment
* and how many hardware frags,
* rounded up, it would use:
*/
mlen = (mb->m_len - moff);
frags = howmany(mlen,
if_hw_tsomaxsegsize);
/* Handle special case: Zero Length Mbuf */
if (frags == 0)
frags = 1;
/*
* Check if the fragment limit
* will be reached or exceeded:
*/
if (frags >= if_hw_tsomaxsegcount) {
max_len += min(mlen,
if_hw_tsomaxsegcount *
if_hw_tsomaxsegsize);
break;
}
max_len += mlen;
if_hw_tsomaxsegcount -= frags;
moff = 0;
mb = mb->m_next;
}
if (max_len <= 0) {
len = 0;
} else if (len > max_len) {
sendalot = 1;
len = max_len;
}
}
/*
* Prevent the last segment from being
* fractional unless the send sockbuf can be
* emptied:
*/
max_len = (tp->t_maxseg - optlen);
if (((uint32_t)off + (uint32_t)len) <
sbavail(&so->so_snd)) {
moff = len % max_len;
if (moff != 0) {
len -= moff;
sendalot = 1;
}
}
/*
* In case there are too many small fragments
* don't use TSO:
*/
if (len <= max_len) {
len = max_len;
sendalot = 1;
tso = 0;
}
/*
* Send the FIN in a separate segment
* after the bulk sending is done.
* We don't trust the TSO implementations
* to clear the FIN flag on all but the
* last segment.
*/
if (tp->t_flags & TF_NEEDFIN)
sendalot = 1;
} else {
len = tp->t_maxseg - optlen - ipoptlen;
sendalot = 1;
}
} else
tso = 0;
KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET,
("%s: len > IP_MAXPACKET", __func__));
/*#ifdef DIAGNOSTIC*/
#ifdef INET6
if (max_linkhdr + hdrlen > MCLBYTES)
#else
if (max_linkhdr + hdrlen > MHLEN)
#endif
panic("tcphdr too big");
/*#endif*/
/*
* This KASSERT is here to catch edge cases at a well defined place.
* Before, those had triggered (random) panic conditions further down.
*/
KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
/*
* Grab a header mbuf, attaching a copy of data to
* be transmitted, and initialize the header from
* the template for sends on this connection.
*/
if (len) {
struct mbuf *mb;
u_int moff;
if ((tp->t_flags & TF_FORCEDATA) && len == 1)
TCPSTAT_INC(tcps_sndprobe);
else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
tp->t_sndrexmitpack++;
TCPSTAT_INC(tcps_sndrexmitpack);
TCPSTAT_ADD(tcps_sndrexmitbyte, len);
} else {
TCPSTAT_INC(tcps_sndpack);
TCPSTAT_ADD(tcps_sndbyte, len);
}
#ifdef INET6
if (MHLEN < hdrlen + max_linkhdr)
m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
else
#endif
m = m_gethdr(M_NOWAIT, MT_DATA);
if (m == NULL) {
SOCKBUF_UNLOCK(&so->so_snd);
error = ENOBUFS;
sack_rxmit = 0;
goto out;
}
m->m_data += max_linkhdr;
m->m_len = hdrlen;
/*
* Start the m_copy functions from the closest mbuf
* to the offset in the socket buffer chain.
*/
mb = sbsndptr(&so->so_snd, off, len, &moff);
if (len <= MHLEN - hdrlen - max_linkhdr) {
m_copydata(mb, moff, len,
mtod(m, caddr_t) + hdrlen);
m->m_len += len;
} else {
m->m_next = m_copym(mb, moff, len, M_NOWAIT);
if (m->m_next == NULL) {
SOCKBUF_UNLOCK(&so->so_snd);
(void) m_free(m);
error = ENOBUFS;
sack_rxmit = 0;
goto out;
}
}
/*
* If we're sending everything we've got, set PUSH.
* (This will keep happy those implementations which only
* give data to the user when a buffer fills or
* a PUSH comes in.)
*/
if (((uint32_t)off + (uint32_t)len == sbused(&so->so_snd)) &&
!(flags & TH_SYN))
flags |= TH_PUSH;
SOCKBUF_UNLOCK(&so->so_snd);
} else {
SOCKBUF_UNLOCK(&so->so_snd);
if (tp->t_flags & TF_ACKNOW)
TCPSTAT_INC(tcps_sndacks);
else if (flags & (TH_SYN|TH_FIN|TH_RST))
TCPSTAT_INC(tcps_sndctrl);
else if (SEQ_GT(tp->snd_up, tp->snd_una))
TCPSTAT_INC(tcps_sndurg);
else
TCPSTAT_INC(tcps_sndwinup);
m = m_gethdr(M_NOWAIT, MT_DATA);
if (m == NULL) {
error = ENOBUFS;
sack_rxmit = 0;
goto out;
}
#ifdef INET6
if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
MHLEN >= hdrlen) {
M_ALIGN(m, hdrlen);
} else
#endif
m->m_data += max_linkhdr;
m->m_len = hdrlen;
}
SOCKBUF_UNLOCK_ASSERT(&so->so_snd);
m->m_pkthdr.rcvif = (struct ifnet *)0;
#ifdef MAC
mac_inpcb_create_mbuf(tp->t_inpcb, m);
#endif
#ifdef INET6
if (isipv6) {
ip6 = mtod(m, struct ip6_hdr *);
th = (struct tcphdr *)(ip6 + 1);
tcpip_fillheaders(tp->t_inpcb, ip6, th);
} else
#endif /* INET6 */
{
ip = mtod(m, struct ip *);
+#ifdef TCPDEBUG
ipov = (struct ipovly *)ip;
+#endif
th = (struct tcphdr *)(ip + 1);
tcpip_fillheaders(tp->t_inpcb, ip, th);
}
/*
* Fill in fields, remembering maximum advertised
* window for use in delaying messages about window sizes.
* If resending a FIN, be sure not to use a new sequence number.
*/
if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
tp->snd_nxt == tp->snd_max)
tp->snd_nxt--;
/*
* If we are starting a connection, send ECN setup
* SYN packet. If we are on a retransmit, we may
* resend those bits a number of times as per
* RFC 3168.
*/
if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) {
if (tp->t_rxtshift >= 1) {
if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
flags |= TH_ECE|TH_CWR;
} else
flags |= TH_ECE|TH_CWR;
}
if (tp->t_state == TCPS_ESTABLISHED &&
(tp->t_flags & TF_ECN_PERMIT)) {
/*
* If the peer has ECN, mark data packets with
* ECN capable transmission (ECT).
* Ignore pure ack packets, retransmissions and window probes.
*/
if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
!((tp->t_flags & TF_FORCEDATA) && len == 1)) {
#ifdef INET6
if (isipv6)
ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
else
#endif
ip->ip_tos |= IPTOS_ECN_ECT0;
TCPSTAT_INC(tcps_ecn_ect0);
}
/*
* Reply with proper ECN notifications.
*/
if (tp->t_flags & TF_ECN_SND_CWR) {
flags |= TH_CWR;
tp->t_flags &= ~TF_ECN_SND_CWR;
}
if (tp->t_flags & TF_ECN_SND_ECE)
flags |= TH_ECE;
}
/*
* If we are doing retransmissions, then snd_nxt will
* not reflect the first unsent octet. For ACK only
* packets, we do not want the sequence number of the
* retransmitted packet, we want the sequence number
* of the next unsent octet. So, if there is no data
* (and no SYN or FIN), use snd_max instead of snd_nxt
* when filling in ti_seq. But if we are in persist
* state, snd_max might reflect one byte beyond the
* right edge of the window, so use snd_nxt in that
* case, since we know we aren't doing a retransmission.
* (retransmit and persist are mutually exclusive...)
*/
if (sack_rxmit == 0) {
if (len || (flags & (TH_SYN|TH_FIN)) ||
tcp_timer_active(tp, TT_PERSIST))
th->th_seq = htonl(tp->snd_nxt);
else
th->th_seq = htonl(tp->snd_max);
} else {
th->th_seq = htonl(p->rxmit);
p->rxmit += len;
tp->sackhint.sack_bytes_rexmit += len;
}
th->th_ack = htonl(tp->rcv_nxt);
if (optlen) {
bcopy(opt, th + 1, optlen);
th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
}
th->th_flags = flags;
/*
* Calculate receive window. Don't shrink window,
* but avoid silly window syndrome.
*/
if (recwin < (so->so_rcv.sb_hiwat / 4) &&
recwin < tp->t_maxseg)
recwin = 0;
if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
recwin < (tp->rcv_adv - tp->rcv_nxt))
recwin = (tp->rcv_adv - tp->rcv_nxt);
/*
* According to RFC1323 the window field in a SYN (i.e., a <SYN>
* or <SYN,ACK>) segment itself is never scaled. The <SYN,ACK>
* case is handled in syncache.
*/
if (flags & TH_SYN)
th->th_win = htons((u_short)
(min(sbspace(&so->so_rcv), TCP_MAXWIN)));
else
th->th_win = htons((u_short)(recwin >> tp->rcv_scale));
/*
* Adjust the RXWIN0SENT flag - indicate that we have advertised
* a 0 window. This may cause the remote transmitter to stall. This
* flag tells soreceive() to disable delayed acknowledgements when
* draining the buffer. This can occur if the receiver is attempting
* to read more data than can be buffered prior to transmitting on
* the connection.
*/
if (th->th_win == 0) {
tp->t_sndzerowin++;
tp->t_flags |= TF_RXWIN0SENT;
} else
tp->t_flags &= ~TF_RXWIN0SENT;
if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
th->th_flags |= TH_URG;
} else
/*
* If no urgent pointer to send, then we pull
* the urgent pointer to the left edge of the send window
* so that it doesn't drift into the send window on sequence
* number wraparound.
*/
tp->snd_up = tp->snd_una; /* drag it along */
/*
* Put TCP length in extended header, and then
* checksum extended header and data.
*/
m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
if (to.to_flags & TOF_SIGNATURE) {
/*
* Calculate MD5 signature and put it into the place
* determined before.
* NOTE: since TCP options buffer doesn't point into
* mbuf's data, calculate offset and use it.
*/
if (!TCPMD5_ENABLED() || (error = TCPMD5_OUTPUT(m, th,
(u_char *)(th + 1) + (to.to_signature - opt))) != 0) {
/*
* Do not send segment if the calculation of MD5
* digest has failed.
*/
m_freem(m);
goto out;
}
}
#endif
#ifdef INET6
if (isipv6) {
/*
* There is no need to fill in ip6_plen right now.
* It will be filled later by ip6_output.
*/
m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) +
optlen + len, IPPROTO_TCP, 0);
}
#endif
#if defined(INET6) && defined(INET)
else
#endif
#ifdef INET
{
m->m_pkthdr.csum_flags = CSUM_TCP;
th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen));
/* IP version must be set here for ipv4/ipv6 checking later */
KASSERT(ip->ip_v == IPVERSION,
("%s: IP version incorrect: %d", __func__, ip->ip_v));
}
#endif
/*
* Enable TSO and specify the size of the segments.
* The TCP pseudo header checksum is always provided.
*/
if (tso) {
KASSERT(len > tp->t_maxseg - optlen,
("%s: len <= tso_segsz", __func__));
m->m_pkthdr.csum_flags |= CSUM_TSO;
m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
}
#if defined(IPSEC) || defined(IPSEC_SUPPORT)
KASSERT(len + hdrlen + ipoptlen - ipsec_optlen == m_length(m, NULL),
("%s: mbuf chain shorter than expected: %d + %u + %u - %u != %u",
__func__, len, hdrlen, ipoptlen, ipsec_optlen, m_length(m, NULL)));
#else
KASSERT(len + hdrlen + ipoptlen == m_length(m, NULL),
("%s: mbuf chain shorter than expected: %d + %u + %u != %u",
__func__, len, hdrlen, ipoptlen, m_length(m, NULL)));
#endif
#ifdef TCP_HHOOK
/* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */
hhook_run_tcp_est_out(tp, th, &to, len, tso);
#endif
#ifdef TCPDEBUG
/*
* Trace.
*/
if (so->so_options & SO_DEBUG) {
u_short save = 0;
#ifdef INET6
if (!isipv6)
#endif
{
save = ipov->ih_len;
ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + (th->th_off << 2) */);
}
tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
#ifdef INET6
if (!isipv6)
#endif
ipov->ih_len = save;
}
#endif /* TCPDEBUG */
TCP_PROBE3(debug__output, tp, th, m);
/*
* Fill in IP length and desired time to live and
* send to IP level. There should be a better way
* to handle ttl and tos; we could keep them in
* the template, but need a way to checksum without them.
*/
/*
* m->m_pkthdr.len should have been set before checksum calculation,
* because in6_cksum() need it.
*/
#ifdef INET6
if (isipv6) {
/*
* we separately set hoplimit for every segment, since the
* user might want to change the value via setsockopt.
* Also, desired default hop limit might be changed via
* Neighbor Discovery.
*/
ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL);
/*
* Set the packet size here for the benefit of DTrace probes.
* ip6_output() will set it properly; it's supposed to include
* the option header lengths as well.
*/
ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
tp->t_flags2 |= TF2_PLPMTU_PMTUD;
else
tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
if (tp->t_state == TCPS_SYN_SENT)
TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th);
TCP_PROBE5(send, NULL, tp, ip6, tp, th);
#ifdef TCPPCAP
/* Save packet, if requested. */
tcp_pcap_add(th, m, &(tp->t_outpkts));
#endif
/* TODO: IPv6 IP6TOS_ECT bit on */
error = ip6_output(m, tp->t_inpcb->in6p_outputopts,
&tp->t_inpcb->inp_route6,
((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0),
NULL, NULL, tp->t_inpcb);
if (error == EMSGSIZE && tp->t_inpcb->inp_route6.ro_rt != NULL)
mtu = tp->t_inpcb->inp_route6.ro_rt->rt_mtu;
}
#endif /* INET6 */
#if defined(INET) && defined(INET6)
else
#endif
#ifdef INET
{
ip->ip_len = htons(m->m_pkthdr.len);
#ifdef INET6
if (tp->t_inpcb->inp_vflag & INP_IPV6PROTO)
ip->ip_ttl = in6_selecthlim(tp->t_inpcb, NULL);
#endif /* INET6 */
/*
* If we do path MTU discovery, then we set DF on every packet.
* This might not be the best thing to do according to RFC3390
* Section 2. However the tcp hostcache migitates the problem
* so it affects only the first tcp connection with a host.
*
* NB: Don't set DF on small MTU/MSS to have a safe fallback.
*/
if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
ip->ip_off |= htons(IP_DF);
tp->t_flags2 |= TF2_PLPMTU_PMTUD;
} else {
tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
}
if (tp->t_state == TCPS_SYN_SENT)
TCP_PROBE5(connect__request, NULL, tp, ip, tp, th);
TCP_PROBE5(send, NULL, tp, ip, tp, th);
#ifdef TCPPCAP
/* Save packet, if requested. */
tcp_pcap_add(th, m, &(tp->t_outpkts));
#endif
error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route,
((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0,
tp->t_inpcb);
if (error == EMSGSIZE && tp->t_inpcb->inp_route.ro_rt != NULL)
mtu = tp->t_inpcb->inp_route.ro_rt->rt_mtu;
}
#endif /* INET */
out:
/*
* In transmit state, time the transmission and arrange for
* the retransmit. In persist state, just set snd_max.
*/
if ((tp->t_flags & TF_FORCEDATA) == 0 ||
!tcp_timer_active(tp, TT_PERSIST)) {
tcp_seq startseq = tp->snd_nxt;
/*
* Advance snd_nxt over sequence space of this segment.
*/
if (flags & (TH_SYN|TH_FIN)) {
if (flags & TH_SYN)
tp->snd_nxt++;
if (flags & TH_FIN) {
tp->snd_nxt++;
tp->t_flags |= TF_SENTFIN;
}
}
if (sack_rxmit)
goto timer;
tp->snd_nxt += len;
if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
tp->snd_max = tp->snd_nxt;
/*
* Time this transmission if not a retransmission and
* not currently timing anything.
*/
if (tp->t_rtttime == 0) {
tp->t_rtttime = ticks;
tp->t_rtseq = startseq;
TCPSTAT_INC(tcps_segstimed);
}
}
/*
* Set retransmit timer if not currently set,
* and not doing a pure ack or a keep-alive probe.
* Initial value for retransmit timer is smoothed
* round-trip time + 2 * round-trip time variance.
* Initialize shift counter which is used for backoff
* of retransmit time.
*/
timer:
if (!tcp_timer_active(tp, TT_REXMT) &&
((sack_rxmit && tp->snd_nxt != tp->snd_max) ||
(tp->snd_nxt != tp->snd_una))) {
if (tcp_timer_active(tp, TT_PERSIST)) {
tcp_timer_activate(tp, TT_PERSIST, 0);
tp->t_rxtshift = 0;
}
tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
} else if (len == 0 && sbavail(&so->so_snd) &&
!tcp_timer_active(tp, TT_REXMT) &&
!tcp_timer_active(tp, TT_PERSIST)) {
/*
* Avoid a situation where we do not set persist timer
* after a zero window condition. For example:
* 1) A -> B: packet with enough data to fill the window
* 2) B -> A: ACK for #1 + new data (0 window
* advertisement)
* 3) A -> B: ACK for #2, 0 len packet
*
* In this case, A will not activate the persist timer,
* because it chose to send a packet. Unless tcp_output
* is called for some other reason (delayed ack timer,
* another input packet from B, socket syscall), A will
* not send zero window probes.
*
* So, if you send a 0-length packet, but there is data
* in the socket buffer, and neither the rexmt or
* persist timer is already set, then activate the
* persist timer.
*/
tp->t_rxtshift = 0;
tcp_setpersist(tp);
}
} else {
/*
* Persist case, update snd_max but since we are in
* persist mode (no window) we do not update snd_nxt.
*/
int xlen = len;
if (flags & TH_SYN)
++xlen;
if (flags & TH_FIN) {
++xlen;
tp->t_flags |= TF_SENTFIN;
}
if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max))
tp->snd_max = tp->snd_nxt + xlen;
}
if (error) {
/*
* We know that the packet was lost, so back out the
* sequence number advance, if any.
*
* If the error is EPERM the packet got blocked by the
* local firewall. Normally we should terminate the
* connection but the blocking may have been spurious
* due to a firewall reconfiguration cycle. So we treat
* it like a packet loss and let the retransmit timer and
* timeouts do their work over time.
* XXX: It is a POLA question whether calling tcp_drop right
* away would be the really correct behavior instead.
*/
if (((tp->t_flags & TF_FORCEDATA) == 0 ||
!tcp_timer_active(tp, TT_PERSIST)) &&
((flags & TH_SYN) == 0) &&
(error != EPERM)) {
if (sack_rxmit) {
p->rxmit -= len;
tp->sackhint.sack_bytes_rexmit -= len;
KASSERT(tp->sackhint.sack_bytes_rexmit >= 0,
("sackhint bytes rtx >= 0"));
} else
tp->snd_nxt -= len;
}
SOCKBUF_UNLOCK_ASSERT(&so->so_snd); /* Check gotos. */
switch (error) {
case EACCES:
tp->t_softerror = error;
return (0);
case EPERM:
tp->t_softerror = error;
return (error);
case ENOBUFS:
TCP_XMIT_TIMER_ASSERT(tp, len, flags);
tp->snd_cwnd = tp->t_maxseg;
return (0);
case EMSGSIZE:
/*
* For some reason the interface we used initially
* to send segments changed to another or lowered
* its MTU.
* If TSO was active we either got an interface
* without TSO capabilits or TSO was turned off.
* If we obtained mtu from ip_output() then update
* it and try again.
*/
if (tso)
tp->t_flags &= ~TF_TSO;
if (mtu != 0) {
tcp_mss_update(tp, -1, mtu, NULL, NULL);
goto again;
}
return (error);
case EHOSTDOWN:
case EHOSTUNREACH:
case ENETDOWN:
case ENETUNREACH:
if (TCPS_HAVERCVDSYN(tp->t_state)) {
tp->t_softerror = error;
return (0);
}
/* FALLTHROUGH */
default:
return (error);
}
}
TCPSTAT_INC(tcps_sndtotal);
/*
* Data sent (as far as we can tell).
* If this advertises a larger window than any other segment,
* then remember the size of the advertised window.
* Any pending ACK has now been sent.
*/
if (SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
tp->rcv_adv = tp->rcv_nxt + recwin;
tp->last_ack_sent = tp->rcv_nxt;
tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
if (tcp_timer_active(tp, TT_DELACK))
tcp_timer_activate(tp, TT_DELACK, 0);
#if 0
/*
* This completely breaks TCP if newreno is turned on. What happens
* is that if delayed-acks are turned on on the receiver, this code
* on the transmitter effectively destroys the TCP window, forcing
* it to four packets (1.5Kx4 = 6K window).
*/
if (sendalot && --maxburst)
goto again;
#endif
if (sendalot)
goto again;
return (0);
}
void
tcp_setpersist(struct tcpcb *tp)
{
int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
int tt;
tp->t_flags &= ~TF_PREVVALID;
if (tcp_timer_active(tp, TT_REXMT))
panic("tcp_setpersist: retransmit pending");
/*
* Start/restart persistence timer.
*/
TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
tcp_persmin, tcp_persmax);
tcp_timer_activate(tp, TT_PERSIST, tt);
if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
tp->t_rxtshift++;
}
/*
* Insert TCP options according to the supplied parameters to the place
* optp in a consistent way. Can handle unaligned destinations.
*
* The order of the option processing is crucial for optimal packing and
* alignment for the scarce option space.
*
* The optimal order for a SYN/SYN-ACK segment is:
* MSS (4) + NOP (1) + Window scale (3) + SACK permitted (2) +
* Timestamp (10) + Signature (18) = 38 bytes out of a maximum of 40.
*
* The SACK options should be last. SACK blocks consume 8*n+2 bytes.
* So a full size SACK blocks option is 34 bytes (with 4 SACK blocks).
* At minimum we need 10 bytes (to generate 1 SACK block). If both
* TCP Timestamps (12 bytes) and TCP Signatures (18 bytes) are present,
* we only have 10 bytes for SACK options (40 - (12 + 18)).
*/
int
tcp_addoptions(struct tcpopt *to, u_char *optp)
{
u_int32_t mask, optlen = 0;
for (mask = 1; mask < TOF_MAXOPT; mask <<= 1) {
if ((to->to_flags & mask) != mask)
continue;
if (optlen == TCP_MAXOLEN)
break;
switch (to->to_flags & mask) {
case TOF_MSS:
while (optlen % 4) {
optlen += TCPOLEN_NOP;
*optp++ = TCPOPT_NOP;
}
if (TCP_MAXOLEN - optlen < TCPOLEN_MAXSEG)
continue;
optlen += TCPOLEN_MAXSEG;
*optp++ = TCPOPT_MAXSEG;
*optp++ = TCPOLEN_MAXSEG;
to->to_mss = htons(to->to_mss);
bcopy((u_char *)&to->to_mss, optp, sizeof(to->to_mss));
optp += sizeof(to->to_mss);
break;
case TOF_SCALE:
while (!optlen || optlen % 2 != 1) {
optlen += TCPOLEN_NOP;
*optp++ = TCPOPT_NOP;
}
if (TCP_MAXOLEN - optlen < TCPOLEN_WINDOW)
continue;
optlen += TCPOLEN_WINDOW;
*optp++ = TCPOPT_WINDOW;
*optp++ = TCPOLEN_WINDOW;
*optp++ = to->to_wscale;
break;
case TOF_SACKPERM:
while (optlen % 2) {
optlen += TCPOLEN_NOP;
*optp++ = TCPOPT_NOP;
}
if (TCP_MAXOLEN - optlen < TCPOLEN_SACK_PERMITTED)
continue;
optlen += TCPOLEN_SACK_PERMITTED;
*optp++ = TCPOPT_SACK_PERMITTED;
*optp++ = TCPOLEN_SACK_PERMITTED;
break;
case TOF_TS:
while (!optlen || optlen % 4 != 2) {
optlen += TCPOLEN_NOP;
*optp++ = TCPOPT_NOP;
}
if (TCP_MAXOLEN - optlen < TCPOLEN_TIMESTAMP)
continue;
optlen += TCPOLEN_TIMESTAMP;
*optp++ = TCPOPT_TIMESTAMP;
*optp++ = TCPOLEN_TIMESTAMP;
to->to_tsval = htonl(to->to_tsval);
to->to_tsecr = htonl(to->to_tsecr);
bcopy((u_char *)&to->to_tsval, optp, sizeof(to->to_tsval));
optp += sizeof(to->to_tsval);
bcopy((u_char *)&to->to_tsecr, optp, sizeof(to->to_tsecr));
optp += sizeof(to->to_tsecr);
break;
case TOF_SIGNATURE:
{
int siglen = TCPOLEN_SIGNATURE - 2;
while (!optlen || optlen % 4 != 2) {
optlen += TCPOLEN_NOP;
*optp++ = TCPOPT_NOP;
}
if (TCP_MAXOLEN - optlen < TCPOLEN_SIGNATURE) {
to->to_flags &= ~TOF_SIGNATURE;
continue;
}
optlen += TCPOLEN_SIGNATURE;
*optp++ = TCPOPT_SIGNATURE;
*optp++ = TCPOLEN_SIGNATURE;
to->to_signature = optp;
while (siglen--)
*optp++ = 0;
break;
}
case TOF_SACK:
{
int sackblks = 0;
struct sackblk *sack = (struct sackblk *)to->to_sacks;
tcp_seq sack_seq;
while (!optlen || optlen % 4 != 2) {
optlen += TCPOLEN_NOP;
*optp++ = TCPOPT_NOP;
}
if (TCP_MAXOLEN - optlen < TCPOLEN_SACKHDR + TCPOLEN_SACK)
continue;
optlen += TCPOLEN_SACKHDR;
*optp++ = TCPOPT_SACK;
sackblks = min(to->to_nsacks,
(TCP_MAXOLEN - optlen) / TCPOLEN_SACK);
*optp++ = TCPOLEN_SACKHDR + sackblks * TCPOLEN_SACK;
while (sackblks--) {
sack_seq = htonl(sack->start);
bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
optp += sizeof(sack_seq);
sack_seq = htonl(sack->end);
bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
optp += sizeof(sack_seq);
optlen += TCPOLEN_SACK;
sack++;
}
TCPSTAT_INC(tcps_sack_send_blocks);
break;
}
#ifdef TCP_RFC7413
case TOF_FASTOPEN:
{
int total_len;
/* XXX is there any point to aligning this option? */
total_len = TCPOLEN_FAST_OPEN_EMPTY + to->to_tfo_len;
if (TCP_MAXOLEN - optlen < total_len)
continue;
*optp++ = TCPOPT_FAST_OPEN;
*optp++ = total_len;
if (to->to_tfo_len > 0) {
bcopy(to->to_tfo_cookie, optp, to->to_tfo_len);
optp += to->to_tfo_len;
}
optlen += total_len;
break;
}
#endif
default:
panic("%s: unknown TCP option type", __func__);
break;
}
}
/* Terminate and pad TCP options to a 4 byte boundary. */
if (optlen % 4) {
optlen += TCPOLEN_EOL;
*optp++ = TCPOPT_EOL;
}
/*
* According to RFC 793 (STD0007):
* "The content of the header beyond the End-of-Option option
* must be header padding (i.e., zero)."
* and later: "The padding is composed of zeros."
*/
while (optlen % 4) {
optlen += TCPOLEN_PAD;
*optp++ = TCPOPT_PAD;
}
KASSERT(optlen <= TCP_MAXOLEN, ("%s: TCP options too long", __func__));
return (optlen);
}
void
tcp_sndbuf_autoscale(struct tcpcb *tp, struct socket *so, uint32_t sendwin)
{
/*
* Automatic sizing of send socket buffer. Often the send buffer
* size is not optimally adjusted to the actual network conditions
* at hand (delay bandwidth product). Setting the buffer size too
* small limits throughput on links with high bandwidth and high
* delay (eg. trans-continental/oceanic links). Setting the
* buffer size too big consumes too much real kernel memory,
* especially with many connections on busy servers.
*
* The criteria to step up the send buffer one notch are:
* 1. receive window of remote host is larger than send buffer
* (with a fudge factor of 5/4th);
* 2. send buffer is filled to 7/8th with data (so we actually
* have data to make use of it);
* 3. send buffer fill has not hit maximal automatic size;
* 4. our send window (slow start and cogestion controlled) is
* larger than sent but unacknowledged data in send buffer.
*
* The remote host receive window scaling factor may limit the
* growing of the send buffer before it reaches its allowed
* maximum.
*
* It scales directly with slow start or congestion window
* and does at most one step per received ACK. This fast
* scaling has the drawback of growing the send buffer beyond
* what is strictly necessary to make full use of a given
* delay*bandwidth product. However testing has shown this not
* to be much of an problem. At worst we are trading wasting
* of available bandwidth (the non-use of it) for wasting some
* socket buffer memory.
*
* TODO: Shrink send buffer during idle periods together
* with congestion window. Requires another timer. Has to
* wait for upcoming tcp timer rewrite.
*
* XXXGL: should there be used sbused() or sbavail()?
*/
if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) {
int lowat;
lowat = V_tcp_sendbuf_auto_lowat ? so->so_snd.sb_lowat : 0;
if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat - lowat &&
sbused(&so->so_snd) >=
(so->so_snd.sb_hiwat / 8 * 7) - lowat &&
sbused(&so->so_snd) < V_tcp_autosndbuf_max &&
sendwin >= (sbused(&so->so_snd) -
(tp->snd_nxt - tp->snd_una))) {
if (!sbreserve_locked(&so->so_snd,
min(so->so_snd.sb_hiwat + V_tcp_autosndbuf_inc,
V_tcp_autosndbuf_max), so, curthread))
so->so_snd.sb_flags &= ~SB_AUTOSIZE;
}
}
}
Index: head/sys/netinet6/in6_mcast.c
===================================================================
--- head/sys/netinet6/in6_mcast.c (revision 327172)
+++ head/sys/netinet6/in6_mcast.c (revision 327173)
@@ -1,2836 +1,2833 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (c) 2009 Bruce Simpson.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* IPv6 multicast socket, group, and socket option processing module.
* Normative references: RFC 2292, RFC 3492, RFC 3542, RFC 3678, RFC 3810.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_inet6.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/protosw.h>
#include <sys/sysctl.h>
#include <sys/priv.h>
#include <sys/ktr.h>
#include <sys/tree.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/if_dl.h>
#include <net/route.h>
#include <net/vnet.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet6/in6_fib.h>
#include <netinet6/in6_var.h>
#include <netinet/ip6.h>
#include <netinet/icmp6.h>
#include <netinet6/ip6_var.h>
#include <netinet/in_pcb.h>
#include <netinet/tcp_var.h>
#include <netinet6/nd6.h>
#include <netinet6/mld6_var.h>
#include <netinet6/scope6_var.h>
#ifndef KTR_MLD
#define KTR_MLD KTR_INET6
#endif
#ifndef __SOCKUNION_DECLARED
union sockunion {
struct sockaddr_storage ss;
struct sockaddr sa;
struct sockaddr_dl sdl;
struct sockaddr_in6 sin6;
};
typedef union sockunion sockunion_t;
#define __SOCKUNION_DECLARED
#endif /* __SOCKUNION_DECLARED */
static MALLOC_DEFINE(M_IN6MFILTER, "in6_mfilter",
"IPv6 multicast PCB-layer source filter");
static MALLOC_DEFINE(M_IP6MADDR, "in6_multi", "IPv6 multicast group");
static MALLOC_DEFINE(M_IP6MOPTS, "ip6_moptions", "IPv6 multicast options");
static MALLOC_DEFINE(M_IP6MSOURCE, "ip6_msource",
"IPv6 multicast MLD-layer source filter");
RB_GENERATE(ip6_msource_tree, ip6_msource, im6s_link, ip6_msource_cmp);
/*
* Locking:
* - Lock order is: Giant, INP_WLOCK, IN6_MULTI_LOCK, MLD_LOCK, IF_ADDR_LOCK.
* - The IF_ADDR_LOCK is implicitly taken by in6m_lookup() earlier, however
* it can be taken by code in net/if.c also.
* - ip6_moptions and in6_mfilter are covered by the INP_WLOCK.
*
* struct in6_multi is covered by IN6_MULTI_LOCK. There isn't strictly
* any need for in6_multi itself to be virtualized -- it is bound to an ifp
* anyway no matter what happens.
*/
struct mtx in6_multi_mtx;
MTX_SYSINIT(in6_multi_mtx, &in6_multi_mtx, "in6_multi_mtx", MTX_DEF);
static void im6f_commit(struct in6_mfilter *);
static int im6f_get_source(struct in6_mfilter *imf,
const struct sockaddr_in6 *psin,
struct in6_msource **);
static struct in6_msource *
im6f_graft(struct in6_mfilter *, const uint8_t,
const struct sockaddr_in6 *);
static void im6f_leave(struct in6_mfilter *);
static int im6f_prune(struct in6_mfilter *, const struct sockaddr_in6 *);
static void im6f_purge(struct in6_mfilter *);
static void im6f_rollback(struct in6_mfilter *);
static void im6f_reap(struct in6_mfilter *);
static int im6o_grow(struct ip6_moptions *);
static size_t im6o_match_group(const struct ip6_moptions *,
const struct ifnet *, const struct sockaddr *);
static struct in6_msource *
im6o_match_source(const struct ip6_moptions *, const size_t,
const struct sockaddr *);
static void im6s_merge(struct ip6_msource *ims,
const struct in6_msource *lims, const int rollback);
static int in6_mc_get(struct ifnet *, const struct in6_addr *,
struct in6_multi **);
static int in6m_get_source(struct in6_multi *inm,
const struct in6_addr *addr, const int noalloc,
struct ip6_msource **pims);
#ifdef KTR
static int in6m_is_ifp_detached(const struct in6_multi *);
#endif
static int in6m_merge(struct in6_multi *, /*const*/ struct in6_mfilter *);
static void in6m_purge(struct in6_multi *);
static void in6m_reap(struct in6_multi *);
static struct ip6_moptions *
in6p_findmoptions(struct inpcb *);
static int in6p_get_source_filters(struct inpcb *, struct sockopt *);
static int in6p_join_group(struct inpcb *, struct sockopt *);
static int in6p_leave_group(struct inpcb *, struct sockopt *);
static struct ifnet *
in6p_lookup_mcast_ifp(const struct inpcb *,
const struct sockaddr_in6 *);
static int in6p_block_unblock_source(struct inpcb *, struct sockopt *);
static int in6p_set_multicast_if(struct inpcb *, struct sockopt *);
static int in6p_set_source_filters(struct inpcb *, struct sockopt *);
static int sysctl_ip6_mcast_filters(SYSCTL_HANDLER_ARGS);
SYSCTL_DECL(_net_inet6_ip6); /* XXX Not in any common header. */
static SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, mcast, CTLFLAG_RW, 0,
"IPv6 multicast");
static u_long in6_mcast_maxgrpsrc = IPV6_MAX_GROUP_SRC_FILTER;
SYSCTL_ULONG(_net_inet6_ip6_mcast, OID_AUTO, maxgrpsrc,
CTLFLAG_RWTUN, &in6_mcast_maxgrpsrc, 0,
"Max source filters per group");
static u_long in6_mcast_maxsocksrc = IPV6_MAX_SOCK_SRC_FILTER;
SYSCTL_ULONG(_net_inet6_ip6_mcast, OID_AUTO, maxsocksrc,
CTLFLAG_RWTUN, &in6_mcast_maxsocksrc, 0,
"Max source filters per socket");
/* TODO Virtualize this switch. */
int in6_mcast_loop = IPV6_DEFAULT_MULTICAST_LOOP;
SYSCTL_INT(_net_inet6_ip6_mcast, OID_AUTO, loop, CTLFLAG_RWTUN,
&in6_mcast_loop, 0, "Loopback multicast datagrams by default");
static SYSCTL_NODE(_net_inet6_ip6_mcast, OID_AUTO, filters,
CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip6_mcast_filters,
"Per-interface stack-wide source filters");
#ifdef KTR
/*
* Inline function which wraps assertions for a valid ifp.
* The ifnet layer will set the ifma's ifp pointer to NULL if the ifp
* is detached.
*/
static int __inline
in6m_is_ifp_detached(const struct in6_multi *inm)
{
struct ifnet *ifp;
KASSERT(inm->in6m_ifma != NULL, ("%s: no ifma", __func__));
ifp = inm->in6m_ifma->ifma_ifp;
if (ifp != NULL) {
/*
* Sanity check that network-layer notion of ifp is the
* same as that of link-layer.
*/
KASSERT(inm->in6m_ifp == ifp, ("%s: bad ifp", __func__));
}
return (ifp == NULL);
}
#endif
/*
* Initialize an in6_mfilter structure to a known state at t0, t1
* with an empty source filter list.
*/
static __inline void
im6f_init(struct in6_mfilter *imf, const int st0, const int st1)
{
memset(imf, 0, sizeof(struct in6_mfilter));
RB_INIT(&imf->im6f_sources);
imf->im6f_st[0] = st0;
imf->im6f_st[1] = st1;
}
/*
* Resize the ip6_moptions vector to the next power-of-two minus 1.
* May be called with locks held; do not sleep.
*/
static int
im6o_grow(struct ip6_moptions *imo)
{
struct in6_multi **nmships;
struct in6_multi **omships;
struct in6_mfilter *nmfilters;
struct in6_mfilter *omfilters;
size_t idx;
size_t newmax;
size_t oldmax;
nmships = NULL;
nmfilters = NULL;
omships = imo->im6o_membership;
omfilters = imo->im6o_mfilters;
oldmax = imo->im6o_max_memberships;
newmax = ((oldmax + 1) * 2) - 1;
if (newmax <= IPV6_MAX_MEMBERSHIPS) {
nmships = (struct in6_multi **)realloc(omships,
sizeof(struct in6_multi *) * newmax, M_IP6MOPTS, M_NOWAIT);
nmfilters = (struct in6_mfilter *)realloc(omfilters,
sizeof(struct in6_mfilter) * newmax, M_IN6MFILTER,
M_NOWAIT);
if (nmships != NULL && nmfilters != NULL) {
/* Initialize newly allocated source filter heads. */
for (idx = oldmax; idx < newmax; idx++) {
im6f_init(&nmfilters[idx], MCAST_UNDEFINED,
MCAST_EXCLUDE);
}
imo->im6o_max_memberships = newmax;
imo->im6o_membership = nmships;
imo->im6o_mfilters = nmfilters;
}
}
if (nmships == NULL || nmfilters == NULL) {
if (nmships != NULL)
free(nmships, M_IP6MOPTS);
if (nmfilters != NULL)
free(nmfilters, M_IN6MFILTER);
return (ETOOMANYREFS);
}
return (0);
}
/*
* Find an IPv6 multicast group entry for this ip6_moptions instance
* which matches the specified group, and optionally an interface.
* Return its index into the array, or -1 if not found.
*/
static size_t
im6o_match_group(const struct ip6_moptions *imo, const struct ifnet *ifp,
const struct sockaddr *group)
{
const struct sockaddr_in6 *gsin6;
struct in6_multi **pinm;
int idx;
int nmships;
gsin6 = (const struct sockaddr_in6 *)group;
/* The im6o_membership array may be lazy allocated. */
if (imo->im6o_membership == NULL || imo->im6o_num_memberships == 0)
return (-1);
nmships = imo->im6o_num_memberships;
pinm = &imo->im6o_membership[0];
for (idx = 0; idx < nmships; idx++, pinm++) {
if (*pinm == NULL)
continue;
if ((ifp == NULL || ((*pinm)->in6m_ifp == ifp)) &&
IN6_ARE_ADDR_EQUAL(&(*pinm)->in6m_addr,
&gsin6->sin6_addr)) {
break;
}
}
if (idx >= nmships)
idx = -1;
return (idx);
}
/*
* Find an IPv6 multicast source entry for this imo which matches
* the given group index for this socket, and source address.
*
* XXX TODO: The scope ID, if present in src, is stripped before
* any comparison. We SHOULD enforce scope/zone checks where the source
* filter entry has a link scope.
*
* NOTE: This does not check if the entry is in-mode, merely if
* it exists, which may not be the desired behaviour.
*/
static struct in6_msource *
im6o_match_source(const struct ip6_moptions *imo, const size_t gidx,
const struct sockaddr *src)
{
struct ip6_msource find;
struct in6_mfilter *imf;
struct ip6_msource *ims;
const sockunion_t *psa;
KASSERT(src->sa_family == AF_INET6, ("%s: !AF_INET6", __func__));
KASSERT(gidx != -1 && gidx < imo->im6o_num_memberships,
("%s: invalid index %d\n", __func__, (int)gidx));
/* The im6o_mfilters array may be lazy allocated. */
if (imo->im6o_mfilters == NULL)
return (NULL);
imf = &imo->im6o_mfilters[gidx];
psa = (const sockunion_t *)src;
find.im6s_addr = psa->sin6.sin6_addr;
in6_clearscope(&find.im6s_addr); /* XXX */
ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find);
return ((struct in6_msource *)ims);
}
/*
* Perform filtering for multicast datagrams on a socket by group and source.
*
* Returns 0 if a datagram should be allowed through, or various error codes
* if the socket was not a member of the group, or the source was muted, etc.
*/
int
im6o_mc_filter(const struct ip6_moptions *imo, const struct ifnet *ifp,
const struct sockaddr *group, const struct sockaddr *src)
{
size_t gidx;
struct in6_msource *ims;
int mode;
KASSERT(ifp != NULL, ("%s: null ifp", __func__));
gidx = im6o_match_group(imo, ifp, group);
if (gidx == -1)
return (MCAST_NOTGMEMBER);
/*
* Check if the source was included in an (S,G) join.
* Allow reception on exclusive memberships by default,
* reject reception on inclusive memberships by default.
* Exclude source only if an in-mode exclude filter exists.
* Include source only if an in-mode include filter exists.
* NOTE: We are comparing group state here at MLD t1 (now)
* with socket-layer t0 (since last downcall).
*/
mode = imo->im6o_mfilters[gidx].im6f_st[1];
ims = im6o_match_source(imo, gidx, src);
if ((ims == NULL && mode == MCAST_INCLUDE) ||
(ims != NULL && ims->im6sl_st[0] != mode))
return (MCAST_NOTSMEMBER);
return (MCAST_PASS);
}
/*
* Find and return a reference to an in6_multi record for (ifp, group),
* and bump its reference count.
* If one does not exist, try to allocate it, and update link-layer multicast
* filters on ifp to listen for group.
* Assumes the IN6_MULTI lock is held across the call.
* Return 0 if successful, otherwise return an appropriate error code.
*/
static int
in6_mc_get(struct ifnet *ifp, const struct in6_addr *group,
struct in6_multi **pinm)
{
struct sockaddr_in6 gsin6;
struct ifmultiaddr *ifma;
struct in6_multi *inm;
int error;
error = 0;
/*
* XXX: Accesses to ifma_protospec must be covered by IF_ADDR_LOCK;
* if_addmulti() takes this mutex itself, so we must drop and
* re-acquire around the call.
*/
IN6_MULTI_LOCK_ASSERT();
IF_ADDR_WLOCK(ifp);
inm = in6m_lookup_locked(ifp, group);
if (inm != NULL) {
/*
* If we already joined this group, just bump the
* refcount and return it.
*/
KASSERT(inm->in6m_refcount >= 1,
("%s: bad refcount %d", __func__, inm->in6m_refcount));
++inm->in6m_refcount;
*pinm = inm;
goto out_locked;
}
memset(&gsin6, 0, sizeof(gsin6));
gsin6.sin6_family = AF_INET6;
gsin6.sin6_len = sizeof(struct sockaddr_in6);
gsin6.sin6_addr = *group;
/*
* Check if a link-layer group is already associated
* with this network-layer group on the given ifnet.
*/
IF_ADDR_WUNLOCK(ifp);
error = if_addmulti(ifp, (struct sockaddr *)&gsin6, &ifma);
if (error != 0)
return (error);
IF_ADDR_WLOCK(ifp);
/*
* If something other than netinet6 is occupying the link-layer
* group, print a meaningful error message and back out of
* the allocation.
* Otherwise, bump the refcount on the existing network-layer
* group association and return it.
*/
if (ifma->ifma_protospec != NULL) {
inm = (struct in6_multi *)ifma->ifma_protospec;
#ifdef INVARIANTS
KASSERT(ifma->ifma_addr != NULL, ("%s: no ifma_addr",
__func__));
KASSERT(ifma->ifma_addr->sa_family == AF_INET6,
("%s: ifma not AF_INET6", __func__));
KASSERT(inm != NULL, ("%s: no ifma_protospec", __func__));
if (inm->in6m_ifma != ifma || inm->in6m_ifp != ifp ||
!IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, group))
panic("%s: ifma %p is inconsistent with %p (%p)",
__func__, ifma, inm, group);
#endif
++inm->in6m_refcount;
*pinm = inm;
goto out_locked;
}
IF_ADDR_WLOCK_ASSERT(ifp);
/*
* A new in6_multi record is needed; allocate and initialize it.
* We DO NOT perform an MLD join as the in6_ layer may need to
* push an initial source list down to MLD to support SSM.
*
* The initial source filter state is INCLUDE, {} as per the RFC.
* Pending state-changes per group are subject to a bounds check.
*/
inm = malloc(sizeof(*inm), M_IP6MADDR, M_NOWAIT | M_ZERO);
if (inm == NULL) {
IF_ADDR_WUNLOCK(ifp);
if_delmulti_ifma(ifma);
return (ENOMEM);
}
inm->in6m_addr = *group;
inm->in6m_ifp = ifp;
inm->in6m_mli = MLD_IFINFO(ifp);
inm->in6m_ifma = ifma;
inm->in6m_refcount = 1;
inm->in6m_state = MLD_NOT_MEMBER;
mbufq_init(&inm->in6m_scq, MLD_MAX_STATE_CHANGES);
inm->in6m_st[0].iss_fmode = MCAST_UNDEFINED;
inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED;
RB_INIT(&inm->in6m_srcs);
ifma->ifma_protospec = inm;
*pinm = inm;
out_locked:
IF_ADDR_WUNLOCK(ifp);
return (error);
}
/*
* Drop a reference to an in6_multi record.
*
* If the refcount drops to 0, free the in6_multi record and
* delete the underlying link-layer membership.
*/
void
in6m_release_locked(struct in6_multi *inm)
{
struct ifmultiaddr *ifma;
IN6_MULTI_LOCK_ASSERT();
CTR2(KTR_MLD, "%s: refcount is %d", __func__, inm->in6m_refcount);
if (--inm->in6m_refcount > 0) {
CTR2(KTR_MLD, "%s: refcount is now %d", __func__,
inm->in6m_refcount);
return;
}
CTR2(KTR_MLD, "%s: freeing inm %p", __func__, inm);
ifma = inm->in6m_ifma;
/* XXX this access is not covered by IF_ADDR_LOCK */
CTR2(KTR_MLD, "%s: purging ifma %p", __func__, ifma);
KASSERT(ifma->ifma_protospec == inm,
("%s: ifma_protospec != inm", __func__));
ifma->ifma_protospec = NULL;
in6m_purge(inm);
free(inm, M_IP6MADDR);
if_delmulti_ifma(ifma);
}
/*
* Clear recorded source entries for a group.
* Used by the MLD code. Caller must hold the IN6_MULTI lock.
* FIXME: Should reap.
*/
void
in6m_clear_recorded(struct in6_multi *inm)
{
struct ip6_msource *ims;
IN6_MULTI_LOCK_ASSERT();
RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) {
if (ims->im6s_stp) {
ims->im6s_stp = 0;
--inm->in6m_st[1].iss_rec;
}
}
KASSERT(inm->in6m_st[1].iss_rec == 0,
("%s: iss_rec %d not 0", __func__, inm->in6m_st[1].iss_rec));
}
/*
* Record a source as pending for a Source-Group MLDv2 query.
* This lives here as it modifies the shared tree.
*
* inm is the group descriptor.
* naddr is the address of the source to record in network-byte order.
*
* If the net.inet6.mld.sgalloc sysctl is non-zero, we will
* lazy-allocate a source node in response to an SG query.
* Otherwise, no allocation is performed. This saves some memory
* with the trade-off that the source will not be reported to the
* router if joined in the window between the query response and
* the group actually being joined on the local host.
*
* VIMAGE: XXX: Currently the mld_sgalloc feature has been removed.
* This turns off the allocation of a recorded source entry if
* the group has not been joined.
*
* Return 0 if the source didn't exist or was already marked as recorded.
* Return 1 if the source was marked as recorded by this function.
* Return <0 if any error occurred (negated errno code).
*/
int
in6m_record_source(struct in6_multi *inm, const struct in6_addr *addr)
{
struct ip6_msource find;
struct ip6_msource *ims, *nims;
IN6_MULTI_LOCK_ASSERT();
find.im6s_addr = *addr;
ims = RB_FIND(ip6_msource_tree, &inm->in6m_srcs, &find);
if (ims && ims->im6s_stp)
return (0);
if (ims == NULL) {
if (inm->in6m_nsrc == in6_mcast_maxgrpsrc)
return (-ENOSPC);
nims = malloc(sizeof(struct ip6_msource), M_IP6MSOURCE,
M_NOWAIT | M_ZERO);
if (nims == NULL)
return (-ENOMEM);
nims->im6s_addr = find.im6s_addr;
RB_INSERT(ip6_msource_tree, &inm->in6m_srcs, nims);
++inm->in6m_nsrc;
ims = nims;
}
/*
* Mark the source as recorded and update the recorded
* source count.
*/
++ims->im6s_stp;
++inm->in6m_st[1].iss_rec;
return (1);
}
/*
* Return a pointer to an in6_msource owned by an in6_mfilter,
* given its source address.
* Lazy-allocate if needed. If this is a new entry its filter state is
* undefined at t0.
*
* imf is the filter set being modified.
* addr is the source address.
*
* SMPng: May be called with locks held; malloc must not block.
*/
static int
im6f_get_source(struct in6_mfilter *imf, const struct sockaddr_in6 *psin,
struct in6_msource **plims)
{
struct ip6_msource find;
struct ip6_msource *ims, *nims;
struct in6_msource *lims;
int error;
error = 0;
ims = NULL;
lims = NULL;
find.im6s_addr = psin->sin6_addr;
ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find);
lims = (struct in6_msource *)ims;
if (lims == NULL) {
if (imf->im6f_nsrc == in6_mcast_maxsocksrc)
return (ENOSPC);
nims = malloc(sizeof(struct in6_msource), M_IN6MFILTER,
M_NOWAIT | M_ZERO);
if (nims == NULL)
return (ENOMEM);
lims = (struct in6_msource *)nims;
lims->im6s_addr = find.im6s_addr;
lims->im6sl_st[0] = MCAST_UNDEFINED;
RB_INSERT(ip6_msource_tree, &imf->im6f_sources, nims);
++imf->im6f_nsrc;
}
*plims = lims;
return (error);
}
/*
* Graft a source entry into an existing socket-layer filter set,
* maintaining any required invariants and checking allocations.
*
* The source is marked as being in the new filter mode at t1.
*
* Return the pointer to the new node, otherwise return NULL.
*/
static struct in6_msource *
im6f_graft(struct in6_mfilter *imf, const uint8_t st1,
const struct sockaddr_in6 *psin)
{
struct ip6_msource *nims;
struct in6_msource *lims;
nims = malloc(sizeof(struct in6_msource), M_IN6MFILTER,
M_NOWAIT | M_ZERO);
if (nims == NULL)
return (NULL);
lims = (struct in6_msource *)nims;
lims->im6s_addr = psin->sin6_addr;
lims->im6sl_st[0] = MCAST_UNDEFINED;
lims->im6sl_st[1] = st1;
RB_INSERT(ip6_msource_tree, &imf->im6f_sources, nims);
++imf->im6f_nsrc;
return (lims);
}
/*
* Prune a source entry from an existing socket-layer filter set,
* maintaining any required invariants and checking allocations.
*
* The source is marked as being left at t1, it is not freed.
*
* Return 0 if no error occurred, otherwise return an errno value.
*/
static int
im6f_prune(struct in6_mfilter *imf, const struct sockaddr_in6 *psin)
{
struct ip6_msource find;
struct ip6_msource *ims;
struct in6_msource *lims;
find.im6s_addr = psin->sin6_addr;
ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find);
if (ims == NULL)
return (ENOENT);
lims = (struct in6_msource *)ims;
lims->im6sl_st[1] = MCAST_UNDEFINED;
return (0);
}
/*
* Revert socket-layer filter set deltas at t1 to t0 state.
*/
static void
im6f_rollback(struct in6_mfilter *imf)
{
struct ip6_msource *ims, *tims;
struct in6_msource *lims;
RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) {
lims = (struct in6_msource *)ims;
if (lims->im6sl_st[0] == lims->im6sl_st[1]) {
/* no change at t1 */
continue;
} else if (lims->im6sl_st[0] != MCAST_UNDEFINED) {
/* revert change to existing source at t1 */
lims->im6sl_st[1] = lims->im6sl_st[0];
} else {
/* revert source added t1 */
CTR2(KTR_MLD, "%s: free ims %p", __func__, ims);
RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims);
free(ims, M_IN6MFILTER);
imf->im6f_nsrc--;
}
}
imf->im6f_st[1] = imf->im6f_st[0];
}
/*
* Mark socket-layer filter set as INCLUDE {} at t1.
*/
static void
im6f_leave(struct in6_mfilter *imf)
{
struct ip6_msource *ims;
struct in6_msource *lims;
RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) {
lims = (struct in6_msource *)ims;
lims->im6sl_st[1] = MCAST_UNDEFINED;
}
imf->im6f_st[1] = MCAST_INCLUDE;
}
/*
* Mark socket-layer filter set deltas as committed.
*/
static void
im6f_commit(struct in6_mfilter *imf)
{
struct ip6_msource *ims;
struct in6_msource *lims;
RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) {
lims = (struct in6_msource *)ims;
lims->im6sl_st[0] = lims->im6sl_st[1];
}
imf->im6f_st[0] = imf->im6f_st[1];
}
/*
* Reap unreferenced sources from socket-layer filter set.
*/
static void
im6f_reap(struct in6_mfilter *imf)
{
struct ip6_msource *ims, *tims;
struct in6_msource *lims;
RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) {
lims = (struct in6_msource *)ims;
if ((lims->im6sl_st[0] == MCAST_UNDEFINED) &&
(lims->im6sl_st[1] == MCAST_UNDEFINED)) {
CTR2(KTR_MLD, "%s: free lims %p", __func__, ims);
RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims);
free(ims, M_IN6MFILTER);
imf->im6f_nsrc--;
}
}
}
/*
* Purge socket-layer filter set.
*/
static void
im6f_purge(struct in6_mfilter *imf)
{
struct ip6_msource *ims, *tims;
RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) {
CTR2(KTR_MLD, "%s: free ims %p", __func__, ims);
RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims);
free(ims, M_IN6MFILTER);
imf->im6f_nsrc--;
}
imf->im6f_st[0] = imf->im6f_st[1] = MCAST_UNDEFINED;
KASSERT(RB_EMPTY(&imf->im6f_sources),
("%s: im6f_sources not empty", __func__));
}
/*
* Look up a source filter entry for a multicast group.
*
* inm is the group descriptor to work with.
* addr is the IPv6 address to look up.
* noalloc may be non-zero to suppress allocation of sources.
* *pims will be set to the address of the retrieved or allocated source.
*
* SMPng: NOTE: may be called with locks held.
* Return 0 if successful, otherwise return a non-zero error code.
*/
static int
in6m_get_source(struct in6_multi *inm, const struct in6_addr *addr,
const int noalloc, struct ip6_msource **pims)
{
struct ip6_msource find;
struct ip6_msource *ims, *nims;
#ifdef KTR
char ip6tbuf[INET6_ADDRSTRLEN];
#endif
find.im6s_addr = *addr;
ims = RB_FIND(ip6_msource_tree, &inm->in6m_srcs, &find);
if (ims == NULL && !noalloc) {
if (inm->in6m_nsrc == in6_mcast_maxgrpsrc)
return (ENOSPC);
nims = malloc(sizeof(struct ip6_msource), M_IP6MSOURCE,
M_NOWAIT | M_ZERO);
if (nims == NULL)
return (ENOMEM);
nims->im6s_addr = *addr;
RB_INSERT(ip6_msource_tree, &inm->in6m_srcs, nims);
++inm->in6m_nsrc;
ims = nims;
CTR3(KTR_MLD, "%s: allocated %s as %p", __func__,
ip6_sprintf(ip6tbuf, addr), ims);
}
*pims = ims;
return (0);
}
/*
* Merge socket-layer source into MLD-layer source.
* If rollback is non-zero, perform the inverse of the merge.
*/
static void
im6s_merge(struct ip6_msource *ims, const struct in6_msource *lims,
const int rollback)
{
int n = rollback ? -1 : 1;
#ifdef KTR
char ip6tbuf[INET6_ADDRSTRLEN];
ip6_sprintf(ip6tbuf, &lims->im6s_addr);
#endif
if (lims->im6sl_st[0] == MCAST_EXCLUDE) {
CTR3(KTR_MLD, "%s: t1 ex -= %d on %s", __func__, n, ip6tbuf);
ims->im6s_st[1].ex -= n;
} else if (lims->im6sl_st[0] == MCAST_INCLUDE) {
CTR3(KTR_MLD, "%s: t1 in -= %d on %s", __func__, n, ip6tbuf);
ims->im6s_st[1].in -= n;
}
if (lims->im6sl_st[1] == MCAST_EXCLUDE) {
CTR3(KTR_MLD, "%s: t1 ex += %d on %s", __func__, n, ip6tbuf);
ims->im6s_st[1].ex += n;
} else if (lims->im6sl_st[1] == MCAST_INCLUDE) {
CTR3(KTR_MLD, "%s: t1 in += %d on %s", __func__, n, ip6tbuf);
ims->im6s_st[1].in += n;
}
}
/*
* Atomically update the global in6_multi state, when a membership's
* filter list is being updated in any way.
*
* imf is the per-inpcb-membership group filter pointer.
* A fake imf may be passed for in-kernel consumers.
*
* XXX This is a candidate for a set-symmetric-difference style loop
* which would eliminate the repeated lookup from root of ims nodes,
* as they share the same key space.
*
* If any error occurred this function will back out of refcounts
* and return a non-zero value.
*/
static int
in6m_merge(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf)
{
struct ip6_msource *ims, *nims;
struct in6_msource *lims;
int schanged, error;
int nsrc0, nsrc1;
schanged = 0;
error = 0;
nsrc1 = nsrc0 = 0;
/*
* Update the source filters first, as this may fail.
* Maintain count of in-mode filters at t0, t1. These are
* used to work out if we transition into ASM mode or not.
* Maintain a count of source filters whose state was
* actually modified by this operation.
*/
RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) {
lims = (struct in6_msource *)ims;
if (lims->im6sl_st[0] == imf->im6f_st[0]) nsrc0++;
if (lims->im6sl_st[1] == imf->im6f_st[1]) nsrc1++;
if (lims->im6sl_st[0] == lims->im6sl_st[1]) continue;
error = in6m_get_source(inm, &lims->im6s_addr, 0, &nims);
++schanged;
if (error)
break;
im6s_merge(nims, lims, 0);
}
if (error) {
struct ip6_msource *bims;
RB_FOREACH_REVERSE_FROM(ims, ip6_msource_tree, nims) {
lims = (struct in6_msource *)ims;
if (lims->im6sl_st[0] == lims->im6sl_st[1])
continue;
(void)in6m_get_source(inm, &lims->im6s_addr, 1, &bims);
if (bims == NULL)
continue;
im6s_merge(bims, lims, 1);
}
goto out_reap;
}
CTR3(KTR_MLD, "%s: imf filters in-mode: %d at t0, %d at t1",
__func__, nsrc0, nsrc1);
/* Handle transition between INCLUDE {n} and INCLUDE {} on socket. */
if (imf->im6f_st[0] == imf->im6f_st[1] &&
imf->im6f_st[1] == MCAST_INCLUDE) {
if (nsrc1 == 0) {
CTR1(KTR_MLD, "%s: --in on inm at t1", __func__);
--inm->in6m_st[1].iss_in;
}
}
/* Handle filter mode transition on socket. */
if (imf->im6f_st[0] != imf->im6f_st[1]) {
CTR3(KTR_MLD, "%s: imf transition %d to %d",
__func__, imf->im6f_st[0], imf->im6f_st[1]);
if (imf->im6f_st[0] == MCAST_EXCLUDE) {
CTR1(KTR_MLD, "%s: --ex on inm at t1", __func__);
--inm->in6m_st[1].iss_ex;
} else if (imf->im6f_st[0] == MCAST_INCLUDE) {
CTR1(KTR_MLD, "%s: --in on inm at t1", __func__);
--inm->in6m_st[1].iss_in;
}
if (imf->im6f_st[1] == MCAST_EXCLUDE) {
CTR1(KTR_MLD, "%s: ex++ on inm at t1", __func__);
inm->in6m_st[1].iss_ex++;
} else if (imf->im6f_st[1] == MCAST_INCLUDE && nsrc1 > 0) {
CTR1(KTR_MLD, "%s: in++ on inm at t1", __func__);
inm->in6m_st[1].iss_in++;
}
}
/*
* Track inm filter state in terms of listener counts.
* If there are any exclusive listeners, stack-wide
* membership is exclusive.
* Otherwise, if only inclusive listeners, stack-wide is inclusive.
* If no listeners remain, state is undefined at t1,
* and the MLD lifecycle for this group should finish.
*/
if (inm->in6m_st[1].iss_ex > 0) {
CTR1(KTR_MLD, "%s: transition to EX", __func__);
inm->in6m_st[1].iss_fmode = MCAST_EXCLUDE;
} else if (inm->in6m_st[1].iss_in > 0) {
CTR1(KTR_MLD, "%s: transition to IN", __func__);
inm->in6m_st[1].iss_fmode = MCAST_INCLUDE;
} else {
CTR1(KTR_MLD, "%s: transition to UNDEF", __func__);
inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED;
}
/* Decrement ASM listener count on transition out of ASM mode. */
if (imf->im6f_st[0] == MCAST_EXCLUDE && nsrc0 == 0) {
if ((imf->im6f_st[1] != MCAST_EXCLUDE) ||
(imf->im6f_st[1] == MCAST_EXCLUDE && nsrc1 > 0)) {
CTR1(KTR_MLD, "%s: --asm on inm at t1", __func__);
--inm->in6m_st[1].iss_asm;
}
}
/* Increment ASM listener count on transition to ASM mode. */
if (imf->im6f_st[1] == MCAST_EXCLUDE && nsrc1 == 0) {
CTR1(KTR_MLD, "%s: asm++ on inm at t1", __func__);
inm->in6m_st[1].iss_asm++;
}
CTR3(KTR_MLD, "%s: merged imf %p to inm %p", __func__, imf, inm);
in6m_print(inm);
out_reap:
if (schanged > 0) {
CTR1(KTR_MLD, "%s: sources changed; reaping", __func__);
in6m_reap(inm);
}
return (error);
}
/*
* Mark an in6_multi's filter set deltas as committed.
* Called by MLD after a state change has been enqueued.
*/
void
in6m_commit(struct in6_multi *inm)
{
struct ip6_msource *ims;
CTR2(KTR_MLD, "%s: commit inm %p", __func__, inm);
CTR1(KTR_MLD, "%s: pre commit:", __func__);
in6m_print(inm);
RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) {
ims->im6s_st[0] = ims->im6s_st[1];
}
inm->in6m_st[0] = inm->in6m_st[1];
}
/*
* Reap unreferenced nodes from an in6_multi's filter set.
*/
static void
in6m_reap(struct in6_multi *inm)
{
struct ip6_msource *ims, *tims;
RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, tims) {
if (ims->im6s_st[0].ex > 0 || ims->im6s_st[0].in > 0 ||
ims->im6s_st[1].ex > 0 || ims->im6s_st[1].in > 0 ||
ims->im6s_stp != 0)
continue;
CTR2(KTR_MLD, "%s: free ims %p", __func__, ims);
RB_REMOVE(ip6_msource_tree, &inm->in6m_srcs, ims);
free(ims, M_IP6MSOURCE);
inm->in6m_nsrc--;
}
}
/*
* Purge all source nodes from an in6_multi's filter set.
*/
static void
in6m_purge(struct in6_multi *inm)
{
struct ip6_msource *ims, *tims;
RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, tims) {
CTR2(KTR_MLD, "%s: free ims %p", __func__, ims);
RB_REMOVE(ip6_msource_tree, &inm->in6m_srcs, ims);
free(ims, M_IP6MSOURCE);
inm->in6m_nsrc--;
}
/* Free state-change requests that might be queued. */
mbufq_drain(&inm->in6m_scq);
}
/*
* Join a multicast address w/o sources.
* KAME compatibility entry point.
*
* SMPng: Assume no mc locks held by caller.
*/
struct in6_multi_mship *
in6_joingroup(struct ifnet *ifp, struct in6_addr *mcaddr,
int *errorp, int delay)
{
struct in6_multi_mship *imm;
int error;
imm = malloc(sizeof(*imm), M_IP6MADDR, M_NOWAIT);
if (imm == NULL) {
*errorp = ENOBUFS;
return (NULL);
}
delay = (delay * PR_FASTHZ) / hz;
error = in6_mc_join(ifp, mcaddr, NULL, &imm->i6mm_maddr, delay);
if (error) {
*errorp = error;
free(imm, M_IP6MADDR);
return (NULL);
}
return (imm);
}
/*
* Leave a multicast address w/o sources.
* KAME compatibility entry point.
*
* SMPng: Assume no mc locks held by caller.
*/
int
in6_leavegroup(struct in6_multi_mship *imm)
{
if (imm->i6mm_maddr != NULL)
in6_mc_leave(imm->i6mm_maddr, NULL);
free(imm, M_IP6MADDR);
return 0;
}
/*
* Join a multicast group; unlocked entry point.
*
* SMPng: XXX: in6_mc_join() is called from in6_control() when upper
* locks are not held. Fortunately, ifp is unlikely to have been detached
* at this point, so we assume it's OK to recurse.
*/
int
in6_mc_join(struct ifnet *ifp, const struct in6_addr *mcaddr,
/*const*/ struct in6_mfilter *imf, struct in6_multi **pinm,
const int delay)
{
int error;
IN6_MULTI_LOCK();
error = in6_mc_join_locked(ifp, mcaddr, imf, pinm, delay);
IN6_MULTI_UNLOCK();
return (error);
}
/*
* Join a multicast group; real entry point.
*
* Only preserves atomicity at inm level.
* NOTE: imf argument cannot be const due to sys/tree.h limitations.
*
* If the MLD downcall fails, the group is not joined, and an error
* code is returned.
*/
int
in6_mc_join_locked(struct ifnet *ifp, const struct in6_addr *mcaddr,
/*const*/ struct in6_mfilter *imf, struct in6_multi **pinm,
const int delay)
{
struct in6_mfilter timf;
struct in6_multi *inm;
int error;
#ifdef KTR
char ip6tbuf[INET6_ADDRSTRLEN];
#endif
#ifdef INVARIANTS
/*
* Sanity: Check scope zone ID was set for ifp, if and
* only if group is scoped to an interface.
*/
KASSERT(IN6_IS_ADDR_MULTICAST(mcaddr),
("%s: not a multicast address", __func__));
if (IN6_IS_ADDR_MC_LINKLOCAL(mcaddr) ||
IN6_IS_ADDR_MC_INTFACELOCAL(mcaddr)) {
KASSERT(mcaddr->s6_addr16[1] != 0,
("%s: scope zone ID not set", __func__));
}
#endif
IN6_MULTI_LOCK_ASSERT();
CTR4(KTR_MLD, "%s: join %s on %p(%s))", __func__,
ip6_sprintf(ip6tbuf, mcaddr), ifp, if_name(ifp));
error = 0;
inm = NULL;
/*
* If no imf was specified (i.e. kernel consumer),
* fake one up and assume it is an ASM join.
*/
if (imf == NULL) {
im6f_init(&timf, MCAST_UNDEFINED, MCAST_EXCLUDE);
imf = &timf;
}
error = in6_mc_get(ifp, mcaddr, &inm);
if (error) {
CTR1(KTR_MLD, "%s: in6_mc_get() failure", __func__);
return (error);
}
CTR1(KTR_MLD, "%s: merge inm state", __func__);
error = in6m_merge(inm, imf);
if (error) {
CTR1(KTR_MLD, "%s: failed to merge inm state", __func__);
goto out_in6m_release;
}
CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
error = mld_change_state(inm, delay);
if (error) {
CTR1(KTR_MLD, "%s: failed to update source", __func__);
goto out_in6m_release;
}
out_in6m_release:
if (error) {
CTR2(KTR_MLD, "%s: dropping ref on %p", __func__, inm);
in6m_release_locked(inm);
} else {
*pinm = inm;
}
return (error);
}
/*
* Leave a multicast group; unlocked entry point.
*/
int
in6_mc_leave(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf)
{
- struct ifnet *ifp;
int error;
-
- ifp = inm->in6m_ifp;
IN6_MULTI_LOCK();
error = in6_mc_leave_locked(inm, imf);
IN6_MULTI_UNLOCK();
return (error);
}
/*
* Leave a multicast group; real entry point.
* All source filters will be expunged.
*
* Only preserves atomicity at inm level.
*
* Holding the write lock for the INP which contains imf
* is highly advisable. We can't assert for it as imf does not
* contain a back-pointer to the owning inp.
*
* Note: This is not the same as in6m_release(*) as this function also
* makes a state change downcall into MLD.
*/
int
in6_mc_leave_locked(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf)
{
struct in6_mfilter timf;
int error;
#ifdef KTR
char ip6tbuf[INET6_ADDRSTRLEN];
#endif
error = 0;
IN6_MULTI_LOCK_ASSERT();
CTR5(KTR_MLD, "%s: leave inm %p, %s/%s, imf %p", __func__,
inm, ip6_sprintf(ip6tbuf, &inm->in6m_addr),
(in6m_is_ifp_detached(inm) ? "null" : if_name(inm->in6m_ifp)),
imf);
/*
* If no imf was specified (i.e. kernel consumer),
* fake one up and assume it is an ASM join.
*/
if (imf == NULL) {
im6f_init(&timf, MCAST_EXCLUDE, MCAST_UNDEFINED);
imf = &timf;
}
/*
* Begin state merge transaction at MLD layer.
*
* As this particular invocation should not cause any memory
* to be allocated, and there is no opportunity to roll back
* the transaction, it MUST NOT fail.
*/
CTR1(KTR_MLD, "%s: merge inm state", __func__);
error = in6m_merge(inm, imf);
KASSERT(error == 0, ("%s: failed to merge inm state", __func__));
CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
error = mld_change_state(inm, 0);
if (error)
CTR1(KTR_MLD, "%s: failed mld downcall", __func__);
CTR2(KTR_MLD, "%s: dropping ref on %p", __func__, inm);
in6m_release_locked(inm);
return (error);
}
/*
* Block or unblock an ASM multicast source on an inpcb.
* This implements the delta-based API described in RFC 3678.
*
* The delta-based API applies only to exclusive-mode memberships.
* An MLD downcall will be performed.
*
* SMPng: NOTE: Must take Giant as a join may create a new ifma.
*
* Return 0 if successful, otherwise return an appropriate error code.
*/
static int
in6p_block_unblock_source(struct inpcb *inp, struct sockopt *sopt)
{
struct group_source_req gsr;
sockunion_t *gsa, *ssa;
struct ifnet *ifp;
struct in6_mfilter *imf;
struct ip6_moptions *imo;
struct in6_msource *ims;
struct in6_multi *inm;
size_t idx;
uint16_t fmode;
int error, doblock;
#ifdef KTR
char ip6tbuf[INET6_ADDRSTRLEN];
#endif
ifp = NULL;
error = 0;
doblock = 0;
memset(&gsr, 0, sizeof(struct group_source_req));
gsa = (sockunion_t *)&gsr.gsr_group;
ssa = (sockunion_t *)&gsr.gsr_source;
switch (sopt->sopt_name) {
case MCAST_BLOCK_SOURCE:
case MCAST_UNBLOCK_SOURCE:
error = sooptcopyin(sopt, &gsr,
sizeof(struct group_source_req),
sizeof(struct group_source_req));
if (error)
return (error);
if (gsa->sin6.sin6_family != AF_INET6 ||
gsa->sin6.sin6_len != sizeof(struct sockaddr_in6))
return (EINVAL);
if (ssa->sin6.sin6_family != AF_INET6 ||
ssa->sin6.sin6_len != sizeof(struct sockaddr_in6))
return (EINVAL);
if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface)
return (EADDRNOTAVAIL);
ifp = ifnet_byindex(gsr.gsr_interface);
if (sopt->sopt_name == MCAST_BLOCK_SOURCE)
doblock = 1;
break;
default:
CTR2(KTR_MLD, "%s: unknown sopt_name %d",
__func__, sopt->sopt_name);
return (EOPNOTSUPP);
break;
}
if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
return (EINVAL);
(void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);
/*
* Check if we are actually a member of this group.
*/
imo = in6p_findmoptions(inp);
idx = im6o_match_group(imo, ifp, &gsa->sa);
if (idx == -1 || imo->im6o_mfilters == NULL) {
error = EADDRNOTAVAIL;
goto out_in6p_locked;
}
KASSERT(imo->im6o_mfilters != NULL,
("%s: im6o_mfilters not allocated", __func__));
imf = &imo->im6o_mfilters[idx];
inm = imo->im6o_membership[idx];
/*
* Attempting to use the delta-based API on an
* non exclusive-mode membership is an error.
*/
fmode = imf->im6f_st[0];
if (fmode != MCAST_EXCLUDE) {
error = EINVAL;
goto out_in6p_locked;
}
/*
* Deal with error cases up-front:
* Asked to block, but already blocked; or
* Asked to unblock, but nothing to unblock.
* If adding a new block entry, allocate it.
*/
ims = im6o_match_source(imo, idx, &ssa->sa);
if ((ims != NULL && doblock) || (ims == NULL && !doblock)) {
CTR3(KTR_MLD, "%s: source %s %spresent", __func__,
ip6_sprintf(ip6tbuf, &ssa->sin6.sin6_addr),
doblock ? "" : "not ");
error = EADDRNOTAVAIL;
goto out_in6p_locked;
}
INP_WLOCK_ASSERT(inp);
/*
* Begin state merge transaction at socket layer.
*/
if (doblock) {
CTR2(KTR_MLD, "%s: %s source", __func__, "block");
ims = im6f_graft(imf, fmode, &ssa->sin6);
if (ims == NULL)
error = ENOMEM;
} else {
CTR2(KTR_MLD, "%s: %s source", __func__, "allow");
error = im6f_prune(imf, &ssa->sin6);
}
if (error) {
CTR1(KTR_MLD, "%s: merge imf state failed", __func__);
goto out_im6f_rollback;
}
/*
* Begin state merge transaction at MLD layer.
*/
IN6_MULTI_LOCK();
CTR1(KTR_MLD, "%s: merge inm state", __func__);
error = in6m_merge(inm, imf);
if (error)
CTR1(KTR_MLD, "%s: failed to merge inm state", __func__);
else {
CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
error = mld_change_state(inm, 0);
if (error)
CTR1(KTR_MLD, "%s: failed mld downcall", __func__);
}
IN6_MULTI_UNLOCK();
out_im6f_rollback:
if (error)
im6f_rollback(imf);
else
im6f_commit(imf);
im6f_reap(imf);
out_in6p_locked:
INP_WUNLOCK(inp);
return (error);
}
/*
* Given an inpcb, return its multicast options structure pointer. Accepts
* an unlocked inpcb pointer, but will return it locked. May sleep.
*
* SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held.
* SMPng: NOTE: Returns with the INP write lock held.
*/
static struct ip6_moptions *
in6p_findmoptions(struct inpcb *inp)
{
struct ip6_moptions *imo;
struct in6_multi **immp;
struct in6_mfilter *imfp;
size_t idx;
INP_WLOCK(inp);
if (inp->in6p_moptions != NULL)
return (inp->in6p_moptions);
INP_WUNLOCK(inp);
imo = malloc(sizeof(*imo), M_IP6MOPTS, M_WAITOK);
immp = malloc(sizeof(*immp) * IPV6_MIN_MEMBERSHIPS, M_IP6MOPTS,
M_WAITOK | M_ZERO);
imfp = malloc(sizeof(struct in6_mfilter) * IPV6_MIN_MEMBERSHIPS,
M_IN6MFILTER, M_WAITOK);
imo->im6o_multicast_ifp = NULL;
imo->im6o_multicast_hlim = V_ip6_defmcasthlim;
imo->im6o_multicast_loop = in6_mcast_loop;
imo->im6o_num_memberships = 0;
imo->im6o_max_memberships = IPV6_MIN_MEMBERSHIPS;
imo->im6o_membership = immp;
/* Initialize per-group source filters. */
for (idx = 0; idx < IPV6_MIN_MEMBERSHIPS; idx++)
im6f_init(&imfp[idx], MCAST_UNDEFINED, MCAST_EXCLUDE);
imo->im6o_mfilters = imfp;
INP_WLOCK(inp);
if (inp->in6p_moptions != NULL) {
free(imfp, M_IN6MFILTER);
free(immp, M_IP6MOPTS);
free(imo, M_IP6MOPTS);
return (inp->in6p_moptions);
}
inp->in6p_moptions = imo;
return (imo);
}
/*
* Discard the IPv6 multicast options (and source filters).
*
* SMPng: NOTE: assumes INP write lock is held.
*/
void
ip6_freemoptions(struct ip6_moptions *imo)
{
struct in6_mfilter *imf;
size_t idx, nmships;
KASSERT(imo != NULL, ("%s: ip6_moptions is NULL", __func__));
nmships = imo->im6o_num_memberships;
for (idx = 0; idx < nmships; ++idx) {
imf = imo->im6o_mfilters ? &imo->im6o_mfilters[idx] : NULL;
if (imf)
im6f_leave(imf);
/* XXX this will thrash the lock(s) */
(void)in6_mc_leave(imo->im6o_membership[idx], imf);
if (imf)
im6f_purge(imf);
}
if (imo->im6o_mfilters)
free(imo->im6o_mfilters, M_IN6MFILTER);
free(imo->im6o_membership, M_IP6MOPTS);
free(imo, M_IP6MOPTS);
}
/*
* Atomically get source filters on a socket for an IPv6 multicast group.
* Called with INP lock held; returns with lock released.
*/
static int
in6p_get_source_filters(struct inpcb *inp, struct sockopt *sopt)
{
struct __msfilterreq msfr;
sockunion_t *gsa;
struct ifnet *ifp;
struct ip6_moptions *imo;
struct in6_mfilter *imf;
struct ip6_msource *ims;
struct in6_msource *lims;
struct sockaddr_in6 *psin;
struct sockaddr_storage *ptss;
struct sockaddr_storage *tss;
int error;
size_t idx, nsrcs, ncsrcs;
INP_WLOCK_ASSERT(inp);
imo = inp->in6p_moptions;
KASSERT(imo != NULL, ("%s: null ip6_moptions", __func__));
INP_WUNLOCK(inp);
error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq),
sizeof(struct __msfilterreq));
if (error)
return (error);
if (msfr.msfr_group.ss_family != AF_INET6 ||
msfr.msfr_group.ss_len != sizeof(struct sockaddr_in6))
return (EINVAL);
gsa = (sockunion_t *)&msfr.msfr_group;
if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
return (EINVAL);
if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex)
return (EADDRNOTAVAIL);
ifp = ifnet_byindex(msfr.msfr_ifindex);
if (ifp == NULL)
return (EADDRNOTAVAIL);
(void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);
INP_WLOCK(inp);
/*
* Lookup group on the socket.
*/
idx = im6o_match_group(imo, ifp, &gsa->sa);
if (idx == -1 || imo->im6o_mfilters == NULL) {
INP_WUNLOCK(inp);
return (EADDRNOTAVAIL);
}
imf = &imo->im6o_mfilters[idx];
/*
* Ignore memberships which are in limbo.
*/
if (imf->im6f_st[1] == MCAST_UNDEFINED) {
INP_WUNLOCK(inp);
return (EAGAIN);
}
msfr.msfr_fmode = imf->im6f_st[1];
/*
* If the user specified a buffer, copy out the source filter
* entries to userland gracefully.
* We only copy out the number of entries which userland
* has asked for, but we always tell userland how big the
* buffer really needs to be.
*/
if (msfr.msfr_nsrcs > in6_mcast_maxsocksrc)
msfr.msfr_nsrcs = in6_mcast_maxsocksrc;
tss = NULL;
if (msfr.msfr_srcs != NULL && msfr.msfr_nsrcs > 0) {
tss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs,
M_TEMP, M_NOWAIT | M_ZERO);
if (tss == NULL) {
INP_WUNLOCK(inp);
return (ENOBUFS);
}
}
/*
* Count number of sources in-mode at t0.
* If buffer space exists and remains, copy out source entries.
*/
nsrcs = msfr.msfr_nsrcs;
ncsrcs = 0;
ptss = tss;
RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) {
lims = (struct in6_msource *)ims;
if (lims->im6sl_st[0] == MCAST_UNDEFINED ||
lims->im6sl_st[0] != imf->im6f_st[0])
continue;
++ncsrcs;
if (tss != NULL && nsrcs > 0) {
psin = (struct sockaddr_in6 *)ptss;
psin->sin6_family = AF_INET6;
psin->sin6_len = sizeof(struct sockaddr_in6);
psin->sin6_addr = lims->im6s_addr;
psin->sin6_port = 0;
--nsrcs;
++ptss;
}
}
INP_WUNLOCK(inp);
if (tss != NULL) {
error = copyout(tss, msfr.msfr_srcs,
sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs);
free(tss, M_TEMP);
if (error)
return (error);
}
msfr.msfr_nsrcs = ncsrcs;
error = sooptcopyout(sopt, &msfr, sizeof(struct __msfilterreq));
return (error);
}
/*
* Return the IP multicast options in response to user getsockopt().
*/
int
ip6_getmoptions(struct inpcb *inp, struct sockopt *sopt)
{
struct ip6_moptions *im6o;
int error;
u_int optval;
INP_WLOCK(inp);
im6o = inp->in6p_moptions;
/*
* If socket is neither of type SOCK_RAW or SOCK_DGRAM,
* or is a divert socket, reject it.
*/
if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT ||
(inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) {
INP_WUNLOCK(inp);
return (EOPNOTSUPP);
}
error = 0;
switch (sopt->sopt_name) {
case IPV6_MULTICAST_IF:
if (im6o == NULL || im6o->im6o_multicast_ifp == NULL) {
optval = 0;
} else {
optval = im6o->im6o_multicast_ifp->if_index;
}
INP_WUNLOCK(inp);
error = sooptcopyout(sopt, &optval, sizeof(u_int));
break;
case IPV6_MULTICAST_HOPS:
if (im6o == NULL)
optval = V_ip6_defmcasthlim;
else
optval = im6o->im6o_multicast_hlim;
INP_WUNLOCK(inp);
error = sooptcopyout(sopt, &optval, sizeof(u_int));
break;
case IPV6_MULTICAST_LOOP:
if (im6o == NULL)
optval = in6_mcast_loop; /* XXX VIMAGE */
else
optval = im6o->im6o_multicast_loop;
INP_WUNLOCK(inp);
error = sooptcopyout(sopt, &optval, sizeof(u_int));
break;
case IPV6_MSFILTER:
if (im6o == NULL) {
error = EADDRNOTAVAIL;
INP_WUNLOCK(inp);
} else {
error = in6p_get_source_filters(inp, sopt);
}
break;
default:
INP_WUNLOCK(inp);
error = ENOPROTOOPT;
break;
}
INP_UNLOCK_ASSERT(inp);
return (error);
}
/*
* Look up the ifnet to use for a multicast group membership,
* given the address of an IPv6 group.
*
* This routine exists to support legacy IPv6 multicast applications.
*
* If inp is non-NULL, use this socket's current FIB number for any
* required FIB lookup. Look up the group address in the unicast FIB,
* and use its ifp; usually, this points to the default next-hop.
* If the FIB lookup fails, return NULL.
*
* FUTURE: Support multiple forwarding tables for IPv6.
*
* Returns NULL if no ifp could be found.
*/
static struct ifnet *
in6p_lookup_mcast_ifp(const struct inpcb *in6p,
const struct sockaddr_in6 *gsin6)
{
struct nhop6_basic nh6;
struct in6_addr dst;
uint32_t scopeid;
uint32_t fibnum;
KASSERT(in6p->inp_vflag & INP_IPV6,
("%s: not INP_IPV6 inpcb", __func__));
KASSERT(gsin6->sin6_family == AF_INET6,
("%s: not AF_INET6 group", __func__));
in6_splitscope(&gsin6->sin6_addr, &dst, &scopeid);
fibnum = in6p ? in6p->inp_inc.inc_fibnum : RT_DEFAULT_FIB;
if (fib6_lookup_nh_basic(fibnum, &dst, scopeid, 0, 0, &nh6) != 0)
return (NULL);
return (nh6.nh_ifp);
}
/*
* Join an IPv6 multicast group, possibly with a source.
*
* FIXME: The KAME use of the unspecified address (::)
* to join *all* multicast groups is currently unsupported.
*/
static int
in6p_join_group(struct inpcb *inp, struct sockopt *sopt)
{
struct group_source_req gsr;
sockunion_t *gsa, *ssa;
struct ifnet *ifp;
struct in6_mfilter *imf;
struct ip6_moptions *imo;
struct in6_multi *inm;
struct in6_msource *lims;
size_t idx;
int error, is_new;
ifp = NULL;
imf = NULL;
lims = NULL;
error = 0;
is_new = 0;
memset(&gsr, 0, sizeof(struct group_source_req));
gsa = (sockunion_t *)&gsr.gsr_group;
gsa->ss.ss_family = AF_UNSPEC;
ssa = (sockunion_t *)&gsr.gsr_source;
ssa->ss.ss_family = AF_UNSPEC;
/*
* Chew everything into struct group_source_req.
* Overwrite the port field if present, as the sockaddr
* being copied in may be matched with a binary comparison.
* Ignore passed-in scope ID.
*/
switch (sopt->sopt_name) {
case IPV6_JOIN_GROUP: {
struct ipv6_mreq mreq;
error = sooptcopyin(sopt, &mreq, sizeof(struct ipv6_mreq),
sizeof(struct ipv6_mreq));
if (error)
return (error);
gsa->sin6.sin6_family = AF_INET6;
gsa->sin6.sin6_len = sizeof(struct sockaddr_in6);
gsa->sin6.sin6_addr = mreq.ipv6mr_multiaddr;
if (mreq.ipv6mr_interface == 0) {
ifp = in6p_lookup_mcast_ifp(inp, &gsa->sin6);
} else {
if (V_if_index < mreq.ipv6mr_interface)
return (EADDRNOTAVAIL);
ifp = ifnet_byindex(mreq.ipv6mr_interface);
}
CTR3(KTR_MLD, "%s: ipv6mr_interface = %d, ifp = %p",
__func__, mreq.ipv6mr_interface, ifp);
} break;
case MCAST_JOIN_GROUP:
case MCAST_JOIN_SOURCE_GROUP:
if (sopt->sopt_name == MCAST_JOIN_GROUP) {
error = sooptcopyin(sopt, &gsr,
sizeof(struct group_req),
sizeof(struct group_req));
} else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) {
error = sooptcopyin(sopt, &gsr,
sizeof(struct group_source_req),
sizeof(struct group_source_req));
}
if (error)
return (error);
if (gsa->sin6.sin6_family != AF_INET6 ||
gsa->sin6.sin6_len != sizeof(struct sockaddr_in6))
return (EINVAL);
if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) {
if (ssa->sin6.sin6_family != AF_INET6 ||
ssa->sin6.sin6_len != sizeof(struct sockaddr_in6))
return (EINVAL);
if (IN6_IS_ADDR_MULTICAST(&ssa->sin6.sin6_addr))
return (EINVAL);
/*
* TODO: Validate embedded scope ID in source
* list entry against passed-in ifp, if and only
* if source list filter entry is iface or node local.
*/
in6_clearscope(&ssa->sin6.sin6_addr);
ssa->sin6.sin6_port = 0;
ssa->sin6.sin6_scope_id = 0;
}
if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface)
return (EADDRNOTAVAIL);
ifp = ifnet_byindex(gsr.gsr_interface);
break;
default:
CTR2(KTR_MLD, "%s: unknown sopt_name %d",
__func__, sopt->sopt_name);
return (EOPNOTSUPP);
break;
}
if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
return (EINVAL);
if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0)
return (EADDRNOTAVAIL);
gsa->sin6.sin6_port = 0;
gsa->sin6.sin6_scope_id = 0;
/*
* Always set the scope zone ID on memberships created from userland.
* Use the passed-in ifp to do this.
* XXX The in6_setscope() return value is meaningless.
* XXX SCOPE6_LOCK() is taken by in6_setscope().
*/
(void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);
imo = in6p_findmoptions(inp);
idx = im6o_match_group(imo, ifp, &gsa->sa);
if (idx == -1) {
is_new = 1;
} else {
inm = imo->im6o_membership[idx];
imf = &imo->im6o_mfilters[idx];
if (ssa->ss.ss_family != AF_UNSPEC) {
/*
* MCAST_JOIN_SOURCE_GROUP on an exclusive membership
* is an error. On an existing inclusive membership,
* it just adds the source to the filter list.
*/
if (imf->im6f_st[1] != MCAST_INCLUDE) {
error = EINVAL;
goto out_in6p_locked;
}
/*
* Throw out duplicates.
*
* XXX FIXME: This makes a naive assumption that
* even if entries exist for *ssa in this imf,
* they will be rejected as dupes, even if they
* are not valid in the current mode (in-mode).
*
* in6_msource is transactioned just as for anything
* else in SSM -- but note naive use of in6m_graft()
* below for allocating new filter entries.
*
* This is only an issue if someone mixes the
* full-state SSM API with the delta-based API,
* which is discouraged in the relevant RFCs.
*/
lims = im6o_match_source(imo, idx, &ssa->sa);
if (lims != NULL /*&&
lims->im6sl_st[1] == MCAST_INCLUDE*/) {
error = EADDRNOTAVAIL;
goto out_in6p_locked;
}
} else {
/*
* MCAST_JOIN_GROUP alone, on any existing membership,
* is rejected, to stop the same inpcb tying up
* multiple refs to the in_multi.
* On an existing inclusive membership, this is also
* an error; if you want to change filter mode,
* you must use the userland API setsourcefilter().
* XXX We don't reject this for imf in UNDEFINED
* state at t1, because allocation of a filter
* is atomic with allocation of a membership.
*/
error = EINVAL;
goto out_in6p_locked;
}
}
/*
* Begin state merge transaction at socket layer.
*/
INP_WLOCK_ASSERT(inp);
if (is_new) {
if (imo->im6o_num_memberships == imo->im6o_max_memberships) {
error = im6o_grow(imo);
if (error)
goto out_in6p_locked;
}
/*
* Allocate the new slot upfront so we can deal with
* grafting the new source filter in same code path
* as for join-source on existing membership.
*/
idx = imo->im6o_num_memberships;
imo->im6o_membership[idx] = NULL;
imo->im6o_num_memberships++;
KASSERT(imo->im6o_mfilters != NULL,
("%s: im6f_mfilters vector was not allocated", __func__));
imf = &imo->im6o_mfilters[idx];
KASSERT(RB_EMPTY(&imf->im6f_sources),
("%s: im6f_sources not empty", __func__));
}
/*
* Graft new source into filter list for this inpcb's
* membership of the group. The in6_multi may not have
* been allocated yet if this is a new membership, however,
* the in_mfilter slot will be allocated and must be initialized.
*
* Note: Grafting of exclusive mode filters doesn't happen
* in this path.
* XXX: Should check for non-NULL lims (node exists but may
* not be in-mode) for interop with full-state API.
*/
if (ssa->ss.ss_family != AF_UNSPEC) {
/* Membership starts in IN mode */
if (is_new) {
CTR1(KTR_MLD, "%s: new join w/source", __func__);
im6f_init(imf, MCAST_UNDEFINED, MCAST_INCLUDE);
} else {
CTR2(KTR_MLD, "%s: %s source", __func__, "allow");
}
lims = im6f_graft(imf, MCAST_INCLUDE, &ssa->sin6);
if (lims == NULL) {
CTR1(KTR_MLD, "%s: merge imf state failed",
__func__);
error = ENOMEM;
goto out_im6o_free;
}
} else {
/* No address specified; Membership starts in EX mode */
if (is_new) {
CTR1(KTR_MLD, "%s: new join w/o source", __func__);
im6f_init(imf, MCAST_UNDEFINED, MCAST_EXCLUDE);
}
}
/*
* Begin state merge transaction at MLD layer.
*/
IN6_MULTI_LOCK();
if (is_new) {
error = in6_mc_join_locked(ifp, &gsa->sin6.sin6_addr, imf,
&inm, 0);
if (error) {
IN6_MULTI_UNLOCK();
goto out_im6o_free;
}
imo->im6o_membership[idx] = inm;
} else {
CTR1(KTR_MLD, "%s: merge inm state", __func__);
error = in6m_merge(inm, imf);
if (error)
CTR1(KTR_MLD, "%s: failed to merge inm state",
__func__);
else {
CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
error = mld_change_state(inm, 0);
if (error)
CTR1(KTR_MLD, "%s: failed mld downcall",
__func__);
}
}
IN6_MULTI_UNLOCK();
INP_WLOCK_ASSERT(inp);
if (error) {
im6f_rollback(imf);
if (is_new)
im6f_purge(imf);
else
im6f_reap(imf);
} else {
im6f_commit(imf);
}
out_im6o_free:
if (error && is_new) {
imo->im6o_membership[idx] = NULL;
--imo->im6o_num_memberships;
}
out_in6p_locked:
INP_WUNLOCK(inp);
return (error);
}
/*
* Leave an IPv6 multicast group on an inpcb, possibly with a source.
*/
static int
in6p_leave_group(struct inpcb *inp, struct sockopt *sopt)
{
struct ipv6_mreq mreq;
struct group_source_req gsr;
sockunion_t *gsa, *ssa;
struct ifnet *ifp;
struct in6_mfilter *imf;
struct ip6_moptions *imo;
struct in6_msource *ims;
struct in6_multi *inm;
uint32_t ifindex;
size_t idx;
int error, is_final;
#ifdef KTR
char ip6tbuf[INET6_ADDRSTRLEN];
#endif
ifp = NULL;
ifindex = 0;
error = 0;
is_final = 1;
memset(&gsr, 0, sizeof(struct group_source_req));
gsa = (sockunion_t *)&gsr.gsr_group;
gsa->ss.ss_family = AF_UNSPEC;
ssa = (sockunion_t *)&gsr.gsr_source;
ssa->ss.ss_family = AF_UNSPEC;
/*
* Chew everything passed in up into a struct group_source_req
* as that is easier to process.
* Note: Any embedded scope ID in the multicast group passed
* in by userland is ignored, the interface index is the recommended
* mechanism to specify an interface; see below.
*/
switch (sopt->sopt_name) {
case IPV6_LEAVE_GROUP:
error = sooptcopyin(sopt, &mreq, sizeof(struct ipv6_mreq),
sizeof(struct ipv6_mreq));
if (error)
return (error);
gsa->sin6.sin6_family = AF_INET6;
gsa->sin6.sin6_len = sizeof(struct sockaddr_in6);
gsa->sin6.sin6_addr = mreq.ipv6mr_multiaddr;
gsa->sin6.sin6_port = 0;
gsa->sin6.sin6_scope_id = 0;
ifindex = mreq.ipv6mr_interface;
break;
case MCAST_LEAVE_GROUP:
case MCAST_LEAVE_SOURCE_GROUP:
if (sopt->sopt_name == MCAST_LEAVE_GROUP) {
error = sooptcopyin(sopt, &gsr,
sizeof(struct group_req),
sizeof(struct group_req));
} else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) {
error = sooptcopyin(sopt, &gsr,
sizeof(struct group_source_req),
sizeof(struct group_source_req));
}
if (error)
return (error);
if (gsa->sin6.sin6_family != AF_INET6 ||
gsa->sin6.sin6_len != sizeof(struct sockaddr_in6))
return (EINVAL);
if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) {
if (ssa->sin6.sin6_family != AF_INET6 ||
ssa->sin6.sin6_len != sizeof(struct sockaddr_in6))
return (EINVAL);
if (IN6_IS_ADDR_MULTICAST(&ssa->sin6.sin6_addr))
return (EINVAL);
/*
* TODO: Validate embedded scope ID in source
* list entry against passed-in ifp, if and only
* if source list filter entry is iface or node local.
*/
in6_clearscope(&ssa->sin6.sin6_addr);
}
gsa->sin6.sin6_port = 0;
gsa->sin6.sin6_scope_id = 0;
ifindex = gsr.gsr_interface;
break;
default:
CTR2(KTR_MLD, "%s: unknown sopt_name %d",
__func__, sopt->sopt_name);
return (EOPNOTSUPP);
break;
}
if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
return (EINVAL);
/*
* Validate interface index if provided. If no interface index
* was provided separately, attempt to look the membership up
* from the default scope as a last resort to disambiguate
* the membership we are being asked to leave.
* XXX SCOPE6 lock potentially taken here.
*/
if (ifindex != 0) {
if (V_if_index < ifindex)
return (EADDRNOTAVAIL);
ifp = ifnet_byindex(ifindex);
if (ifp == NULL)
return (EADDRNOTAVAIL);
(void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);
} else {
error = sa6_embedscope(&gsa->sin6, V_ip6_use_defzone);
if (error)
return (EADDRNOTAVAIL);
/*
* Some badly behaved applications don't pass an ifindex
* or a scope ID, which is an API violation. In this case,
* perform a lookup as per a v6 join.
*
* XXX For now, stomp on zone ID for the corner case.
* This is not the 'KAME way', but we need to see the ifp
* directly until such time as this implementation is
* refactored, assuming the scope IDs are the way to go.
*/
ifindex = ntohs(gsa->sin6.sin6_addr.s6_addr16[1]);
if (ifindex == 0) {
CTR2(KTR_MLD, "%s: warning: no ifindex, looking up "
"ifp for group %s.", __func__,
ip6_sprintf(ip6tbuf, &gsa->sin6.sin6_addr));
ifp = in6p_lookup_mcast_ifp(inp, &gsa->sin6);
} else {
ifp = ifnet_byindex(ifindex);
}
if (ifp == NULL)
return (EADDRNOTAVAIL);
}
CTR2(KTR_MLD, "%s: ifp = %p", __func__, ifp);
KASSERT(ifp != NULL, ("%s: ifp did not resolve", __func__));
/*
* Find the membership in the membership array.
*/
imo = in6p_findmoptions(inp);
idx = im6o_match_group(imo, ifp, &gsa->sa);
if (idx == -1) {
error = EADDRNOTAVAIL;
goto out_in6p_locked;
}
inm = imo->im6o_membership[idx];
imf = &imo->im6o_mfilters[idx];
if (ssa->ss.ss_family != AF_UNSPEC)
is_final = 0;
/*
* Begin state merge transaction at socket layer.
*/
INP_WLOCK_ASSERT(inp);
/*
* If we were instructed only to leave a given source, do so.
* MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships.
*/
if (is_final) {
im6f_leave(imf);
} else {
if (imf->im6f_st[0] == MCAST_EXCLUDE) {
error = EADDRNOTAVAIL;
goto out_in6p_locked;
}
ims = im6o_match_source(imo, idx, &ssa->sa);
if (ims == NULL) {
CTR3(KTR_MLD, "%s: source %p %spresent", __func__,
ip6_sprintf(ip6tbuf, &ssa->sin6.sin6_addr),
"not ");
error = EADDRNOTAVAIL;
goto out_in6p_locked;
}
CTR2(KTR_MLD, "%s: %s source", __func__, "block");
error = im6f_prune(imf, &ssa->sin6);
if (error) {
CTR1(KTR_MLD, "%s: merge imf state failed",
__func__);
goto out_in6p_locked;
}
}
/*
* Begin state merge transaction at MLD layer.
*/
IN6_MULTI_LOCK();
if (is_final) {
/*
* Give up the multicast address record to which
* the membership points.
*/
(void)in6_mc_leave_locked(inm, imf);
} else {
CTR1(KTR_MLD, "%s: merge inm state", __func__);
error = in6m_merge(inm, imf);
if (error)
CTR1(KTR_MLD, "%s: failed to merge inm state",
__func__);
else {
CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
error = mld_change_state(inm, 0);
if (error)
CTR1(KTR_MLD, "%s: failed mld downcall",
__func__);
}
}
IN6_MULTI_UNLOCK();
if (error)
im6f_rollback(imf);
else
im6f_commit(imf);
im6f_reap(imf);
if (is_final) {
/* Remove the gap in the membership array. */
for (++idx; idx < imo->im6o_num_memberships; ++idx) {
imo->im6o_membership[idx-1] = imo->im6o_membership[idx];
imo->im6o_mfilters[idx-1] = imo->im6o_mfilters[idx];
}
imo->im6o_num_memberships--;
}
out_in6p_locked:
INP_WUNLOCK(inp);
return (error);
}
/*
* Select the interface for transmitting IPv6 multicast datagrams.
*
* Either an instance of struct in6_addr or an instance of struct ipv6_mreqn
* may be passed to this socket option. An address of in6addr_any or an
* interface index of 0 is used to remove a previous selection.
* When no interface is selected, one is chosen for every send.
*/
static int
in6p_set_multicast_if(struct inpcb *inp, struct sockopt *sopt)
{
struct ifnet *ifp;
struct ip6_moptions *imo;
u_int ifindex;
int error;
if (sopt->sopt_valsize != sizeof(u_int))
return (EINVAL);
error = sooptcopyin(sopt, &ifindex, sizeof(u_int), sizeof(u_int));
if (error)
return (error);
if (V_if_index < ifindex)
return (EINVAL);
if (ifindex == 0)
ifp = NULL;
else {
ifp = ifnet_byindex(ifindex);
if (ifp == NULL)
return (EINVAL);
if ((ifp->if_flags & IFF_MULTICAST) == 0)
return (EADDRNOTAVAIL);
}
imo = in6p_findmoptions(inp);
imo->im6o_multicast_ifp = ifp;
INP_WUNLOCK(inp);
return (0);
}
/*
* Atomically set source filters on a socket for an IPv6 multicast group.
*
* SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held.
*/
static int
in6p_set_source_filters(struct inpcb *inp, struct sockopt *sopt)
{
struct __msfilterreq msfr;
sockunion_t *gsa;
struct ifnet *ifp;
struct in6_mfilter *imf;
struct ip6_moptions *imo;
struct in6_multi *inm;
size_t idx;
int error;
error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq),
sizeof(struct __msfilterreq));
if (error)
return (error);
if (msfr.msfr_nsrcs > in6_mcast_maxsocksrc)
return (ENOBUFS);
if (msfr.msfr_fmode != MCAST_EXCLUDE &&
msfr.msfr_fmode != MCAST_INCLUDE)
return (EINVAL);
if (msfr.msfr_group.ss_family != AF_INET6 ||
msfr.msfr_group.ss_len != sizeof(struct sockaddr_in6))
return (EINVAL);
gsa = (sockunion_t *)&msfr.msfr_group;
if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
return (EINVAL);
gsa->sin6.sin6_port = 0; /* ignore port */
if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex)
return (EADDRNOTAVAIL);
ifp = ifnet_byindex(msfr.msfr_ifindex);
if (ifp == NULL)
return (EADDRNOTAVAIL);
(void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);
/*
* Take the INP write lock.
* Check if this socket is a member of this group.
*/
imo = in6p_findmoptions(inp);
idx = im6o_match_group(imo, ifp, &gsa->sa);
if (idx == -1 || imo->im6o_mfilters == NULL) {
error = EADDRNOTAVAIL;
goto out_in6p_locked;
}
inm = imo->im6o_membership[idx];
imf = &imo->im6o_mfilters[idx];
/*
* Begin state merge transaction at socket layer.
*/
INP_WLOCK_ASSERT(inp);
imf->im6f_st[1] = msfr.msfr_fmode;
/*
* Apply any new source filters, if present.
* Make a copy of the user-space source vector so
* that we may copy them with a single copyin. This
* allows us to deal with page faults up-front.
*/
if (msfr.msfr_nsrcs > 0) {
struct in6_msource *lims;
struct sockaddr_in6 *psin;
struct sockaddr_storage *kss, *pkss;
int i;
INP_WUNLOCK(inp);
CTR2(KTR_MLD, "%s: loading %lu source list entries",
__func__, (unsigned long)msfr.msfr_nsrcs);
kss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs,
M_TEMP, M_WAITOK);
error = copyin(msfr.msfr_srcs, kss,
sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs);
if (error) {
free(kss, M_TEMP);
return (error);
}
INP_WLOCK(inp);
/*
* Mark all source filters as UNDEFINED at t1.
* Restore new group filter mode, as im6f_leave()
* will set it to INCLUDE.
*/
im6f_leave(imf);
imf->im6f_st[1] = msfr.msfr_fmode;
/*
* Update socket layer filters at t1, lazy-allocating
* new entries. This saves a bunch of memory at the
* cost of one RB_FIND() per source entry; duplicate
* entries in the msfr_nsrcs vector are ignored.
* If we encounter an error, rollback transaction.
*
* XXX This too could be replaced with a set-symmetric
* difference like loop to avoid walking from root
* every time, as the key space is common.
*/
for (i = 0, pkss = kss; i < msfr.msfr_nsrcs; i++, pkss++) {
psin = (struct sockaddr_in6 *)pkss;
if (psin->sin6_family != AF_INET6) {
error = EAFNOSUPPORT;
break;
}
if (psin->sin6_len != sizeof(struct sockaddr_in6)) {
error = EINVAL;
break;
}
if (IN6_IS_ADDR_MULTICAST(&psin->sin6_addr)) {
error = EINVAL;
break;
}
/*
* TODO: Validate embedded scope ID in source
* list entry against passed-in ifp, if and only
* if source list filter entry is iface or node local.
*/
in6_clearscope(&psin->sin6_addr);
error = im6f_get_source(imf, psin, &lims);
if (error)
break;
lims->im6sl_st[1] = imf->im6f_st[1];
}
free(kss, M_TEMP);
}
if (error)
goto out_im6f_rollback;
INP_WLOCK_ASSERT(inp);
IN6_MULTI_LOCK();
/*
* Begin state merge transaction at MLD layer.
*/
CTR1(KTR_MLD, "%s: merge inm state", __func__);
error = in6m_merge(inm, imf);
if (error)
CTR1(KTR_MLD, "%s: failed to merge inm state", __func__);
else {
CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
error = mld_change_state(inm, 0);
if (error)
CTR1(KTR_MLD, "%s: failed mld downcall", __func__);
}
IN6_MULTI_UNLOCK();
out_im6f_rollback:
if (error)
im6f_rollback(imf);
else
im6f_commit(imf);
im6f_reap(imf);
out_in6p_locked:
INP_WUNLOCK(inp);
return (error);
}
/*
* Set the IP multicast options in response to user setsockopt().
*
* Many of the socket options handled in this function duplicate the
* functionality of socket options in the regular unicast API. However,
* it is not possible to merge the duplicate code, because the idempotence
* of the IPv6 multicast part of the BSD Sockets API must be preserved;
* the effects of these options must be treated as separate and distinct.
*
* SMPng: XXX: Unlocked read of inp_socket believed OK.
*/
int
ip6_setmoptions(struct inpcb *inp, struct sockopt *sopt)
{
struct ip6_moptions *im6o;
int error;
error = 0;
/*
* If socket is neither of type SOCK_RAW or SOCK_DGRAM,
* or is a divert socket, reject it.
*/
if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT ||
(inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
inp->inp_socket->so_proto->pr_type != SOCK_DGRAM))
return (EOPNOTSUPP);
switch (sopt->sopt_name) {
case IPV6_MULTICAST_IF:
error = in6p_set_multicast_if(inp, sopt);
break;
case IPV6_MULTICAST_HOPS: {
int hlim;
if (sopt->sopt_valsize != sizeof(int)) {
error = EINVAL;
break;
}
error = sooptcopyin(sopt, &hlim, sizeof(hlim), sizeof(int));
if (error)
break;
if (hlim < -1 || hlim > 255) {
error = EINVAL;
break;
} else if (hlim == -1) {
hlim = V_ip6_defmcasthlim;
}
im6o = in6p_findmoptions(inp);
im6o->im6o_multicast_hlim = hlim;
INP_WUNLOCK(inp);
break;
}
case IPV6_MULTICAST_LOOP: {
u_int loop;
/*
* Set the loopback flag for outgoing multicast packets.
* Must be zero or one.
*/
if (sopt->sopt_valsize != sizeof(u_int)) {
error = EINVAL;
break;
}
error = sooptcopyin(sopt, &loop, sizeof(u_int), sizeof(u_int));
if (error)
break;
if (loop > 1) {
error = EINVAL;
break;
}
im6o = in6p_findmoptions(inp);
im6o->im6o_multicast_loop = loop;
INP_WUNLOCK(inp);
break;
}
case IPV6_JOIN_GROUP:
case MCAST_JOIN_GROUP:
case MCAST_JOIN_SOURCE_GROUP:
error = in6p_join_group(inp, sopt);
break;
case IPV6_LEAVE_GROUP:
case MCAST_LEAVE_GROUP:
case MCAST_LEAVE_SOURCE_GROUP:
error = in6p_leave_group(inp, sopt);
break;
case MCAST_BLOCK_SOURCE:
case MCAST_UNBLOCK_SOURCE:
error = in6p_block_unblock_source(inp, sopt);
break;
case IPV6_MSFILTER:
error = in6p_set_source_filters(inp, sopt);
break;
default:
error = EOPNOTSUPP;
break;
}
INP_UNLOCK_ASSERT(inp);
return (error);
}
/*
* Expose MLD's multicast filter mode and source list(s) to userland,
* keyed by (ifindex, group).
* The filter mode is written out as a uint32_t, followed by
* 0..n of struct in6_addr.
* For use by ifmcstat(8).
* SMPng: NOTE: unlocked read of ifindex space.
*/
static int
sysctl_ip6_mcast_filters(SYSCTL_HANDLER_ARGS)
{
struct in6_addr mcaddr;
struct in6_addr src;
struct ifnet *ifp;
struct ifmultiaddr *ifma;
struct in6_multi *inm;
struct ip6_msource *ims;
int *name;
int retval;
u_int namelen;
uint32_t fmode, ifindex;
#ifdef KTR
char ip6tbuf[INET6_ADDRSTRLEN];
#endif
name = (int *)arg1;
namelen = arg2;
if (req->newptr != NULL)
return (EPERM);
/* int: ifindex + 4 * 32 bits of IPv6 address */
if (namelen != 5)
return (EINVAL);
ifindex = name[0];
if (ifindex <= 0 || ifindex > V_if_index) {
CTR2(KTR_MLD, "%s: ifindex %u out of range",
__func__, ifindex);
return (ENOENT);
}
memcpy(&mcaddr, &name[1], sizeof(struct in6_addr));
if (!IN6_IS_ADDR_MULTICAST(&mcaddr)) {
CTR2(KTR_MLD, "%s: group %s is not multicast",
__func__, ip6_sprintf(ip6tbuf, &mcaddr));
return (EINVAL);
}
ifp = ifnet_byindex(ifindex);
if (ifp == NULL) {
CTR2(KTR_MLD, "%s: no ifp for ifindex %u",
__func__, ifindex);
return (ENOENT);
}
/*
* Internal MLD lookups require that scope/zone ID is set.
*/
(void)in6_setscope(&mcaddr, ifp, NULL);
retval = sysctl_wire_old_buffer(req,
sizeof(uint32_t) + (in6_mcast_maxgrpsrc * sizeof(struct in6_addr)));
if (retval)
return (retval);
IN6_MULTI_LOCK();
IF_ADDR_RLOCK(ifp);
TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
if (ifma->ifma_addr->sa_family != AF_INET6 ||
ifma->ifma_protospec == NULL)
continue;
inm = (struct in6_multi *)ifma->ifma_protospec;
if (!IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, &mcaddr))
continue;
fmode = inm->in6m_st[1].iss_fmode;
retval = SYSCTL_OUT(req, &fmode, sizeof(uint32_t));
if (retval != 0)
break;
RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) {
CTR2(KTR_MLD, "%s: visit node %p", __func__, ims);
/*
* Only copy-out sources which are in-mode.
*/
if (fmode != im6s_get_mode(inm, ims, 1)) {
CTR1(KTR_MLD, "%s: skip non-in-mode",
__func__);
continue;
}
src = ims->im6s_addr;
retval = SYSCTL_OUT(req, &src,
sizeof(struct in6_addr));
if (retval != 0)
break;
}
}
IF_ADDR_RUNLOCK(ifp);
IN6_MULTI_UNLOCK();
return (retval);
}
#ifdef KTR
static const char *in6m_modestrs[] = { "un", "in", "ex" };
static const char *
in6m_mode_str(const int mode)
{
if (mode >= MCAST_UNDEFINED && mode <= MCAST_EXCLUDE)
return (in6m_modestrs[mode]);
return ("??");
}
static const char *in6m_statestrs[] = {
"not-member",
"silent",
"idle",
"lazy",
"sleeping",
"awakening",
"query-pending",
"sg-query-pending",
"leaving"
};
static const char *
in6m_state_str(const int state)
{
if (state >= MLD_NOT_MEMBER && state <= MLD_LEAVING_MEMBER)
return (in6m_statestrs[state]);
return ("??");
}
/*
* Dump an in6_multi structure to the console.
*/
void
in6m_print(const struct in6_multi *inm)
{
int t;
char ip6tbuf[INET6_ADDRSTRLEN];
if ((ktr_mask & KTR_MLD) == 0)
return;
printf("%s: --- begin in6m %p ---\n", __func__, inm);
printf("addr %s ifp %p(%s) ifma %p\n",
ip6_sprintf(ip6tbuf, &inm->in6m_addr),
inm->in6m_ifp,
if_name(inm->in6m_ifp),
inm->in6m_ifma);
printf("timer %u state %s refcount %u scq.len %u\n",
inm->in6m_timer,
in6m_state_str(inm->in6m_state),
inm->in6m_refcount,
mbufq_len(&inm->in6m_scq));
printf("mli %p nsrc %lu sctimer %u scrv %u\n",
inm->in6m_mli,
inm->in6m_nsrc,
inm->in6m_sctimer,
inm->in6m_scrv);
for (t = 0; t < 2; t++) {
printf("t%d: fmode %s asm %u ex %u in %u rec %u\n", t,
in6m_mode_str(inm->in6m_st[t].iss_fmode),
inm->in6m_st[t].iss_asm,
inm->in6m_st[t].iss_ex,
inm->in6m_st[t].iss_in,
inm->in6m_st[t].iss_rec);
}
printf("%s: --- end in6m %p ---\n", __func__, inm);
}
#else /* !KTR */
void
in6m_print(const struct in6_multi *inm)
{
}
#endif /* KTR */
Index: head/sys/netinet6/in6_src.c
===================================================================
--- head/sys/netinet6/in6_src.c (revision 327172)
+++ head/sys/netinet6/in6_src.c (revision 327173)
@@ -1,1246 +1,1243 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $KAME: in6_src.c,v 1.132 2003/08/26 04:42:27 keiichi Exp $
*/
/*-
* Copyright (c) 1982, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)in_pcb.c 8.2 (Berkeley) 1/4/94
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_mpath.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/priv.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sockio.h>
#include <sys/sysctl.h>
#include <sys/errno.h>
#include <sys/time.h>
#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/rmlock.h>
#include <sys/sx.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/if_dl.h>
#include <net/route.h>
#include <net/if_llatbl.h>
#ifdef RADIX_MPATH
#include <net/radix_mpath.h>
#endif
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/ip_var.h>
#include <netinet/udp.h>
#include <netinet/udp_var.h>
#include <netinet6/in6_var.h>
#include <netinet/ip6.h>
#include <netinet6/in6_fib.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6_var.h>
#include <netinet6/scope6_var.h>
#include <netinet6/nd6.h>
static struct mtx addrsel_lock;
#define ADDRSEL_LOCK_INIT() mtx_init(&addrsel_lock, "addrsel_lock", NULL, MTX_DEF)
#define ADDRSEL_LOCK() mtx_lock(&addrsel_lock)
#define ADDRSEL_UNLOCK() mtx_unlock(&addrsel_lock)
#define ADDRSEL_LOCK_ASSERT() mtx_assert(&addrsel_lock, MA_OWNED)
static struct sx addrsel_sxlock;
#define ADDRSEL_SXLOCK_INIT() sx_init(&addrsel_sxlock, "addrsel_sxlock")
#define ADDRSEL_SLOCK() sx_slock(&addrsel_sxlock)
#define ADDRSEL_SUNLOCK() sx_sunlock(&addrsel_sxlock)
#define ADDRSEL_XLOCK() sx_xlock(&addrsel_sxlock)
#define ADDRSEL_XUNLOCK() sx_xunlock(&addrsel_sxlock)
#define ADDR_LABEL_NOTAPP (-1)
static VNET_DEFINE(struct in6_addrpolicy, defaultaddrpolicy);
#define V_defaultaddrpolicy VNET(defaultaddrpolicy)
VNET_DEFINE(int, ip6_prefer_tempaddr) = 0;
static int selectroute(struct sockaddr_in6 *, struct ip6_pktopts *,
struct ip6_moptions *, struct route_in6 *, struct ifnet **,
struct rtentry **, int, u_int);
static int in6_selectif(struct sockaddr_in6 *, struct ip6_pktopts *,
struct ip6_moptions *, struct ifnet **,
struct ifnet *, u_int);
static int in6_selectsrc(uint32_t, struct sockaddr_in6 *,
struct ip6_pktopts *, struct inpcb *, struct ucred *,
struct ifnet **, struct in6_addr *);
static struct in6_addrpolicy *lookup_addrsel_policy(struct sockaddr_in6 *);
static void init_policy_queue(void);
static int add_addrsel_policyent(struct in6_addrpolicy *);
static int delete_addrsel_policyent(struct in6_addrpolicy *);
static int walk_addrsel_policy(int (*)(struct in6_addrpolicy *, void *),
void *);
static int dump_addrsel_policyent(struct in6_addrpolicy *, void *);
static struct in6_addrpolicy *match_addrsel_policy(struct sockaddr_in6 *);
/*
* Return an IPv6 address, which is the most appropriate for a given
* destination and user specified options.
* If necessary, this function lookups the routing table and returns
* an entry to the caller for later use.
*/
#define REPLACE(r) do {\
IP6STAT_INC(ip6s_sources_rule[(r)]); \
- rule = (r); \
/* { \
char ip6buf[INET6_ADDRSTRLEN], ip6b[INET6_ADDRSTRLEN]; \
printf("in6_selectsrc: replace %s with %s by %d\n", ia_best ? ip6_sprintf(ip6buf, &ia_best->ia_addr.sin6_addr) : "none", ip6_sprintf(ip6b, &ia->ia_addr.sin6_addr), (r)); \
} */ \
goto replace; \
} while(0)
#define NEXT(r) do {\
/* { \
char ip6buf[INET6_ADDRSTRLEN], ip6b[INET6_ADDRSTRLEN]; \
printf("in6_selectsrc: keep %s against %s by %d\n", ia_best ? ip6_sprintf(ip6buf, &ia_best->ia_addr.sin6_addr) : "none", ip6_sprintf(ip6b, &ia->ia_addr.sin6_addr), (r)); \
} */ \
goto next; /* XXX: we can't use 'continue' here */ \
} while(0)
#define BREAK(r) do { \
IP6STAT_INC(ip6s_sources_rule[(r)]); \
- rule = (r); \
goto out; /* XXX: we can't use 'break' here */ \
} while(0)
static int
in6_selectsrc(uint32_t fibnum, struct sockaddr_in6 *dstsock,
struct ip6_pktopts *opts, struct inpcb *inp, struct ucred *cred,
struct ifnet **ifpp, struct in6_addr *srcp)
{
struct rm_priotracker in6_ifa_tracker;
struct in6_addr dst, tmp;
struct ifnet *ifp = NULL, *oifp = NULL;
struct in6_ifaddr *ia = NULL, *ia_best = NULL;
struct in6_pktinfo *pi = NULL;
int dst_scope = -1, best_scope = -1, best_matchlen = -1;
struct in6_addrpolicy *dst_policy = NULL, *best_policy = NULL;
u_int32_t odstzone;
int prefer_tempaddr;
- int error, rule;
+ int error;
struct ip6_moptions *mopts;
KASSERT(srcp != NULL, ("%s: srcp is NULL", __func__));
dst = dstsock->sin6_addr; /* make a copy for local operation */
if (ifpp) {
/*
* Save a possibly passed in ifp for in6_selectsrc. Only
* neighbor discovery code should use this feature, where
* we may know the interface but not the FIB number holding
* the connected subnet in case someone deleted it from the
* default FIB and we need to check the interface.
*/
if (*ifpp != NULL)
oifp = *ifpp;
*ifpp = NULL;
}
if (inp != NULL) {
INP_LOCK_ASSERT(inp);
mopts = inp->in6p_moptions;
} else {
mopts = NULL;
}
/*
* If the source address is explicitly specified by the caller,
* check if the requested source address is indeed a unicast address
* assigned to the node, and can be used as the packet's source
* address. If everything is okay, use the address as source.
*/
if (opts && (pi = opts->ip6po_pktinfo) &&
!IN6_IS_ADDR_UNSPECIFIED(&pi->ipi6_addr)) {
/* get the outgoing interface */
if ((error = in6_selectif(dstsock, opts, mopts, &ifp, oifp,
fibnum))
!= 0)
return (error);
/*
* determine the appropriate zone id of the source based on
* the zone of the destination and the outgoing interface.
* If the specified address is ambiguous wrt the scope zone,
* the interface must be specified; otherwise, ifa_ifwithaddr()
* will fail matching the address.
*/
tmp = pi->ipi6_addr;
if (ifp) {
error = in6_setscope(&tmp, ifp, &odstzone);
if (error)
return (error);
}
if (cred != NULL && (error = prison_local_ip6(cred,
&tmp, (inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)) != 0)
return (error);
/*
* If IPV6_BINDANY socket option is set, we allow to specify
* non local addresses as source address in IPV6_PKTINFO
* ancillary data.
*/
if ((inp->inp_flags & INP_BINDANY) == 0) {
ia = in6ifa_ifwithaddr(&tmp, 0 /* XXX */);
if (ia == NULL || (ia->ia6_flags & (IN6_IFF_ANYCAST |
IN6_IFF_NOTREADY))) {
if (ia != NULL)
ifa_free(&ia->ia_ifa);
return (EADDRNOTAVAIL);
}
bcopy(&ia->ia_addr.sin6_addr, srcp, sizeof(*srcp));
ifa_free(&ia->ia_ifa);
} else
bcopy(&tmp, srcp, sizeof(*srcp));
pi->ipi6_addr = tmp; /* XXX: this overrides pi */
if (ifpp)
*ifpp = ifp;
return (0);
}
/*
* Otherwise, if the socket has already bound the source, just use it.
*/
if (inp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
if (cred != NULL &&
(error = prison_local_ip6(cred, &inp->in6p_laddr,
((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0))) != 0)
return (error);
bcopy(&inp->in6p_laddr, srcp, sizeof(*srcp));
return (0);
}
/*
* Bypass source address selection and use the primary jail IP
* if requested.
*/
if (cred != NULL && !prison_saddrsel_ip6(cred, srcp))
return (0);
/*
* If the address is not specified, choose the best one based on
* the outgoing interface and the destination address.
*/
/* get the outgoing interface */
if ((error = in6_selectif(dstsock, opts, mopts, &ifp, oifp,
(inp != NULL) ? inp->inp_inc.inc_fibnum : fibnum)) != 0)
return (error);
#ifdef DIAGNOSTIC
if (ifp == NULL) /* this should not happen */
panic("in6_selectsrc: NULL ifp");
#endif
error = in6_setscope(&dst, ifp, &odstzone);
if (error)
return (error);
- rule = 0;
IN6_IFADDR_RLOCK(&in6_ifa_tracker);
TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) {
int new_scope = -1, new_matchlen = -1;
struct in6_addrpolicy *new_policy = NULL;
u_int32_t srczone, osrczone, dstzone;
struct in6_addr src;
struct ifnet *ifp1 = ia->ia_ifp;
/*
* We'll never take an address that breaks the scope zone
* of the destination. We also skip an address if its zone
* does not contain the outgoing interface.
* XXX: we should probably use sin6_scope_id here.
*/
if (in6_setscope(&dst, ifp1, &dstzone) ||
odstzone != dstzone) {
continue;
}
src = ia->ia_addr.sin6_addr;
if (in6_setscope(&src, ifp, &osrczone) ||
in6_setscope(&src, ifp1, &srczone) ||
osrczone != srczone) {
continue;
}
/* avoid unusable addresses */
if ((ia->ia6_flags &
(IN6_IFF_NOTREADY | IN6_IFF_ANYCAST | IN6_IFF_DETACHED))) {
continue;
}
if (!V_ip6_use_deprecated && IFA6_IS_DEPRECATED(ia))
continue;
/* If jailed only take addresses of the jail into account. */
if (cred != NULL &&
prison_check_ip6(cred, &ia->ia_addr.sin6_addr) != 0)
continue;
/* Rule 1: Prefer same address */
if (IN6_ARE_ADDR_EQUAL(&dst, &ia->ia_addr.sin6_addr)) {
ia_best = ia;
BREAK(1); /* there should be no better candidate */
}
if (ia_best == NULL)
REPLACE(0);
/* Rule 2: Prefer appropriate scope */
if (dst_scope < 0)
dst_scope = in6_addrscope(&dst);
new_scope = in6_addrscope(&ia->ia_addr.sin6_addr);
if (IN6_ARE_SCOPE_CMP(best_scope, new_scope) < 0) {
if (IN6_ARE_SCOPE_CMP(best_scope, dst_scope) < 0)
REPLACE(2);
NEXT(2);
} else if (IN6_ARE_SCOPE_CMP(new_scope, best_scope) < 0) {
if (IN6_ARE_SCOPE_CMP(new_scope, dst_scope) < 0)
NEXT(2);
REPLACE(2);
}
/*
* Rule 3: Avoid deprecated addresses. Note that the case of
* !ip6_use_deprecated is already rejected above.
*/
if (!IFA6_IS_DEPRECATED(ia_best) && IFA6_IS_DEPRECATED(ia))
NEXT(3);
if (IFA6_IS_DEPRECATED(ia_best) && !IFA6_IS_DEPRECATED(ia))
REPLACE(3);
/* Rule 4: Prefer home addresses */
/*
* XXX: This is a TODO. We should probably merge the MIP6
* case above.
*/
/* Rule 5: Prefer outgoing interface */
if (!(ND_IFINFO(ifp)->flags & ND6_IFF_NO_PREFER_IFACE)) {
if (ia_best->ia_ifp == ifp && ia->ia_ifp != ifp)
NEXT(5);
if (ia_best->ia_ifp != ifp && ia->ia_ifp == ifp)
REPLACE(5);
}
/*
* Rule 6: Prefer matching label
* Note that best_policy should be non-NULL here.
*/
if (dst_policy == NULL)
dst_policy = lookup_addrsel_policy(dstsock);
if (dst_policy->label != ADDR_LABEL_NOTAPP) {
new_policy = lookup_addrsel_policy(&ia->ia_addr);
if (dst_policy->label == best_policy->label &&
dst_policy->label != new_policy->label)
NEXT(6);
if (dst_policy->label != best_policy->label &&
dst_policy->label == new_policy->label)
REPLACE(6);
}
/*
* Rule 7: Prefer public addresses.
* We allow users to reverse the logic by configuring
* a sysctl variable, so that privacy conscious users can
* always prefer temporary addresses.
*/
if (opts == NULL ||
opts->ip6po_prefer_tempaddr == IP6PO_TEMPADDR_SYSTEM) {
prefer_tempaddr = V_ip6_prefer_tempaddr;
} else if (opts->ip6po_prefer_tempaddr ==
IP6PO_TEMPADDR_NOTPREFER) {
prefer_tempaddr = 0;
} else
prefer_tempaddr = 1;
if (!(ia_best->ia6_flags & IN6_IFF_TEMPORARY) &&
(ia->ia6_flags & IN6_IFF_TEMPORARY)) {
if (prefer_tempaddr)
REPLACE(7);
else
NEXT(7);
}
if ((ia_best->ia6_flags & IN6_IFF_TEMPORARY) &&
!(ia->ia6_flags & IN6_IFF_TEMPORARY)) {
if (prefer_tempaddr)
NEXT(7);
else
REPLACE(7);
}
/*
* Rule 8: prefer addresses on alive interfaces.
* This is a KAME specific rule.
*/
if ((ia_best->ia_ifp->if_flags & IFF_UP) &&
!(ia->ia_ifp->if_flags & IFF_UP))
NEXT(8);
if (!(ia_best->ia_ifp->if_flags & IFF_UP) &&
(ia->ia_ifp->if_flags & IFF_UP))
REPLACE(8);
/*
* Rule 9: prefer address with better virtual status.
*/
if (ifa_preferred(&ia_best->ia_ifa, &ia->ia_ifa))
REPLACE(9);
if (ifa_preferred(&ia->ia_ifa, &ia_best->ia_ifa))
NEXT(9);
/*
* Rule 10: prefer address with `prefer_source' flag.
*/
if ((ia_best->ia6_flags & IN6_IFF_PREFER_SOURCE) == 0 &&
(ia->ia6_flags & IN6_IFF_PREFER_SOURCE) != 0)
REPLACE(10);
if ((ia_best->ia6_flags & IN6_IFF_PREFER_SOURCE) != 0 &&
(ia->ia6_flags & IN6_IFF_PREFER_SOURCE) == 0)
NEXT(10);
/*
* Rule 14: Use longest matching prefix.
* Note: in the address selection draft, this rule is
* documented as "Rule 8". However, since it is also
* documented that this rule can be overridden, we assign
* a large number so that it is easy to assign smaller numbers
* to more preferred rules.
*/
new_matchlen = in6_matchlen(&ia->ia_addr.sin6_addr, &dst);
if (best_matchlen < new_matchlen)
REPLACE(14);
if (new_matchlen < best_matchlen)
NEXT(14);
/* Rule 15 is reserved. */
/*
* Last resort: just keep the current candidate.
* Or, do we need more rules?
*/
continue;
replace:
ia_best = ia;
best_scope = (new_scope >= 0 ? new_scope :
in6_addrscope(&ia_best->ia_addr.sin6_addr));
best_policy = (new_policy ? new_policy :
lookup_addrsel_policy(&ia_best->ia_addr));
best_matchlen = (new_matchlen >= 0 ? new_matchlen :
in6_matchlen(&ia_best->ia_addr.sin6_addr,
&dst));
next:
continue;
out:
break;
}
if ((ia = ia_best) == NULL) {
IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
IP6STAT_INC(ip6s_sources_none);
return (EADDRNOTAVAIL);
}
/*
* At this point at least one of the addresses belonged to the jail
* but it could still be, that we want to further restrict it, e.g.
* theoratically IN6_IS_ADDR_LOOPBACK.
* It must not be IN6_IS_ADDR_UNSPECIFIED anymore.
* prison_local_ip6() will fix an IN6_IS_ADDR_LOOPBACK but should
* let all others previously selected pass.
* Use tmp to not change ::1 on lo0 to the primary jail address.
*/
tmp = ia->ia_addr.sin6_addr;
if (cred != NULL && prison_local_ip6(cred, &tmp, (inp != NULL &&
(inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)) != 0) {
IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
IP6STAT_INC(ip6s_sources_none);
return (EADDRNOTAVAIL);
}
if (ifpp)
*ifpp = ifp;
bcopy(&tmp, srcp, sizeof(*srcp));
if (ia->ia_ifp == ifp)
IP6STAT_INC(ip6s_sources_sameif[best_scope]);
else
IP6STAT_INC(ip6s_sources_otherif[best_scope]);
if (dst_scope == best_scope)
IP6STAT_INC(ip6s_sources_samescope[best_scope]);
else
IP6STAT_INC(ip6s_sources_otherscope[best_scope]);
if (IFA6_IS_DEPRECATED(ia))
IP6STAT_INC(ip6s_sources_deprecated[best_scope]);
IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
return (0);
}
/*
* Select source address based on @inp, @dstsock and @opts.
* Stores selected address to @srcp. If @scope_ambiguous is set,
* embed scope from selected outgoing interface. If @hlim pointer
* is provided, stores calculated hop limit there.
* Returns 0 on success.
*/
int
in6_selectsrc_socket(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
struct inpcb *inp, struct ucred *cred, int scope_ambiguous,
struct in6_addr *srcp, int *hlim)
{
struct ifnet *retifp;
uint32_t fibnum;
int error;
fibnum = inp->inp_inc.inc_fibnum;
retifp = NULL;
error = in6_selectsrc(fibnum, dstsock, opts, inp, cred, &retifp, srcp);
if (error != 0)
return (error);
if (hlim != NULL)
*hlim = in6_selecthlim(inp, retifp);
if (retifp == NULL || scope_ambiguous == 0)
return (0);
/*
* Application should provide a proper zone ID or the use of
* default zone IDs should be enabled. Unfortunately, some
* applications do not behave as it should, so we need a
* workaround. Even if an appropriate ID is not determined
* (when it's required), if we can determine the outgoing
* interface. determine the zone ID based on the interface.
*/
error = in6_setscope(&dstsock->sin6_addr, retifp, NULL);
return (error);
}
/*
* Select source address based on @fibnum, @dst and @scopeid.
* Stores selected address to @srcp.
* Returns 0 on success.
*
* Used by non-socket based consumers (ND code mostly)
*/
int
in6_selectsrc_addr(uint32_t fibnum, const struct in6_addr *dst,
uint32_t scopeid, struct ifnet *ifp, struct in6_addr *srcp,
int *hlim)
{
struct ifnet *retifp;
struct sockaddr_in6 dst_sa;
int error;
retifp = ifp;
bzero(&dst_sa, sizeof(dst_sa));
dst_sa.sin6_family = AF_INET6;
dst_sa.sin6_len = sizeof(dst_sa);
dst_sa.sin6_addr = *dst;
dst_sa.sin6_scope_id = scopeid;
sa6_embedscope(&dst_sa, 0);
error = in6_selectsrc(fibnum, &dst_sa, NULL, NULL, NULL, &retifp, srcp);
if (hlim != NULL)
*hlim = in6_selecthlim(NULL, retifp);
return (error);
}
/*
* clone - meaningful only for bsdi and freebsd
*/
static int
selectroute(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
struct ip6_moptions *mopts, struct route_in6 *ro,
struct ifnet **retifp, struct rtentry **retrt, int norouteok, u_int fibnum)
{
int error = 0;
struct ifnet *ifp = NULL;
struct rtentry *rt = NULL;
struct sockaddr_in6 *sin6_next;
struct in6_pktinfo *pi = NULL;
struct in6_addr *dst = &dstsock->sin6_addr;
uint32_t zoneid;
#if 0
char ip6buf[INET6_ADDRSTRLEN];
if (dstsock->sin6_addr.s6_addr32[0] == 0 &&
dstsock->sin6_addr.s6_addr32[1] == 0 &&
!IN6_IS_ADDR_LOOPBACK(&dstsock->sin6_addr)) {
printf("in6_selectroute: strange destination %s\n",
ip6_sprintf(ip6buf, &dstsock->sin6_addr));
} else {
printf("in6_selectroute: destination = %s%%%d\n",
ip6_sprintf(ip6buf, &dstsock->sin6_addr),
dstsock->sin6_scope_id); /* for debug */
}
#endif
/* If the caller specify the outgoing interface explicitly, use it. */
if (opts && (pi = opts->ip6po_pktinfo) != NULL && pi->ipi6_ifindex) {
/* XXX boundary check is assumed to be already done. */
ifp = ifnet_byindex(pi->ipi6_ifindex);
if (ifp != NULL &&
(norouteok || retrt == NULL ||
IN6_IS_ADDR_MULTICAST(dst))) {
/*
* we do not have to check or get the route for
* multicast.
*/
goto done;
} else
goto getroute;
}
/*
* If the destination address is a multicast address and the outgoing
* interface for the address is specified by the caller, use it.
*/
if (IN6_IS_ADDR_MULTICAST(dst) &&
mopts != NULL && (ifp = mopts->im6o_multicast_ifp) != NULL) {
goto done; /* we do not need a route for multicast. */
}
/*
* If destination address is LLA or link- or node-local multicast,
* use it's embedded scope zone id to determine outgoing interface.
*/
if (IN6_IS_ADDR_MC_LINKLOCAL(dst) ||
IN6_IS_ADDR_MC_NODELOCAL(dst)) {
zoneid = ntohs(in6_getscope(dst));
if (zoneid > 0) {
ifp = in6_getlinkifnet(zoneid);
goto done;
}
}
getroute:
/*
* If the next hop address for the packet is specified by the caller,
* use it as the gateway.
*/
if (opts && opts->ip6po_nexthop) {
struct route_in6 *ron;
sin6_next = satosin6(opts->ip6po_nexthop);
if (IN6_IS_ADDR_LINKLOCAL(&sin6_next->sin6_addr)) {
/*
* Next hop is LLA, thus it should be neighbor.
* Determine outgoing interface by zone index.
*/
zoneid = ntohs(in6_getscope(&sin6_next->sin6_addr));
if (zoneid > 0) {
ifp = in6_getlinkifnet(zoneid);
goto done;
}
}
ron = &opts->ip6po_nextroute;
/* Use a cached route if it exists and is valid. */
if (ron->ro_rt != NULL && (
(ron->ro_rt->rt_flags & RTF_UP) == 0 ||
ron->ro_dst.sin6_family != AF_INET6 ||
!IN6_ARE_ADDR_EQUAL(&ron->ro_dst.sin6_addr,
&sin6_next->sin6_addr)))
RO_RTFREE(ron);
if (ron->ro_rt == NULL) {
ron->ro_dst = *sin6_next;
in6_rtalloc(ron, fibnum); /* multi path case? */
}
/*
* The node identified by that address must be a
* neighbor of the sending host.
*/
if (ron->ro_rt == NULL ||
(ron->ro_rt->rt_flags & RTF_GATEWAY) != 0)
error = EHOSTUNREACH;
goto done;
}
/*
* Use a cached route if it exists and is valid, else try to allocate
* a new one. Note that we should check the address family of the
* cached destination, in case of sharing the cache with IPv4.
*/
if (ro) {
if (ro->ro_rt &&
(!(ro->ro_rt->rt_flags & RTF_UP) ||
((struct sockaddr *)(&ro->ro_dst))->sa_family != AF_INET6 ||
!IN6_ARE_ADDR_EQUAL(&satosin6(&ro->ro_dst)->sin6_addr,
dst))) {
RTFREE(ro->ro_rt);
ro->ro_rt = (struct rtentry *)NULL;
}
if (ro->ro_rt == (struct rtentry *)NULL) {
struct sockaddr_in6 *sa6;
/* No route yet, so try to acquire one */
bzero(&ro->ro_dst, sizeof(struct sockaddr_in6));
sa6 = (struct sockaddr_in6 *)&ro->ro_dst;
*sa6 = *dstsock;
sa6->sin6_scope_id = 0;
#ifdef RADIX_MPATH
rtalloc_mpath_fib((struct route *)ro,
ntohl(sa6->sin6_addr.s6_addr32[3]), fibnum);
#else
ro->ro_rt = in6_rtalloc1((struct sockaddr *)
&ro->ro_dst, 0, 0UL, fibnum);
if (ro->ro_rt)
RT_UNLOCK(ro->ro_rt);
#endif
}
/*
* do not care about the result if we have the nexthop
* explicitly specified.
*/
if (opts && opts->ip6po_nexthop)
goto done;
if (ro->ro_rt) {
ifp = ro->ro_rt->rt_ifp;
if (ifp == NULL) { /* can this really happen? */
RTFREE(ro->ro_rt);
ro->ro_rt = NULL;
}
}
if (ro->ro_rt == NULL)
error = EHOSTUNREACH;
rt = ro->ro_rt;
/*
* Check if the outgoing interface conflicts with
* the interface specified by ipi6_ifindex (if specified).
* Note that loopback interface is always okay.
* (this may happen when we are sending a packet to one of
* our own addresses.)
*/
if (ifp && opts && opts->ip6po_pktinfo &&
opts->ip6po_pktinfo->ipi6_ifindex) {
if (!(ifp->if_flags & IFF_LOOPBACK) &&
ifp->if_index !=
opts->ip6po_pktinfo->ipi6_ifindex) {
error = EHOSTUNREACH;
goto done;
}
}
}
done:
if (ifp == NULL && rt == NULL) {
/*
* This can happen if the caller did not pass a cached route
* nor any other hints. We treat this case an error.
*/
error = EHOSTUNREACH;
}
if (error == EHOSTUNREACH)
IP6STAT_INC(ip6s_noroute);
if (retifp != NULL) {
*retifp = ifp;
/*
* Adjust the "outgoing" interface. If we're going to loop
* the packet back to ourselves, the ifp would be the loopback
* interface. However, we'd rather know the interface associated
* to the destination address (which should probably be one of
* our own addresses.)
*/
if (rt) {
if ((rt->rt_ifp->if_flags & IFF_LOOPBACK) &&
(rt->rt_gateway->sa_family == AF_LINK))
*retifp =
ifnet_byindex(((struct sockaddr_dl *)
rt->rt_gateway)->sdl_index);
}
}
if (retrt != NULL)
*retrt = rt; /* rt may be NULL */
return (error);
}
static int
in6_selectif(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
struct ip6_moptions *mopts, struct ifnet **retifp,
struct ifnet *oifp, u_int fibnum)
{
int error;
struct route_in6 sro;
struct rtentry *rt = NULL;
int rt_flags;
KASSERT(retifp != NULL, ("%s: retifp is NULL", __func__));
bzero(&sro, sizeof(sro));
rt_flags = 0;
error = selectroute(dstsock, opts, mopts, &sro, retifp, &rt, 1, fibnum);
if (rt)
rt_flags = rt->rt_flags;
if (rt && rt == sro.ro_rt)
RTFREE(rt);
if (error != 0) {
/* Help ND. See oifp comment in in6_selectsrc(). */
if (oifp != NULL && fibnum == RT_DEFAULT_FIB) {
*retifp = oifp;
error = 0;
}
return (error);
}
/*
* do not use a rejected or black hole route.
* XXX: this check should be done in the L2 output routine.
* However, if we skipped this check here, we'd see the following
* scenario:
* - install a rejected route for a scoped address prefix
* (like fe80::/10)
* - send a packet to a destination that matches the scoped prefix,
* with ambiguity about the scope zone.
* - pick the outgoing interface from the route, and disambiguate the
* scope zone with the interface.
* - ip6_output() would try to get another route with the "new"
* destination, which may be valid.
* - we'd see no error on output.
* Although this may not be very harmful, it should still be confusing.
* We thus reject the case here.
*/
if (rt_flags & (RTF_REJECT | RTF_BLACKHOLE)) {
error = (rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH);
return (error);
}
return (0);
}
/*
* Public wrapper function to selectroute().
*
* XXX-BZ in6_selectroute() should and will grow the FIB argument. The
* in6_selectroute_fib() function is only there for backward compat on stable.
*/
int
in6_selectroute(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
struct ip6_moptions *mopts, struct route_in6 *ro,
struct ifnet **retifp, struct rtentry **retrt)
{
return (selectroute(dstsock, opts, mopts, ro, retifp,
retrt, 0, RT_DEFAULT_FIB));
}
#ifndef BURN_BRIDGES
int
in6_selectroute_fib(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
struct ip6_moptions *mopts, struct route_in6 *ro,
struct ifnet **retifp, struct rtentry **retrt, u_int fibnum)
{
return (selectroute(dstsock, opts, mopts, ro, retifp,
retrt, 0, fibnum));
}
#endif
/*
* Default hop limit selection. The precedence is as follows:
* 1. Hoplimit value specified via ioctl.
* 2. (If the outgoing interface is detected) the current
* hop limit of the interface specified by router advertisement.
* 3. The system default hoplimit.
*/
int
in6_selecthlim(struct inpcb *in6p, struct ifnet *ifp)
{
if (in6p && in6p->in6p_hops >= 0)
return (in6p->in6p_hops);
else if (ifp)
return (ND_IFINFO(ifp)->chlim);
else if (in6p && !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr)) {
struct nhop6_basic nh6;
struct in6_addr dst;
uint32_t fibnum, scopeid;
int hlim;
fibnum = in6p->inp_inc.inc_fibnum;
in6_splitscope(&in6p->in6p_faddr, &dst, &scopeid);
if (fib6_lookup_nh_basic(fibnum, &dst, scopeid, 0, 0, &nh6)==0){
hlim = ND_IFINFO(nh6.nh_ifp)->chlim;
return (hlim);
}
}
return (V_ip6_defhlim);
}
/*
* XXX: this is borrowed from in6_pcbbind(). If possible, we should
* share this function by all *bsd*...
*/
int
in6_pcbsetport(struct in6_addr *laddr, struct inpcb *inp, struct ucred *cred)
{
struct socket *so = inp->inp_socket;
u_int16_t lport = 0;
int error, lookupflags = 0;
#ifdef INVARIANTS
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
#endif
INP_WLOCK_ASSERT(inp);
INP_HASH_WLOCK_ASSERT(pcbinfo);
error = prison_local_ip6(cred, laddr,
((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0));
if (error)
return(error);
/* XXX: this is redundant when called from in6_pcbbind */
if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
lookupflags = INPLOOKUP_WILDCARD;
inp->inp_flags |= INP_ANONPORT;
error = in_pcb_lport(inp, NULL, &lport, cred, lookupflags);
if (error != 0)
return (error);
inp->inp_lport = lport;
if (in_pcbinshash(inp) != 0) {
inp->in6p_laddr = in6addr_any;
inp->inp_lport = 0;
return (EAGAIN);
}
return (0);
}
void
addrsel_policy_init(void)
{
init_policy_queue();
/* initialize the "last resort" policy */
bzero(&V_defaultaddrpolicy, sizeof(V_defaultaddrpolicy));
V_defaultaddrpolicy.label = ADDR_LABEL_NOTAPP;
if (!IS_DEFAULT_VNET(curvnet))
return;
ADDRSEL_LOCK_INIT();
ADDRSEL_SXLOCK_INIT();
}
static struct in6_addrpolicy *
lookup_addrsel_policy(struct sockaddr_in6 *key)
{
struct in6_addrpolicy *match = NULL;
ADDRSEL_LOCK();
match = match_addrsel_policy(key);
if (match == NULL)
match = &V_defaultaddrpolicy;
else
match->use++;
ADDRSEL_UNLOCK();
return (match);
}
/*
* Subroutines to manage the address selection policy table via sysctl.
*/
struct walkarg {
struct sysctl_req *w_req;
};
static int in6_src_sysctl(SYSCTL_HANDLER_ARGS);
SYSCTL_DECL(_net_inet6_ip6);
static SYSCTL_NODE(_net_inet6_ip6, IPV6CTL_ADDRCTLPOLICY, addrctlpolicy,
CTLFLAG_RD, in6_src_sysctl, "");
static int
in6_src_sysctl(SYSCTL_HANDLER_ARGS)
{
struct walkarg w;
if (req->newptr)
return EPERM;
bzero(&w, sizeof(w));
w.w_req = req;
return (walk_addrsel_policy(dump_addrsel_policyent, &w));
}
int
in6_src_ioctl(u_long cmd, caddr_t data)
{
struct in6_addrpolicy ent0;
if (cmd != SIOCAADDRCTL_POLICY && cmd != SIOCDADDRCTL_POLICY)
return (EOPNOTSUPP); /* check for safety */
ent0 = *(struct in6_addrpolicy *)data;
if (ent0.label == ADDR_LABEL_NOTAPP)
return (EINVAL);
/* check if the prefix mask is consecutive. */
if (in6_mask2len(&ent0.addrmask.sin6_addr, NULL) < 0)
return (EINVAL);
/* clear trailing garbages (if any) of the prefix address. */
IN6_MASK_ADDR(&ent0.addr.sin6_addr, &ent0.addrmask.sin6_addr);
ent0.use = 0;
switch (cmd) {
case SIOCAADDRCTL_POLICY:
return (add_addrsel_policyent(&ent0));
case SIOCDADDRCTL_POLICY:
return (delete_addrsel_policyent(&ent0));
}
return (0); /* XXX: compromise compilers */
}
/*
* The followings are implementation of the policy table using a
* simple tail queue.
* XXX such details should be hidden.
* XXX implementation using binary tree should be more efficient.
*/
struct addrsel_policyent {
TAILQ_ENTRY(addrsel_policyent) ape_entry;
struct in6_addrpolicy ape_policy;
};
TAILQ_HEAD(addrsel_policyhead, addrsel_policyent);
static VNET_DEFINE(struct addrsel_policyhead, addrsel_policytab);
#define V_addrsel_policytab VNET(addrsel_policytab)
static void
init_policy_queue(void)
{
TAILQ_INIT(&V_addrsel_policytab);
}
static int
add_addrsel_policyent(struct in6_addrpolicy *newpolicy)
{
struct addrsel_policyent *new, *pol;
new = malloc(sizeof(*new), M_IFADDR,
M_WAITOK);
ADDRSEL_XLOCK();
ADDRSEL_LOCK();
/* duplication check */
TAILQ_FOREACH(pol, &V_addrsel_policytab, ape_entry) {
if (IN6_ARE_ADDR_EQUAL(&newpolicy->addr.sin6_addr,
&pol->ape_policy.addr.sin6_addr) &&
IN6_ARE_ADDR_EQUAL(&newpolicy->addrmask.sin6_addr,
&pol->ape_policy.addrmask.sin6_addr)) {
ADDRSEL_UNLOCK();
ADDRSEL_XUNLOCK();
free(new, M_IFADDR);
return (EEXIST); /* or override it? */
}
}
bzero(new, sizeof(*new));
/* XXX: should validate entry */
new->ape_policy = *newpolicy;
TAILQ_INSERT_TAIL(&V_addrsel_policytab, new, ape_entry);
ADDRSEL_UNLOCK();
ADDRSEL_XUNLOCK();
return (0);
}
static int
delete_addrsel_policyent(struct in6_addrpolicy *key)
{
struct addrsel_policyent *pol;
ADDRSEL_XLOCK();
ADDRSEL_LOCK();
/* search for the entry in the table */
TAILQ_FOREACH(pol, &V_addrsel_policytab, ape_entry) {
if (IN6_ARE_ADDR_EQUAL(&key->addr.sin6_addr,
&pol->ape_policy.addr.sin6_addr) &&
IN6_ARE_ADDR_EQUAL(&key->addrmask.sin6_addr,
&pol->ape_policy.addrmask.sin6_addr)) {
break;
}
}
if (pol == NULL) {
ADDRSEL_UNLOCK();
ADDRSEL_XUNLOCK();
return (ESRCH);
}
TAILQ_REMOVE(&V_addrsel_policytab, pol, ape_entry);
ADDRSEL_UNLOCK();
ADDRSEL_XUNLOCK();
free(pol, M_IFADDR);
return (0);
}
static int
walk_addrsel_policy(int (*callback)(struct in6_addrpolicy *, void *), void *w)
{
struct addrsel_policyent *pol;
int error = 0;
ADDRSEL_SLOCK();
TAILQ_FOREACH(pol, &V_addrsel_policytab, ape_entry) {
if ((error = (*callback)(&pol->ape_policy, w)) != 0) {
ADDRSEL_SUNLOCK();
return (error);
}
}
ADDRSEL_SUNLOCK();
return (error);
}
static int
dump_addrsel_policyent(struct in6_addrpolicy *pol, void *arg)
{
int error = 0;
struct walkarg *w = arg;
error = SYSCTL_OUT(w->w_req, pol, sizeof(*pol));
return (error);
}
static struct in6_addrpolicy *
match_addrsel_policy(struct sockaddr_in6 *key)
{
struct addrsel_policyent *pent;
struct in6_addrpolicy *bestpol = NULL, *pol;
int matchlen, bestmatchlen = -1;
u_char *mp, *ep, *k, *p, m;
TAILQ_FOREACH(pent, &V_addrsel_policytab, ape_entry) {
matchlen = 0;
pol = &pent->ape_policy;
mp = (u_char *)&pol->addrmask.sin6_addr;
ep = mp + 16; /* XXX: scope field? */
k = (u_char *)&key->sin6_addr;
p = (u_char *)&pol->addr.sin6_addr;
for (; mp < ep && *mp; mp++, k++, p++) {
m = *mp;
if ((*k & m) != *p)
goto next; /* not match */
if (m == 0xff) /* short cut for a typical case */
matchlen += 8;
else {
while (m >= 0x80) {
matchlen++;
m <<= 1;
}
}
}
/* matched. check if this is better than the current best. */
if (bestpol == NULL ||
matchlen > bestmatchlen) {
bestpol = pol;
bestmatchlen = matchlen;
}
next:
continue;
}
return (bestpol);
}
Index: head/sys/netinet6/mld6.c
===================================================================
--- head/sys/netinet6/mld6.c (revision 327172)
+++ head/sys/netinet6/mld6.c (revision 327173)
@@ -1,3318 +1,3317 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (c) 2009 Bruce Simpson.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $KAME: mld6.c,v 1.27 2001/04/04 05:17:30 itojun Exp $
*/
/*-
* Copyright (c) 1988 Stephen Deering.
* Copyright (c) 1992, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Stephen Deering of Stanford University.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)igmp.c 8.1 (Berkeley) 7/19/93
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_inet.h"
#include "opt_inet6.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/protosw.h>
#include <sys/sysctl.h>
#include <sys/kernel.h>
#include <sys/callout.h>
#include <sys/malloc.h>
#include <sys/module.h>
#include <sys/ktr.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/route.h>
#include <net/vnet.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet6/in6_var.h>
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/scope6_var.h>
#include <netinet/icmp6.h>
#include <netinet6/mld6.h>
#include <netinet6/mld6_var.h>
#include <security/mac/mac_framework.h>
#ifndef KTR_MLD
#define KTR_MLD KTR_INET6
#endif
static struct mld_ifsoftc *
mli_alloc_locked(struct ifnet *);
static void mli_delete_locked(const struct ifnet *);
static void mld_dispatch_packet(struct mbuf *);
static void mld_dispatch_queue(struct mbufq *, int);
static void mld_final_leave(struct in6_multi *, struct mld_ifsoftc *);
static void mld_fasttimo_vnet(void);
static int mld_handle_state_change(struct in6_multi *,
struct mld_ifsoftc *);
static int mld_initial_join(struct in6_multi *, struct mld_ifsoftc *,
const int);
#ifdef KTR
static char * mld_rec_type_to_str(const int);
#endif
static void mld_set_version(struct mld_ifsoftc *, const int);
static void mld_slowtimo_vnet(void);
static int mld_v1_input_query(struct ifnet *, const struct ip6_hdr *,
/*const*/ struct mld_hdr *);
static int mld_v1_input_report(struct ifnet *, const struct ip6_hdr *,
/*const*/ struct mld_hdr *);
static void mld_v1_process_group_timer(struct mld_ifsoftc *,
struct in6_multi *);
static void mld_v1_process_querier_timers(struct mld_ifsoftc *);
static int mld_v1_transmit_report(struct in6_multi *, const int);
static void mld_v1_update_group(struct in6_multi *, const int);
static void mld_v2_cancel_link_timers(struct mld_ifsoftc *);
static void mld_v2_dispatch_general_query(struct mld_ifsoftc *);
static struct mbuf *
mld_v2_encap_report(struct ifnet *, struct mbuf *);
static int mld_v2_enqueue_filter_change(struct mbufq *,
struct in6_multi *);
static int mld_v2_enqueue_group_record(struct mbufq *,
struct in6_multi *, const int, const int, const int,
const int);
static int mld_v2_input_query(struct ifnet *, const struct ip6_hdr *,
struct mbuf *, const int, const int);
static int mld_v2_merge_state_changes(struct in6_multi *,
struct mbufq *);
static void mld_v2_process_group_timers(struct mld_ifsoftc *,
struct mbufq *, struct mbufq *,
struct in6_multi *, const int);
static int mld_v2_process_group_query(struct in6_multi *,
struct mld_ifsoftc *mli, int, struct mbuf *, const int);
static int sysctl_mld_gsr(SYSCTL_HANDLER_ARGS);
static int sysctl_mld_ifinfo(SYSCTL_HANDLER_ARGS);
/*
* Normative references: RFC 2710, RFC 3590, RFC 3810.
*
* Locking:
* * The MLD subsystem lock ends up being system-wide for the moment,
* but could be per-VIMAGE later on.
* * The permitted lock order is: IN6_MULTI_LOCK, MLD_LOCK, IF_ADDR_LOCK.
* Any may be taken independently; if any are held at the same
* time, the above lock order must be followed.
* * IN6_MULTI_LOCK covers in_multi.
* * MLD_LOCK covers per-link state and any global variables in this file.
* * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of
* per-link state iterators.
*
* XXX LOR PREVENTION
* A special case for IPv6 is the in6_setscope() routine. ip6_output()
* will not accept an ifp; it wants an embedded scope ID, unlike
* ip_output(), which happily takes the ifp given to it. The embedded
* scope ID is only used by MLD to select the outgoing interface.
*
* During interface attach and detach, MLD will take MLD_LOCK *after*
* the IF_AFDATA_LOCK.
* As in6_setscope() takes IF_AFDATA_LOCK then SCOPE_LOCK, we can't call
* it with MLD_LOCK held without triggering an LOR. A netisr with indirect
* dispatch could work around this, but we'd rather not do that, as it
* can introduce other races.
*
* As such, we exploit the fact that the scope ID is just the interface
* index, and embed it in the IPv6 destination address accordingly.
* This is potentially NOT VALID for MLDv1 reports, as they
* are always sent to the multicast group itself; as MLDv2
* reports are always sent to ff02::16, this is not an issue
* when MLDv2 is in use.
*
* This does not however eliminate the LOR when ip6_output() itself
* calls in6_setscope() internally whilst MLD_LOCK is held. This will
* trigger a LOR warning in WITNESS when the ifnet is detached.
*
* The right answer is probably to make IF_AFDATA_LOCK an rwlock, given
* how it's used across the network stack. Here we're simply exploiting
* the fact that MLD runs at a similar layer in the stack to scope6.c.
*
* VIMAGE:
* * Each in6_multi corresponds to an ifp, and each ifp corresponds
* to a vnet in ifp->if_vnet.
*/
static struct mtx mld_mtx;
static MALLOC_DEFINE(M_MLD, "mld", "mld state");
#define MLD_EMBEDSCOPE(pin6, zoneid) \
if (IN6_IS_SCOPE_LINKLOCAL(pin6) || \
IN6_IS_ADDR_MC_INTFACELOCAL(pin6)) \
(pin6)->s6_addr16[1] = htons((zoneid) & 0xFFFF) \
/*
* VIMAGE-wide globals.
*/
static VNET_DEFINE(struct timeval, mld_gsrdelay) = {10, 0};
static VNET_DEFINE(LIST_HEAD(, mld_ifsoftc), mli_head);
static VNET_DEFINE(int, interface_timers_running6);
static VNET_DEFINE(int, state_change_timers_running6);
static VNET_DEFINE(int, current_state_timers_running6);
#define V_mld_gsrdelay VNET(mld_gsrdelay)
#define V_mli_head VNET(mli_head)
#define V_interface_timers_running6 VNET(interface_timers_running6)
#define V_state_change_timers_running6 VNET(state_change_timers_running6)
#define V_current_state_timers_running6 VNET(current_state_timers_running6)
SYSCTL_DECL(_net_inet6); /* Note: Not in any common header. */
SYSCTL_NODE(_net_inet6, OID_AUTO, mld, CTLFLAG_RW, 0,
"IPv6 Multicast Listener Discovery");
/*
* Virtualized sysctls.
*/
SYSCTL_PROC(_net_inet6_mld, OID_AUTO, gsrdelay,
CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
&VNET_NAME(mld_gsrdelay.tv_sec), 0, sysctl_mld_gsr, "I",
"Rate limit for MLDv2 Group-and-Source queries in seconds");
/*
* Non-virtualized sysctls.
*/
static SYSCTL_NODE(_net_inet6_mld, OID_AUTO, ifinfo,
CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_mld_ifinfo,
"Per-interface MLDv2 state");
static int mld_v1enable = 1;
SYSCTL_INT(_net_inet6_mld, OID_AUTO, v1enable, CTLFLAG_RWTUN,
&mld_v1enable, 0, "Enable fallback to MLDv1");
static int mld_use_allow = 1;
SYSCTL_INT(_net_inet6_mld, OID_AUTO, use_allow, CTLFLAG_RWTUN,
&mld_use_allow, 0, "Use ALLOW/BLOCK for RFC 4604 SSM joins/leaves");
/*
* Packed Router Alert option structure declaration.
*/
struct mld_raopt {
struct ip6_hbh hbh;
struct ip6_opt pad;
struct ip6_opt_router ra;
} __packed;
/*
* Router Alert hop-by-hop option header.
*/
static struct mld_raopt mld_ra = {
.hbh = { 0, 0 },
.pad = { .ip6o_type = IP6OPT_PADN, 0 },
.ra = {
.ip6or_type = IP6OPT_ROUTER_ALERT,
.ip6or_len = IP6OPT_RTALERT_LEN - 2,
.ip6or_value[0] = ((IP6OPT_RTALERT_MLD >> 8) & 0xFF),
.ip6or_value[1] = (IP6OPT_RTALERT_MLD & 0xFF)
}
};
static struct ip6_pktopts mld_po;
static __inline void
mld_save_context(struct mbuf *m, struct ifnet *ifp)
{
#ifdef VIMAGE
m->m_pkthdr.PH_loc.ptr = ifp->if_vnet;
#endif /* VIMAGE */
m->m_pkthdr.flowid = ifp->if_index;
}
static __inline void
mld_scrub_context(struct mbuf *m)
{
m->m_pkthdr.PH_loc.ptr = NULL;
m->m_pkthdr.flowid = 0;
}
/*
* Restore context from a queued output chain.
* Return saved ifindex.
*
* VIMAGE: The assertion is there to make sure that we
* actually called CURVNET_SET() with what's in the mbuf chain.
*/
static __inline uint32_t
mld_restore_context(struct mbuf *m)
{
#if defined(VIMAGE) && defined(INVARIANTS)
KASSERT(curvnet == m->m_pkthdr.PH_loc.ptr,
("%s: called when curvnet was not restored: cuvnet %p m ptr %p",
__func__, curvnet, m->m_pkthdr.PH_loc.ptr));
#endif
return (m->m_pkthdr.flowid);
}
/*
* Retrieve or set threshold between group-source queries in seconds.
*
* VIMAGE: Assume curvnet set by caller.
* SMPng: NOTE: Serialized by MLD lock.
*/
static int
sysctl_mld_gsr(SYSCTL_HANDLER_ARGS)
{
int error;
int i;
error = sysctl_wire_old_buffer(req, sizeof(int));
if (error)
return (error);
MLD_LOCK();
i = V_mld_gsrdelay.tv_sec;
error = sysctl_handle_int(oidp, &i, 0, req);
if (error || !req->newptr)
goto out_locked;
if (i < -1 || i >= 60) {
error = EINVAL;
goto out_locked;
}
CTR2(KTR_MLD, "change mld_gsrdelay from %d to %d",
V_mld_gsrdelay.tv_sec, i);
V_mld_gsrdelay.tv_sec = i;
out_locked:
MLD_UNLOCK();
return (error);
}
/*
* Expose struct mld_ifsoftc to userland, keyed by ifindex.
* For use by ifmcstat(8).
*
* SMPng: NOTE: Does an unlocked ifindex space read.
* VIMAGE: Assume curvnet set by caller. The node handler itself
* is not directly virtualized.
*/
static int
sysctl_mld_ifinfo(SYSCTL_HANDLER_ARGS)
{
int *name;
int error;
u_int namelen;
struct ifnet *ifp;
struct mld_ifsoftc *mli;
name = (int *)arg1;
namelen = arg2;
if (req->newptr != NULL)
return (EPERM);
if (namelen != 1)
return (EINVAL);
error = sysctl_wire_old_buffer(req, sizeof(struct mld_ifinfo));
if (error)
return (error);
IN6_MULTI_LOCK();
MLD_LOCK();
if (name[0] <= 0 || name[0] > V_if_index) {
error = ENOENT;
goto out_locked;
}
error = ENOENT;
ifp = ifnet_byindex(name[0]);
if (ifp == NULL)
goto out_locked;
LIST_FOREACH(mli, &V_mli_head, mli_link) {
if (ifp == mli->mli_ifp) {
struct mld_ifinfo info;
info.mli_version = mli->mli_version;
info.mli_v1_timer = mli->mli_v1_timer;
info.mli_v2_timer = mli->mli_v2_timer;
info.mli_flags = mli->mli_flags;
info.mli_rv = mli->mli_rv;
info.mli_qi = mli->mli_qi;
info.mli_qri = mli->mli_qri;
info.mli_uri = mli->mli_uri;
error = SYSCTL_OUT(req, &info, sizeof(info));
break;
}
}
out_locked:
MLD_UNLOCK();
IN6_MULTI_UNLOCK();
return (error);
}
/*
* Dispatch an entire queue of pending packet chains.
* VIMAGE: Assumes the vnet pointer has been set.
*/
static void
mld_dispatch_queue(struct mbufq *mq, int limit)
{
struct mbuf *m;
while ((m = mbufq_dequeue(mq)) != NULL) {
CTR3(KTR_MLD, "%s: dispatch %p from %p", __func__, mq, m);
mld_dispatch_packet(m);
if (--limit == 0)
break;
}
}
/*
* Filter outgoing MLD report state by group.
*
* Reports are ALWAYS suppressed for ALL-HOSTS (ff02::1)
* and node-local addresses. However, kernel and socket consumers
* always embed the KAME scope ID in the address provided, so strip it
* when performing comparison.
* Note: This is not the same as the *multicast* scope.
*
* Return zero if the given group is one for which MLD reports
* should be suppressed, or non-zero if reports should be issued.
*/
static __inline int
mld_is_addr_reported(const struct in6_addr *addr)
{
KASSERT(IN6_IS_ADDR_MULTICAST(addr), ("%s: not multicast", __func__));
if (IPV6_ADDR_MC_SCOPE(addr) == IPV6_ADDR_SCOPE_NODELOCAL)
return (0);
if (IPV6_ADDR_MC_SCOPE(addr) == IPV6_ADDR_SCOPE_LINKLOCAL) {
struct in6_addr tmp = *addr;
in6_clearscope(&tmp);
if (IN6_ARE_ADDR_EQUAL(&tmp, &in6addr_linklocal_allnodes))
return (0);
}
return (1);
}
/*
* Attach MLD when PF_INET6 is attached to an interface.
*
* SMPng: Normally called with IF_AFDATA_LOCK held.
*/
struct mld_ifsoftc *
mld_domifattach(struct ifnet *ifp)
{
struct mld_ifsoftc *mli;
CTR3(KTR_MLD, "%s: called for ifp %p(%s)",
__func__, ifp, if_name(ifp));
MLD_LOCK();
mli = mli_alloc_locked(ifp);
if (!(ifp->if_flags & IFF_MULTICAST))
mli->mli_flags |= MLIF_SILENT;
if (mld_use_allow)
mli->mli_flags |= MLIF_USEALLOW;
MLD_UNLOCK();
return (mli);
}
/*
* VIMAGE: assume curvnet set by caller.
*/
static struct mld_ifsoftc *
mli_alloc_locked(/*const*/ struct ifnet *ifp)
{
struct mld_ifsoftc *mli;
MLD_LOCK_ASSERT();
mli = malloc(sizeof(struct mld_ifsoftc), M_MLD, M_NOWAIT|M_ZERO);
if (mli == NULL)
goto out;
mli->mli_ifp = ifp;
mli->mli_version = MLD_VERSION_2;
mli->mli_flags = 0;
mli->mli_rv = MLD_RV_INIT;
mli->mli_qi = MLD_QI_INIT;
mli->mli_qri = MLD_QRI_INIT;
mli->mli_uri = MLD_URI_INIT;
SLIST_INIT(&mli->mli_relinmhead);
mbufq_init(&mli->mli_gq, MLD_MAX_RESPONSE_PACKETS);
LIST_INSERT_HEAD(&V_mli_head, mli, mli_link);
CTR2(KTR_MLD, "allocate mld_ifsoftc for ifp %p(%s)",
ifp, if_name(ifp));
out:
return (mli);
}
/*
* Hook for ifdetach.
*
* NOTE: Some finalization tasks need to run before the protocol domain
* is detached, but also before the link layer does its cleanup.
* Run before link-layer cleanup; cleanup groups, but do not free MLD state.
*
* SMPng: Caller must hold IN6_MULTI_LOCK().
* Must take IF_ADDR_LOCK() to cover if_multiaddrs iterator.
* XXX This routine is also bitten by unlocked ifma_protospec access.
*/
void
mld_ifdetach(struct ifnet *ifp)
{
struct mld_ifsoftc *mli;
struct ifmultiaddr *ifma;
struct in6_multi *inm, *tinm;
CTR3(KTR_MLD, "%s: called for ifp %p(%s)", __func__, ifp,
if_name(ifp));
IN6_MULTI_LOCK_ASSERT();
MLD_LOCK();
mli = MLD_IFINFO(ifp);
if (mli->mli_version == MLD_VERSION_2) {
IF_ADDR_RLOCK(ifp);
TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
if (ifma->ifma_addr->sa_family != AF_INET6 ||
ifma->ifma_protospec == NULL)
continue;
inm = (struct in6_multi *)ifma->ifma_protospec;
if (inm->in6m_state == MLD_LEAVING_MEMBER) {
SLIST_INSERT_HEAD(&mli->mli_relinmhead,
inm, in6m_nrele);
}
in6m_clear_recorded(inm);
}
IF_ADDR_RUNLOCK(ifp);
SLIST_FOREACH_SAFE(inm, &mli->mli_relinmhead, in6m_nrele,
tinm) {
SLIST_REMOVE_HEAD(&mli->mli_relinmhead, in6m_nrele);
in6m_release_locked(inm);
}
}
MLD_UNLOCK();
}
/*
* Hook for domifdetach.
* Runs after link-layer cleanup; free MLD state.
*
* SMPng: Normally called with IF_AFDATA_LOCK held.
*/
void
mld_domifdetach(struct ifnet *ifp)
{
CTR3(KTR_MLD, "%s: called for ifp %p(%s)",
__func__, ifp, if_name(ifp));
MLD_LOCK();
mli_delete_locked(ifp);
MLD_UNLOCK();
}
static void
mli_delete_locked(const struct ifnet *ifp)
{
struct mld_ifsoftc *mli, *tmli;
CTR3(KTR_MLD, "%s: freeing mld_ifsoftc for ifp %p(%s)",
__func__, ifp, if_name(ifp));
MLD_LOCK_ASSERT();
LIST_FOREACH_SAFE(mli, &V_mli_head, mli_link, tmli) {
if (mli->mli_ifp == ifp) {
/*
* Free deferred General Query responses.
*/
mbufq_drain(&mli->mli_gq);
LIST_REMOVE(mli, mli_link);
KASSERT(SLIST_EMPTY(&mli->mli_relinmhead),
("%s: there are dangling in_multi references",
__func__));
free(mli, M_MLD);
return;
}
}
}
/*
* Process a received MLDv1 general or address-specific query.
* Assumes that the query header has been pulled up to sizeof(mld_hdr).
*
* NOTE: Can't be fully const correct as we temporarily embed scope ID in
* mld_addr. This is OK as we own the mbuf chain.
*/
static int
mld_v1_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6,
/*const*/ struct mld_hdr *mld)
{
struct ifmultiaddr *ifma;
struct mld_ifsoftc *mli;
struct in6_multi *inm;
int is_general_query;
uint16_t timer;
#ifdef KTR
char ip6tbuf[INET6_ADDRSTRLEN];
#endif
is_general_query = 0;
if (!mld_v1enable) {
CTR3(KTR_MLD, "ignore v1 query %s on ifp %p(%s)",
ip6_sprintf(ip6tbuf, &mld->mld_addr),
ifp, if_name(ifp));
return (0);
}
/*
* RFC3810 Section 6.2: MLD queries must originate from
* a router's link-local address.
*/
if (!IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) {
CTR3(KTR_MLD, "ignore v1 query src %s on ifp %p(%s)",
ip6_sprintf(ip6tbuf, &ip6->ip6_src),
ifp, if_name(ifp));
return (0);
}
/*
* Do address field validation upfront before we accept
* the query.
*/
if (IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr)) {
/*
* MLDv1 General Query.
* If this was not sent to the all-nodes group, ignore it.
*/
struct in6_addr dst;
dst = ip6->ip6_dst;
in6_clearscope(&dst);
if (!IN6_ARE_ADDR_EQUAL(&dst, &in6addr_linklocal_allnodes))
return (EINVAL);
is_general_query = 1;
} else {
/*
* Embed scope ID of receiving interface in MLD query for
* lookup whilst we don't hold other locks.
*/
in6_setscope(&mld->mld_addr, ifp, NULL);
}
IN6_MULTI_LOCK();
MLD_LOCK();
/*
* Switch to MLDv1 host compatibility mode.
*/
mli = MLD_IFINFO(ifp);
KASSERT(mli != NULL, ("%s: no mld_ifsoftc for ifp %p", __func__, ifp));
mld_set_version(mli, MLD_VERSION_1);
timer = (ntohs(mld->mld_maxdelay) * PR_FASTHZ) / MLD_TIMER_SCALE;
if (timer == 0)
timer = 1;
IF_ADDR_RLOCK(ifp);
if (is_general_query) {
/*
* For each reporting group joined on this
* interface, kick the report timer.
*/
CTR2(KTR_MLD, "process v1 general query on ifp %p(%s)",
ifp, if_name(ifp));
TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
if (ifma->ifma_addr->sa_family != AF_INET6 ||
ifma->ifma_protospec == NULL)
continue;
inm = (struct in6_multi *)ifma->ifma_protospec;
mld_v1_update_group(inm, timer);
}
} else {
/*
* MLDv1 Group-Specific Query.
* If this is a group-specific MLDv1 query, we need only
* look up the single group to process it.
*/
inm = in6m_lookup_locked(ifp, &mld->mld_addr);
if (inm != NULL) {
CTR3(KTR_MLD, "process v1 query %s on ifp %p(%s)",
ip6_sprintf(ip6tbuf, &mld->mld_addr),
ifp, if_name(ifp));
mld_v1_update_group(inm, timer);
}
/* XXX Clear embedded scope ID as userland won't expect it. */
in6_clearscope(&mld->mld_addr);
}
IF_ADDR_RUNLOCK(ifp);
MLD_UNLOCK();
IN6_MULTI_UNLOCK();
return (0);
}
/*
* Update the report timer on a group in response to an MLDv1 query.
*
* If we are becoming the reporting member for this group, start the timer.
* If we already are the reporting member for this group, and timer is
* below the threshold, reset it.
*
* We may be updating the group for the first time since we switched
* to MLDv2. If we are, then we must clear any recorded source lists,
* and transition to REPORTING state; the group timer is overloaded
* for group and group-source query responses.
*
* Unlike MLDv2, the delay per group should be jittered
* to avoid bursts of MLDv1 reports.
*/
static void
mld_v1_update_group(struct in6_multi *inm, const int timer)
{
#ifdef KTR
char ip6tbuf[INET6_ADDRSTRLEN];
#endif
CTR4(KTR_MLD, "%s: %s/%s timer=%d", __func__,
ip6_sprintf(ip6tbuf, &inm->in6m_addr),
if_name(inm->in6m_ifp), timer);
IN6_MULTI_LOCK_ASSERT();
switch (inm->in6m_state) {
case MLD_NOT_MEMBER:
case MLD_SILENT_MEMBER:
break;
case MLD_REPORTING_MEMBER:
if (inm->in6m_timer != 0 &&
inm->in6m_timer <= timer) {
CTR1(KTR_MLD, "%s: REPORTING and timer running, "
"skipping.", __func__);
break;
}
/* FALLTHROUGH */
case MLD_SG_QUERY_PENDING_MEMBER:
case MLD_G_QUERY_PENDING_MEMBER:
case MLD_IDLE_MEMBER:
case MLD_LAZY_MEMBER:
case MLD_AWAKENING_MEMBER:
CTR1(KTR_MLD, "%s: ->REPORTING", __func__);
inm->in6m_state = MLD_REPORTING_MEMBER;
inm->in6m_timer = MLD_RANDOM_DELAY(timer);
V_current_state_timers_running6 = 1;
break;
case MLD_SLEEPING_MEMBER:
CTR1(KTR_MLD, "%s: ->AWAKENING", __func__);
inm->in6m_state = MLD_AWAKENING_MEMBER;
break;
case MLD_LEAVING_MEMBER:
break;
}
}
/*
* Process a received MLDv2 general, group-specific or
* group-and-source-specific query.
*
* Assumes that the query header has been pulled up to sizeof(mldv2_query).
*
* Return 0 if successful, otherwise an appropriate error code is returned.
*/
static int
mld_v2_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6,
struct mbuf *m, const int off, const int icmp6len)
{
struct mld_ifsoftc *mli;
struct mldv2_query *mld;
struct in6_multi *inm;
uint32_t maxdelay, nsrc, qqi;
int is_general_query;
uint16_t timer;
uint8_t qrv;
#ifdef KTR
char ip6tbuf[INET6_ADDRSTRLEN];
#endif
is_general_query = 0;
/*
* RFC3810 Section 6.2: MLD queries must originate from
* a router's link-local address.
*/
if (!IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) {
CTR3(KTR_MLD, "ignore v1 query src %s on ifp %p(%s)",
ip6_sprintf(ip6tbuf, &ip6->ip6_src),
ifp, if_name(ifp));
return (0);
}
CTR2(KTR_MLD, "input v2 query on ifp %p(%s)", ifp, if_name(ifp));
mld = (struct mldv2_query *)(mtod(m, uint8_t *) + off);
maxdelay = ntohs(mld->mld_maxdelay); /* in 1/10ths of a second */
if (maxdelay >= 32768) {
maxdelay = (MLD_MRC_MANT(maxdelay) | 0x1000) <<
(MLD_MRC_EXP(maxdelay) + 3);
}
timer = (maxdelay * PR_FASTHZ) / MLD_TIMER_SCALE;
if (timer == 0)
timer = 1;
qrv = MLD_QRV(mld->mld_misc);
if (qrv < 2) {
CTR3(KTR_MLD, "%s: clamping qrv %d to %d", __func__,
qrv, MLD_RV_INIT);
qrv = MLD_RV_INIT;
}
qqi = mld->mld_qqi;
if (qqi >= 128) {
qqi = MLD_QQIC_MANT(mld->mld_qqi) <<
(MLD_QQIC_EXP(mld->mld_qqi) + 3);
}
nsrc = ntohs(mld->mld_numsrc);
if (nsrc > MLD_MAX_GS_SOURCES)
return (EMSGSIZE);
if (icmp6len < sizeof(struct mldv2_query) +
(nsrc * sizeof(struct in6_addr)))
return (EMSGSIZE);
/*
* Do further input validation upfront to avoid resetting timers
* should we need to discard this query.
*/
if (IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr)) {
/*
* A general query with a source list has undefined
* behaviour; discard it.
*/
if (nsrc > 0)
return (EINVAL);
is_general_query = 1;
} else {
/*
* Embed scope ID of receiving interface in MLD query for
* lookup whilst we don't hold other locks (due to KAME
* locking lameness). We own this mbuf chain just now.
*/
in6_setscope(&mld->mld_addr, ifp, NULL);
}
IN6_MULTI_LOCK();
MLD_LOCK();
mli = MLD_IFINFO(ifp);
KASSERT(mli != NULL, ("%s: no mld_ifsoftc for ifp %p", __func__, ifp));
/*
* Discard the v2 query if we're in Compatibility Mode.
* The RFC is pretty clear that hosts need to stay in MLDv1 mode
* until the Old Version Querier Present timer expires.
*/
if (mli->mli_version != MLD_VERSION_2)
goto out_locked;
mld_set_version(mli, MLD_VERSION_2);
mli->mli_rv = qrv;
mli->mli_qi = qqi;
mli->mli_qri = maxdelay;
CTR4(KTR_MLD, "%s: qrv %d qi %d maxdelay %d", __func__, qrv, qqi,
maxdelay);
if (is_general_query) {
/*
* MLDv2 General Query.
*
* Schedule a current-state report on this ifp for
* all groups, possibly containing source lists.
*
* If there is a pending General Query response
* scheduled earlier than the selected delay, do
* not schedule any other reports.
* Otherwise, reset the interface timer.
*/
CTR2(KTR_MLD, "process v2 general query on ifp %p(%s)",
ifp, if_name(ifp));
if (mli->mli_v2_timer == 0 || mli->mli_v2_timer >= timer) {
mli->mli_v2_timer = MLD_RANDOM_DELAY(timer);
V_interface_timers_running6 = 1;
}
} else {
/*
* MLDv2 Group-specific or Group-and-source-specific Query.
*
* Group-source-specific queries are throttled on
* a per-group basis to defeat denial-of-service attempts.
* Queries for groups we are not a member of on this
* link are simply ignored.
*/
IF_ADDR_RLOCK(ifp);
inm = in6m_lookup_locked(ifp, &mld->mld_addr);
if (inm == NULL) {
IF_ADDR_RUNLOCK(ifp);
goto out_locked;
}
if (nsrc > 0) {
if (!ratecheck(&inm->in6m_lastgsrtv,
&V_mld_gsrdelay)) {
CTR1(KTR_MLD, "%s: GS query throttled.",
__func__);
IF_ADDR_RUNLOCK(ifp);
goto out_locked;
}
}
CTR2(KTR_MLD, "process v2 group query on ifp %p(%s)",
ifp, if_name(ifp));
/*
* If there is a pending General Query response
* scheduled sooner than the selected delay, no
* further report need be scheduled.
* Otherwise, prepare to respond to the
* group-specific or group-and-source query.
*/
if (mli->mli_v2_timer == 0 || mli->mli_v2_timer >= timer)
mld_v2_process_group_query(inm, mli, timer, m, off);
/* XXX Clear embedded scope ID as userland won't expect it. */
in6_clearscope(&mld->mld_addr);
IF_ADDR_RUNLOCK(ifp);
}
out_locked:
MLD_UNLOCK();
IN6_MULTI_UNLOCK();
return (0);
}
/*
* Process a received MLDv2 group-specific or group-and-source-specific
* query.
* Return <0 if any error occurred. Currently this is ignored.
*/
static int
mld_v2_process_group_query(struct in6_multi *inm, struct mld_ifsoftc *mli,
int timer, struct mbuf *m0, const int off)
{
struct mldv2_query *mld;
int retval;
uint16_t nsrc;
IN6_MULTI_LOCK_ASSERT();
MLD_LOCK_ASSERT();
retval = 0;
mld = (struct mldv2_query *)(mtod(m0, uint8_t *) + off);
switch (inm->in6m_state) {
case MLD_NOT_MEMBER:
case MLD_SILENT_MEMBER:
case MLD_SLEEPING_MEMBER:
case MLD_LAZY_MEMBER:
case MLD_AWAKENING_MEMBER:
case MLD_IDLE_MEMBER:
case MLD_LEAVING_MEMBER:
return (retval);
break;
case MLD_REPORTING_MEMBER:
case MLD_G_QUERY_PENDING_MEMBER:
case MLD_SG_QUERY_PENDING_MEMBER:
break;
}
nsrc = ntohs(mld->mld_numsrc);
/*
* Deal with group-specific queries upfront.
* If any group query is already pending, purge any recorded
* source-list state if it exists, and schedule a query response
* for this group-specific query.
*/
if (nsrc == 0) {
if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER ||
inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER) {
in6m_clear_recorded(inm);
timer = min(inm->in6m_timer, timer);
}
inm->in6m_state = MLD_G_QUERY_PENDING_MEMBER;
inm->in6m_timer = MLD_RANDOM_DELAY(timer);
V_current_state_timers_running6 = 1;
return (retval);
}
/*
* Deal with the case where a group-and-source-specific query has
* been received but a group-specific query is already pending.
*/
if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER) {
timer = min(inm->in6m_timer, timer);
inm->in6m_timer = MLD_RANDOM_DELAY(timer);
V_current_state_timers_running6 = 1;
return (retval);
}
/*
* Finally, deal with the case where a group-and-source-specific
* query has been received, where a response to a previous g-s-r
* query exists, or none exists.
* In this case, we need to parse the source-list which the Querier
* has provided us with and check if we have any source list filter
* entries at T1 for these sources. If we do not, there is no need
* schedule a report and the query may be dropped.
* If we do, we must record them and schedule a current-state
* report for those sources.
*/
if (inm->in6m_nsrc > 0) {
struct mbuf *m;
uint8_t *sp;
int i, nrecorded;
int soff;
m = m0;
soff = off + sizeof(struct mldv2_query);
nrecorded = 0;
for (i = 0; i < nsrc; i++) {
sp = mtod(m, uint8_t *) + soff;
retval = in6m_record_source(inm,
(const struct in6_addr *)sp);
if (retval < 0)
break;
nrecorded += retval;
soff += sizeof(struct in6_addr);
if (soff >= m->m_len) {
soff = soff - m->m_len;
m = m->m_next;
if (m == NULL)
break;
}
}
if (nrecorded > 0) {
CTR1(KTR_MLD,
"%s: schedule response to SG query", __func__);
inm->in6m_state = MLD_SG_QUERY_PENDING_MEMBER;
inm->in6m_timer = MLD_RANDOM_DELAY(timer);
V_current_state_timers_running6 = 1;
}
}
return (retval);
}
/*
* Process a received MLDv1 host membership report.
* Assumes mld points to mld_hdr in pulled up mbuf chain.
*
* NOTE: Can't be fully const correct as we temporarily embed scope ID in
* mld_addr. This is OK as we own the mbuf chain.
*/
static int
mld_v1_input_report(struct ifnet *ifp, const struct ip6_hdr *ip6,
/*const*/ struct mld_hdr *mld)
{
struct in6_addr src, dst;
struct in6_ifaddr *ia;
struct in6_multi *inm;
#ifdef KTR
char ip6tbuf[INET6_ADDRSTRLEN];
#endif
if (!mld_v1enable) {
CTR3(KTR_MLD, "ignore v1 report %s on ifp %p(%s)",
ip6_sprintf(ip6tbuf, &mld->mld_addr),
ifp, if_name(ifp));
return (0);
}
if (ifp->if_flags & IFF_LOOPBACK)
return (0);
/*
* MLDv1 reports must originate from a host's link-local address,
* or the unspecified address (when booting).
*/
src = ip6->ip6_src;
in6_clearscope(&src);
if (!IN6_IS_SCOPE_LINKLOCAL(&src) && !IN6_IS_ADDR_UNSPECIFIED(&src)) {
CTR3(KTR_MLD, "ignore v1 query src %s on ifp %p(%s)",
ip6_sprintf(ip6tbuf, &ip6->ip6_src),
ifp, if_name(ifp));
return (EINVAL);
}
/*
* RFC2710 Section 4: MLDv1 reports must pertain to a multicast
* group, and must be directed to the group itself.
*/
dst = ip6->ip6_dst;
in6_clearscope(&dst);
if (!IN6_IS_ADDR_MULTICAST(&mld->mld_addr) ||
!IN6_ARE_ADDR_EQUAL(&mld->mld_addr, &dst)) {
CTR3(KTR_MLD, "ignore v1 query dst %s on ifp %p(%s)",
ip6_sprintf(ip6tbuf, &ip6->ip6_dst),
ifp, if_name(ifp));
return (EINVAL);
}
/*
* Make sure we don't hear our own membership report, as fast
* leave requires knowing that we are the only member of a
* group. Assume we used the link-local address if available,
* otherwise look for ::.
*
* XXX Note that scope ID comparison is needed for the address
* returned by in6ifa_ifpforlinklocal(), but SHOULD NOT be
* performed for the on-wire address.
*/
ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST);
if ((ia && IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, IA6_IN6(ia))) ||
(ia == NULL && IN6_IS_ADDR_UNSPECIFIED(&src))) {
if (ia != NULL)
ifa_free(&ia->ia_ifa);
return (0);
}
if (ia != NULL)
ifa_free(&ia->ia_ifa);
CTR3(KTR_MLD, "process v1 report %s on ifp %p(%s)",
ip6_sprintf(ip6tbuf, &mld->mld_addr), ifp, if_name(ifp));
/*
* Embed scope ID of receiving interface in MLD query for lookup
* whilst we don't hold other locks (due to KAME locking lameness).
*/
if (!IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr))
in6_setscope(&mld->mld_addr, ifp, NULL);
IN6_MULTI_LOCK();
MLD_LOCK();
IF_ADDR_RLOCK(ifp);
/*
* MLDv1 report suppression.
* If we are a member of this group, and our membership should be
* reported, and our group timer is pending or about to be reset,
* stop our group timer by transitioning to the 'lazy' state.
*/
inm = in6m_lookup_locked(ifp, &mld->mld_addr);
if (inm != NULL) {
struct mld_ifsoftc *mli;
mli = inm->in6m_mli;
KASSERT(mli != NULL,
("%s: no mli for ifp %p", __func__, ifp));
/*
* If we are in MLDv2 host mode, do not allow the
* other host's MLDv1 report to suppress our reports.
*/
if (mli->mli_version == MLD_VERSION_2)
goto out_locked;
inm->in6m_timer = 0;
switch (inm->in6m_state) {
case MLD_NOT_MEMBER:
case MLD_SILENT_MEMBER:
case MLD_SLEEPING_MEMBER:
break;
case MLD_REPORTING_MEMBER:
case MLD_IDLE_MEMBER:
case MLD_AWAKENING_MEMBER:
CTR3(KTR_MLD,
"report suppressed for %s on ifp %p(%s)",
ip6_sprintf(ip6tbuf, &mld->mld_addr),
ifp, if_name(ifp));
case MLD_LAZY_MEMBER:
inm->in6m_state = MLD_LAZY_MEMBER;
break;
case MLD_G_QUERY_PENDING_MEMBER:
case MLD_SG_QUERY_PENDING_MEMBER:
case MLD_LEAVING_MEMBER:
break;
}
}
out_locked:
IF_ADDR_RUNLOCK(ifp);
MLD_UNLOCK();
IN6_MULTI_UNLOCK();
/* XXX Clear embedded scope ID as userland won't expect it. */
in6_clearscope(&mld->mld_addr);
return (0);
}
/*
* MLD input path.
*
* Assume query messages which fit in a single ICMPv6 message header
* have been pulled up.
* Assume that userland will want to see the message, even if it
* otherwise fails kernel input validation; do not free it.
* Pullup may however free the mbuf chain m if it fails.
*
* Return IPPROTO_DONE if we freed m. Otherwise, return 0.
*/
int
mld_input(struct mbuf *m, int off, int icmp6len)
{
struct ifnet *ifp;
struct ip6_hdr *ip6;
struct mld_hdr *mld;
int mldlen;
CTR3(KTR_MLD, "%s: called w/mbuf (%p,%d)", __func__, m, off);
ifp = m->m_pkthdr.rcvif;
ip6 = mtod(m, struct ip6_hdr *);
/* Pullup to appropriate size. */
mld = (struct mld_hdr *)(mtod(m, uint8_t *) + off);
if (mld->mld_type == MLD_LISTENER_QUERY &&
icmp6len >= sizeof(struct mldv2_query)) {
mldlen = sizeof(struct mldv2_query);
} else {
mldlen = sizeof(struct mld_hdr);
}
IP6_EXTHDR_GET(mld, struct mld_hdr *, m, off, mldlen);
if (mld == NULL) {
ICMP6STAT_INC(icp6s_badlen);
return (IPPROTO_DONE);
}
/*
* Userland needs to see all of this traffic for implementing
* the endpoint discovery portion of multicast routing.
*/
switch (mld->mld_type) {
case MLD_LISTENER_QUERY:
icmp6_ifstat_inc(ifp, ifs6_in_mldquery);
if (icmp6len == sizeof(struct mld_hdr)) {
if (mld_v1_input_query(ifp, ip6, mld) != 0)
return (0);
} else if (icmp6len >= sizeof(struct mldv2_query)) {
if (mld_v2_input_query(ifp, ip6, m, off,
icmp6len) != 0)
return (0);
}
break;
case MLD_LISTENER_REPORT:
icmp6_ifstat_inc(ifp, ifs6_in_mldreport);
if (mld_v1_input_report(ifp, ip6, mld) != 0)
return (0);
break;
case MLDV2_LISTENER_REPORT:
icmp6_ifstat_inc(ifp, ifs6_in_mldreport);
break;
case MLD_LISTENER_DONE:
icmp6_ifstat_inc(ifp, ifs6_in_mlddone);
break;
default:
break;
}
return (0);
}
/*
* Fast timeout handler (global).
* VIMAGE: Timeout handlers are expected to service all vimages.
*/
void
mld_fasttimo(void)
{
VNET_ITERATOR_DECL(vnet_iter);
VNET_LIST_RLOCK_NOSLEEP();
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
mld_fasttimo_vnet();
CURVNET_RESTORE();
}
VNET_LIST_RUNLOCK_NOSLEEP();
}
/*
* Fast timeout handler (per-vnet).
*
* VIMAGE: Assume caller has set up our curvnet.
*/
static void
mld_fasttimo_vnet(void)
{
struct mbufq scq; /* State-change packets */
struct mbufq qrq; /* Query response packets */
struct ifnet *ifp;
struct mld_ifsoftc *mli;
struct ifmultiaddr *ifma;
struct in6_multi *inm, *tinm;
int uri_fasthz;
uri_fasthz = 0;
/*
* Quick check to see if any work needs to be done, in order to
* minimize the overhead of fasttimo processing.
* SMPng: XXX Unlocked reads.
*/
if (!V_current_state_timers_running6 &&
!V_interface_timers_running6 &&
!V_state_change_timers_running6)
return;
IN6_MULTI_LOCK();
MLD_LOCK();
/*
* MLDv2 General Query response timer processing.
*/
if (V_interface_timers_running6) {
CTR1(KTR_MLD, "%s: interface timers running", __func__);
V_interface_timers_running6 = 0;
LIST_FOREACH(mli, &V_mli_head, mli_link) {
if (mli->mli_v2_timer == 0) {
/* Do nothing. */
} else if (--mli->mli_v2_timer == 0) {
mld_v2_dispatch_general_query(mli);
} else {
V_interface_timers_running6 = 1;
}
}
}
if (!V_current_state_timers_running6 &&
!V_state_change_timers_running6)
goto out_locked;
V_current_state_timers_running6 = 0;
V_state_change_timers_running6 = 0;
CTR1(KTR_MLD, "%s: state change timers running", __func__);
/*
* MLD host report and state-change timer processing.
* Note: Processing a v2 group timer may remove a node.
*/
LIST_FOREACH(mli, &V_mli_head, mli_link) {
ifp = mli->mli_ifp;
if (mli->mli_version == MLD_VERSION_2) {
uri_fasthz = MLD_RANDOM_DELAY(mli->mli_uri *
PR_FASTHZ);
mbufq_init(&qrq, MLD_MAX_G_GS_PACKETS);
mbufq_init(&scq, MLD_MAX_STATE_CHANGE_PACKETS);
}
IF_ADDR_RLOCK(ifp);
TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
if (ifma->ifma_addr->sa_family != AF_INET6 ||
ifma->ifma_protospec == NULL)
continue;
inm = (struct in6_multi *)ifma->ifma_protospec;
switch (mli->mli_version) {
case MLD_VERSION_1:
mld_v1_process_group_timer(mli, inm);
break;
case MLD_VERSION_2:
mld_v2_process_group_timers(mli, &qrq,
&scq, inm, uri_fasthz);
break;
}
}
IF_ADDR_RUNLOCK(ifp);
switch (mli->mli_version) {
case MLD_VERSION_1:
/*
* Transmit reports for this lifecycle. This
* is done while not holding IF_ADDR_LOCK
* since this can call
* in6ifa_ifpforlinklocal() which locks
* IF_ADDR_LOCK internally as well as
* ip6_output() to transmit a packet.
*/
SLIST_FOREACH_SAFE(inm, &mli->mli_relinmhead,
in6m_nrele, tinm) {
SLIST_REMOVE_HEAD(&mli->mli_relinmhead,
in6m_nrele);
(void)mld_v1_transmit_report(inm,
MLD_LISTENER_REPORT);
}
break;
case MLD_VERSION_2:
mld_dispatch_queue(&qrq, 0);
mld_dispatch_queue(&scq, 0);
/*
* Free the in_multi reference(s) for
* this lifecycle.
*/
SLIST_FOREACH_SAFE(inm, &mli->mli_relinmhead,
in6m_nrele, tinm) {
SLIST_REMOVE_HEAD(&mli->mli_relinmhead,
in6m_nrele);
in6m_release_locked(inm);
}
break;
}
}
out_locked:
MLD_UNLOCK();
IN6_MULTI_UNLOCK();
}
/*
* Update host report group timer.
* Will update the global pending timer flags.
*/
static void
mld_v1_process_group_timer(struct mld_ifsoftc *mli, struct in6_multi *inm)
{
int report_timer_expired;
IN6_MULTI_LOCK_ASSERT();
MLD_LOCK_ASSERT();
if (inm->in6m_timer == 0) {
report_timer_expired = 0;
} else if (--inm->in6m_timer == 0) {
report_timer_expired = 1;
} else {
V_current_state_timers_running6 = 1;
return;
}
switch (inm->in6m_state) {
case MLD_NOT_MEMBER:
case MLD_SILENT_MEMBER:
case MLD_IDLE_MEMBER:
case MLD_LAZY_MEMBER:
case MLD_SLEEPING_MEMBER:
case MLD_AWAKENING_MEMBER:
break;
case MLD_REPORTING_MEMBER:
if (report_timer_expired) {
inm->in6m_state = MLD_IDLE_MEMBER;
SLIST_INSERT_HEAD(&mli->mli_relinmhead, inm,
in6m_nrele);
}
break;
case MLD_G_QUERY_PENDING_MEMBER:
case MLD_SG_QUERY_PENDING_MEMBER:
case MLD_LEAVING_MEMBER:
break;
}
}
/*
* Update a group's timers for MLDv2.
* Will update the global pending timer flags.
* Note: Unlocked read from mli.
*/
static void
mld_v2_process_group_timers(struct mld_ifsoftc *mli,
struct mbufq *qrq, struct mbufq *scq,
struct in6_multi *inm, const int uri_fasthz)
{
int query_response_timer_expired;
int state_change_retransmit_timer_expired;
#ifdef KTR
char ip6tbuf[INET6_ADDRSTRLEN];
#endif
IN6_MULTI_LOCK_ASSERT();
MLD_LOCK_ASSERT();
query_response_timer_expired = 0;
state_change_retransmit_timer_expired = 0;
/*
* During a transition from compatibility mode back to MLDv2,
* a group record in REPORTING state may still have its group
* timer active. This is a no-op in this function; it is easier
* to deal with it here than to complicate the slow-timeout path.
*/
if (inm->in6m_timer == 0) {
query_response_timer_expired = 0;
} else if (--inm->in6m_timer == 0) {
query_response_timer_expired = 1;
} else {
V_current_state_timers_running6 = 1;
}
if (inm->in6m_sctimer == 0) {
state_change_retransmit_timer_expired = 0;
} else if (--inm->in6m_sctimer == 0) {
state_change_retransmit_timer_expired = 1;
} else {
V_state_change_timers_running6 = 1;
}
/* We are in fasttimo, so be quick about it. */
if (!state_change_retransmit_timer_expired &&
!query_response_timer_expired)
return;
switch (inm->in6m_state) {
case MLD_NOT_MEMBER:
case MLD_SILENT_MEMBER:
case MLD_SLEEPING_MEMBER:
case MLD_LAZY_MEMBER:
case MLD_AWAKENING_MEMBER:
case MLD_IDLE_MEMBER:
break;
case MLD_G_QUERY_PENDING_MEMBER:
case MLD_SG_QUERY_PENDING_MEMBER:
/*
* Respond to a previously pending Group-Specific
* or Group-and-Source-Specific query by enqueueing
* the appropriate Current-State report for
* immediate transmission.
*/
if (query_response_timer_expired) {
int retval;
retval = mld_v2_enqueue_group_record(qrq, inm, 0, 1,
(inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER),
0);
CTR2(KTR_MLD, "%s: enqueue record = %d",
__func__, retval);
inm->in6m_state = MLD_REPORTING_MEMBER;
in6m_clear_recorded(inm);
}
/* FALLTHROUGH */
case MLD_REPORTING_MEMBER:
case MLD_LEAVING_MEMBER:
if (state_change_retransmit_timer_expired) {
/*
* State-change retransmission timer fired.
* If there are any further pending retransmissions,
* set the global pending state-change flag, and
* reset the timer.
*/
if (--inm->in6m_scrv > 0) {
inm->in6m_sctimer = uri_fasthz;
V_state_change_timers_running6 = 1;
}
/*
* Retransmit the previously computed state-change
* report. If there are no further pending
* retransmissions, the mbuf queue will be consumed.
* Update T0 state to T1 as we have now sent
* a state-change.
*/
(void)mld_v2_merge_state_changes(inm, scq);
in6m_commit(inm);
CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__,
ip6_sprintf(ip6tbuf, &inm->in6m_addr),
if_name(inm->in6m_ifp));
/*
* If we are leaving the group for good, make sure
* we release MLD's reference to it.
* This release must be deferred using a SLIST,
* as we are called from a loop which traverses
* the in_ifmultiaddr TAILQ.
*/
if (inm->in6m_state == MLD_LEAVING_MEMBER &&
inm->in6m_scrv == 0) {
inm->in6m_state = MLD_NOT_MEMBER;
SLIST_INSERT_HEAD(&mli->mli_relinmhead,
inm, in6m_nrele);
}
}
break;
}
}
/*
* Switch to a different version on the given interface,
* as per Section 9.12.
*/
static void
mld_set_version(struct mld_ifsoftc *mli, const int version)
{
int old_version_timer;
MLD_LOCK_ASSERT();
CTR4(KTR_MLD, "%s: switching to v%d on ifp %p(%s)", __func__,
version, mli->mli_ifp, if_name(mli->mli_ifp));
if (version == MLD_VERSION_1) {
/*
* Compute the "Older Version Querier Present" timer as per
* Section 9.12.
*/
old_version_timer = (mli->mli_rv * mli->mli_qi) + mli->mli_qri;
old_version_timer *= PR_SLOWHZ;
mli->mli_v1_timer = old_version_timer;
}
if (mli->mli_v1_timer > 0 && mli->mli_version != MLD_VERSION_1) {
mli->mli_version = MLD_VERSION_1;
mld_v2_cancel_link_timers(mli);
}
}
/*
* Cancel pending MLDv2 timers for the given link and all groups
* joined on it; state-change, general-query, and group-query timers.
*/
static void
mld_v2_cancel_link_timers(struct mld_ifsoftc *mli)
{
struct ifmultiaddr *ifma;
struct ifnet *ifp;
struct in6_multi *inm, *tinm;
CTR3(KTR_MLD, "%s: cancel v2 timers on ifp %p(%s)", __func__,
mli->mli_ifp, if_name(mli->mli_ifp));
IN6_MULTI_LOCK_ASSERT();
MLD_LOCK_ASSERT();
/*
* Fast-track this potentially expensive operation
* by checking all the global 'timer pending' flags.
*/
if (!V_interface_timers_running6 &&
!V_state_change_timers_running6 &&
!V_current_state_timers_running6)
return;
mli->mli_v2_timer = 0;
ifp = mli->mli_ifp;
IF_ADDR_RLOCK(ifp);
TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
if (ifma->ifma_addr->sa_family != AF_INET6)
continue;
inm = (struct in6_multi *)ifma->ifma_protospec;
switch (inm->in6m_state) {
case MLD_NOT_MEMBER:
case MLD_SILENT_MEMBER:
case MLD_IDLE_MEMBER:
case MLD_LAZY_MEMBER:
case MLD_SLEEPING_MEMBER:
case MLD_AWAKENING_MEMBER:
break;
case MLD_LEAVING_MEMBER:
/*
* If we are leaving the group and switching
* version, we need to release the final
* reference held for issuing the INCLUDE {}.
*/
SLIST_INSERT_HEAD(&mli->mli_relinmhead, inm,
in6m_nrele);
/* FALLTHROUGH */
case MLD_G_QUERY_PENDING_MEMBER:
case MLD_SG_QUERY_PENDING_MEMBER:
in6m_clear_recorded(inm);
/* FALLTHROUGH */
case MLD_REPORTING_MEMBER:
inm->in6m_sctimer = 0;
inm->in6m_timer = 0;
inm->in6m_state = MLD_REPORTING_MEMBER;
/*
* Free any pending MLDv2 state-change records.
*/
mbufq_drain(&inm->in6m_scq);
break;
}
}
IF_ADDR_RUNLOCK(ifp);
SLIST_FOREACH_SAFE(inm, &mli->mli_relinmhead, in6m_nrele, tinm) {
SLIST_REMOVE_HEAD(&mli->mli_relinmhead, in6m_nrele);
in6m_release_locked(inm);
}
}
/*
* Global slowtimo handler.
* VIMAGE: Timeout handlers are expected to service all vimages.
*/
void
mld_slowtimo(void)
{
VNET_ITERATOR_DECL(vnet_iter);
VNET_LIST_RLOCK_NOSLEEP();
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
mld_slowtimo_vnet();
CURVNET_RESTORE();
}
VNET_LIST_RUNLOCK_NOSLEEP();
}
/*
* Per-vnet slowtimo handler.
*/
static void
mld_slowtimo_vnet(void)
{
struct mld_ifsoftc *mli;
MLD_LOCK();
LIST_FOREACH(mli, &V_mli_head, mli_link) {
mld_v1_process_querier_timers(mli);
}
MLD_UNLOCK();
}
/*
* Update the Older Version Querier Present timers for a link.
* See Section 9.12 of RFC 3810.
*/
static void
mld_v1_process_querier_timers(struct mld_ifsoftc *mli)
{
MLD_LOCK_ASSERT();
if (mli->mli_version != MLD_VERSION_2 && --mli->mli_v1_timer == 0) {
/*
* MLDv1 Querier Present timer expired; revert to MLDv2.
*/
CTR5(KTR_MLD,
"%s: transition from v%d -> v%d on %p(%s)",
__func__, mli->mli_version, MLD_VERSION_2,
mli->mli_ifp, if_name(mli->mli_ifp));
mli->mli_version = MLD_VERSION_2;
}
}
/*
* Transmit an MLDv1 report immediately.
*/
static int
mld_v1_transmit_report(struct in6_multi *in6m, const int type)
{
struct ifnet *ifp;
struct in6_ifaddr *ia;
struct ip6_hdr *ip6;
struct mbuf *mh, *md;
struct mld_hdr *mld;
IN6_MULTI_LOCK_ASSERT();
MLD_LOCK_ASSERT();
ifp = in6m->in6m_ifp;
ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST);
/* ia may be NULL if link-local address is tentative. */
mh = m_gethdr(M_NOWAIT, MT_DATA);
if (mh == NULL) {
if (ia != NULL)
ifa_free(&ia->ia_ifa);
return (ENOMEM);
}
md = m_get(M_NOWAIT, MT_DATA);
if (md == NULL) {
m_free(mh);
if (ia != NULL)
ifa_free(&ia->ia_ifa);
return (ENOMEM);
}
mh->m_next = md;
/*
* FUTURE: Consider increasing alignment by ETHER_HDR_LEN, so
* that ether_output() does not need to allocate another mbuf
* for the header in the most common case.
*/
M_ALIGN(mh, sizeof(struct ip6_hdr));
mh->m_pkthdr.len = sizeof(struct ip6_hdr) + sizeof(struct mld_hdr);
mh->m_len = sizeof(struct ip6_hdr);
ip6 = mtod(mh, struct ip6_hdr *);
ip6->ip6_flow = 0;
ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
ip6->ip6_vfc |= IPV6_VERSION;
ip6->ip6_nxt = IPPROTO_ICMPV6;
ip6->ip6_src = ia ? ia->ia_addr.sin6_addr : in6addr_any;
ip6->ip6_dst = in6m->in6m_addr;
md->m_len = sizeof(struct mld_hdr);
mld = mtod(md, struct mld_hdr *);
mld->mld_type = type;
mld->mld_code = 0;
mld->mld_cksum = 0;
mld->mld_maxdelay = 0;
mld->mld_reserved = 0;
mld->mld_addr = in6m->in6m_addr;
in6_clearscope(&mld->mld_addr);
mld->mld_cksum = in6_cksum(mh, IPPROTO_ICMPV6,
sizeof(struct ip6_hdr), sizeof(struct mld_hdr));
mld_save_context(mh, ifp);
mh->m_flags |= M_MLDV1;
mld_dispatch_packet(mh);
if (ia != NULL)
ifa_free(&ia->ia_ifa);
return (0);
}
/*
* Process a state change from the upper layer for the given IPv6 group.
*
* Each socket holds a reference on the in_multi in its own ip_moptions.
* The socket layer will have made the necessary updates to.the group
* state, it is now up to MLD to issue a state change report if there
* has been any change between T0 (when the last state-change was issued)
* and T1 (now).
*
* We use the MLDv2 state machine at group level. The MLd module
* however makes the decision as to which MLD protocol version to speak.
* A state change *from* INCLUDE {} always means an initial join.
* A state change *to* INCLUDE {} always means a final leave.
*
* If delay is non-zero, and the state change is an initial multicast
* join, the state change report will be delayed by 'delay' ticks
* in units of PR_FASTHZ if MLDv1 is active on the link; otherwise
* the initial MLDv2 state change report will be delayed by whichever
* is sooner, a pending state-change timer or delay itself.
*
* VIMAGE: curvnet should have been set by caller, as this routine
* is called from the socket option handlers.
*/
int
mld_change_state(struct in6_multi *inm, const int delay)
{
struct mld_ifsoftc *mli;
struct ifnet *ifp;
int error;
IN6_MULTI_LOCK_ASSERT();
error = 0;
/*
* Try to detect if the upper layer just asked us to change state
* for an interface which has now gone away.
*/
KASSERT(inm->in6m_ifma != NULL, ("%s: no ifma", __func__));
ifp = inm->in6m_ifma->ifma_ifp;
if (ifp != NULL) {
/*
* Sanity check that netinet6's notion of ifp is the
* same as net's.
*/
KASSERT(inm->in6m_ifp == ifp, ("%s: bad ifp", __func__));
}
MLD_LOCK();
mli = MLD_IFINFO(ifp);
KASSERT(mli != NULL, ("%s: no mld_ifsoftc for ifp %p", __func__, ifp));
/*
* If we detect a state transition to or from MCAST_UNDEFINED
* for this group, then we are starting or finishing an MLD
* life cycle for this group.
*/
if (inm->in6m_st[1].iss_fmode != inm->in6m_st[0].iss_fmode) {
CTR3(KTR_MLD, "%s: inm transition %d -> %d", __func__,
inm->in6m_st[0].iss_fmode, inm->in6m_st[1].iss_fmode);
if (inm->in6m_st[0].iss_fmode == MCAST_UNDEFINED) {
CTR1(KTR_MLD, "%s: initial join", __func__);
error = mld_initial_join(inm, mli, delay);
goto out_locked;
} else if (inm->in6m_st[1].iss_fmode == MCAST_UNDEFINED) {
CTR1(KTR_MLD, "%s: final leave", __func__);
mld_final_leave(inm, mli);
goto out_locked;
}
} else {
CTR1(KTR_MLD, "%s: filter set change", __func__);
}
error = mld_handle_state_change(inm, mli);
out_locked:
MLD_UNLOCK();
return (error);
}
/*
* Perform the initial join for an MLD group.
*
* When joining a group:
* If the group should have its MLD traffic suppressed, do nothing.
* MLDv1 starts sending MLDv1 host membership reports.
* MLDv2 will schedule an MLDv2 state-change report containing the
* initial state of the membership.
*
* If the delay argument is non-zero, then we must delay sending the
* initial state change for delay ticks (in units of PR_FASTHZ).
*/
static int
mld_initial_join(struct in6_multi *inm, struct mld_ifsoftc *mli,
const int delay)
{
struct ifnet *ifp;
struct mbufq *mq;
int error, retval, syncstates;
int odelay;
#ifdef KTR
char ip6tbuf[INET6_ADDRSTRLEN];
#endif
CTR4(KTR_MLD, "%s: initial join %s on ifp %p(%s)",
__func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr),
inm->in6m_ifp, if_name(inm->in6m_ifp));
error = 0;
syncstates = 1;
ifp = inm->in6m_ifp;
IN6_MULTI_LOCK_ASSERT();
MLD_LOCK_ASSERT();
KASSERT(mli && mli->mli_ifp == ifp, ("%s: inconsistent ifp", __func__));
/*
* Groups joined on loopback or marked as 'not reported',
* enter the MLD_SILENT_MEMBER state and
* are never reported in any protocol exchanges.
* All other groups enter the appropriate state machine
* for the version in use on this link.
* A link marked as MLIF_SILENT causes MLD to be completely
* disabled for the link.
*/
if ((ifp->if_flags & IFF_LOOPBACK) ||
(mli->mli_flags & MLIF_SILENT) ||
!mld_is_addr_reported(&inm->in6m_addr)) {
CTR1(KTR_MLD,
"%s: not kicking state machine for silent group", __func__);
inm->in6m_state = MLD_SILENT_MEMBER;
inm->in6m_timer = 0;
} else {
/*
* Deal with overlapping in_multi lifecycle.
* If this group was LEAVING, then make sure
* we drop the reference we picked up to keep the
* group around for the final INCLUDE {} enqueue.
*/
if (mli->mli_version == MLD_VERSION_2 &&
inm->in6m_state == MLD_LEAVING_MEMBER)
in6m_release_locked(inm);
inm->in6m_state = MLD_REPORTING_MEMBER;
switch (mli->mli_version) {
case MLD_VERSION_1:
/*
* If a delay was provided, only use it if
* it is greater than the delay normally
* used for an MLDv1 state change report,
* and delay sending the initial MLDv1 report
* by not transitioning to the IDLE state.
*/
odelay = MLD_RANDOM_DELAY(MLD_V1_MAX_RI * PR_FASTHZ);
if (delay) {
inm->in6m_timer = max(delay, odelay);
V_current_state_timers_running6 = 1;
} else {
inm->in6m_state = MLD_IDLE_MEMBER;
error = mld_v1_transmit_report(inm,
MLD_LISTENER_REPORT);
if (error == 0) {
inm->in6m_timer = odelay;
V_current_state_timers_running6 = 1;
}
}
break;
case MLD_VERSION_2:
/*
* Defer update of T0 to T1, until the first copy
* of the state change has been transmitted.
*/
syncstates = 0;
/*
* Immediately enqueue a State-Change Report for
* this interface, freeing any previous reports.
* Don't kick the timers if there is nothing to do,
* or if an error occurred.
*/
mq = &inm->in6m_scq;
mbufq_drain(mq);
retval = mld_v2_enqueue_group_record(mq, inm, 1,
0, 0, (mli->mli_flags & MLIF_USEALLOW));
CTR2(KTR_MLD, "%s: enqueue record = %d",
__func__, retval);
if (retval <= 0) {
error = retval * -1;
break;
}
/*
* Schedule transmission of pending state-change
* report up to RV times for this link. The timer
* will fire at the next mld_fasttimo (~200ms),
* giving us an opportunity to merge the reports.
*
* If a delay was provided to this function, only
* use this delay if sooner than the existing one.
*/
KASSERT(mli->mli_rv > 1,
("%s: invalid robustness %d", __func__,
mli->mli_rv));
inm->in6m_scrv = mli->mli_rv;
if (delay) {
if (inm->in6m_sctimer > 1) {
inm->in6m_sctimer =
min(inm->in6m_sctimer, delay);
} else
inm->in6m_sctimer = delay;
} else
inm->in6m_sctimer = 1;
V_state_change_timers_running6 = 1;
error = 0;
break;
}
}
/*
* Only update the T0 state if state change is atomic,
* i.e. we don't need to wait for a timer to fire before we
* can consider the state change to have been communicated.
*/
if (syncstates) {
in6m_commit(inm);
CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__,
ip6_sprintf(ip6tbuf, &inm->in6m_addr),
if_name(inm->in6m_ifp));
}
return (error);
}
/*
* Issue an intermediate state change during the life-cycle.
*/
static int
mld_handle_state_change(struct in6_multi *inm, struct mld_ifsoftc *mli)
{
struct ifnet *ifp;
int retval;
#ifdef KTR
char ip6tbuf[INET6_ADDRSTRLEN];
#endif
CTR4(KTR_MLD, "%s: state change for %s on ifp %p(%s)",
__func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr),
inm->in6m_ifp, if_name(inm->in6m_ifp));
ifp = inm->in6m_ifp;
IN6_MULTI_LOCK_ASSERT();
MLD_LOCK_ASSERT();
KASSERT(mli && mli->mli_ifp == ifp,
("%s: inconsistent ifp", __func__));
if ((ifp->if_flags & IFF_LOOPBACK) ||
(mli->mli_flags & MLIF_SILENT) ||
!mld_is_addr_reported(&inm->in6m_addr) ||
(mli->mli_version != MLD_VERSION_2)) {
if (!mld_is_addr_reported(&inm->in6m_addr)) {
CTR1(KTR_MLD,
"%s: not kicking state machine for silent group", __func__);
}
CTR1(KTR_MLD, "%s: nothing to do", __func__);
in6m_commit(inm);
CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__,
ip6_sprintf(ip6tbuf, &inm->in6m_addr),
if_name(inm->in6m_ifp));
return (0);
}
mbufq_drain(&inm->in6m_scq);
retval = mld_v2_enqueue_group_record(&inm->in6m_scq, inm, 1, 0, 0,
(mli->mli_flags & MLIF_USEALLOW));
CTR2(KTR_MLD, "%s: enqueue record = %d", __func__, retval);
if (retval <= 0)
return (-retval);
/*
* If record(s) were enqueued, start the state-change
* report timer for this group.
*/
inm->in6m_scrv = mli->mli_rv;
inm->in6m_sctimer = 1;
V_state_change_timers_running6 = 1;
return (0);
}
/*
* Perform the final leave for a multicast address.
*
* When leaving a group:
* MLDv1 sends a DONE message, if and only if we are the reporter.
* MLDv2 enqueues a state-change report containing a transition
* to INCLUDE {} for immediate transmission.
*/
static void
mld_final_leave(struct in6_multi *inm, struct mld_ifsoftc *mli)
{
int syncstates;
#ifdef KTR
char ip6tbuf[INET6_ADDRSTRLEN];
#endif
syncstates = 1;
CTR4(KTR_MLD, "%s: final leave %s on ifp %p(%s)",
__func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr),
inm->in6m_ifp, if_name(inm->in6m_ifp));
IN6_MULTI_LOCK_ASSERT();
MLD_LOCK_ASSERT();
switch (inm->in6m_state) {
case MLD_NOT_MEMBER:
case MLD_SILENT_MEMBER:
case MLD_LEAVING_MEMBER:
/* Already leaving or left; do nothing. */
CTR1(KTR_MLD,
"%s: not kicking state machine for silent group", __func__);
break;
case MLD_REPORTING_MEMBER:
case MLD_IDLE_MEMBER:
case MLD_G_QUERY_PENDING_MEMBER:
case MLD_SG_QUERY_PENDING_MEMBER:
if (mli->mli_version == MLD_VERSION_1) {
#ifdef INVARIANTS
if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER ||
inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER)
panic("%s: MLDv2 state reached, not MLDv2 mode",
__func__);
#endif
mld_v1_transmit_report(inm, MLD_LISTENER_DONE);
inm->in6m_state = MLD_NOT_MEMBER;
V_current_state_timers_running6 = 1;
} else if (mli->mli_version == MLD_VERSION_2) {
/*
* Stop group timer and all pending reports.
* Immediately enqueue a state-change report
* TO_IN {} to be sent on the next fast timeout,
* giving us an opportunity to merge reports.
*/
mbufq_drain(&inm->in6m_scq);
inm->in6m_timer = 0;
inm->in6m_scrv = mli->mli_rv;
CTR4(KTR_MLD, "%s: Leaving %s/%s with %d "
"pending retransmissions.", __func__,
ip6_sprintf(ip6tbuf, &inm->in6m_addr),
if_name(inm->in6m_ifp), inm->in6m_scrv);
if (inm->in6m_scrv == 0) {
inm->in6m_state = MLD_NOT_MEMBER;
inm->in6m_sctimer = 0;
} else {
int retval;
in6m_acquire_locked(inm);
retval = mld_v2_enqueue_group_record(
&inm->in6m_scq, inm, 1, 0, 0,
(mli->mli_flags & MLIF_USEALLOW));
KASSERT(retval != 0,
("%s: enqueue record = %d", __func__,
retval));
inm->in6m_state = MLD_LEAVING_MEMBER;
inm->in6m_sctimer = 1;
V_state_change_timers_running6 = 1;
syncstates = 0;
}
break;
}
break;
case MLD_LAZY_MEMBER:
case MLD_SLEEPING_MEMBER:
case MLD_AWAKENING_MEMBER:
/* Our reports are suppressed; do nothing. */
break;
}
if (syncstates) {
in6m_commit(inm);
CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__,
ip6_sprintf(ip6tbuf, &inm->in6m_addr),
if_name(inm->in6m_ifp));
inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED;
CTR3(KTR_MLD, "%s: T1 now MCAST_UNDEFINED for %p/%s",
__func__, &inm->in6m_addr, if_name(inm->in6m_ifp));
}
}
/*
* Enqueue an MLDv2 group record to the given output queue.
*
* If is_state_change is zero, a current-state record is appended.
* If is_state_change is non-zero, a state-change report is appended.
*
* If is_group_query is non-zero, an mbuf packet chain is allocated.
* If is_group_query is zero, and if there is a packet with free space
* at the tail of the queue, it will be appended to providing there
* is enough free space.
* Otherwise a new mbuf packet chain is allocated.
*
* If is_source_query is non-zero, each source is checked to see if
* it was recorded for a Group-Source query, and will be omitted if
* it is not both in-mode and recorded.
*
* If use_block_allow is non-zero, state change reports for initial join
* and final leave, on an inclusive mode group with a source list, will be
* rewritten to use the ALLOW_NEW and BLOCK_OLD record types, respectively.
*
* The function will attempt to allocate leading space in the packet
* for the IPv6+ICMP headers to be prepended without fragmenting the chain.
*
* If successful the size of all data appended to the queue is returned,
* otherwise an error code less than zero is returned, or zero if
* no record(s) were appended.
*/
static int
mld_v2_enqueue_group_record(struct mbufq *mq, struct in6_multi *inm,
const int is_state_change, const int is_group_query,
const int is_source_query, const int use_block_allow)
{
struct mldv2_record mr;
struct mldv2_record *pmr;
struct ifnet *ifp;
struct ip6_msource *ims, *nims;
struct mbuf *m0, *m, *md;
- int error, is_filter_list_change;
+ int is_filter_list_change;
int minrec0len, m0srcs, msrcs, nbytes, off;
int record_has_sources;
int now;
int type;
uint8_t mode;
#ifdef KTR
char ip6tbuf[INET6_ADDRSTRLEN];
#endif
IN6_MULTI_LOCK_ASSERT();
- error = 0;
ifp = inm->in6m_ifp;
is_filter_list_change = 0;
m = NULL;
m0 = NULL;
m0srcs = 0;
msrcs = 0;
nbytes = 0;
nims = NULL;
record_has_sources = 1;
pmr = NULL;
type = MLD_DO_NOTHING;
mode = inm->in6m_st[1].iss_fmode;
/*
* If we did not transition out of ASM mode during t0->t1,
* and there are no source nodes to process, we can skip
* the generation of source records.
*/
if (inm->in6m_st[0].iss_asm > 0 && inm->in6m_st[1].iss_asm > 0 &&
inm->in6m_nsrc == 0)
record_has_sources = 0;
if (is_state_change) {
/*
* Queue a state change record.
* If the mode did not change, and there are non-ASM
* listeners or source filters present,
* we potentially need to issue two records for the group.
* If there are ASM listeners, and there was no filter
* mode transition of any kind, do nothing.
*
* If we are transitioning to MCAST_UNDEFINED, we need
* not send any sources. A transition to/from this state is
* considered inclusive with some special treatment.
*
* If we are rewriting initial joins/leaves to use
* ALLOW/BLOCK, and the group's membership is inclusive,
* we need to send sources in all cases.
*/
if (mode != inm->in6m_st[0].iss_fmode) {
if (mode == MCAST_EXCLUDE) {
CTR1(KTR_MLD, "%s: change to EXCLUDE",
__func__);
type = MLD_CHANGE_TO_EXCLUDE_MODE;
} else {
CTR1(KTR_MLD, "%s: change to INCLUDE",
__func__);
if (use_block_allow) {
/*
* XXX
* Here we're interested in state
* edges either direction between
* MCAST_UNDEFINED and MCAST_INCLUDE.
* Perhaps we should just check
* the group state, rather than
* the filter mode.
*/
if (mode == MCAST_UNDEFINED) {
type = MLD_BLOCK_OLD_SOURCES;
} else {
type = MLD_ALLOW_NEW_SOURCES;
}
} else {
type = MLD_CHANGE_TO_INCLUDE_MODE;
if (mode == MCAST_UNDEFINED)
record_has_sources = 0;
}
}
} else {
if (record_has_sources) {
is_filter_list_change = 1;
} else {
type = MLD_DO_NOTHING;
}
}
} else {
/*
* Queue a current state record.
*/
if (mode == MCAST_EXCLUDE) {
type = MLD_MODE_IS_EXCLUDE;
} else if (mode == MCAST_INCLUDE) {
type = MLD_MODE_IS_INCLUDE;
KASSERT(inm->in6m_st[1].iss_asm == 0,
("%s: inm %p is INCLUDE but ASM count is %d",
__func__, inm, inm->in6m_st[1].iss_asm));
}
}
/*
* Generate the filter list changes using a separate function.
*/
if (is_filter_list_change)
return (mld_v2_enqueue_filter_change(mq, inm));
if (type == MLD_DO_NOTHING) {
CTR3(KTR_MLD, "%s: nothing to do for %s/%s",
__func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr),
if_name(inm->in6m_ifp));
return (0);
}
/*
* If any sources are present, we must be able to fit at least
* one in the trailing space of the tail packet's mbuf,
* ideally more.
*/
minrec0len = sizeof(struct mldv2_record);
if (record_has_sources)
minrec0len += sizeof(struct in6_addr);
CTR4(KTR_MLD, "%s: queueing %s for %s/%s", __func__,
mld_rec_type_to_str(type),
ip6_sprintf(ip6tbuf, &inm->in6m_addr),
if_name(inm->in6m_ifp));
/*
* Check if we have a packet in the tail of the queue for this
* group into which the first group record for this group will fit.
* Otherwise allocate a new packet.
* Always allocate leading space for IP6+RA+ICMPV6+REPORT.
* Note: Group records for G/GSR query responses MUST be sent
* in their own packet.
*/
m0 = mbufq_last(mq);
if (!is_group_query &&
m0 != NULL &&
(m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= MLD_V2_REPORT_MAXRECS) &&
(m0->m_pkthdr.len + minrec0len) <
(ifp->if_mtu - MLD_MTUSPACE)) {
m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
sizeof(struct mldv2_record)) /
sizeof(struct in6_addr);
m = m0;
CTR1(KTR_MLD, "%s: use existing packet", __func__);
} else {
if (mbufq_full(mq)) {
CTR1(KTR_MLD, "%s: outbound queue full", __func__);
return (-ENOMEM);
}
m = NULL;
m0srcs = (ifp->if_mtu - MLD_MTUSPACE -
sizeof(struct mldv2_record)) / sizeof(struct in6_addr);
if (!is_state_change && !is_group_query)
m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
if (m == NULL)
m = m_gethdr(M_NOWAIT, MT_DATA);
if (m == NULL)
return (-ENOMEM);
mld_save_context(m, ifp);
CTR1(KTR_MLD, "%s: allocated first packet", __func__);
}
/*
* Append group record.
* If we have sources, we don't know how many yet.
*/
mr.mr_type = type;
mr.mr_datalen = 0;
mr.mr_numsrc = 0;
mr.mr_addr = inm->in6m_addr;
in6_clearscope(&mr.mr_addr);
if (!m_append(m, sizeof(struct mldv2_record), (void *)&mr)) {
if (m != m0)
m_freem(m);
CTR1(KTR_MLD, "%s: m_append() failed.", __func__);
return (-ENOMEM);
}
nbytes += sizeof(struct mldv2_record);
/*
* Append as many sources as will fit in the first packet.
* If we are appending to a new packet, the chain allocation
* may potentially use clusters; use m_getptr() in this case.
* If we are appending to an existing packet, we need to obtain
* a pointer to the group record after m_append(), in case a new
* mbuf was allocated.
*
* Only append sources which are in-mode at t1. If we are
* transitioning to MCAST_UNDEFINED state on the group, and
* use_block_allow is zero, do not include source entries.
* Otherwise, we need to include this source in the report.
*
* Only report recorded sources in our filter set when responding
* to a group-source query.
*/
if (record_has_sources) {
if (m == m0) {
md = m_last(m);
pmr = (struct mldv2_record *)(mtod(md, uint8_t *) +
md->m_len - nbytes);
} else {
md = m_getptr(m, 0, &off);
pmr = (struct mldv2_record *)(mtod(md, uint8_t *) +
off);
}
msrcs = 0;
RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs,
nims) {
CTR2(KTR_MLD, "%s: visit node %s", __func__,
ip6_sprintf(ip6tbuf, &ims->im6s_addr));
now = im6s_get_mode(inm, ims, 1);
CTR2(KTR_MLD, "%s: node is %d", __func__, now);
if ((now != mode) ||
(now == mode &&
(!use_block_allow && mode == MCAST_UNDEFINED))) {
CTR1(KTR_MLD, "%s: skip node", __func__);
continue;
}
if (is_source_query && ims->im6s_stp == 0) {
CTR1(KTR_MLD, "%s: skip unrecorded node",
__func__);
continue;
}
CTR1(KTR_MLD, "%s: append node", __func__);
if (!m_append(m, sizeof(struct in6_addr),
(void *)&ims->im6s_addr)) {
if (m != m0)
m_freem(m);
CTR1(KTR_MLD, "%s: m_append() failed.",
__func__);
return (-ENOMEM);
}
nbytes += sizeof(struct in6_addr);
++msrcs;
if (msrcs == m0srcs)
break;
}
CTR2(KTR_MLD, "%s: msrcs is %d this packet", __func__,
msrcs);
pmr->mr_numsrc = htons(msrcs);
nbytes += (msrcs * sizeof(struct in6_addr));
}
if (is_source_query && msrcs == 0) {
CTR1(KTR_MLD, "%s: no recorded sources to report", __func__);
if (m != m0)
m_freem(m);
return (0);
}
/*
* We are good to go with first packet.
*/
if (m != m0) {
CTR1(KTR_MLD, "%s: enqueueing first packet", __func__);
m->m_pkthdr.PH_vt.vt_nrecs = 1;
mbufq_enqueue(mq, m);
} else
m->m_pkthdr.PH_vt.vt_nrecs++;
/*
* No further work needed if no source list in packet(s).
*/
if (!record_has_sources)
return (nbytes);
/*
* Whilst sources remain to be announced, we need to allocate
* a new packet and fill out as many sources as will fit.
* Always try for a cluster first.
*/
while (nims != NULL) {
if (mbufq_full(mq)) {
CTR1(KTR_MLD, "%s: outbound queue full", __func__);
return (-ENOMEM);
}
m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
if (m == NULL)
m = m_gethdr(M_NOWAIT, MT_DATA);
if (m == NULL)
return (-ENOMEM);
mld_save_context(m, ifp);
md = m_getptr(m, 0, &off);
pmr = (struct mldv2_record *)(mtod(md, uint8_t *) + off);
CTR1(KTR_MLD, "%s: allocated next packet", __func__);
if (!m_append(m, sizeof(struct mldv2_record), (void *)&mr)) {
if (m != m0)
m_freem(m);
CTR1(KTR_MLD, "%s: m_append() failed.", __func__);
return (-ENOMEM);
}
m->m_pkthdr.PH_vt.vt_nrecs = 1;
nbytes += sizeof(struct mldv2_record);
m0srcs = (ifp->if_mtu - MLD_MTUSPACE -
sizeof(struct mldv2_record)) / sizeof(struct in6_addr);
msrcs = 0;
RB_FOREACH_FROM(ims, ip6_msource_tree, nims) {
CTR2(KTR_MLD, "%s: visit node %s",
__func__, ip6_sprintf(ip6tbuf, &ims->im6s_addr));
now = im6s_get_mode(inm, ims, 1);
if ((now != mode) ||
(now == mode &&
(!use_block_allow && mode == MCAST_UNDEFINED))) {
CTR1(KTR_MLD, "%s: skip node", __func__);
continue;
}
if (is_source_query && ims->im6s_stp == 0) {
CTR1(KTR_MLD, "%s: skip unrecorded node",
__func__);
continue;
}
CTR1(KTR_MLD, "%s: append node", __func__);
if (!m_append(m, sizeof(struct in6_addr),
(void *)&ims->im6s_addr)) {
if (m != m0)
m_freem(m);
CTR1(KTR_MLD, "%s: m_append() failed.",
__func__);
return (-ENOMEM);
}
++msrcs;
if (msrcs == m0srcs)
break;
}
pmr->mr_numsrc = htons(msrcs);
nbytes += (msrcs * sizeof(struct in6_addr));
CTR1(KTR_MLD, "%s: enqueueing next packet", __func__);
mbufq_enqueue(mq, m);
}
return (nbytes);
}
/*
* Type used to mark record pass completion.
* We exploit the fact we can cast to this easily from the
* current filter modes on each ip_msource node.
*/
typedef enum {
REC_NONE = 0x00, /* MCAST_UNDEFINED */
REC_ALLOW = 0x01, /* MCAST_INCLUDE */
REC_BLOCK = 0x02, /* MCAST_EXCLUDE */
REC_FULL = REC_ALLOW | REC_BLOCK
} rectype_t;
/*
* Enqueue an MLDv2 filter list change to the given output queue.
*
* Source list filter state is held in an RB-tree. When the filter list
* for a group is changed without changing its mode, we need to compute
* the deltas between T0 and T1 for each source in the filter set,
* and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records.
*
* As we may potentially queue two record types, and the entire R-B tree
* needs to be walked at once, we break this out into its own function
* so we can generate a tightly packed queue of packets.
*
* XXX This could be written to only use one tree walk, although that makes
* serializing into the mbuf chains a bit harder. For now we do two walks
* which makes things easier on us, and it may or may not be harder on
* the L2 cache.
*
* If successful the size of all data appended to the queue is returned,
* otherwise an error code less than zero is returned, or zero if
* no record(s) were appended.
*/
static int
mld_v2_enqueue_filter_change(struct mbufq *mq, struct in6_multi *inm)
{
static const int MINRECLEN =
sizeof(struct mldv2_record) + sizeof(struct in6_addr);
struct ifnet *ifp;
struct mldv2_record mr;
struct mldv2_record *pmr;
struct ip6_msource *ims, *nims;
struct mbuf *m, *m0, *md;
int m0srcs, nbytes, npbytes, off, rsrcs, schanged;
int nallow, nblock;
uint8_t mode, now, then;
rectype_t crt, drt, nrt;
#ifdef KTR
char ip6tbuf[INET6_ADDRSTRLEN];
#endif
IN6_MULTI_LOCK_ASSERT();
if (inm->in6m_nsrc == 0 ||
(inm->in6m_st[0].iss_asm > 0 && inm->in6m_st[1].iss_asm > 0))
return (0);
ifp = inm->in6m_ifp; /* interface */
mode = inm->in6m_st[1].iss_fmode; /* filter mode at t1 */
crt = REC_NONE; /* current group record type */
drt = REC_NONE; /* mask of completed group record types */
nrt = REC_NONE; /* record type for current node */
m0srcs = 0; /* # source which will fit in current mbuf chain */
npbytes = 0; /* # of bytes appended this packet */
nbytes = 0; /* # of bytes appended to group's state-change queue */
rsrcs = 0; /* # sources encoded in current record */
schanged = 0; /* # nodes encoded in overall filter change */
nallow = 0; /* # of source entries in ALLOW_NEW */
nblock = 0; /* # of source entries in BLOCK_OLD */
nims = NULL; /* next tree node pointer */
/*
* For each possible filter record mode.
* The first kind of source we encounter tells us which
* is the first kind of record we start appending.
* If a node transitioned to UNDEFINED at t1, its mode is treated
* as the inverse of the group's filter mode.
*/
while (drt != REC_FULL) {
do {
m0 = mbufq_last(mq);
if (m0 != NULL &&
(m0->m_pkthdr.PH_vt.vt_nrecs + 1 <=
MLD_V2_REPORT_MAXRECS) &&
(m0->m_pkthdr.len + MINRECLEN) <
(ifp->if_mtu - MLD_MTUSPACE)) {
m = m0;
m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
sizeof(struct mldv2_record)) /
sizeof(struct in6_addr);
CTR1(KTR_MLD,
"%s: use previous packet", __func__);
} else {
m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
if (m == NULL)
m = m_gethdr(M_NOWAIT, MT_DATA);
if (m == NULL) {
CTR1(KTR_MLD,
"%s: m_get*() failed", __func__);
return (-ENOMEM);
}
m->m_pkthdr.PH_vt.vt_nrecs = 0;
mld_save_context(m, ifp);
m0srcs = (ifp->if_mtu - MLD_MTUSPACE -
sizeof(struct mldv2_record)) /
sizeof(struct in6_addr);
npbytes = 0;
CTR1(KTR_MLD,
"%s: allocated new packet", __func__);
}
/*
* Append the MLD group record header to the
* current packet's data area.
* Recalculate pointer to free space for next
* group record, in case m_append() allocated
* a new mbuf or cluster.
*/
memset(&mr, 0, sizeof(mr));
mr.mr_addr = inm->in6m_addr;
in6_clearscope(&mr.mr_addr);
if (!m_append(m, sizeof(mr), (void *)&mr)) {
if (m != m0)
m_freem(m);
CTR1(KTR_MLD,
"%s: m_append() failed", __func__);
return (-ENOMEM);
}
npbytes += sizeof(struct mldv2_record);
if (m != m0) {
/* new packet; offset in chain */
md = m_getptr(m, npbytes -
sizeof(struct mldv2_record), &off);
pmr = (struct mldv2_record *)(mtod(md,
uint8_t *) + off);
} else {
/* current packet; offset from last append */
md = m_last(m);
pmr = (struct mldv2_record *)(mtod(md,
uint8_t *) + md->m_len -
sizeof(struct mldv2_record));
}
/*
* Begin walking the tree for this record type
* pass, or continue from where we left off
* previously if we had to allocate a new packet.
* Only report deltas in-mode at t1.
* We need not report included sources as allowed
* if we are in inclusive mode on the group,
* however the converse is not true.
*/
rsrcs = 0;
if (nims == NULL) {
nims = RB_MIN(ip6_msource_tree,
&inm->in6m_srcs);
}
RB_FOREACH_FROM(ims, ip6_msource_tree, nims) {
CTR2(KTR_MLD, "%s: visit node %s", __func__,
ip6_sprintf(ip6tbuf, &ims->im6s_addr));
now = im6s_get_mode(inm, ims, 1);
then = im6s_get_mode(inm, ims, 0);
CTR3(KTR_MLD, "%s: mode: t0 %d, t1 %d",
__func__, then, now);
if (now == then) {
CTR1(KTR_MLD,
"%s: skip unchanged", __func__);
continue;
}
if (mode == MCAST_EXCLUDE &&
now == MCAST_INCLUDE) {
CTR1(KTR_MLD,
"%s: skip IN src on EX group",
__func__);
continue;
}
nrt = (rectype_t)now;
if (nrt == REC_NONE)
nrt = (rectype_t)(~mode & REC_FULL);
if (schanged++ == 0) {
crt = nrt;
} else if (crt != nrt)
continue;
if (!m_append(m, sizeof(struct in6_addr),
(void *)&ims->im6s_addr)) {
if (m != m0)
m_freem(m);
CTR1(KTR_MLD,
"%s: m_append() failed", __func__);
return (-ENOMEM);
}
nallow += !!(crt == REC_ALLOW);
nblock += !!(crt == REC_BLOCK);
if (++rsrcs == m0srcs)
break;
}
/*
* If we did not append any tree nodes on this
* pass, back out of allocations.
*/
if (rsrcs == 0) {
npbytes -= sizeof(struct mldv2_record);
if (m != m0) {
CTR1(KTR_MLD,
"%s: m_free(m)", __func__);
m_freem(m);
} else {
CTR1(KTR_MLD,
"%s: m_adj(m, -mr)", __func__);
m_adj(m, -((int)sizeof(
struct mldv2_record)));
}
continue;
}
npbytes += (rsrcs * sizeof(struct in6_addr));
if (crt == REC_ALLOW)
pmr->mr_type = MLD_ALLOW_NEW_SOURCES;
else if (crt == REC_BLOCK)
pmr->mr_type = MLD_BLOCK_OLD_SOURCES;
pmr->mr_numsrc = htons(rsrcs);
/*
* Count the new group record, and enqueue this
* packet if it wasn't already queued.
*/
m->m_pkthdr.PH_vt.vt_nrecs++;
if (m != m0)
mbufq_enqueue(mq, m);
nbytes += npbytes;
} while (nims != NULL);
drt |= crt;
crt = (~crt & REC_FULL);
}
CTR3(KTR_MLD, "%s: queued %d ALLOW_NEW, %d BLOCK_OLD", __func__,
nallow, nblock);
return (nbytes);
}
static int
mld_v2_merge_state_changes(struct in6_multi *inm, struct mbufq *scq)
{
struct mbufq *gq;
struct mbuf *m; /* pending state-change */
struct mbuf *m0; /* copy of pending state-change */
struct mbuf *mt; /* last state-change in packet */
int docopy, domerge;
u_int recslen;
docopy = 0;
domerge = 0;
recslen = 0;
IN6_MULTI_LOCK_ASSERT();
MLD_LOCK_ASSERT();
/*
* If there are further pending retransmissions, make a writable
* copy of each queued state-change message before merging.
*/
if (inm->in6m_scrv > 0)
docopy = 1;
gq = &inm->in6m_scq;
#ifdef KTR
if (mbufq_first(gq) == NULL) {
CTR2(KTR_MLD, "%s: WARNING: queue for inm %p is empty",
__func__, inm);
}
#endif
m = mbufq_first(gq);
while (m != NULL) {
/*
* Only merge the report into the current packet if
* there is sufficient space to do so; an MLDv2 report
* packet may only contain 65,535 group records.
* Always use a simple mbuf chain concatentation to do this,
* as large state changes for single groups may have
* allocated clusters.
*/
domerge = 0;
mt = mbufq_last(scq);
if (mt != NULL) {
recslen = m_length(m, NULL);
if ((mt->m_pkthdr.PH_vt.vt_nrecs +
m->m_pkthdr.PH_vt.vt_nrecs <=
MLD_V2_REPORT_MAXRECS) &&
(mt->m_pkthdr.len + recslen <=
(inm->in6m_ifp->if_mtu - MLD_MTUSPACE)))
domerge = 1;
}
if (!domerge && mbufq_full(gq)) {
CTR2(KTR_MLD,
"%s: outbound queue full, skipping whole packet %p",
__func__, m);
mt = m->m_nextpkt;
if (!docopy)
m_freem(m);
m = mt;
continue;
}
if (!docopy) {
CTR2(KTR_MLD, "%s: dequeueing %p", __func__, m);
m0 = mbufq_dequeue(gq);
m = m0->m_nextpkt;
} else {
CTR2(KTR_MLD, "%s: copying %p", __func__, m);
m0 = m_dup(m, M_NOWAIT);
if (m0 == NULL)
return (ENOMEM);
m0->m_nextpkt = NULL;
m = m->m_nextpkt;
}
if (!domerge) {
CTR3(KTR_MLD, "%s: queueing %p to scq %p)",
__func__, m0, scq);
mbufq_enqueue(scq, m0);
} else {
struct mbuf *mtl; /* last mbuf of packet mt */
CTR3(KTR_MLD, "%s: merging %p with ifscq tail %p)",
__func__, m0, mt);
mtl = m_last(mt);
m0->m_flags &= ~M_PKTHDR;
mt->m_pkthdr.len += recslen;
mt->m_pkthdr.PH_vt.vt_nrecs +=
m0->m_pkthdr.PH_vt.vt_nrecs;
mtl->m_next = m0;
}
}
return (0);
}
/*
* Respond to a pending MLDv2 General Query.
*/
static void
mld_v2_dispatch_general_query(struct mld_ifsoftc *mli)
{
struct ifmultiaddr *ifma;
struct ifnet *ifp;
struct in6_multi *inm;
int retval;
IN6_MULTI_LOCK_ASSERT();
MLD_LOCK_ASSERT();
KASSERT(mli->mli_version == MLD_VERSION_2,
("%s: called when version %d", __func__, mli->mli_version));
/*
* Check that there are some packets queued. If so, send them first.
* For large number of groups the reply to general query can take
* many packets, we should finish sending them before starting of
* queuing the new reply.
*/
if (mbufq_len(&mli->mli_gq) != 0)
goto send;
ifp = mli->mli_ifp;
IF_ADDR_RLOCK(ifp);
TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
if (ifma->ifma_addr->sa_family != AF_INET6 ||
ifma->ifma_protospec == NULL)
continue;
inm = (struct in6_multi *)ifma->ifma_protospec;
KASSERT(ifp == inm->in6m_ifp,
("%s: inconsistent ifp", __func__));
switch (inm->in6m_state) {
case MLD_NOT_MEMBER:
case MLD_SILENT_MEMBER:
break;
case MLD_REPORTING_MEMBER:
case MLD_IDLE_MEMBER:
case MLD_LAZY_MEMBER:
case MLD_SLEEPING_MEMBER:
case MLD_AWAKENING_MEMBER:
inm->in6m_state = MLD_REPORTING_MEMBER;
retval = mld_v2_enqueue_group_record(&mli->mli_gq,
inm, 0, 0, 0, 0);
CTR2(KTR_MLD, "%s: enqueue record = %d",
__func__, retval);
break;
case MLD_G_QUERY_PENDING_MEMBER:
case MLD_SG_QUERY_PENDING_MEMBER:
case MLD_LEAVING_MEMBER:
break;
}
}
IF_ADDR_RUNLOCK(ifp);
send:
mld_dispatch_queue(&mli->mli_gq, MLD_MAX_RESPONSE_BURST);
/*
* Slew transmission of bursts over 500ms intervals.
*/
if (mbufq_first(&mli->mli_gq) != NULL) {
mli->mli_v2_timer = 1 + MLD_RANDOM_DELAY(
MLD_RESPONSE_BURST_INTERVAL);
V_interface_timers_running6 = 1;
}
}
/*
* Transmit the next pending message in the output queue.
*
* VIMAGE: Needs to store/restore vnet pointer on a per-mbuf-chain basis.
* MRT: Nothing needs to be done, as MLD traffic is always local to
* a link and uses a link-scope multicast address.
*/
static void
mld_dispatch_packet(struct mbuf *m)
{
struct ip6_moptions im6o;
struct ifnet *ifp;
struct ifnet *oifp;
struct mbuf *m0;
struct mbuf *md;
struct ip6_hdr *ip6;
struct mld_hdr *mld;
int error;
int off;
int type;
uint32_t ifindex;
CTR2(KTR_MLD, "%s: transmit %p", __func__, m);
/*
* Set VNET image pointer from enqueued mbuf chain
* before doing anything else. Whilst we use interface
* indexes to guard against interface detach, they are
* unique to each VIMAGE and must be retrieved.
*/
ifindex = mld_restore_context(m);
/*
* Check if the ifnet still exists. This limits the scope of
* any race in the absence of a global ifp lock for low cost
* (an array lookup).
*/
ifp = ifnet_byindex(ifindex);
if (ifp == NULL) {
CTR3(KTR_MLD, "%s: dropped %p as ifindex %u went away.",
__func__, m, ifindex);
m_freem(m);
IP6STAT_INC(ip6s_noroute);
goto out;
}
im6o.im6o_multicast_hlim = 1;
im6o.im6o_multicast_loop = (V_ip6_mrouter != NULL);
im6o.im6o_multicast_ifp = ifp;
if (m->m_flags & M_MLDV1) {
m0 = m;
} else {
m0 = mld_v2_encap_report(ifp, m);
if (m0 == NULL) {
CTR2(KTR_MLD, "%s: dropped %p", __func__, m);
IP6STAT_INC(ip6s_odropped);
goto out;
}
}
mld_scrub_context(m0);
m_clrprotoflags(m);
m0->m_pkthdr.rcvif = V_loif;
ip6 = mtod(m0, struct ip6_hdr *);
#if 0
(void)in6_setscope(&ip6->ip6_dst, ifp, NULL); /* XXX LOR */
#else
/*
* XXX XXX Break some KPI rules to prevent an LOR which would
* occur if we called in6_setscope() at transmission.
* See comments at top of file.
*/
MLD_EMBEDSCOPE(&ip6->ip6_dst, ifp->if_index);
#endif
/*
* Retrieve the ICMPv6 type before handoff to ip6_output(),
* so we can bump the stats.
*/
md = m_getptr(m0, sizeof(struct ip6_hdr), &off);
mld = (struct mld_hdr *)(mtod(md, uint8_t *) + off);
type = mld->mld_type;
error = ip6_output(m0, &mld_po, NULL, IPV6_UNSPECSRC, &im6o,
&oifp, NULL);
if (error) {
CTR3(KTR_MLD, "%s: ip6_output(%p) = %d", __func__, m0, error);
goto out;
}
ICMP6STAT_INC(icp6s_outhist[type]);
if (oifp != NULL) {
icmp6_ifstat_inc(oifp, ifs6_out_msg);
switch (type) {
case MLD_LISTENER_REPORT:
case MLDV2_LISTENER_REPORT:
icmp6_ifstat_inc(oifp, ifs6_out_mldreport);
break;
case MLD_LISTENER_DONE:
icmp6_ifstat_inc(oifp, ifs6_out_mlddone);
break;
}
}
out:
return;
}
/*
* Encapsulate an MLDv2 report.
*
* KAME IPv6 requires that hop-by-hop options be passed separately,
* and that the IPv6 header be prepended in a separate mbuf.
*
* Returns a pointer to the new mbuf chain head, or NULL if the
* allocation failed.
*/
static struct mbuf *
mld_v2_encap_report(struct ifnet *ifp, struct mbuf *m)
{
struct mbuf *mh;
struct mldv2_report *mld;
struct ip6_hdr *ip6;
struct in6_ifaddr *ia;
int mldreclen;
KASSERT(ifp != NULL, ("%s: null ifp", __func__));
KASSERT((m->m_flags & M_PKTHDR),
("%s: mbuf chain %p is !M_PKTHDR", __func__, m));
/*
* RFC3590: OK to send as :: or tentative during DAD.
*/
ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST);
if (ia == NULL)
CTR1(KTR_MLD, "%s: warning: ia is NULL", __func__);
mh = m_gethdr(M_NOWAIT, MT_DATA);
if (mh == NULL) {
if (ia != NULL)
ifa_free(&ia->ia_ifa);
m_freem(m);
return (NULL);
}
M_ALIGN(mh, sizeof(struct ip6_hdr) + sizeof(struct mldv2_report));
mldreclen = m_length(m, NULL);
CTR2(KTR_MLD, "%s: mldreclen is %d", __func__, mldreclen);
mh->m_len = sizeof(struct ip6_hdr) + sizeof(struct mldv2_report);
mh->m_pkthdr.len = sizeof(struct ip6_hdr) +
sizeof(struct mldv2_report) + mldreclen;
ip6 = mtod(mh, struct ip6_hdr *);
ip6->ip6_flow = 0;
ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
ip6->ip6_vfc |= IPV6_VERSION;
ip6->ip6_nxt = IPPROTO_ICMPV6;
ip6->ip6_src = ia ? ia->ia_addr.sin6_addr : in6addr_any;
if (ia != NULL)
ifa_free(&ia->ia_ifa);
ip6->ip6_dst = in6addr_linklocal_allv2routers;
/* scope ID will be set in netisr */
mld = (struct mldv2_report *)(ip6 + 1);
mld->mld_type = MLDV2_LISTENER_REPORT;
mld->mld_code = 0;
mld->mld_cksum = 0;
mld->mld_v2_reserved = 0;
mld->mld_v2_numrecs = htons(m->m_pkthdr.PH_vt.vt_nrecs);
m->m_pkthdr.PH_vt.vt_nrecs = 0;
mh->m_next = m;
mld->mld_cksum = in6_cksum(mh, IPPROTO_ICMPV6,
sizeof(struct ip6_hdr), sizeof(struct mldv2_report) + mldreclen);
return (mh);
}
#ifdef KTR
static char *
mld_rec_type_to_str(const int type)
{
switch (type) {
case MLD_CHANGE_TO_EXCLUDE_MODE:
return "TO_EX";
break;
case MLD_CHANGE_TO_INCLUDE_MODE:
return "TO_IN";
break;
case MLD_MODE_IS_EXCLUDE:
return "MODE_EX";
break;
case MLD_MODE_IS_INCLUDE:
return "MODE_IN";
break;
case MLD_ALLOW_NEW_SOURCES:
return "ALLOW_NEW";
break;
case MLD_BLOCK_OLD_SOURCES:
return "BLOCK_OLD";
break;
default:
break;
}
return "unknown";
}
#endif
static void
mld_init(void *unused __unused)
{
CTR1(KTR_MLD, "%s: initializing", __func__);
MLD_LOCK_INIT();
ip6_initpktopts(&mld_po);
mld_po.ip6po_hlim = 1;
mld_po.ip6po_hbh = &mld_ra.hbh;
mld_po.ip6po_prefer_tempaddr = IP6PO_TEMPADDR_NOTPREFER;
mld_po.ip6po_flags = IP6PO_DONTFRAG;
}
SYSINIT(mld_init, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE, mld_init, NULL);
static void
mld_uninit(void *unused __unused)
{
CTR1(KTR_MLD, "%s: tearing down", __func__);
MLD_LOCK_DESTROY();
}
SYSUNINIT(mld_uninit, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE, mld_uninit, NULL);
static void
vnet_mld_init(const void *unused __unused)
{
CTR1(KTR_MLD, "%s: initializing", __func__);
LIST_INIT(&V_mli_head);
}
VNET_SYSINIT(vnet_mld_init, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_mld_init,
NULL);
static void
vnet_mld_uninit(const void *unused __unused)
{
/* This can happen if we shutdown the network stack. */
CTR1(KTR_MLD, "%s: tearing down", __func__);
}
VNET_SYSUNINIT(vnet_mld_uninit, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_mld_uninit,
NULL);
static int
mld_modevent(module_t mod, int type, void *unused __unused)
{
switch (type) {
case MOD_LOAD:
case MOD_UNLOAD:
break;
default:
return (EOPNOTSUPP);
}
return (0);
}
static moduledata_t mld_mod = {
"mld",
mld_modevent,
0
};
DECLARE_MODULE(mld, mld_mod, SI_SUB_PROTO_MC, SI_ORDER_ANY);
Index: head/sys/netinet6/nd6.c
===================================================================
--- head/sys/netinet6/nd6.c (revision 327172)
+++ head/sys/netinet6/nd6.c (revision 327173)
@@ -1,2765 +1,2760 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $KAME: nd6.c,v 1.144 2001/05/24 07:44:00 itojun Exp $
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_inet.h"
#include "opt_inet6.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/callout.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/mutex.h>
#include <sys/socket.h>
#include <sys/sockio.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/protosw.h>
#include <sys/errno.h>
#include <sys/syslog.h>
#include <sys/rwlock.h>
#include <sys/queue.h>
#include <sys/sdt.h>
#include <sys/sysctl.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/if_arc.h>
#include <net/if_dl.h>
#include <net/if_types.h>
#include <net/iso88025.h>
#include <net/fddi.h>
#include <net/route.h>
#include <net/vnet.h>
#include <netinet/in.h>
#include <netinet/in_kdtrace.h>
#include <net/if_llatbl.h>
#include <netinet/if_ether.h>
#include <netinet6/in6_var.h>
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/scope6_var.h>
#include <netinet6/nd6.h>
#include <netinet6/in6_ifattach.h>
#include <netinet/icmp6.h>
#include <netinet6/send.h>
#include <sys/limits.h>
#include <security/mac/mac_framework.h>
#define ND6_SLOWTIMER_INTERVAL (60 * 60) /* 1 hour */
#define ND6_RECALC_REACHTM_INTERVAL (60 * 120) /* 2 hours */
#define SIN6(s) ((const struct sockaddr_in6 *)(s))
MALLOC_DEFINE(M_IP6NDP, "ip6ndp", "IPv6 Neighbor Discovery");
/* timer values */
VNET_DEFINE(int, nd6_prune) = 1; /* walk list every 1 seconds */
VNET_DEFINE(int, nd6_delay) = 5; /* delay first probe time 5 second */
VNET_DEFINE(int, nd6_umaxtries) = 3; /* maximum unicast query */
VNET_DEFINE(int, nd6_mmaxtries) = 3; /* maximum multicast query */
VNET_DEFINE(int, nd6_useloopback) = 1; /* use loopback interface for
* local traffic */
VNET_DEFINE(int, nd6_gctimer) = (60 * 60 * 24); /* 1 day: garbage
* collection timer */
/* preventing too many loops in ND option parsing */
static VNET_DEFINE(int, nd6_maxndopt) = 10; /* max # of ND options allowed */
VNET_DEFINE(int, nd6_maxnudhint) = 0; /* max # of subsequent upper
* layer hints */
static VNET_DEFINE(int, nd6_maxqueuelen) = 1; /* max pkts cached in unresolved
* ND entries */
#define V_nd6_maxndopt VNET(nd6_maxndopt)
#define V_nd6_maxqueuelen VNET(nd6_maxqueuelen)
#ifdef ND6_DEBUG
VNET_DEFINE(int, nd6_debug) = 1;
#else
VNET_DEFINE(int, nd6_debug) = 0;
#endif
static eventhandler_tag lle_event_eh, iflladdr_event_eh;
VNET_DEFINE(struct nd_drhead, nd_defrouter);
VNET_DEFINE(struct nd_prhead, nd_prefix);
VNET_DEFINE(struct rwlock, nd6_lock);
VNET_DEFINE(uint64_t, nd6_list_genid);
VNET_DEFINE(struct mtx, nd6_onlink_mtx);
VNET_DEFINE(int, nd6_recalc_reachtm_interval) = ND6_RECALC_REACHTM_INTERVAL;
#define V_nd6_recalc_reachtm_interval VNET(nd6_recalc_reachtm_interval)
int (*send_sendso_input_hook)(struct mbuf *, struct ifnet *, int, int);
static int nd6_is_new_addr_neighbor(const struct sockaddr_in6 *,
struct ifnet *);
static void nd6_setmtu0(struct ifnet *, struct nd_ifinfo *);
static void nd6_slowtimo(void *);
static int regen_tmpaddr(struct in6_ifaddr *);
static void nd6_free(struct llentry **, int);
static void nd6_free_redirect(const struct llentry *);
static void nd6_llinfo_timer(void *);
static void nd6_llinfo_settimer_locked(struct llentry *, long);
static void clear_llinfo_pqueue(struct llentry *);
static void nd6_rtrequest(int, struct rtentry *, struct rt_addrinfo *);
static int nd6_resolve_slow(struct ifnet *, int, struct mbuf *,
const struct sockaddr_in6 *, u_char *, uint32_t *, struct llentry **);
static int nd6_need_cache(struct ifnet *);
static VNET_DEFINE(struct callout, nd6_slowtimo_ch);
#define V_nd6_slowtimo_ch VNET(nd6_slowtimo_ch)
VNET_DEFINE(struct callout, nd6_timer_ch);
#define V_nd6_timer_ch VNET(nd6_timer_ch)
static void
nd6_lle_event(void *arg __unused, struct llentry *lle, int evt)
{
struct rt_addrinfo rtinfo;
struct sockaddr_in6 dst;
struct sockaddr_dl gw;
struct ifnet *ifp;
int type;
int fibnum;
LLE_WLOCK_ASSERT(lle);
if (lltable_get_af(lle->lle_tbl) != AF_INET6)
return;
switch (evt) {
case LLENTRY_RESOLVED:
type = RTM_ADD;
KASSERT(lle->la_flags & LLE_VALID,
("%s: %p resolved but not valid?", __func__, lle));
break;
case LLENTRY_EXPIRED:
type = RTM_DELETE;
break;
default:
return;
}
ifp = lltable_get_ifp(lle->lle_tbl);
bzero(&dst, sizeof(dst));
bzero(&gw, sizeof(gw));
bzero(&rtinfo, sizeof(rtinfo));
lltable_fill_sa_entry(lle, (struct sockaddr *)&dst);
dst.sin6_scope_id = in6_getscopezone(ifp,
in6_addrscope(&dst.sin6_addr));
gw.sdl_len = sizeof(struct sockaddr_dl);
gw.sdl_family = AF_LINK;
gw.sdl_alen = ifp->if_addrlen;
gw.sdl_index = ifp->if_index;
gw.sdl_type = ifp->if_type;
if (evt == LLENTRY_RESOLVED)
bcopy(lle->ll_addr, gw.sdl_data, ifp->if_addrlen);
rtinfo.rti_info[RTAX_DST] = (struct sockaddr *)&dst;
rtinfo.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&gw;
rtinfo.rti_addrs = RTA_DST | RTA_GATEWAY;
fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS : ifp->if_fib;
rt_missmsg_fib(type, &rtinfo, RTF_HOST | RTF_LLDATA | (
type == RTM_ADD ? RTF_UP: 0), 0, fibnum);
}
/*
* A handler for interface link layer address change event.
*/
static void
nd6_iflladdr(void *arg __unused, struct ifnet *ifp)
{
lltable_update_ifaddr(LLTABLE6(ifp));
}
void
nd6_init(void)
{
mtx_init(&V_nd6_onlink_mtx, "nd6 onlink", NULL, MTX_DEF);
rw_init(&V_nd6_lock, "nd6 list");
LIST_INIT(&V_nd_prefix);
TAILQ_INIT(&V_nd_defrouter);
/* Start timers. */
callout_init(&V_nd6_slowtimo_ch, 0);
callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz,
nd6_slowtimo, curvnet);
callout_init(&V_nd6_timer_ch, 0);
callout_reset(&V_nd6_timer_ch, hz, nd6_timer, curvnet);
nd6_dad_init();
if (IS_DEFAULT_VNET(curvnet)) {
lle_event_eh = EVENTHANDLER_REGISTER(lle_event, nd6_lle_event,
NULL, EVENTHANDLER_PRI_ANY);
iflladdr_event_eh = EVENTHANDLER_REGISTER(iflladdr_event,
nd6_iflladdr, NULL, EVENTHANDLER_PRI_ANY);
}
}
#ifdef VIMAGE
void
nd6_destroy()
{
callout_drain(&V_nd6_slowtimo_ch);
callout_drain(&V_nd6_timer_ch);
if (IS_DEFAULT_VNET(curvnet)) {
EVENTHANDLER_DEREGISTER(lle_event, lle_event_eh);
EVENTHANDLER_DEREGISTER(iflladdr_event, iflladdr_event_eh);
}
rw_destroy(&V_nd6_lock);
mtx_destroy(&V_nd6_onlink_mtx);
}
#endif
struct nd_ifinfo *
nd6_ifattach(struct ifnet *ifp)
{
struct nd_ifinfo *nd;
nd = malloc(sizeof(*nd), M_IP6NDP, M_WAITOK | M_ZERO);
nd->initialized = 1;
nd->chlim = IPV6_DEFHLIM;
nd->basereachable = REACHABLE_TIME;
nd->reachable = ND_COMPUTE_RTIME(nd->basereachable);
nd->retrans = RETRANS_TIMER;
nd->flags = ND6_IFF_PERFORMNUD;
/* A loopback interface always has ND6_IFF_AUTO_LINKLOCAL.
* XXXHRS: Clear ND6_IFF_AUTO_LINKLOCAL on an IFT_BRIDGE interface by
* default regardless of the V_ip6_auto_linklocal configuration to
* give a reasonable default behavior.
*/
if ((V_ip6_auto_linklocal && ifp->if_type != IFT_BRIDGE) ||
(ifp->if_flags & IFF_LOOPBACK))
nd->flags |= ND6_IFF_AUTO_LINKLOCAL;
/*
* A loopback interface does not need to accept RTADV.
* XXXHRS: Clear ND6_IFF_ACCEPT_RTADV on an IFT_BRIDGE interface by
* default regardless of the V_ip6_accept_rtadv configuration to
* prevent the interface from accepting RA messages arrived
* on one of the member interfaces with ND6_IFF_ACCEPT_RTADV.
*/
if (V_ip6_accept_rtadv &&
!(ifp->if_flags & IFF_LOOPBACK) &&
(ifp->if_type != IFT_BRIDGE))
nd->flags |= ND6_IFF_ACCEPT_RTADV;
if (V_ip6_no_radr && !(ifp->if_flags & IFF_LOOPBACK))
nd->flags |= ND6_IFF_NO_RADR;
/* XXX: we cannot call nd6_setmtu since ifp is not fully initialized */
nd6_setmtu0(ifp, nd);
return nd;
}
void
nd6_ifdetach(struct ifnet *ifp, struct nd_ifinfo *nd)
{
struct ifaddr *ifa, *next;
IF_ADDR_RLOCK(ifp);
TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) {
if (ifa->ifa_addr->sa_family != AF_INET6)
continue;
/* stop DAD processing */
nd6_dad_stop(ifa);
}
IF_ADDR_RUNLOCK(ifp);
free(nd, M_IP6NDP);
}
/*
* Reset ND level link MTU. This function is called when the physical MTU
* changes, which means we might have to adjust the ND level MTU.
*/
void
nd6_setmtu(struct ifnet *ifp)
{
if (ifp->if_afdata[AF_INET6] == NULL)
return;
nd6_setmtu0(ifp, ND_IFINFO(ifp));
}
/* XXX todo: do not maintain copy of ifp->if_mtu in ndi->maxmtu */
void
nd6_setmtu0(struct ifnet *ifp, struct nd_ifinfo *ndi)
{
u_int32_t omaxmtu;
omaxmtu = ndi->maxmtu;
switch (ifp->if_type) {
case IFT_ARCNET:
ndi->maxmtu = MIN(ARC_PHDS_MAXMTU, ifp->if_mtu); /* RFC2497 */
break;
case IFT_FDDI:
ndi->maxmtu = MIN(FDDIIPMTU, ifp->if_mtu); /* RFC2467 */
break;
case IFT_ISO88025:
ndi->maxmtu = MIN(ISO88025_MAX_MTU, ifp->if_mtu);
break;
default:
ndi->maxmtu = ifp->if_mtu;
break;
}
/*
* Decreasing the interface MTU under IPV6 minimum MTU may cause
* undesirable situation. We thus notify the operator of the change
* explicitly. The check for omaxmtu is necessary to restrict the
* log to the case of changing the MTU, not initializing it.
*/
if (omaxmtu >= IPV6_MMTU && ndi->maxmtu < IPV6_MMTU) {
log(LOG_NOTICE, "nd6_setmtu0: "
"new link MTU on %s (%lu) is too small for IPv6\n",
if_name(ifp), (unsigned long)ndi->maxmtu);
}
if (ndi->maxmtu > V_in6_maxmtu)
in6_setmaxmtu(); /* check all interfaces just in case */
}
void
nd6_option_init(void *opt, int icmp6len, union nd_opts *ndopts)
{
bzero(ndopts, sizeof(*ndopts));
ndopts->nd_opts_search = (struct nd_opt_hdr *)opt;
ndopts->nd_opts_last
= (struct nd_opt_hdr *)(((u_char *)opt) + icmp6len);
if (icmp6len == 0) {
ndopts->nd_opts_done = 1;
ndopts->nd_opts_search = NULL;
}
}
/*
* Take one ND option.
*/
struct nd_opt_hdr *
nd6_option(union nd_opts *ndopts)
{
struct nd_opt_hdr *nd_opt;
int olen;
KASSERT(ndopts != NULL, ("%s: ndopts == NULL", __func__));
KASSERT(ndopts->nd_opts_last != NULL, ("%s: uninitialized ndopts",
__func__));
if (ndopts->nd_opts_search == NULL)
return NULL;
if (ndopts->nd_opts_done)
return NULL;
nd_opt = ndopts->nd_opts_search;
/* make sure nd_opt_len is inside the buffer */
if ((caddr_t)&nd_opt->nd_opt_len >= (caddr_t)ndopts->nd_opts_last) {
bzero(ndopts, sizeof(*ndopts));
return NULL;
}
olen = nd_opt->nd_opt_len << 3;
if (olen == 0) {
/*
* Message validation requires that all included
* options have a length that is greater than zero.
*/
bzero(ndopts, sizeof(*ndopts));
return NULL;
}
ndopts->nd_opts_search = (struct nd_opt_hdr *)((caddr_t)nd_opt + olen);
if (ndopts->nd_opts_search > ndopts->nd_opts_last) {
/* option overruns the end of buffer, invalid */
bzero(ndopts, sizeof(*ndopts));
return NULL;
} else if (ndopts->nd_opts_search == ndopts->nd_opts_last) {
/* reached the end of options chain */
ndopts->nd_opts_done = 1;
ndopts->nd_opts_search = NULL;
}
return nd_opt;
}
/*
* Parse multiple ND options.
* This function is much easier to use, for ND routines that do not need
* multiple options of the same type.
*/
int
nd6_options(union nd_opts *ndopts)
{
struct nd_opt_hdr *nd_opt;
int i = 0;
KASSERT(ndopts != NULL, ("%s: ndopts == NULL", __func__));
KASSERT(ndopts->nd_opts_last != NULL, ("%s: uninitialized ndopts",
__func__));
if (ndopts->nd_opts_search == NULL)
return 0;
while (1) {
nd_opt = nd6_option(ndopts);
if (nd_opt == NULL && ndopts->nd_opts_last == NULL) {
/*
* Message validation requires that all included
* options have a length that is greater than zero.
*/
ICMP6STAT_INC(icp6s_nd_badopt);
bzero(ndopts, sizeof(*ndopts));
return -1;
}
if (nd_opt == NULL)
goto skip1;
switch (nd_opt->nd_opt_type) {
case ND_OPT_SOURCE_LINKADDR:
case ND_OPT_TARGET_LINKADDR:
case ND_OPT_MTU:
case ND_OPT_REDIRECTED_HEADER:
case ND_OPT_NONCE:
if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) {
nd6log((LOG_INFO,
"duplicated ND6 option found (type=%d)\n",
nd_opt->nd_opt_type));
/* XXX bark? */
} else {
ndopts->nd_opt_array[nd_opt->nd_opt_type]
= nd_opt;
}
break;
case ND_OPT_PREFIX_INFORMATION:
if (ndopts->nd_opt_array[nd_opt->nd_opt_type] == 0) {
ndopts->nd_opt_array[nd_opt->nd_opt_type]
= nd_opt;
}
ndopts->nd_opts_pi_end =
(struct nd_opt_prefix_info *)nd_opt;
break;
/* What about ND_OPT_ROUTE_INFO? RFC 4191 */
case ND_OPT_RDNSS: /* RFC 6106 */
case ND_OPT_DNSSL: /* RFC 6106 */
/*
* Silently ignore options we know and do not care about
* in the kernel.
*/
break;
default:
/*
* Unknown options must be silently ignored,
* to accommodate future extension to the protocol.
*/
nd6log((LOG_DEBUG,
"nd6_options: unsupported option %d - "
"option ignored\n", nd_opt->nd_opt_type));
}
skip1:
i++;
if (i > V_nd6_maxndopt) {
ICMP6STAT_INC(icp6s_nd_toomanyopt);
nd6log((LOG_INFO, "too many loop in nd opt\n"));
break;
}
if (ndopts->nd_opts_done)
break;
}
return 0;
}
/*
* ND6 timer routine to handle ND6 entries
*/
static void
nd6_llinfo_settimer_locked(struct llentry *ln, long tick)
{
int canceled;
LLE_WLOCK_ASSERT(ln);
if (tick < 0) {
ln->la_expire = 0;
ln->ln_ntick = 0;
canceled = callout_stop(&ln->lle_timer);
} else {
ln->la_expire = time_uptime + tick / hz;
LLE_ADDREF(ln);
if (tick > INT_MAX) {
ln->ln_ntick = tick - INT_MAX;
canceled = callout_reset(&ln->lle_timer, INT_MAX,
nd6_llinfo_timer, ln);
} else {
ln->ln_ntick = 0;
canceled = callout_reset(&ln->lle_timer, tick,
nd6_llinfo_timer, ln);
}
}
if (canceled > 0)
LLE_REMREF(ln);
}
/*
* Gets source address of the first packet in hold queue
* and stores it in @src.
* Returns pointer to @src (if hold queue is not empty) or NULL.
*
* Set noinline to be dtrace-friendly
*/
static __noinline struct in6_addr *
nd6_llinfo_get_holdsrc(struct llentry *ln, struct in6_addr *src)
{
struct ip6_hdr hdr;
struct mbuf *m;
if (ln->la_hold == NULL)
return (NULL);
/*
* assume every packet in la_hold has the same IP header
*/
m = ln->la_hold;
if (sizeof(hdr) > m->m_len)
return (NULL);
m_copydata(m, 0, sizeof(hdr), (caddr_t)&hdr);
*src = hdr.ip6_src;
return (src);
}
/*
* Checks if we need to switch from STALE state.
*
* RFC 4861 requires switching from STALE to DELAY state
* on first packet matching entry, waiting V_nd6_delay and
* transition to PROBE state (if upper layer confirmation was
* not received).
*
* This code performs a bit differently:
* On packet hit we don't change state (but desired state
* can be guessed by control plane). However, after V_nd6_delay
* seconds code will transition to PROBE state (so DELAY state
* is kinda skipped in most situations).
*
* Typically, V_nd6_gctimer is bigger than V_nd6_delay, so
* we perform the following upon entering STALE state:
*
* 1) Arm timer to run each V_nd6_delay seconds to make sure that
* if packet was transmitted at the start of given interval, we
* would be able to switch to PROBE state in V_nd6_delay seconds
* as user expects.
*
* 2) Reschedule timer until original V_nd6_gctimer expires keeping
* lle in STALE state (remaining timer value stored in lle_remtime).
*
* 3) Reschedule timer if packet was transmitted less that V_nd6_delay
* seconds ago.
*
* Returns non-zero value if the entry is still STALE (storing
* the next timer interval in @pdelay).
*
* Returns zero value if original timer expired or we need to switch to
* PROBE (store that in @do_switch variable).
*/
static int
nd6_is_stale(struct llentry *lle, long *pdelay, int *do_switch)
{
int nd_delay, nd_gctimer, r_skip_req;
time_t lle_hittime;
long delay;
*do_switch = 0;
nd_gctimer = V_nd6_gctimer;
nd_delay = V_nd6_delay;
LLE_REQ_LOCK(lle);
r_skip_req = lle->r_skip_req;
lle_hittime = lle->lle_hittime;
LLE_REQ_UNLOCK(lle);
if (r_skip_req > 0) {
/*
* Nonzero r_skip_req value was set upon entering
* STALE state. Since value was not changed, no
* packets were passed using this lle. Ask for
* timer reschedule and keep STALE state.
*/
delay = (long)(MIN(nd_gctimer, nd_delay));
delay *= hz;
if (lle->lle_remtime > delay)
lle->lle_remtime -= delay;
else {
delay = lle->lle_remtime;
lle->lle_remtime = 0;
}
if (delay == 0) {
/*
* The original ng6_gctime timeout ended,
* no more rescheduling.
*/
return (0);
}
*pdelay = delay;
return (1);
}
/*
* Packet received. Verify timestamp
*/
delay = (long)(time_uptime - lle_hittime);
if (delay < nd_delay) {
/*
* V_nd6_delay still not passed since the first
* hit in STALE state.
* Reshedule timer and return.
*/
*pdelay = (long)(nd_delay - delay) * hz;
return (1);
}
/* Request switching to probe */
*do_switch = 1;
return (0);
}
/*
* Switch @lle state to new state optionally arming timers.
*
* Set noinline to be dtrace-friendly
*/
__noinline void
nd6_llinfo_setstate(struct llentry *lle, int newstate)
{
struct ifnet *ifp;
int nd_gctimer, nd_delay;
long delay, remtime;
delay = 0;
remtime = 0;
switch (newstate) {
case ND6_LLINFO_INCOMPLETE:
ifp = lle->lle_tbl->llt_ifp;
delay = (long)ND_IFINFO(ifp)->retrans * hz / 1000;
break;
case ND6_LLINFO_REACHABLE:
if (!ND6_LLINFO_PERMANENT(lle)) {
ifp = lle->lle_tbl->llt_ifp;
delay = (long)ND_IFINFO(ifp)->reachable * hz;
}
break;
case ND6_LLINFO_STALE:
/*
* Notify fast path that we want to know if any packet
* is transmitted by setting r_skip_req.
*/
LLE_REQ_LOCK(lle);
lle->r_skip_req = 1;
LLE_REQ_UNLOCK(lle);
nd_delay = V_nd6_delay;
nd_gctimer = V_nd6_gctimer;
delay = (long)(MIN(nd_gctimer, nd_delay)) * hz;
remtime = (long)nd_gctimer * hz - delay;
break;
case ND6_LLINFO_DELAY:
lle->la_asked = 0;
delay = (long)V_nd6_delay * hz;
break;
}
if (delay > 0)
nd6_llinfo_settimer_locked(lle, delay);
lle->lle_remtime = remtime;
lle->ln_state = newstate;
}
/*
* Timer-dependent part of nd state machine.
*
* Set noinline to be dtrace-friendly
*/
static __noinline void
nd6_llinfo_timer(void *arg)
{
struct llentry *ln;
struct in6_addr *dst, *pdst, *psrc, src;
struct ifnet *ifp;
struct nd_ifinfo *ndi;
int do_switch, send_ns;
long delay;
KASSERT(arg != NULL, ("%s: arg NULL", __func__));
ln = (struct llentry *)arg;
ifp = lltable_get_ifp(ln->lle_tbl);
CURVNET_SET(ifp->if_vnet);
ND6_RLOCK();
LLE_WLOCK(ln);
if (callout_pending(&ln->lle_timer)) {
/*
* Here we are a bit odd here in the treatment of
* active/pending. If the pending bit is set, it got
* rescheduled before I ran. The active
* bit we ignore, since if it was stopped
* in ll_tablefree() and was currently running
* it would have return 0 so the code would
* not have deleted it since the callout could
* not be stopped so we want to go through
* with the delete here now. If the callout
* was restarted, the pending bit will be back on and
* we just want to bail since the callout_reset would
* return 1 and our reference would have been removed
* by nd6_llinfo_settimer_locked above since canceled
* would have been 1.
*/
LLE_WUNLOCK(ln);
ND6_RUNLOCK();
CURVNET_RESTORE();
return;
}
ndi = ND_IFINFO(ifp);
send_ns = 0;
dst = &ln->r_l3addr.addr6;
pdst = dst;
if (ln->ln_ntick > 0) {
if (ln->ln_ntick > INT_MAX) {
ln->ln_ntick -= INT_MAX;
nd6_llinfo_settimer_locked(ln, INT_MAX);
} else {
ln->ln_ntick = 0;
nd6_llinfo_settimer_locked(ln, ln->ln_ntick);
}
goto done;
}
if (ln->la_flags & LLE_STATIC) {
goto done;
}
if (ln->la_flags & LLE_DELETED) {
nd6_free(&ln, 0);
goto done;
}
switch (ln->ln_state) {
case ND6_LLINFO_INCOMPLETE:
if (ln->la_asked < V_nd6_mmaxtries) {
ln->la_asked++;
send_ns = 1;
/* Send NS to multicast address */
pdst = NULL;
} else {
struct mbuf *m = ln->la_hold;
if (m) {
struct mbuf *m0;
/*
* assuming every packet in la_hold has the
* same IP header. Send error after unlock.
*/
m0 = m->m_nextpkt;
m->m_nextpkt = NULL;
ln->la_hold = m0;
clear_llinfo_pqueue(ln);
}
nd6_free(&ln, 0);
if (m != NULL)
icmp6_error2(m, ICMP6_DST_UNREACH,
ICMP6_DST_UNREACH_ADDR, 0, ifp);
}
break;
case ND6_LLINFO_REACHABLE:
if (!ND6_LLINFO_PERMANENT(ln))
nd6_llinfo_setstate(ln, ND6_LLINFO_STALE);
break;
case ND6_LLINFO_STALE:
if (nd6_is_stale(ln, &delay, &do_switch) != 0) {
/*
* No packet has used this entry and GC timeout
* has not been passed. Reshedule timer and
* return.
*/
nd6_llinfo_settimer_locked(ln, delay);
break;
}
if (do_switch == 0) {
/*
* GC timer has ended and entry hasn't been used.
* Run Garbage collector (RFC 4861, 5.3)
*/
if (!ND6_LLINFO_PERMANENT(ln))
nd6_free(&ln, 1);
break;
}
/* Entry has been used AND delay timer has ended. */
/* FALLTHROUGH */
case ND6_LLINFO_DELAY:
if (ndi && (ndi->flags & ND6_IFF_PERFORMNUD) != 0) {
/* We need NUD */
ln->la_asked = 1;
nd6_llinfo_setstate(ln, ND6_LLINFO_PROBE);
send_ns = 1;
} else
nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); /* XXX */
break;
case ND6_LLINFO_PROBE:
if (ln->la_asked < V_nd6_umaxtries) {
ln->la_asked++;
send_ns = 1;
} else {
nd6_free(&ln, 0);
}
break;
default:
panic("%s: paths in a dark night can be confusing: %d",
__func__, ln->ln_state);
}
done:
if (ln != NULL)
ND6_RUNLOCK();
if (send_ns != 0) {
nd6_llinfo_settimer_locked(ln, (long)ndi->retrans * hz / 1000);
psrc = nd6_llinfo_get_holdsrc(ln, &src);
LLE_FREE_LOCKED(ln);
ln = NULL;
nd6_ns_output(ifp, psrc, pdst, dst, NULL);
}
if (ln != NULL)
LLE_FREE_LOCKED(ln);
CURVNET_RESTORE();
}
/*
* ND6 timer routine to expire default route list and prefix list
*/
void
nd6_timer(void *arg)
{
CURVNET_SET((struct vnet *) arg);
struct nd_drhead drq;
struct nd_prhead prl;
struct nd_defrouter *dr, *ndr;
struct nd_prefix *pr, *npr;
struct in6_ifaddr *ia6, *nia6;
uint64_t genid;
TAILQ_INIT(&drq);
LIST_INIT(&prl);
ND6_WLOCK();
TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, ndr)
if (dr->expire && dr->expire < time_uptime)
defrouter_unlink(dr, &drq);
ND6_WUNLOCK();
while ((dr = TAILQ_FIRST(&drq)) != NULL) {
TAILQ_REMOVE(&drq, dr, dr_entry);
defrouter_del(dr);
}
/*
* expire interface addresses.
* in the past the loop was inside prefix expiry processing.
* However, from a stricter speci-confrmance standpoint, we should
* rather separate address lifetimes and prefix lifetimes.
*
* XXXRW: in6_ifaddrhead locking.
*/
addrloop:
TAILQ_FOREACH_SAFE(ia6, &V_in6_ifaddrhead, ia_link, nia6) {
/* check address lifetime */
if (IFA6_IS_INVALID(ia6)) {
int regen = 0;
/*
* If the expiring address is temporary, try
* regenerating a new one. This would be useful when
* we suspended a laptop PC, then turned it on after a
* period that could invalidate all temporary
* addresses. Although we may have to restart the
* loop (see below), it must be after purging the
* address. Otherwise, we'd see an infinite loop of
* regeneration.
*/
if (V_ip6_use_tempaddr &&
(ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0) {
if (regen_tmpaddr(ia6) == 0)
regen = 1;
}
in6_purgeaddr(&ia6->ia_ifa);
if (regen)
goto addrloop; /* XXX: see below */
} else if (IFA6_IS_DEPRECATED(ia6)) {
int oldflags = ia6->ia6_flags;
ia6->ia6_flags |= IN6_IFF_DEPRECATED;
/*
* If a temporary address has just become deprecated,
* regenerate a new one if possible.
*/
if (V_ip6_use_tempaddr &&
(ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0 &&
(oldflags & IN6_IFF_DEPRECATED) == 0) {
if (regen_tmpaddr(ia6) == 0) {
/*
* A new temporary address is
* generated.
* XXX: this means the address chain
* has changed while we are still in
* the loop. Although the change
* would not cause disaster (because
* it's not a deletion, but an
* addition,) we'd rather restart the
* loop just for safety. Or does this
* significantly reduce performance??
*/
goto addrloop;
}
}
} else if ((ia6->ia6_flags & IN6_IFF_TENTATIVE) != 0) {
/*
* Schedule DAD for a tentative address. This happens
* if the interface was down or not running
* when the address was configured.
*/
int delay;
delay = arc4random() %
(MAX_RTR_SOLICITATION_DELAY * hz);
nd6_dad_start((struct ifaddr *)ia6, delay);
} else {
/*
* Check status of the interface. If it is down,
* mark the address as tentative for future DAD.
*/
if ((ia6->ia_ifp->if_flags & IFF_UP) == 0 ||
(ia6->ia_ifp->if_drv_flags & IFF_DRV_RUNNING)
== 0 ||
(ND_IFINFO(ia6->ia_ifp)->flags &
ND6_IFF_IFDISABLED) != 0) {
ia6->ia6_flags &= ~IN6_IFF_DUPLICATED;
ia6->ia6_flags |= IN6_IFF_TENTATIVE;
}
/*
* A new RA might have made a deprecated address
* preferred.
*/
ia6->ia6_flags &= ~IN6_IFF_DEPRECATED;
}
}
ND6_WLOCK();
restart:
LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, npr) {
/*
* Expire prefixes. Since the pltime is only used for
* autoconfigured addresses, pltime processing for prefixes is
* not necessary.
*
* Only unlink after all derived addresses have expired. This
* may not occur until two hours after the prefix has expired
* per RFC 4862. If the prefix expires before its derived
* addresses, mark it off-link. This will be done automatically
* after unlinking if no address references remain.
*/
if (pr->ndpr_vltime == ND6_INFINITE_LIFETIME ||
time_uptime - pr->ndpr_lastupdate <= pr->ndpr_vltime)
continue;
if (pr->ndpr_addrcnt == 0) {
nd6_prefix_unlink(pr, &prl);
continue;
}
if ((pr->ndpr_stateflags & NDPRF_ONLINK) != 0) {
genid = V_nd6_list_genid;
nd6_prefix_ref(pr);
ND6_WUNLOCK();
ND6_ONLINK_LOCK();
(void)nd6_prefix_offlink(pr);
ND6_ONLINK_UNLOCK();
ND6_WLOCK();
nd6_prefix_rele(pr);
if (genid != V_nd6_list_genid)
goto restart;
}
}
ND6_WUNLOCK();
while ((pr = LIST_FIRST(&prl)) != NULL) {
LIST_REMOVE(pr, ndpr_entry);
nd6_prefix_del(pr);
}
callout_reset(&V_nd6_timer_ch, V_nd6_prune * hz,
nd6_timer, curvnet);
CURVNET_RESTORE();
}
/*
* ia6 - deprecated/invalidated temporary address
*/
static int
regen_tmpaddr(struct in6_ifaddr *ia6)
{
struct ifaddr *ifa;
struct ifnet *ifp;
struct in6_ifaddr *public_ifa6 = NULL;
ifp = ia6->ia_ifa.ifa_ifp;
IF_ADDR_RLOCK(ifp);
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
struct in6_ifaddr *it6;
if (ifa->ifa_addr->sa_family != AF_INET6)
continue;
it6 = (struct in6_ifaddr *)ifa;
/* ignore no autoconf addresses. */
if ((it6->ia6_flags & IN6_IFF_AUTOCONF) == 0)
continue;
/* ignore autoconf addresses with different prefixes. */
if (it6->ia6_ndpr == NULL || it6->ia6_ndpr != ia6->ia6_ndpr)
continue;
/*
* Now we are looking at an autoconf address with the same
* prefix as ours. If the address is temporary and is still
* preferred, do not create another one. It would be rare, but
* could happen, for example, when we resume a laptop PC after
* a long period.
*/
if ((it6->ia6_flags & IN6_IFF_TEMPORARY) != 0 &&
!IFA6_IS_DEPRECATED(it6)) {
public_ifa6 = NULL;
break;
}
/*
* This is a public autoconf address that has the same prefix
* as ours. If it is preferred, keep it. We can't break the
* loop here, because there may be a still-preferred temporary
* address with the prefix.
*/
if (!IFA6_IS_DEPRECATED(it6))
public_ifa6 = it6;
}
if (public_ifa6 != NULL)
ifa_ref(&public_ifa6->ia_ifa);
IF_ADDR_RUNLOCK(ifp);
if (public_ifa6 != NULL) {
int e;
if ((e = in6_tmpifadd(public_ifa6, 0, 0)) != 0) {
ifa_free(&public_ifa6->ia_ifa);
log(LOG_NOTICE, "regen_tmpaddr: failed to create a new"
" tmp addr,errno=%d\n", e);
return (-1);
}
ifa_free(&public_ifa6->ia_ifa);
return (0);
}
return (-1);
}
/*
* Remove prefix and default router list entries corresponding to ifp. Neighbor
* cache entries are freed in in6_domifdetach().
*/
void
nd6_purge(struct ifnet *ifp)
{
struct nd_drhead drq;
struct nd_prhead prl;
struct nd_defrouter *dr, *ndr;
struct nd_prefix *pr, *npr;
TAILQ_INIT(&drq);
LIST_INIT(&prl);
/*
* Nuke default router list entries toward ifp.
* We defer removal of default router list entries that is installed
* in the routing table, in order to keep additional side effects as
* small as possible.
*/
ND6_WLOCK();
TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, ndr) {
if (dr->installed)
continue;
if (dr->ifp == ifp)
defrouter_unlink(dr, &drq);
}
TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, ndr) {
if (!dr->installed)
continue;
if (dr->ifp == ifp)
defrouter_unlink(dr, &drq);
}
/*
* Remove prefixes on ifp. We should have already removed addresses on
* this interface, so no addresses should be referencing these prefixes.
*/
LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, npr) {
if (pr->ndpr_ifp == ifp)
nd6_prefix_unlink(pr, &prl);
}
ND6_WUNLOCK();
/* Delete the unlinked router and prefix objects. */
while ((dr = TAILQ_FIRST(&drq)) != NULL) {
TAILQ_REMOVE(&drq, dr, dr_entry);
defrouter_del(dr);
}
while ((pr = LIST_FIRST(&prl)) != NULL) {
LIST_REMOVE(pr, ndpr_entry);
nd6_prefix_del(pr);
}
/* cancel default outgoing interface setting */
if (V_nd6_defifindex == ifp->if_index)
nd6_setdefaultiface(0);
if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) {
/* Refresh default router list. */
defrouter_select_fib(ifp->if_fib);
}
}
/*
* the caller acquires and releases the lock on the lltbls
* Returns the llentry locked
*/
struct llentry *
nd6_lookup(const struct in6_addr *addr6, int flags, struct ifnet *ifp)
{
struct sockaddr_in6 sin6;
struct llentry *ln;
bzero(&sin6, sizeof(sin6));
sin6.sin6_len = sizeof(struct sockaddr_in6);
sin6.sin6_family = AF_INET6;
sin6.sin6_addr = *addr6;
IF_AFDATA_LOCK_ASSERT(ifp);
ln = lla_lookup(LLTABLE6(ifp), flags, (struct sockaddr *)&sin6);
return (ln);
}
struct llentry *
nd6_alloc(const struct in6_addr *addr6, int flags, struct ifnet *ifp)
{
struct sockaddr_in6 sin6;
struct llentry *ln;
bzero(&sin6, sizeof(sin6));
sin6.sin6_len = sizeof(struct sockaddr_in6);
sin6.sin6_family = AF_INET6;
sin6.sin6_addr = *addr6;
ln = lltable_alloc_entry(LLTABLE6(ifp), 0, (struct sockaddr *)&sin6);
if (ln != NULL)
ln->ln_state = ND6_LLINFO_NOSTATE;
return (ln);
}
/*
* Test whether a given IPv6 address is a neighbor or not, ignoring
* the actual neighbor cache. The neighbor cache is ignored in order
* to not reenter the routing code from within itself.
*/
static int
nd6_is_new_addr_neighbor(const struct sockaddr_in6 *addr, struct ifnet *ifp)
{
struct nd_prefix *pr;
struct ifaddr *ifa;
struct rt_addrinfo info;
struct sockaddr_in6 rt_key;
const struct sockaddr *dst6;
uint64_t genid;
int error, fibnum;
/*
* A link-local address is always a neighbor.
* XXX: a link does not necessarily specify a single interface.
*/
if (IN6_IS_ADDR_LINKLOCAL(&addr->sin6_addr)) {
struct sockaddr_in6 sin6_copy;
u_int32_t zone;
/*
* We need sin6_copy since sa6_recoverscope() may modify the
* content (XXX).
*/
sin6_copy = *addr;
if (sa6_recoverscope(&sin6_copy))
return (0); /* XXX: should be impossible */
if (in6_setscope(&sin6_copy.sin6_addr, ifp, &zone))
return (0);
if (sin6_copy.sin6_scope_id == zone)
return (1);
else
return (0);
}
bzero(&rt_key, sizeof(rt_key));
bzero(&info, sizeof(info));
info.rti_info[RTAX_DST] = (struct sockaddr *)&rt_key;
/*
* If the address matches one of our addresses,
* it should be a neighbor.
* If the address matches one of our on-link prefixes, it should be a
* neighbor.
*/
ND6_RLOCK();
restart:
LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) {
if (pr->ndpr_ifp != ifp)
continue;
if ((pr->ndpr_stateflags & NDPRF_ONLINK) == 0) {
dst6 = (const struct sockaddr *)&pr->ndpr_prefix;
/*
* We only need to check all FIBs if add_addr_allfibs
* is unset. If set, checking any FIB will suffice.
*/
fibnum = V_rt_add_addr_allfibs ? rt_numfibs - 1 : 0;
for (; fibnum < rt_numfibs; fibnum++) {
genid = V_nd6_list_genid;
ND6_RUNLOCK();
/*
* Restore length field before
* retrying lookup
*/
rt_key.sin6_len = sizeof(rt_key);
error = rib_lookup_info(fibnum, dst6, 0, 0,
&info);
ND6_RLOCK();
if (genid != V_nd6_list_genid)
goto restart;
if (error == 0)
break;
}
if (error != 0)
continue;
/*
* This is the case where multiple interfaces
* have the same prefix, but only one is installed
* into the routing table and that prefix entry
* is not the one being examined here. In the case
* where RADIX_MPATH is enabled, multiple route
* entries (of the same rt_key value) will be
* installed because the interface addresses all
* differ.
*/
if (!IN6_ARE_ADDR_EQUAL(&pr->ndpr_prefix.sin6_addr,
&rt_key.sin6_addr))
continue;
}
if (IN6_ARE_MASKED_ADDR_EQUAL(&pr->ndpr_prefix.sin6_addr,
&addr->sin6_addr, &pr->ndpr_mask)) {
ND6_RUNLOCK();
return (1);
}
}
ND6_RUNLOCK();
/*
* If the address is assigned on the node of the other side of
* a p2p interface, the address should be a neighbor.
*/
if (ifp->if_flags & IFF_POINTOPOINT) {
IF_ADDR_RLOCK(ifp);
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
if (ifa->ifa_addr->sa_family != addr->sin6_family)
continue;
if (ifa->ifa_dstaddr != NULL &&
sa_equal(addr, ifa->ifa_dstaddr)) {
IF_ADDR_RUNLOCK(ifp);
return 1;
}
}
IF_ADDR_RUNLOCK(ifp);
}
/*
* If the default router list is empty, all addresses are regarded
* as on-link, and thus, as a neighbor.
*/
if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV &&
TAILQ_EMPTY(&V_nd_defrouter) &&
V_nd6_defifindex == ifp->if_index) {
return (1);
}
return (0);
}
/*
* Detect if a given IPv6 address identifies a neighbor on a given link.
* XXX: should take care of the destination of a p2p link?
*/
int
nd6_is_addr_neighbor(const struct sockaddr_in6 *addr, struct ifnet *ifp)
{
struct llentry *lle;
int rc = 0;
IF_AFDATA_UNLOCK_ASSERT(ifp);
if (nd6_is_new_addr_neighbor(addr, ifp))
return (1);
/*
* Even if the address matches none of our addresses, it might be
* in the neighbor cache.
*/
IF_AFDATA_RLOCK(ifp);
if ((lle = nd6_lookup(&addr->sin6_addr, 0, ifp)) != NULL) {
LLE_RUNLOCK(lle);
rc = 1;
}
IF_AFDATA_RUNLOCK(ifp);
return (rc);
}
/*
* Free an nd6 llinfo entry.
* Since the function would cause significant changes in the kernel, DO NOT
* make it global, unless you have a strong reason for the change, and are sure
* that the change is safe.
*
* Set noinline to be dtrace-friendly
*/
static __noinline void
nd6_free(struct llentry **lnp, int gc)
{
struct ifnet *ifp;
struct llentry *ln;
struct nd_defrouter *dr;
ln = *lnp;
*lnp = NULL;
LLE_WLOCK_ASSERT(ln);
ND6_RLOCK_ASSERT();
ifp = lltable_get_ifp(ln->lle_tbl);
if ((ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) != 0)
dr = defrouter_lookup_locked(&ln->r_l3addr.addr6, ifp);
else
dr = NULL;
ND6_RUNLOCK();
if ((ln->la_flags & LLE_DELETED) == 0)
EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_EXPIRED);
/*
* we used to have pfctlinput(PRC_HOSTDEAD) here.
* even though it is not harmful, it was not really necessary.
*/
/* cancel timer */
nd6_llinfo_settimer_locked(ln, -1);
if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) {
if (dr != NULL && dr->expire &&
ln->ln_state == ND6_LLINFO_STALE && gc) {
/*
* If the reason for the deletion is just garbage
* collection, and the neighbor is an active default
* router, do not delete it. Instead, reset the GC
* timer using the router's lifetime.
* Simply deleting the entry would affect default
* router selection, which is not necessarily a good
* thing, especially when we're using router preference
* values.
* XXX: the check for ln_state would be redundant,
* but we intentionally keep it just in case.
*/
if (dr->expire > time_uptime)
nd6_llinfo_settimer_locked(ln,
(dr->expire - time_uptime) * hz);
else
nd6_llinfo_settimer_locked(ln,
(long)V_nd6_gctimer * hz);
LLE_REMREF(ln);
LLE_WUNLOCK(ln);
defrouter_rele(dr);
return;
}
if (dr) {
/*
* Unreachablity of a router might affect the default
* router selection and on-link detection of advertised
* prefixes.
*/
/*
* Temporarily fake the state to choose a new default
* router and to perform on-link determination of
* prefixes correctly.
* Below the state will be set correctly,
* or the entry itself will be deleted.
*/
ln->ln_state = ND6_LLINFO_INCOMPLETE;
}
if (ln->ln_router || dr) {
/*
* We need to unlock to avoid a LOR with rt6_flush() with the
* rnh and for the calls to pfxlist_onlink_check() and
* defrouter_select_fib() in the block further down for calls
* into nd6_lookup(). We still hold a ref.
*/
LLE_WUNLOCK(ln);
/*
* rt6_flush must be called whether or not the neighbor
* is in the Default Router List.
* See a corresponding comment in nd6_na_input().
*/
rt6_flush(&ln->r_l3addr.addr6, ifp);
}
if (dr) {
/*
* Since defrouter_select_fib() does not affect the
* on-link determination and MIP6 needs the check
* before the default router selection, we perform
* the check now.
*/
pfxlist_onlink_check();
/*
* Refresh default router list.
*/
defrouter_select_fib(dr->ifp->if_fib);
}
/*
* If this entry was added by an on-link redirect, remove the
* corresponding host route.
*/
if (ln->la_flags & LLE_REDIRECT)
nd6_free_redirect(ln);
if (ln->ln_router || dr)
LLE_WLOCK(ln);
}
/*
* Save to unlock. We still hold an extra reference and will not
* free(9) in llentry_free() if someone else holds one as well.
*/
LLE_WUNLOCK(ln);
IF_AFDATA_LOCK(ifp);
LLE_WLOCK(ln);
/* Guard against race with other llentry_free(). */
if (ln->la_flags & LLE_LINKED) {
/* Remove callout reference */
LLE_REMREF(ln);
lltable_unlink_entry(ln->lle_tbl, ln);
}
IF_AFDATA_UNLOCK(ifp);
llentry_free(ln);
if (dr != NULL)
defrouter_rele(dr);
}
static int
nd6_isdynrte(const struct rtentry *rt, void *xap)
{
if (rt->rt_flags == (RTF_UP | RTF_HOST | RTF_DYNAMIC))
return (1);
return (0);
}
/*
* Remove the rtentry for the given llentry,
* both of which were installed by a redirect.
*/
static void
nd6_free_redirect(const struct llentry *ln)
{
int fibnum;
struct sockaddr_in6 sin6;
struct rt_addrinfo info;
lltable_fill_sa_entry(ln, (struct sockaddr *)&sin6);
memset(&info, 0, sizeof(info));
info.rti_info[RTAX_DST] = (struct sockaddr *)&sin6;
info.rti_filter = nd6_isdynrte;
for (fibnum = 0; fibnum < rt_numfibs; fibnum++)
rtrequest1_fib(RTM_DELETE, &info, NULL, fibnum);
}
/*
* Rejuvenate this function for routing operations related
* processing.
*/
void
nd6_rtrequest(int req, struct rtentry *rt, struct rt_addrinfo *info)
{
struct sockaddr_in6 *gateway;
struct nd_defrouter *dr;
struct ifnet *ifp;
gateway = (struct sockaddr_in6 *)rt->rt_gateway;
ifp = rt->rt_ifp;
switch (req) {
case RTM_ADD:
break;
case RTM_DELETE:
if (!ifp)
return;
/*
* Only indirect routes are interesting.
*/
if ((rt->rt_flags & RTF_GATEWAY) == 0)
return;
/*
* check for default route
*/
if (IN6_ARE_ADDR_EQUAL(&in6addr_any,
&SIN6(rt_key(rt))->sin6_addr)) {
dr = defrouter_lookup(&gateway->sin6_addr, ifp);
if (dr != NULL) {
dr->installed = 0;
defrouter_rele(dr);
}
}
break;
}
}
int
nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp)
{
struct in6_ndireq *ndi = (struct in6_ndireq *)data;
struct in6_nbrinfo *nbi = (struct in6_nbrinfo *)data;
struct in6_ndifreq *ndif = (struct in6_ndifreq *)data;
int error = 0;
if (ifp->if_afdata[AF_INET6] == NULL)
return (EPFNOSUPPORT);
switch (cmd) {
case OSIOCGIFINFO_IN6:
#define ND ndi->ndi
/* XXX: old ndp(8) assumes a positive value for linkmtu. */
bzero(&ND, sizeof(ND));
ND.linkmtu = IN6_LINKMTU(ifp);
ND.maxmtu = ND_IFINFO(ifp)->maxmtu;
ND.basereachable = ND_IFINFO(ifp)->basereachable;
ND.reachable = ND_IFINFO(ifp)->reachable;
ND.retrans = ND_IFINFO(ifp)->retrans;
ND.flags = ND_IFINFO(ifp)->flags;
ND.recalctm = ND_IFINFO(ifp)->recalctm;
ND.chlim = ND_IFINFO(ifp)->chlim;
break;
case SIOCGIFINFO_IN6:
ND = *ND_IFINFO(ifp);
break;
case SIOCSIFINFO_IN6:
/*
* used to change host variables from userland.
* intended for a use on router to reflect RA configurations.
*/
/* 0 means 'unspecified' */
if (ND.linkmtu != 0) {
if (ND.linkmtu < IPV6_MMTU ||
ND.linkmtu > IN6_LINKMTU(ifp)) {
error = EINVAL;
break;
}
ND_IFINFO(ifp)->linkmtu = ND.linkmtu;
}
if (ND.basereachable != 0) {
int obasereachable = ND_IFINFO(ifp)->basereachable;
ND_IFINFO(ifp)->basereachable = ND.basereachable;
if (ND.basereachable != obasereachable)
ND_IFINFO(ifp)->reachable =
ND_COMPUTE_RTIME(ND.basereachable);
}
if (ND.retrans != 0)
ND_IFINFO(ifp)->retrans = ND.retrans;
if (ND.chlim != 0)
ND_IFINFO(ifp)->chlim = ND.chlim;
/* FALLTHROUGH */
case SIOCSIFINFO_FLAGS:
{
struct ifaddr *ifa;
struct in6_ifaddr *ia;
if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) &&
!(ND.flags & ND6_IFF_IFDISABLED)) {
/* ifdisabled 1->0 transision */
/*
* If the interface is marked as ND6_IFF_IFDISABLED and
* has an link-local address with IN6_IFF_DUPLICATED,
* do not clear ND6_IFF_IFDISABLED.
* See RFC 4862, Section 5.4.5.
*/
IF_ADDR_RLOCK(ifp);
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
if (ifa->ifa_addr->sa_family != AF_INET6)
continue;
ia = (struct in6_ifaddr *)ifa;
if ((ia->ia6_flags & IN6_IFF_DUPLICATED) &&
IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia)))
break;
}
IF_ADDR_RUNLOCK(ifp);
if (ifa != NULL) {
/* LLA is duplicated. */
ND.flags |= ND6_IFF_IFDISABLED;
log(LOG_ERR, "Cannot enable an interface"
" with a link-local address marked"
" duplicate.\n");
} else {
ND_IFINFO(ifp)->flags &= ~ND6_IFF_IFDISABLED;
if (ifp->if_flags & IFF_UP)
in6_if_up(ifp);
}
} else if (!(ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) &&
(ND.flags & ND6_IFF_IFDISABLED)) {
/* ifdisabled 0->1 transision */
/* Mark all IPv6 address as tentative. */
ND_IFINFO(ifp)->flags |= ND6_IFF_IFDISABLED;
if (V_ip6_dad_count > 0 &&
(ND_IFINFO(ifp)->flags & ND6_IFF_NO_DAD) == 0) {
IF_ADDR_RLOCK(ifp);
TAILQ_FOREACH(ifa, &ifp->if_addrhead,
ifa_link) {
if (ifa->ifa_addr->sa_family !=
AF_INET6)
continue;
ia = (struct in6_ifaddr *)ifa;
ia->ia6_flags |= IN6_IFF_TENTATIVE;
}
IF_ADDR_RUNLOCK(ifp);
}
}
if (ND.flags & ND6_IFF_AUTO_LINKLOCAL) {
if (!(ND_IFINFO(ifp)->flags & ND6_IFF_AUTO_LINKLOCAL)) {
/* auto_linklocal 0->1 transision */
/* If no link-local address on ifp, configure */
ND_IFINFO(ifp)->flags |= ND6_IFF_AUTO_LINKLOCAL;
in6_ifattach(ifp, NULL);
} else if (!(ND.flags & ND6_IFF_IFDISABLED) &&
ifp->if_flags & IFF_UP) {
/*
* When the IF already has
* ND6_IFF_AUTO_LINKLOCAL, no link-local
* address is assigned, and IFF_UP, try to
* assign one.
*/
IF_ADDR_RLOCK(ifp);
TAILQ_FOREACH(ifa, &ifp->if_addrhead,
ifa_link) {
if (ifa->ifa_addr->sa_family !=
AF_INET6)
continue;
ia = (struct in6_ifaddr *)ifa;
if (IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia)))
break;
}
IF_ADDR_RUNLOCK(ifp);
if (ifa != NULL)
/* No LLA is configured. */
in6_ifattach(ifp, NULL);
}
}
}
ND_IFINFO(ifp)->flags = ND.flags;
break;
#undef ND
case SIOCSNDFLUSH_IN6: /* XXX: the ioctl name is confusing... */
/* sync kernel routing table with the default router list */
defrouter_reset();
defrouter_select();
break;
case SIOCSPFXFLUSH_IN6:
{
/* flush all the prefix advertised by routers */
struct in6_ifaddr *ia, *ia_next;
struct nd_prefix *pr, *next;
struct nd_prhead prl;
LIST_INIT(&prl);
ND6_WLOCK();
LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, next) {
if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr))
continue; /* XXX */
nd6_prefix_unlink(pr, &prl);
}
ND6_WUNLOCK();
while ((pr = LIST_FIRST(&prl)) != NULL) {
LIST_REMOVE(pr, ndpr_entry);
/* XXXRW: in6_ifaddrhead locking. */
TAILQ_FOREACH_SAFE(ia, &V_in6_ifaddrhead, ia_link,
ia_next) {
if ((ia->ia6_flags & IN6_IFF_AUTOCONF) == 0)
continue;
if (ia->ia6_ndpr == pr)
in6_purgeaddr(&ia->ia_ifa);
}
nd6_prefix_del(pr);
}
break;
}
case SIOCSRTRFLUSH_IN6:
{
/* flush all the default routers */
struct nd_drhead drq;
struct nd_defrouter *dr;
TAILQ_INIT(&drq);
defrouter_reset();
ND6_WLOCK();
while ((dr = TAILQ_FIRST(&V_nd_defrouter)) != NULL)
defrouter_unlink(dr, &drq);
ND6_WUNLOCK();
while ((dr = TAILQ_FIRST(&drq)) != NULL) {
TAILQ_REMOVE(&drq, dr, dr_entry);
defrouter_del(dr);
}
defrouter_select();
break;
}
case SIOCGNBRINFO_IN6:
{
struct llentry *ln;
struct in6_addr nb_addr = nbi->addr; /* make local for safety */
if ((error = in6_setscope(&nb_addr, ifp, NULL)) != 0)
return (error);
IF_AFDATA_RLOCK(ifp);
ln = nd6_lookup(&nb_addr, 0, ifp);
IF_AFDATA_RUNLOCK(ifp);
if (ln == NULL) {
error = EINVAL;
break;
}
nbi->state = ln->ln_state;
nbi->asked = ln->la_asked;
nbi->isrouter = ln->ln_router;
if (ln->la_expire == 0)
nbi->expire = 0;
else
nbi->expire = ln->la_expire + ln->lle_remtime / hz +
(time_second - time_uptime);
LLE_RUNLOCK(ln);
break;
}
case SIOCGDEFIFACE_IN6: /* XXX: should be implemented as a sysctl? */
ndif->ifindex = V_nd6_defifindex;
break;
case SIOCSDEFIFACE_IN6: /* XXX: should be implemented as a sysctl? */
return (nd6_setdefaultiface(ndif->ifindex));
}
return (error);
}
/*
* Calculates new isRouter value based on provided parameters and
* returns it.
*/
static int
nd6_is_router(int type, int code, int is_new, int old_addr, int new_addr,
int ln_router)
{
/*
* ICMP6 type dependent behavior.
*
* NS: clear IsRouter if new entry
* RS: clear IsRouter
* RA: set IsRouter if there's lladdr
* redir: clear IsRouter if new entry
*
* RA case, (1):
* The spec says that we must set IsRouter in the following cases:
* - If lladdr exist, set IsRouter. This means (1-5).
* - If it is old entry (!newentry), set IsRouter. This means (7).
* So, based on the spec, in (1-5) and (7) cases we must set IsRouter.
* A quetion arises for (1) case. (1) case has no lladdr in the
* neighbor cache, this is similar to (6).
* This case is rare but we figured that we MUST NOT set IsRouter.
*
* is_new old_addr new_addr NS RS RA redir
* D R
* 0 n n (1) c ? s
* 0 y n (2) c s s
* 0 n y (3) c s s
* 0 y y (4) c s s
* 0 y y (5) c s s
* 1 -- n (6) c c c s
* 1 -- y (7) c c s c s
*
* (c=clear s=set)
*/
switch (type & 0xff) {
case ND_NEIGHBOR_SOLICIT:
/*
* New entry must have is_router flag cleared.
*/
if (is_new) /* (6-7) */
ln_router = 0;
break;
case ND_REDIRECT:
/*
* If the icmp is a redirect to a better router, always set the
* is_router flag. Otherwise, if the entry is newly created,
* clear the flag. [RFC 2461, sec 8.3]
*/
if (code == ND_REDIRECT_ROUTER)
ln_router = 1;
else {
if (is_new) /* (6-7) */
ln_router = 0;
}
break;
case ND_ROUTER_SOLICIT:
/*
* is_router flag must always be cleared.
*/
ln_router = 0;
break;
case ND_ROUTER_ADVERT:
/*
* Mark an entry with lladdr as a router.
*/
if ((!is_new && (old_addr || new_addr)) || /* (2-5) */
(is_new && new_addr)) { /* (7) */
ln_router = 1;
}
break;
}
return (ln_router);
}
/*
* Create neighbor cache entry and cache link-layer address,
* on reception of inbound ND6 packets. (RS/RA/NS/redirect)
*
* type - ICMP6 type
* code - type dependent information
*
*/
void
nd6_cache_lladdr(struct ifnet *ifp, struct in6_addr *from, char *lladdr,
int lladdrlen, int type, int code)
{
struct llentry *ln = NULL, *ln_tmp;
int is_newentry;
int do_update;
int olladdr;
int llchange;
int flags;
uint16_t router = 0;
struct sockaddr_in6 sin6;
struct mbuf *chain = NULL;
u_char linkhdr[LLE_MAX_LINKHDR];
size_t linkhdrsize;
int lladdr_off;
IF_AFDATA_UNLOCK_ASSERT(ifp);
KASSERT(ifp != NULL, ("%s: ifp == NULL", __func__));
KASSERT(from != NULL, ("%s: from == NULL", __func__));
/* nothing must be updated for unspecified address */
if (IN6_IS_ADDR_UNSPECIFIED(from))
return;
/*
* Validation about ifp->if_addrlen and lladdrlen must be done in
* the caller.
*
* XXX If the link does not have link-layer adderss, what should
* we do? (ifp->if_addrlen == 0)
* Spec says nothing in sections for RA, RS and NA. There's small
* description on it in NS section (RFC 2461 7.2.3).
*/
flags = lladdr ? LLE_EXCLUSIVE : 0;
IF_AFDATA_RLOCK(ifp);
ln = nd6_lookup(from, flags, ifp);
IF_AFDATA_RUNLOCK(ifp);
is_newentry = 0;
if (ln == NULL) {
flags |= LLE_EXCLUSIVE;
ln = nd6_alloc(from, 0, ifp);
if (ln == NULL)
return;
/*
* Since we already know all the data for the new entry,
* fill it before insertion.
*/
if (lladdr != NULL) {
linkhdrsize = sizeof(linkhdr);
if (lltable_calc_llheader(ifp, AF_INET6, lladdr,
linkhdr, &linkhdrsize, &lladdr_off) != 0)
return;
lltable_set_entry_addr(ifp, ln, linkhdr, linkhdrsize,
lladdr_off);
}
IF_AFDATA_WLOCK(ifp);
LLE_WLOCK(ln);
/* Prefer any existing lle over newly-created one */
ln_tmp = nd6_lookup(from, LLE_EXCLUSIVE, ifp);
if (ln_tmp == NULL)
lltable_link_entry(LLTABLE6(ifp), ln);
IF_AFDATA_WUNLOCK(ifp);
if (ln_tmp == NULL) {
/* No existing lle, mark as new entry (6,7) */
is_newentry = 1;
nd6_llinfo_setstate(ln, ND6_LLINFO_STALE);
if (lladdr != NULL) /* (7) */
EVENTHANDLER_INVOKE(lle_event, ln,
LLENTRY_RESOLVED);
} else {
lltable_free_entry(LLTABLE6(ifp), ln);
ln = ln_tmp;
ln_tmp = NULL;
}
}
/* do nothing if static ndp is set */
if ((ln->la_flags & LLE_STATIC)) {
if (flags & LLE_EXCLUSIVE)
LLE_WUNLOCK(ln);
else
LLE_RUNLOCK(ln);
return;
}
olladdr = (ln->la_flags & LLE_VALID) ? 1 : 0;
if (olladdr && lladdr) {
llchange = bcmp(lladdr, ln->ll_addr,
ifp->if_addrlen);
} else if (!olladdr && lladdr)
llchange = 1;
else
llchange = 0;
/*
* newentry olladdr lladdr llchange (*=record)
* 0 n n -- (1)
* 0 y n -- (2)
* 0 n y y (3) * STALE
* 0 y y n (4) *
* 0 y y y (5) * STALE
* 1 -- n -- (6) NOSTATE(= PASSIVE)
* 1 -- y -- (7) * STALE
*/
do_update = 0;
if (is_newentry == 0 && llchange != 0) {
do_update = 1; /* (3,5) */
/*
* Record source link-layer address
* XXX is it dependent to ifp->if_type?
*/
linkhdrsize = sizeof(linkhdr);
if (lltable_calc_llheader(ifp, AF_INET6, lladdr,
linkhdr, &linkhdrsize, &lladdr_off) != 0)
return;
if (lltable_try_set_entry_addr(ifp, ln, linkhdr, linkhdrsize,
lladdr_off) == 0) {
/* Entry was deleted */
return;
}
nd6_llinfo_setstate(ln, ND6_LLINFO_STALE);
EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED);
if (ln->la_hold != NULL)
nd6_grab_holdchain(ln, &chain, &sin6);
}
/* Calculates new router status */
router = nd6_is_router(type, code, is_newentry, olladdr,
lladdr != NULL ? 1 : 0, ln->ln_router);
ln->ln_router = router;
/* Mark non-router redirects with special flag */
if ((type & 0xFF) == ND_REDIRECT && code != ND_REDIRECT_ROUTER)
ln->la_flags |= LLE_REDIRECT;
if (flags & LLE_EXCLUSIVE)
LLE_WUNLOCK(ln);
else
LLE_RUNLOCK(ln);
if (chain != NULL)
- nd6_flush_holdchain(ifp, ifp, chain, &sin6);
+ nd6_flush_holdchain(ifp, chain, &sin6);
/*
* When the link-layer address of a router changes, select the
* best router again. In particular, when the neighbor entry is newly
* created, it might affect the selection policy.
* Question: can we restrict the first condition to the "is_newentry"
* case?
* XXX: when we hear an RA from a new router with the link-layer
* address option, defrouter_select_fib() is called twice, since
* defrtrlist_update called the function as well. However, I believe
* we can compromise the overhead, since it only happens the first
* time.
* XXX: although defrouter_select_fib() should not have a bad effect
* for those are not autoconfigured hosts, we explicitly avoid such
* cases for safety.
*/
if ((do_update || is_newentry) && router &&
ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) {
/*
* guaranteed recursion
*/
defrouter_select_fib(ifp->if_fib);
}
}
static void
nd6_slowtimo(void *arg)
{
CURVNET_SET((struct vnet *) arg);
struct nd_ifinfo *nd6if;
struct ifnet *ifp;
callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz,
nd6_slowtimo, curvnet);
IFNET_RLOCK_NOSLEEP();
TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
if (ifp->if_afdata[AF_INET6] == NULL)
continue;
nd6if = ND_IFINFO(ifp);
if (nd6if->basereachable && /* already initialized */
(nd6if->recalctm -= ND6_SLOWTIMER_INTERVAL) <= 0) {
/*
* Since reachable time rarely changes by router
* advertisements, we SHOULD insure that a new random
* value gets recomputed at least once every few hours.
* (RFC 2461, 6.3.4)
*/
nd6if->recalctm = V_nd6_recalc_reachtm_interval;
nd6if->reachable = ND_COMPUTE_RTIME(nd6if->basereachable);
}
}
IFNET_RUNLOCK_NOSLEEP();
CURVNET_RESTORE();
}
void
nd6_grab_holdchain(struct llentry *ln, struct mbuf **chain,
struct sockaddr_in6 *sin6)
{
LLE_WLOCK_ASSERT(ln);
*chain = ln->la_hold;
ln->la_hold = NULL;
lltable_fill_sa_entry(ln, (struct sockaddr *)sin6);
if (ln->ln_state == ND6_LLINFO_STALE) {
/*
* The first time we send a packet to a
* neighbor whose entry is STALE, we have
* to change the state to DELAY and a sets
* a timer to expire in DELAY_FIRST_PROBE_TIME
* seconds to ensure do neighbor unreachability
* detection on expiration.
* (RFC 2461 7.3.3)
*/
nd6_llinfo_setstate(ln, ND6_LLINFO_DELAY);
}
}
int
nd6_output_ifp(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m,
struct sockaddr_in6 *dst, struct route *ro)
{
int error;
int ip6len;
struct ip6_hdr *ip6;
struct m_tag *mtag;
#ifdef MAC
mac_netinet6_nd6_send(ifp, m);
#endif
/*
* If called from nd6_ns_output() (NS), nd6_na_output() (NA),
* icmp6_redirect_output() (REDIRECT) or from rip6_output() (RS, RA
* as handled by rtsol and rtadvd), mbufs will be tagged for SeND
* to be diverted to user space. When re-injected into the kernel,
* send_output() will directly dispatch them to the outgoing interface.
*/
if (send_sendso_input_hook != NULL) {
mtag = m_tag_find(m, PACKET_TAG_ND_OUTGOING, NULL);
if (mtag != NULL) {
ip6 = mtod(m, struct ip6_hdr *);
ip6len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen);
/* Use the SEND socket */
error = send_sendso_input_hook(m, ifp, SND_OUT,
ip6len);
/* -1 == no app on SEND socket */
if (error == 0 || error != -1)
return (error);
}
}
m_clrprotoflags(m); /* Avoid confusing lower layers. */
IP_PROBE(send, NULL, NULL, mtod(m, struct ip6_hdr *), ifp, NULL,
mtod(m, struct ip6_hdr *));
if ((ifp->if_flags & IFF_LOOPBACK) == 0)
origifp = ifp;
error = (*ifp->if_output)(origifp, m, (struct sockaddr *)dst, ro);
return (error);
}
/*
* Lookup link headerfor @sa_dst address. Stores found
* data in @desten buffer. Copy of lle ln_flags can be also
* saved in @pflags if @pflags is non-NULL.
*
* If destination LLE does not exists or lle state modification
* is required, call "slow" version.
*
* Return values:
* - 0 on success (address copied to buffer).
* - EWOULDBLOCK (no local error, but address is still unresolved)
* - other errors (alloc failure, etc)
*/
int
nd6_resolve(struct ifnet *ifp, int is_gw, struct mbuf *m,
const struct sockaddr *sa_dst, u_char *desten, uint32_t *pflags,
struct llentry **plle)
{
struct llentry *ln = NULL;
const struct sockaddr_in6 *dst6;
if (pflags != NULL)
*pflags = 0;
dst6 = (const struct sockaddr_in6 *)sa_dst;
/* discard the packet if IPv6 operation is disabled on the interface */
if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED)) {
m_freem(m);
return (ENETDOWN); /* better error? */
}
if (m != NULL && m->m_flags & M_MCAST) {
switch (ifp->if_type) {
case IFT_ETHER:
case IFT_FDDI:
case IFT_L2VLAN:
case IFT_BRIDGE:
case IFT_ISO88025:
ETHER_MAP_IPV6_MULTICAST(&dst6->sin6_addr,
desten);
return (0);
default:
m_freem(m);
return (EAFNOSUPPORT);
}
}
IF_AFDATA_RLOCK(ifp);
ln = nd6_lookup(&dst6->sin6_addr, plle ? LLE_EXCLUSIVE : LLE_UNLOCKED,
ifp);
if (ln != NULL && (ln->r_flags & RLLE_VALID) != 0) {
/* Entry found, let's copy lle info */
bcopy(ln->r_linkdata, desten, ln->r_hdrlen);
if (pflags != NULL)
*pflags = LLE_VALID | (ln->r_flags & RLLE_IFADDR);
/* Check if we have feedback request from nd6 timer */
if (ln->r_skip_req != 0) {
LLE_REQ_LOCK(ln);
ln->r_skip_req = 0; /* Notify that entry was used */
ln->lle_hittime = time_uptime;
LLE_REQ_UNLOCK(ln);
}
if (plle) {
LLE_ADDREF(ln);
*plle = ln;
LLE_WUNLOCK(ln);
}
IF_AFDATA_RUNLOCK(ifp);
return (0);
} else if (plle && ln)
LLE_WUNLOCK(ln);
IF_AFDATA_RUNLOCK(ifp);
return (nd6_resolve_slow(ifp, 0, m, dst6, desten, pflags, plle));
}
/*
* Do L2 address resolution for @sa_dst address. Stores found
* address in @desten buffer. Copy of lle ln_flags can be also
* saved in @pflags if @pflags is non-NULL.
*
* Heavy version.
* Function assume that destination LLE does not exist,
* is invalid or stale, so LLE_EXCLUSIVE lock needs to be acquired.
*
* Set noinline to be dtrace-friendly
*/
static __noinline int
nd6_resolve_slow(struct ifnet *ifp, int flags, struct mbuf *m,
const struct sockaddr_in6 *dst, u_char *desten, uint32_t *pflags,
struct llentry **plle)
{
struct llentry *lle = NULL, *lle_tmp;
struct in6_addr *psrc, src;
int send_ns, ll_len;
char *lladdr;
/*
* Address resolution or Neighbor Unreachability Detection
* for the next hop.
* At this point, the destination of the packet must be a unicast
* or an anycast address(i.e. not a multicast).
*/
if (lle == NULL) {
IF_AFDATA_RLOCK(ifp);
lle = nd6_lookup(&dst->sin6_addr, LLE_EXCLUSIVE, ifp);
IF_AFDATA_RUNLOCK(ifp);
if ((lle == NULL) && nd6_is_addr_neighbor(dst, ifp)) {
/*
* Since nd6_is_addr_neighbor() internally calls nd6_lookup(),
* the condition below is not very efficient. But we believe
* it is tolerable, because this should be a rare case.
*/
lle = nd6_alloc(&dst->sin6_addr, 0, ifp);
if (lle == NULL) {
char ip6buf[INET6_ADDRSTRLEN];
log(LOG_DEBUG,
"nd6_output: can't allocate llinfo for %s "
"(ln=%p)\n",
ip6_sprintf(ip6buf, &dst->sin6_addr), lle);
m_freem(m);
return (ENOBUFS);
}
IF_AFDATA_WLOCK(ifp);
LLE_WLOCK(lle);
/* Prefer any existing entry over newly-created one */
lle_tmp = nd6_lookup(&dst->sin6_addr, LLE_EXCLUSIVE, ifp);
if (lle_tmp == NULL)
lltable_link_entry(LLTABLE6(ifp), lle);
IF_AFDATA_WUNLOCK(ifp);
if (lle_tmp != NULL) {
lltable_free_entry(LLTABLE6(ifp), lle);
lle = lle_tmp;
lle_tmp = NULL;
}
}
}
if (lle == NULL) {
if (!(ND_IFINFO(ifp)->flags & ND6_IFF_PERFORMNUD)) {
m_freem(m);
return (ENOBUFS);
}
if (m != NULL)
m_freem(m);
return (ENOBUFS);
}
LLE_WLOCK_ASSERT(lle);
/*
* The first time we send a packet to a neighbor whose entry is
* STALE, we have to change the state to DELAY and a sets a timer to
* expire in DELAY_FIRST_PROBE_TIME seconds to ensure do
* neighbor unreachability detection on expiration.
* (RFC 2461 7.3.3)
*/
if (lle->ln_state == ND6_LLINFO_STALE)
nd6_llinfo_setstate(lle, ND6_LLINFO_DELAY);
/*
* If the neighbor cache entry has a state other than INCOMPLETE
* (i.e. its link-layer address is already resolved), just
* send the packet.
*/
if (lle->ln_state > ND6_LLINFO_INCOMPLETE) {
if (flags & LLE_ADDRONLY) {
lladdr = lle->ll_addr;
ll_len = ifp->if_addrlen;
} else {
lladdr = lle->r_linkdata;
ll_len = lle->r_hdrlen;
}
bcopy(lladdr, desten, ll_len);
if (pflags != NULL)
*pflags = lle->la_flags;
if (plle) {
LLE_ADDREF(lle);
*plle = lle;
}
LLE_WUNLOCK(lle);
return (0);
}
/*
* There is a neighbor cache entry, but no ethernet address
* response yet. Append this latest packet to the end of the
* packet queue in the mbuf. When it exceeds nd6_maxqueuelen,
* the oldest packet in the queue will be removed.
*/
if (lle->la_hold != NULL) {
struct mbuf *m_hold;
int i;
i = 0;
for (m_hold = lle->la_hold; m_hold; m_hold = m_hold->m_nextpkt){
i++;
if (m_hold->m_nextpkt == NULL) {
m_hold->m_nextpkt = m;
break;
}
}
while (i >= V_nd6_maxqueuelen) {
m_hold = lle->la_hold;
lle->la_hold = lle->la_hold->m_nextpkt;
m_freem(m_hold);
i--;
}
} else {
lle->la_hold = m;
}
/*
* If there has been no NS for the neighbor after entering the
* INCOMPLETE state, send the first solicitation.
* Note that for newly-created lle la_asked will be 0,
* so we will transition from ND6_LLINFO_NOSTATE to
* ND6_LLINFO_INCOMPLETE state here.
*/
psrc = NULL;
send_ns = 0;
if (lle->la_asked == 0) {
lle->la_asked++;
send_ns = 1;
psrc = nd6_llinfo_get_holdsrc(lle, &src);
nd6_llinfo_setstate(lle, ND6_LLINFO_INCOMPLETE);
}
LLE_WUNLOCK(lle);
if (send_ns != 0)
nd6_ns_output(ifp, psrc, NULL, &dst->sin6_addr, NULL);
return (EWOULDBLOCK);
}
/*
* Do L2 address resolution for @sa_dst address. Stores found
* address in @desten buffer. Copy of lle ln_flags can be also
* saved in @pflags if @pflags is non-NULL.
*
* Return values:
* - 0 on success (address copied to buffer).
* - EWOULDBLOCK (no local error, but address is still unresolved)
* - other errors (alloc failure, etc)
*/
int
nd6_resolve_addr(struct ifnet *ifp, int flags, const struct sockaddr *dst,
char *desten, uint32_t *pflags)
{
int error;
flags |= LLE_ADDRONLY;
error = nd6_resolve_slow(ifp, flags, NULL,
(const struct sockaddr_in6 *)dst, desten, pflags, NULL);
return (error);
}
int
-nd6_flush_holdchain(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *chain,
+nd6_flush_holdchain(struct ifnet *ifp, struct mbuf *chain,
struct sockaddr_in6 *dst)
{
struct mbuf *m, *m_head;
- struct ifnet *outifp;
int error = 0;
m_head = chain;
- if ((ifp->if_flags & IFF_LOOPBACK) != 0)
- outifp = origifp;
- else
- outifp = ifp;
-
+
while (m_head) {
m = m_head;
m_head = m_head->m_nextpkt;
- error = nd6_output_ifp(ifp, origifp, m, dst, NULL);
+ error = nd6_output_ifp(ifp, ifp, m, dst, NULL);
}
/*
* XXX
* note that intermediate errors are blindly ignored
*/
return (error);
-}
+}
static int
nd6_need_cache(struct ifnet *ifp)
{
/*
* XXX: we currently do not make neighbor cache on any interface
* other than ARCnet, Ethernet, FDDI and GIF.
*
* RFC2893 says:
* - unidirectional tunnels needs no ND
*/
switch (ifp->if_type) {
case IFT_ARCNET:
case IFT_ETHER:
case IFT_FDDI:
case IFT_IEEE1394:
case IFT_L2VLAN:
case IFT_INFINIBAND:
case IFT_BRIDGE:
case IFT_PROPVIRTUAL:
return (1);
default:
return (0);
}
}
/*
* Add pernament ND6 link-layer record for given
* interface address.
*
* Very similar to IPv4 arp_ifinit(), but:
* 1) IPv6 DAD is performed in different place
* 2) It is called by IPv6 protocol stack in contrast to
* arp_ifinit() which is typically called in SIOCSIFADDR
* driver ioctl handler.
*
*/
int
nd6_add_ifa_lle(struct in6_ifaddr *ia)
{
struct ifnet *ifp;
struct llentry *ln, *ln_tmp;
struct sockaddr *dst;
ifp = ia->ia_ifa.ifa_ifp;
if (nd6_need_cache(ifp) == 0)
return (0);
ia->ia_ifa.ifa_rtrequest = nd6_rtrequest;
dst = (struct sockaddr *)&ia->ia_addr;
ln = lltable_alloc_entry(LLTABLE6(ifp), LLE_IFADDR, dst);
if (ln == NULL)
return (ENOBUFS);
IF_AFDATA_WLOCK(ifp);
LLE_WLOCK(ln);
/* Unlink any entry if exists */
ln_tmp = lla_lookup(LLTABLE6(ifp), LLE_EXCLUSIVE, dst);
if (ln_tmp != NULL)
lltable_unlink_entry(LLTABLE6(ifp), ln_tmp);
lltable_link_entry(LLTABLE6(ifp), ln);
IF_AFDATA_WUNLOCK(ifp);
if (ln_tmp != NULL)
EVENTHANDLER_INVOKE(lle_event, ln_tmp, LLENTRY_EXPIRED);
EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED);
LLE_WUNLOCK(ln);
if (ln_tmp != NULL)
llentry_free(ln_tmp);
return (0);
}
/*
* Removes either all lle entries for given @ia, or lle
* corresponding to @ia address.
*/
void
nd6_rem_ifa_lle(struct in6_ifaddr *ia, int all)
{
struct sockaddr_in6 mask, addr;
struct sockaddr *saddr, *smask;
struct ifnet *ifp;
ifp = ia->ia_ifa.ifa_ifp;
memcpy(&addr, &ia->ia_addr, sizeof(ia->ia_addr));
memcpy(&mask, &ia->ia_prefixmask, sizeof(ia->ia_prefixmask));
saddr = (struct sockaddr *)&addr;
smask = (struct sockaddr *)&mask;
if (all != 0)
lltable_prefix_free(AF_INET6, saddr, smask, LLE_STATIC);
else
lltable_delete_addr(LLTABLE6(ifp), LLE_IFADDR, saddr);
}
static void
clear_llinfo_pqueue(struct llentry *ln)
{
struct mbuf *m_hold, *m_hold_next;
for (m_hold = ln->la_hold; m_hold; m_hold = m_hold_next) {
m_hold_next = m_hold->m_nextpkt;
m_freem(m_hold);
}
ln->la_hold = NULL;
}
static int nd6_sysctl_drlist(SYSCTL_HANDLER_ARGS);
static int nd6_sysctl_prlist(SYSCTL_HANDLER_ARGS);
SYSCTL_DECL(_net_inet6_icmp6);
SYSCTL_PROC(_net_inet6_icmp6, ICMPV6CTL_ND6_DRLIST, nd6_drlist,
CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE,
NULL, 0, nd6_sysctl_drlist, "S,in6_defrouter",
"NDP default router list");
SYSCTL_PROC(_net_inet6_icmp6, ICMPV6CTL_ND6_PRLIST, nd6_prlist,
CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE,
NULL, 0, nd6_sysctl_prlist, "S,in6_prefix",
"NDP prefix list");
SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MAXQLEN, nd6_maxqueuelen,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_maxqueuelen), 1, "");
SYSCTL_INT(_net_inet6_icmp6, OID_AUTO, nd6_gctimer,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_gctimer), (60 * 60 * 24), "");
static int
nd6_sysctl_drlist(SYSCTL_HANDLER_ARGS)
{
struct in6_defrouter d;
struct nd_defrouter *dr;
int error;
if (req->newptr != NULL)
return (EPERM);
error = sysctl_wire_old_buffer(req, 0);
if (error != 0)
return (error);
bzero(&d, sizeof(d));
d.rtaddr.sin6_family = AF_INET6;
d.rtaddr.sin6_len = sizeof(d.rtaddr);
ND6_RLOCK();
TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) {
d.rtaddr.sin6_addr = dr->rtaddr;
error = sa6_recoverscope(&d.rtaddr);
if (error != 0)
break;
d.flags = dr->raflags;
d.rtlifetime = dr->rtlifetime;
d.expire = dr->expire + (time_second - time_uptime);
d.if_index = dr->ifp->if_index;
error = SYSCTL_OUT(req, &d, sizeof(d));
if (error != 0)
break;
}
ND6_RUNLOCK();
return (error);
}
static int
nd6_sysctl_prlist(SYSCTL_HANDLER_ARGS)
{
struct in6_prefix p;
struct sockaddr_in6 s6;
struct nd_prefix *pr;
struct nd_pfxrouter *pfr;
time_t maxexpire;
int error;
char ip6buf[INET6_ADDRSTRLEN];
if (req->newptr)
return (EPERM);
error = sysctl_wire_old_buffer(req, 0);
if (error != 0)
return (error);
bzero(&p, sizeof(p));
p.origin = PR_ORIG_RA;
bzero(&s6, sizeof(s6));
s6.sin6_family = AF_INET6;
s6.sin6_len = sizeof(s6);
ND6_RLOCK();
LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) {
p.prefix = pr->ndpr_prefix;
if (sa6_recoverscope(&p.prefix)) {
log(LOG_ERR, "scope error in prefix list (%s)\n",
ip6_sprintf(ip6buf, &p.prefix.sin6_addr));
/* XXX: press on... */
}
p.raflags = pr->ndpr_raf;
p.prefixlen = pr->ndpr_plen;
p.vltime = pr->ndpr_vltime;
p.pltime = pr->ndpr_pltime;
p.if_index = pr->ndpr_ifp->if_index;
if (pr->ndpr_vltime == ND6_INFINITE_LIFETIME)
p.expire = 0;
else {
/* XXX: we assume time_t is signed. */
maxexpire = (-1) &
~((time_t)1 << ((sizeof(maxexpire) * 8) - 1));
if (pr->ndpr_vltime < maxexpire - pr->ndpr_lastupdate)
p.expire = pr->ndpr_lastupdate +
pr->ndpr_vltime +
(time_second - time_uptime);
else
p.expire = maxexpire;
}
p.refcnt = pr->ndpr_addrcnt;
p.flags = pr->ndpr_stateflags;
p.advrtrs = 0;
LIST_FOREACH(pfr, &pr->ndpr_advrtrs, pfr_entry)
p.advrtrs++;
error = SYSCTL_OUT(req, &p, sizeof(p));
if (error != 0)
break;
LIST_FOREACH(pfr, &pr->ndpr_advrtrs, pfr_entry) {
s6.sin6_addr = pfr->router->rtaddr;
if (sa6_recoverscope(&s6))
log(LOG_ERR,
"scope error in prefix list (%s)\n",
ip6_sprintf(ip6buf, &pfr->router->rtaddr));
error = SYSCTL_OUT(req, &s6, sizeof(s6));
if (error != 0)
goto out;
}
}
out:
ND6_RUNLOCK();
return (error);
}
Index: head/sys/netinet6/nd6.h
===================================================================
--- head/sys/netinet6/nd6.h (revision 327172)
+++ head/sys/netinet6/nd6.h (revision 327173)
@@ -1,499 +1,499 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $KAME: nd6.h,v 1.76 2001/12/18 02:10:31 itojun Exp $
* $FreeBSD$
*/
#ifndef _NETINET6_ND6_H_
#define _NETINET6_ND6_H_
/* see net/route.h, or net/if_inarp.h */
#ifndef RTF_ANNOUNCE
#define RTF_ANNOUNCE RTF_PROTO2
#endif
#include <sys/queue.h>
#include <sys/callout.h>
struct llentry;
#define ND6_LLINFO_NOSTATE -2
/*
* We don't need the WAITDELETE state any more, but we keep the definition
* in a comment line instead of removing it. This is necessary to avoid
* unintentionally reusing the value for another purpose, which might
* affect backward compatibility with old applications.
* (20000711 jinmei@kame.net)
*/
/* #define ND6_LLINFO_WAITDELETE -1 */
#define ND6_LLINFO_INCOMPLETE 0
#define ND6_LLINFO_REACHABLE 1
#define ND6_LLINFO_STALE 2
#define ND6_LLINFO_DELAY 3
#define ND6_LLINFO_PROBE 4
#define ND6_IS_LLINFO_PROBREACH(n) ((n)->ln_state > ND6_LLINFO_INCOMPLETE)
#define ND6_LLINFO_PERMANENT(n) (((n)->la_expire == 0) && ((n)->ln_state > ND6_LLINFO_INCOMPLETE))
struct nd_ifinfo {
u_int32_t linkmtu; /* LinkMTU */
u_int32_t maxmtu; /* Upper bound of LinkMTU */
u_int32_t basereachable; /* BaseReachableTime */
u_int32_t reachable; /* Reachable Time */
u_int32_t retrans; /* Retrans Timer */
u_int32_t flags; /* Flags */
int recalctm; /* BaseReacable re-calculation timer */
u_int8_t chlim; /* CurHopLimit */
u_int8_t initialized; /* Flag to see the entry is initialized */
/* the following 3 members are for privacy extension for addrconf */
u_int8_t randomseed0[8]; /* upper 64 bits of MD5 digest */
u_int8_t randomseed1[8]; /* lower 64 bits (usually the EUI64 IFID) */
u_int8_t randomid[8]; /* current random ID */
};
#define ND6_IFF_PERFORMNUD 0x1
#define ND6_IFF_ACCEPT_RTADV 0x2
#define ND6_IFF_PREFER_SOURCE 0x4 /* Not used in FreeBSD. */
#define ND6_IFF_IFDISABLED 0x8 /* IPv6 operation is disabled due to
* DAD failure. (XXX: not ND-specific)
*/
#define ND6_IFF_DONT_SET_IFROUTE 0x10
#define ND6_IFF_AUTO_LINKLOCAL 0x20
#define ND6_IFF_NO_RADR 0x40
#define ND6_IFF_NO_PREFER_IFACE 0x80 /* XXX: not related to ND. */
#define ND6_IFF_NO_DAD 0x100
#ifdef _KERNEL
#define ND_IFINFO(ifp) \
(((struct in6_ifextra *)(ifp)->if_afdata[AF_INET6])->nd_ifinfo)
#define IN6_LINKMTU(ifp) \
((ND_IFINFO(ifp)->linkmtu && ND_IFINFO(ifp)->linkmtu < (ifp)->if_mtu) \
? ND_IFINFO(ifp)->linkmtu \
: ((ND_IFINFO(ifp)->maxmtu && ND_IFINFO(ifp)->maxmtu < (ifp)->if_mtu) \
? ND_IFINFO(ifp)->maxmtu : (ifp)->if_mtu))
#endif
struct in6_nbrinfo {
char ifname[IFNAMSIZ]; /* if name, e.g. "en0" */
struct in6_addr addr; /* IPv6 address of the neighbor */
long asked; /* number of queries already sent for this addr */
int isrouter; /* if it acts as a router */
int state; /* reachability state */
int expire; /* lifetime for NDP state transition */
};
#define DRLSTSIZ 10
#define PRLSTSIZ 10
struct in6_drlist {
char ifname[IFNAMSIZ];
struct {
struct in6_addr rtaddr;
u_char flags;
u_short rtlifetime;
u_long expire;
u_short if_index;
} defrouter[DRLSTSIZ];
};
struct in6_defrouter {
struct sockaddr_in6 rtaddr;
u_char flags;
u_short rtlifetime;
u_long expire;
u_short if_index;
};
#ifdef _KERNEL
struct in6_oprlist {
char ifname[IFNAMSIZ];
struct {
struct in6_addr prefix;
struct prf_ra raflags;
u_char prefixlen;
u_char origin;
u_long vltime;
u_long pltime;
u_long expire;
u_short if_index;
u_short advrtrs; /* number of advertisement routers */
struct in6_addr advrtr[DRLSTSIZ]; /* XXX: explicit limit */
} prefix[PRLSTSIZ];
};
#endif
struct in6_prlist {
char ifname[IFNAMSIZ];
struct {
struct in6_addr prefix;
struct prf_ra raflags;
u_char prefixlen;
u_char origin;
u_int32_t vltime;
u_int32_t pltime;
time_t expire;
u_short if_index;
u_short advrtrs; /* number of advertisement routers */
struct in6_addr advrtr[DRLSTSIZ]; /* XXX: explicit limit */
} prefix[PRLSTSIZ];
};
struct in6_prefix {
struct sockaddr_in6 prefix;
struct prf_ra raflags;
u_char prefixlen;
u_char origin;
u_int32_t vltime;
u_int32_t pltime;
time_t expire;
u_int32_t flags;
int refcnt;
u_short if_index;
u_short advrtrs; /* number of advertisement routers */
/* struct sockaddr_in6 advrtr[] */
};
#ifdef _KERNEL
struct in6_ondireq {
char ifname[IFNAMSIZ];
struct {
u_int32_t linkmtu; /* LinkMTU */
u_int32_t maxmtu; /* Upper bound of LinkMTU */
u_int32_t basereachable; /* BaseReachableTime */
u_int32_t reachable; /* Reachable Time */
u_int32_t retrans; /* Retrans Timer */
u_int32_t flags; /* Flags */
int recalctm; /* BaseReacable re-calculation timer */
u_int8_t chlim; /* CurHopLimit */
u_int8_t receivedra;
} ndi;
};
#endif
struct in6_ndireq {
char ifname[IFNAMSIZ];
struct nd_ifinfo ndi;
};
struct in6_ndifreq {
char ifname[IFNAMSIZ];
u_long ifindex;
};
/* Prefix status */
#define NDPRF_ONLINK 0x1
#define NDPRF_DETACHED 0x2
/* protocol constants */
#define MAX_RTR_SOLICITATION_DELAY 1 /* 1sec */
#define RTR_SOLICITATION_INTERVAL 4 /* 4sec */
#define MAX_RTR_SOLICITATIONS 3
#define ND6_INFINITE_LIFETIME 0xffffffff
#ifdef _KERNEL
/* node constants */
#define MAX_REACHABLE_TIME 3600000 /* msec */
#define REACHABLE_TIME 30000 /* msec */
#define RETRANS_TIMER 1000 /* msec */
#define MIN_RANDOM_FACTOR 512 /* 1024 * 0.5 */
#define MAX_RANDOM_FACTOR 1536 /* 1024 * 1.5 */
#define DEF_TEMP_VALID_LIFETIME 604800 /* 1 week */
#define DEF_TEMP_PREFERRED_LIFETIME 86400 /* 1 day */
#define TEMPADDR_REGEN_ADVANCE 5 /* sec */
#define MAX_TEMP_DESYNC_FACTOR 600 /* 10 min */
#define ND_COMPUTE_RTIME(x) \
(((MIN_RANDOM_FACTOR * (x >> 10)) + (arc4random() & \
((MAX_RANDOM_FACTOR - MIN_RANDOM_FACTOR) * (x >> 10)))) /1000)
TAILQ_HEAD(nd_drhead, nd_defrouter);
struct nd_defrouter {
TAILQ_ENTRY(nd_defrouter) dr_entry;
struct in6_addr rtaddr;
u_char raflags; /* flags on RA message */
u_short rtlifetime;
u_long expire;
struct ifnet *ifp;
int installed; /* is installed into kernel routing table */
u_int refcnt;
};
struct nd_prefixctl {
struct ifnet *ndpr_ifp;
/* prefix */
struct sockaddr_in6 ndpr_prefix;
u_char ndpr_plen;
u_int32_t ndpr_vltime; /* advertised valid lifetime */
u_int32_t ndpr_pltime; /* advertised preferred lifetime */
struct prf_ra ndpr_flags;
};
LIST_HEAD(nd_prhead, nd_prefix);
struct nd_prefix {
struct ifnet *ndpr_ifp;
LIST_ENTRY(nd_prefix) ndpr_entry;
struct sockaddr_in6 ndpr_prefix; /* prefix */
struct in6_addr ndpr_mask; /* netmask derived from the prefix */
u_int32_t ndpr_vltime; /* advertised valid lifetime */
u_int32_t ndpr_pltime; /* advertised preferred lifetime */
time_t ndpr_expire; /* expiration time of the prefix */
time_t ndpr_preferred; /* preferred time of the prefix */
time_t ndpr_lastupdate; /* reception time of last advertisement */
struct prf_ra ndpr_flags;
u_int32_t ndpr_stateflags; /* actual state flags */
/* list of routers that advertise the prefix: */
LIST_HEAD(pr_rtrhead, nd_pfxrouter) ndpr_advrtrs;
u_char ndpr_plen;
int ndpr_addrcnt; /* count of derived addresses */
volatile u_int ndpr_refcnt;
};
#define ndpr_raf ndpr_flags
#define ndpr_raf_onlink ndpr_flags.onlink
#define ndpr_raf_auto ndpr_flags.autonomous
#define ndpr_raf_router ndpr_flags.router
/*
* Message format for use in obtaining information about prefixes
* from inet6 sysctl function
*/
struct inet6_ndpr_msghdr {
u_short inpm_msglen; /* to skip over non-understood messages */
u_char inpm_version; /* future binary compatibility */
u_char inpm_type; /* message type */
struct in6_addr inpm_prefix;
u_long prm_vltim;
u_long prm_pltime;
u_long prm_expire;
u_long prm_preferred;
struct in6_prflags prm_flags;
u_short prm_index; /* index for associated ifp */
u_char prm_plen; /* length of prefix in bits */
};
#define prm_raf_onlink prm_flags.prf_ra.onlink
#define prm_raf_auto prm_flags.prf_ra.autonomous
#define prm_statef_onlink prm_flags.prf_state.onlink
#define prm_rrf_decrvalid prm_flags.prf_rr.decrvalid
#define prm_rrf_decrprefd prm_flags.prf_rr.decrprefd
struct nd_pfxrouter {
LIST_ENTRY(nd_pfxrouter) pfr_entry;
struct nd_defrouter *router;
};
#ifdef MALLOC_DECLARE
MALLOC_DECLARE(M_IP6NDP);
#endif
/* nd6.c */
VNET_DECLARE(int, nd6_prune);
VNET_DECLARE(int, nd6_delay);
VNET_DECLARE(int, nd6_umaxtries);
VNET_DECLARE(int, nd6_mmaxtries);
VNET_DECLARE(int, nd6_useloopback);
VNET_DECLARE(int, nd6_maxnudhint);
VNET_DECLARE(int, nd6_gctimer);
VNET_DECLARE(struct nd_drhead, nd_defrouter);
VNET_DECLARE(struct nd_prhead, nd_prefix);
VNET_DECLARE(int, nd6_debug);
VNET_DECLARE(int, nd6_onlink_ns_rfc4861);
#define V_nd6_prune VNET(nd6_prune)
#define V_nd6_delay VNET(nd6_delay)
#define V_nd6_umaxtries VNET(nd6_umaxtries)
#define V_nd6_mmaxtries VNET(nd6_mmaxtries)
#define V_nd6_useloopback VNET(nd6_useloopback)
#define V_nd6_maxnudhint VNET(nd6_maxnudhint)
#define V_nd6_gctimer VNET(nd6_gctimer)
#define V_nd_defrouter VNET(nd_defrouter)
#define V_nd_prefix VNET(nd_prefix)
#define V_nd6_debug VNET(nd6_debug)
#define V_nd6_onlink_ns_rfc4861 VNET(nd6_onlink_ns_rfc4861)
/* Lock for the prefix and default router lists. */
VNET_DECLARE(struct rwlock, nd6_lock);
VNET_DECLARE(uint64_t, nd6_list_genid);
#define V_nd6_lock VNET(nd6_lock)
#define V_nd6_list_genid VNET(nd6_list_genid)
#define ND6_RLOCK() rw_rlock(&V_nd6_lock)
#define ND6_RUNLOCK() rw_runlock(&V_nd6_lock)
#define ND6_WLOCK() rw_wlock(&V_nd6_lock)
#define ND6_WUNLOCK() rw_wunlock(&V_nd6_lock)
#define ND6_TRY_UPGRADE() rw_try_upgrade(&V_nd6_lock)
#define ND6_WLOCK_ASSERT() rw_assert(&V_nd6_lock, RA_WLOCKED)
#define ND6_RLOCK_ASSERT() rw_assert(&V_nd6_lock, RA_RLOCKED)
#define ND6_LOCK_ASSERT() rw_assert(&V_nd6_lock, RA_LOCKED)
#define ND6_UNLOCK_ASSERT() rw_assert(&V_nd6_lock, RA_UNLOCKED)
/* Mutex for prefix onlink/offlink transitions. */
VNET_DECLARE(struct mtx, nd6_onlink_mtx);
#define V_nd6_onlink_mtx VNET(nd6_onlink_mtx)
#define ND6_ONLINK_LOCK() mtx_lock(&V_nd6_onlink_mtx)
#define ND6_ONLINK_TRYLOCK() mtx_trylock(&V_nd6_onlink_mtx)
#define ND6_ONLINK_UNLOCK() mtx_unlock(&V_nd6_onlink_mtx)
#define ND6_ONLINK_LOCK_ASSERT() mtx_assert(&V_nd6_onlink_mtx, MA_OWNED)
#define ND6_ONLINK_UNLOCK_ASSERT() mtx_assert(&V_nd6_onlink_mtx, MA_NOTOWNED)
#define nd6log(x) do { if (V_nd6_debug) log x; } while (/*CONSTCOND*/ 0)
/* nd6_rtr.c */
VNET_DECLARE(int, nd6_defifindex);
VNET_DECLARE(int, ip6_desync_factor); /* seconds */
VNET_DECLARE(u_int32_t, ip6_temp_preferred_lifetime); /* seconds */
VNET_DECLARE(u_int32_t, ip6_temp_valid_lifetime); /* seconds */
VNET_DECLARE(int, ip6_temp_regen_advance); /* seconds */
#define V_nd6_defifindex VNET(nd6_defifindex)
#define V_ip6_desync_factor VNET(ip6_desync_factor)
#define V_ip6_temp_preferred_lifetime VNET(ip6_temp_preferred_lifetime)
#define V_ip6_temp_valid_lifetime VNET(ip6_temp_valid_lifetime)
#define V_ip6_temp_regen_advance VNET(ip6_temp_regen_advance)
union nd_opts {
struct nd_opt_hdr *nd_opt_array[16]; /* max = ND_OPT_NONCE */
struct {
struct nd_opt_hdr *zero;
struct nd_opt_hdr *src_lladdr;
struct nd_opt_hdr *tgt_lladdr;
struct nd_opt_prefix_info *pi_beg; /* multiple opts, start */
struct nd_opt_rd_hdr *rh;
struct nd_opt_mtu *mtu;
struct nd_opt_hdr *__res6;
struct nd_opt_hdr *__res7;
struct nd_opt_hdr *__res8;
struct nd_opt_hdr *__res9;
struct nd_opt_hdr *__res10;
struct nd_opt_hdr *__res11;
struct nd_opt_hdr *__res12;
struct nd_opt_hdr *__res13;
struct nd_opt_nonce *nonce;
struct nd_opt_hdr *__res15;
struct nd_opt_hdr *search; /* multiple opts */
struct nd_opt_hdr *last; /* multiple opts */
int done;
struct nd_opt_prefix_info *pi_end;/* multiple opts, end */
} nd_opt_each;
};
#define nd_opts_src_lladdr nd_opt_each.src_lladdr
#define nd_opts_tgt_lladdr nd_opt_each.tgt_lladdr
#define nd_opts_pi nd_opt_each.pi_beg
#define nd_opts_pi_end nd_opt_each.pi_end
#define nd_opts_rh nd_opt_each.rh
#define nd_opts_mtu nd_opt_each.mtu
#define nd_opts_nonce nd_opt_each.nonce
#define nd_opts_search nd_opt_each.search
#define nd_opts_last nd_opt_each.last
#define nd_opts_done nd_opt_each.done
/* XXX: need nd6_var.h?? */
/* nd6.c */
void nd6_init(void);
#ifdef VIMAGE
void nd6_destroy(void);
#endif
struct nd_ifinfo *nd6_ifattach(struct ifnet *);
void nd6_ifdetach(struct ifnet *, struct nd_ifinfo *);
int nd6_is_addr_neighbor(const struct sockaddr_in6 *, struct ifnet *);
void nd6_option_init(void *, int, union nd_opts *);
struct nd_opt_hdr *nd6_option(union nd_opts *);
int nd6_options(union nd_opts *);
struct llentry *nd6_lookup(const struct in6_addr *, int, struct ifnet *);
struct llentry *nd6_alloc(const struct in6_addr *, int, struct ifnet *);
void nd6_setmtu(struct ifnet *);
void nd6_llinfo_setstate(struct llentry *lle, int newstate);
void nd6_timer(void *);
void nd6_purge(struct ifnet *);
int nd6_resolve_addr(struct ifnet *ifp, int flags, const struct sockaddr *dst,
char *desten, uint32_t *pflags);
int nd6_resolve(struct ifnet *, int, struct mbuf *,
const struct sockaddr *, u_char *, uint32_t *, struct llentry **);
int nd6_ioctl(u_long, caddr_t, struct ifnet *);
void nd6_cache_lladdr(struct ifnet *, struct in6_addr *,
char *, int, int, int);
void nd6_grab_holdchain(struct llentry *, struct mbuf **,
struct sockaddr_in6 *);
-int nd6_flush_holdchain(struct ifnet *, struct ifnet *, struct mbuf *,
+int nd6_flush_holdchain(struct ifnet *, struct mbuf *,
struct sockaddr_in6 *);
int nd6_add_ifa_lle(struct in6_ifaddr *);
void nd6_rem_ifa_lle(struct in6_ifaddr *, int);
int nd6_output_ifp(struct ifnet *, struct ifnet *, struct mbuf *,
struct sockaddr_in6 *, struct route *);
/* nd6_nbr.c */
void nd6_na_input(struct mbuf *, int, int);
void nd6_na_output(struct ifnet *, const struct in6_addr *,
const struct in6_addr *, u_long, int, struct sockaddr *);
void nd6_ns_input(struct mbuf *, int, int);
void nd6_ns_output(struct ifnet *, const struct in6_addr *,
const struct in6_addr *, const struct in6_addr *, uint8_t *);
caddr_t nd6_ifptomac(struct ifnet *);
void nd6_dad_init(void);
void nd6_dad_start(struct ifaddr *, int);
void nd6_dad_stop(struct ifaddr *);
/* nd6_rtr.c */
void nd6_rs_input(struct mbuf *, int, int);
void nd6_ra_input(struct mbuf *, int, int);
void defrouter_reset(void);
void defrouter_select_fib(int fibnum);
void defrouter_select(void);
void defrouter_ref(struct nd_defrouter *);
void defrouter_rele(struct nd_defrouter *);
bool defrouter_remove(struct in6_addr *, struct ifnet *);
void defrouter_unlink(struct nd_defrouter *, struct nd_drhead *);
void defrouter_del(struct nd_defrouter *);
int nd6_prelist_add(struct nd_prefixctl *, struct nd_defrouter *,
struct nd_prefix **);
void nd6_prefix_unlink(struct nd_prefix *, struct nd_prhead *);
void nd6_prefix_del(struct nd_prefix *);
void nd6_prefix_ref(struct nd_prefix *);
void nd6_prefix_rele(struct nd_prefix *);
int nd6_prefix_onlink(struct nd_prefix *);
int nd6_prefix_offlink(struct nd_prefix *);
void pfxlist_onlink_check(void);
struct nd_defrouter *defrouter_lookup(struct in6_addr *, struct ifnet *);
struct nd_defrouter *defrouter_lookup_locked(struct in6_addr *, struct ifnet *);
struct nd_prefix *nd6_prefix_lookup(struct nd_prefixctl *);
void rt6_flush(struct in6_addr *, struct ifnet *);
int nd6_setdefaultiface(int);
int in6_tmpifadd(const struct in6_ifaddr *, int, int);
#endif /* _KERNEL */
#endif /* _NETINET6_ND6_H_ */
Index: head/sys/netinet6/nd6_nbr.c
===================================================================
--- head/sys/netinet6/nd6_nbr.c (revision 327172)
+++ head/sys/netinet6/nd6_nbr.c (revision 327173)
@@ -1,1553 +1,1547 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $KAME: nd6_nbr.c,v 1.86 2002/01/21 02:33:04 jinmei Exp $
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_ipsec.h"
#include "opt_mpath.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/libkern.h>
#include <sys/lock.h>
#include <sys/rwlock.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/sockio.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/errno.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/queue.h>
#include <sys/callout.h>
#include <sys/refcount.h>
#include <net/if.h>
#include <net/if_types.h>
#include <net/if_dl.h>
#include <net/if_var.h>
#include <net/route.h>
#ifdef RADIX_MPATH
#include <net/radix_mpath.h>
#endif
#include <net/vnet.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <net/if_llatbl.h>
#include <netinet6/in6_var.h>
#include <netinet6/in6_ifattach.h>
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/scope6_var.h>
#include <netinet6/nd6.h>
#include <netinet/icmp6.h>
#include <netinet/ip_carp.h>
#include <netinet6/send.h>
#define SDL(s) ((struct sockaddr_dl *)s)
struct dadq;
static struct dadq *nd6_dad_find(struct ifaddr *, struct nd_opt_nonce *);
static void nd6_dad_add(struct dadq *dp);
static void nd6_dad_del(struct dadq *dp);
static void nd6_dad_rele(struct dadq *);
static void nd6_dad_starttimer(struct dadq *, int, int);
static void nd6_dad_stoptimer(struct dadq *);
static void nd6_dad_timer(struct dadq *);
static void nd6_dad_duplicated(struct ifaddr *, struct dadq *);
static void nd6_dad_ns_output(struct dadq *);
static void nd6_dad_ns_input(struct ifaddr *, struct nd_opt_nonce *);
static void nd6_dad_na_input(struct ifaddr *);
static void nd6_na_output_fib(struct ifnet *, const struct in6_addr *,
const struct in6_addr *, u_long, int, struct sockaddr *, u_int);
static void nd6_ns_output_fib(struct ifnet *, const struct in6_addr *,
const struct in6_addr *, const struct in6_addr *, uint8_t *, u_int);
static VNET_DEFINE(int, dad_enhanced) = 1;
#define V_dad_enhanced VNET(dad_enhanced)
SYSCTL_DECL(_net_inet6_ip6);
SYSCTL_INT(_net_inet6_ip6, OID_AUTO, dad_enhanced, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(dad_enhanced), 0,
"Enable Enhanced DAD, which adds a random nonce to NS messages for DAD.");
static VNET_DEFINE(int, dad_maxtry) = 15; /* max # of *tries* to
transmit DAD packet */
#define V_dad_maxtry VNET(dad_maxtry)
/*
* Input a Neighbor Solicitation Message.
*
* Based on RFC 2461
* Based on RFC 2462 (duplicate address detection)
*/
void
nd6_ns_input(struct mbuf *m, int off, int icmp6len)
{
struct ifnet *ifp = m->m_pkthdr.rcvif;
struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
struct nd_neighbor_solicit *nd_ns;
struct in6_addr saddr6 = ip6->ip6_src;
struct in6_addr daddr6 = ip6->ip6_dst;
struct in6_addr taddr6;
struct in6_addr myaddr6;
char *lladdr = NULL;
struct ifaddr *ifa = NULL;
int lladdrlen = 0;
int anycast = 0, proxy = 0, tentative = 0;
int tlladdr;
int rflag;
union nd_opts ndopts;
struct sockaddr_dl proxydl;
char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
/* RFC 6980: Nodes MUST silently ignore fragments */
if(m->m_flags & M_FRAGMENTED)
goto freeit;
rflag = (V_ip6_forwarding) ? ND_NA_FLAG_ROUTER : 0;
if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV && V_ip6_norbit_raif)
rflag = 0;
#ifndef PULLDOWN_TEST
IP6_EXTHDR_CHECK(m, off, icmp6len,);
nd_ns = (struct nd_neighbor_solicit *)((caddr_t)ip6 + off);
#else
IP6_EXTHDR_GET(nd_ns, struct nd_neighbor_solicit *, m, off, icmp6len);
if (nd_ns == NULL) {
ICMP6STAT_INC(icp6s_tooshort);
return;
}
#endif
ip6 = mtod(m, struct ip6_hdr *); /* adjust pointer for safety */
taddr6 = nd_ns->nd_ns_target;
if (in6_setscope(&taddr6, ifp, NULL) != 0)
goto bad;
if (ip6->ip6_hlim != 255) {
nd6log((LOG_ERR,
"nd6_ns_input: invalid hlim (%d) from %s to %s on %s\n",
ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src),
ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp)));
goto bad;
}
if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) {
/* dst has to be a solicited node multicast address. */
if (daddr6.s6_addr16[0] == IPV6_ADDR_INT16_MLL &&
/* don't check ifindex portion */
daddr6.s6_addr32[1] == 0 &&
daddr6.s6_addr32[2] == IPV6_ADDR_INT32_ONE &&
daddr6.s6_addr8[12] == 0xff) {
; /* good */
} else {
nd6log((LOG_INFO, "nd6_ns_input: bad DAD packet "
"(wrong ip6 dst)\n"));
goto bad;
}
} else if (!V_nd6_onlink_ns_rfc4861) {
struct sockaddr_in6 src_sa6;
/*
* According to recent IETF discussions, it is not a good idea
* to accept a NS from an address which would not be deemed
* to be a neighbor otherwise. This point is expected to be
* clarified in future revisions of the specification.
*/
bzero(&src_sa6, sizeof(src_sa6));
src_sa6.sin6_family = AF_INET6;
src_sa6.sin6_len = sizeof(src_sa6);
src_sa6.sin6_addr = saddr6;
if (nd6_is_addr_neighbor(&src_sa6, ifp) == 0) {
nd6log((LOG_INFO, "nd6_ns_input: "
"NS packet from non-neighbor\n"));
goto bad;
}
}
if (IN6_IS_ADDR_MULTICAST(&taddr6)) {
nd6log((LOG_INFO, "nd6_ns_input: bad NS target (multicast)\n"));
goto bad;
}
icmp6len -= sizeof(*nd_ns);
nd6_option_init(nd_ns + 1, icmp6len, &ndopts);
if (nd6_options(&ndopts) < 0) {
nd6log((LOG_INFO,
"nd6_ns_input: invalid ND option, ignored\n"));
/* nd6_options have incremented stats */
goto freeit;
}
if (ndopts.nd_opts_src_lladdr) {
lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1);
lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3;
}
if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) && lladdr) {
nd6log((LOG_INFO, "nd6_ns_input: bad DAD packet "
"(link-layer address option)\n"));
goto bad;
}
/*
* Attaching target link-layer address to the NA?
* (RFC 2461 7.2.4)
*
* NS IP dst is unicast/anycast MUST NOT add
* NS IP dst is solicited-node multicast MUST add
*
* In implementation, we add target link-layer address by default.
* We do not add one in MUST NOT cases.
*/
if (!IN6_IS_ADDR_MULTICAST(&daddr6))
tlladdr = 0;
else
tlladdr = 1;
/*
* Target address (taddr6) must be either:
* (1) Valid unicast/anycast address for my receiving interface,
* (2) Unicast address for which I'm offering proxy service, or
* (3) "tentative" address on which DAD is being performed.
*/
/* (1) and (3) check. */
if (ifp->if_carp)
ifa = (*carp_iamatch6_p)(ifp, &taddr6);
else
ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6);
/* (2) check. */
if (ifa == NULL) {
struct sockaddr_dl rt_gateway;
struct rt_addrinfo info;
struct sockaddr_in6 dst6;
bzero(&dst6, sizeof(dst6));
dst6.sin6_len = sizeof(struct sockaddr_in6);
dst6.sin6_family = AF_INET6;
dst6.sin6_addr = taddr6;
bzero(&rt_gateway, sizeof(rt_gateway));
rt_gateway.sdl_len = sizeof(rt_gateway);
bzero(&info, sizeof(info));
info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&rt_gateway;
if (rib_lookup_info(ifp->if_fib, (struct sockaddr *)&dst6,
0, 0, &info) == 0) {
if ((info.rti_flags & RTF_ANNOUNCE) != 0 &&
rt_gateway.sdl_family == AF_LINK) {
/*
* proxy NDP for single entry
*/
proxydl = *SDL(&rt_gateway);
ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(
ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST);
if (ifa)
proxy = 1;
}
}
}
if (ifa == NULL) {
/*
* We've got an NS packet, and we don't have that adddress
* assigned for us. We MUST silently ignore it.
* See RFC2461 7.2.3.
*/
goto freeit;
}
myaddr6 = *IFA_IN6(ifa);
anycast = ((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST;
tentative = ((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_TENTATIVE;
if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DUPLICATED)
goto freeit;
if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
nd6log((LOG_INFO, "nd6_ns_input: lladdrlen mismatch for %s "
"(if %d, NS packet %d)\n",
ip6_sprintf(ip6bufs, &taddr6),
ifp->if_addrlen, lladdrlen - 2));
goto bad;
}
if (IN6_ARE_ADDR_EQUAL(&myaddr6, &saddr6)) {
nd6log((LOG_INFO, "nd6_ns_input: duplicate IP6 address %s\n",
ip6_sprintf(ip6bufs, &saddr6)));
goto freeit;
}
/*
* We have neighbor solicitation packet, with target address equals to
* one of my tentative address.
*
* src addr how to process?
* --- ---
* multicast of course, invalid (rejected in ip6_input)
* unicast somebody is doing address resolution -> ignore
* unspec dup address detection
*
* The processing is defined in RFC 2462.
*/
if (tentative) {
/*
* If source address is unspecified address, it is for
* duplicate address detection.
*
* If not, the packet is for addess resolution;
* silently ignore it.
*/
if (IN6_IS_ADDR_UNSPECIFIED(&saddr6))
nd6_dad_ns_input(ifa, ndopts.nd_opts_nonce);
goto freeit;
}
/*
* If the source address is unspecified address, entries must not
* be created or updated.
* It looks that sender is performing DAD. Output NA toward
* all-node multicast address, to tell the sender that I'm using
* the address.
* S bit ("solicited") must be zero.
*/
if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) {
struct in6_addr in6_all;
in6_all = in6addr_linklocal_allnodes;
if (in6_setscope(&in6_all, ifp, NULL) != 0)
goto bad;
nd6_na_output_fib(ifp, &in6_all, &taddr6,
((anycast || proxy || !tlladdr) ? 0 : ND_NA_FLAG_OVERRIDE) |
rflag, tlladdr, proxy ? (struct sockaddr *)&proxydl : NULL,
M_GETFIB(m));
goto freeit;
}
nd6_cache_lladdr(ifp, &saddr6, lladdr, lladdrlen,
ND_NEIGHBOR_SOLICIT, 0);
nd6_na_output_fib(ifp, &saddr6, &taddr6,
((anycast || proxy || !tlladdr) ? 0 : ND_NA_FLAG_OVERRIDE) |
rflag | ND_NA_FLAG_SOLICITED, tlladdr,
proxy ? (struct sockaddr *)&proxydl : NULL, M_GETFIB(m));
freeit:
if (ifa != NULL)
ifa_free(ifa);
m_freem(m);
return;
bad:
nd6log((LOG_ERR, "nd6_ns_input: src=%s\n",
ip6_sprintf(ip6bufs, &saddr6)));
nd6log((LOG_ERR, "nd6_ns_input: dst=%s\n",
ip6_sprintf(ip6bufs, &daddr6)));
nd6log((LOG_ERR, "nd6_ns_input: tgt=%s\n",
ip6_sprintf(ip6bufs, &taddr6)));
ICMP6STAT_INC(icp6s_badns);
if (ifa != NULL)
ifa_free(ifa);
m_freem(m);
}
/*
* Output a Neighbor Solicitation Message. Caller specifies:
* - ICMP6 header source IP6 address
* - ND6 header target IP6 address
* - ND6 header source datalink address
*
* Based on RFC 2461
* Based on RFC 2462 (duplicate address detection)
*
* ln - for source address determination
* nonce - If non-NULL, NS is used for duplicate address detection and
* the value (length is ND_OPT_NONCE_LEN) is used as a random nonce.
*/
static void
nd6_ns_output_fib(struct ifnet *ifp, const struct in6_addr *saddr6,
const struct in6_addr *daddr6, const struct in6_addr *taddr6,
uint8_t *nonce, u_int fibnum)
{
struct mbuf *m;
struct m_tag *mtag;
struct ip6_hdr *ip6;
struct nd_neighbor_solicit *nd_ns;
struct ip6_moptions im6o;
int icmp6len;
int maxlen;
caddr_t mac;
if (IN6_IS_ADDR_MULTICAST(taddr6))
return;
/* estimate the size of message */
maxlen = sizeof(*ip6) + sizeof(*nd_ns);
maxlen += (sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7;
KASSERT(max_linkhdr + maxlen <= MCLBYTES, (
"%s: max_linkhdr + maxlen > MCLBYTES (%d + %d > %d)",
__func__, max_linkhdr, maxlen, MCLBYTES));
if (max_linkhdr + maxlen > MHLEN)
m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
else
m = m_gethdr(M_NOWAIT, MT_DATA);
if (m == NULL)
return;
M_SETFIB(m, fibnum);
if (daddr6 == NULL || IN6_IS_ADDR_MULTICAST(daddr6)) {
m->m_flags |= M_MCAST;
im6o.im6o_multicast_ifp = ifp;
im6o.im6o_multicast_hlim = 255;
im6o.im6o_multicast_loop = 0;
}
icmp6len = sizeof(*nd_ns);
m->m_pkthdr.len = m->m_len = sizeof(*ip6) + icmp6len;
m->m_data += max_linkhdr; /* or M_ALIGN() equivalent? */
/* fill neighbor solicitation packet */
ip6 = mtod(m, struct ip6_hdr *);
ip6->ip6_flow = 0;
ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
ip6->ip6_vfc |= IPV6_VERSION;
/* ip6->ip6_plen will be set later */
ip6->ip6_nxt = IPPROTO_ICMPV6;
ip6->ip6_hlim = 255;
if (daddr6)
ip6->ip6_dst = *daddr6;
else {
ip6->ip6_dst.s6_addr16[0] = IPV6_ADDR_INT16_MLL;
ip6->ip6_dst.s6_addr16[1] = 0;
ip6->ip6_dst.s6_addr32[1] = 0;
ip6->ip6_dst.s6_addr32[2] = IPV6_ADDR_INT32_ONE;
ip6->ip6_dst.s6_addr32[3] = taddr6->s6_addr32[3];
ip6->ip6_dst.s6_addr8[12] = 0xff;
if (in6_setscope(&ip6->ip6_dst, ifp, NULL) != 0)
goto bad;
}
if (nonce == NULL) {
struct ifaddr *ifa = NULL;
/*
* RFC2461 7.2.2:
* "If the source address of the packet prompting the
* solicitation is the same as one of the addresses assigned
* to the outgoing interface, that address SHOULD be placed
* in the IP Source Address of the outgoing solicitation.
* Otherwise, any one of the addresses assigned to the
* interface should be used."
*
* We use the source address for the prompting packet
* (saddr6), if saddr6 belongs to the outgoing interface.
* Otherwise, we perform the source address selection as usual.
*/
if (saddr6 != NULL)
ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, saddr6);
if (ifa != NULL) {
/* ip6_src set already. */
ip6->ip6_src = *saddr6;
ifa_free(ifa);
} else {
int error;
struct in6_addr dst6, src6;
uint32_t scopeid;
in6_splitscope(&ip6->ip6_dst, &dst6, &scopeid);
error = in6_selectsrc_addr(fibnum, &dst6,
scopeid, ifp, &src6, NULL);
if (error) {
char ip6buf[INET6_ADDRSTRLEN];
nd6log((LOG_DEBUG, "%s: source can't be "
"determined: dst=%s, error=%d\n", __func__,
ip6_sprintf(ip6buf, &dst6),
error));
goto bad;
}
ip6->ip6_src = src6;
}
} else {
/*
* Source address for DAD packet must always be IPv6
* unspecified address. (0::0)
* We actually don't have to 0-clear the address (we did it
* above), but we do so here explicitly to make the intention
* clearer.
*/
bzero(&ip6->ip6_src, sizeof(ip6->ip6_src));
}
nd_ns = (struct nd_neighbor_solicit *)(ip6 + 1);
nd_ns->nd_ns_type = ND_NEIGHBOR_SOLICIT;
nd_ns->nd_ns_code = 0;
nd_ns->nd_ns_reserved = 0;
nd_ns->nd_ns_target = *taddr6;
in6_clearscope(&nd_ns->nd_ns_target); /* XXX */
/*
* Add source link-layer address option.
*
* spec implementation
* --- ---
* DAD packet MUST NOT do not add the option
* there's no link layer address:
* impossible do not add the option
* there's link layer address:
* Multicast NS MUST add one add the option
* Unicast NS SHOULD add one add the option
*/
if (nonce == NULL && (mac = nd6_ifptomac(ifp))) {
int optlen = sizeof(struct nd_opt_hdr) + ifp->if_addrlen;
struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)(nd_ns + 1);
/* 8 byte alignments... */
optlen = (optlen + 7) & ~7;
m->m_pkthdr.len += optlen;
m->m_len += optlen;
icmp6len += optlen;
bzero((caddr_t)nd_opt, optlen);
nd_opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
nd_opt->nd_opt_len = optlen >> 3;
bcopy(mac, (caddr_t)(nd_opt + 1), ifp->if_addrlen);
}
/*
* Add a Nonce option (RFC 3971) to detect looped back NS messages.
* This behavior is documented as Enhanced Duplicate Address
* Detection in RFC 7527.
* net.inet6.ip6.dad_enhanced=0 disables this.
*/
if (V_dad_enhanced != 0 && nonce != NULL) {
int optlen = sizeof(struct nd_opt_hdr) + ND_OPT_NONCE_LEN;
struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)(nd_ns + 1);
/* 8-byte alignment is required. */
optlen = (optlen + 7) & ~7;
m->m_pkthdr.len += optlen;
m->m_len += optlen;
icmp6len += optlen;
bzero((caddr_t)nd_opt, optlen);
nd_opt->nd_opt_type = ND_OPT_NONCE;
nd_opt->nd_opt_len = optlen >> 3;
bcopy(nonce, (caddr_t)(nd_opt + 1), ND_OPT_NONCE_LEN);
}
ip6->ip6_plen = htons((u_short)icmp6len);
nd_ns->nd_ns_cksum = 0;
nd_ns->nd_ns_cksum =
in6_cksum(m, IPPROTO_ICMPV6, sizeof(*ip6), icmp6len);
if (send_sendso_input_hook != NULL) {
mtag = m_tag_get(PACKET_TAG_ND_OUTGOING,
sizeof(unsigned short), M_NOWAIT);
if (mtag == NULL)
goto bad;
*(unsigned short *)(mtag + 1) = nd_ns->nd_ns_type;
m_tag_prepend(m, mtag);
}
ip6_output(m, NULL, NULL, (nonce != NULL) ? IPV6_UNSPECSRC : 0,
&im6o, NULL, NULL);
icmp6_ifstat_inc(ifp, ifs6_out_msg);
icmp6_ifstat_inc(ifp, ifs6_out_neighborsolicit);
ICMP6STAT_INC(icp6s_outhist[ND_NEIGHBOR_SOLICIT]);
return;
bad:
m_freem(m);
}
#ifndef BURN_BRIDGES
void
nd6_ns_output(struct ifnet *ifp, const struct in6_addr *saddr6,
const struct in6_addr *daddr6, const struct in6_addr *taddr6,uint8_t *nonce)
{
nd6_ns_output_fib(ifp, saddr6, daddr6, taddr6, nonce, RT_DEFAULT_FIB);
}
#endif
/*
* Neighbor advertisement input handling.
*
* Based on RFC 2461
* Based on RFC 2462 (duplicate address detection)
*
* the following items are not implemented yet:
* - proxy advertisement delay rule (RFC2461 7.2.8, last paragraph, SHOULD)
* - anycast advertisement delay rule (RFC2461 7.2.7, SHOULD)
*/
void
nd6_na_input(struct mbuf *m, int off, int icmp6len)
{
struct ifnet *ifp = m->m_pkthdr.rcvif;
struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
struct nd_neighbor_advert *nd_na;
struct in6_addr daddr6 = ip6->ip6_dst;
struct in6_addr taddr6;
int flags;
int is_router;
int is_solicited;
int is_override;
char *lladdr = NULL;
int lladdrlen = 0;
int checklink = 0;
struct ifaddr *ifa;
struct llentry *ln = NULL;
union nd_opts ndopts;
struct mbuf *chain = NULL;
struct sockaddr_in6 sin6;
u_char linkhdr[LLE_MAX_LINKHDR];
size_t linkhdrsize;
int lladdr_off;
char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
/* RFC 6980: Nodes MUST silently ignore fragments */
if(m->m_flags & M_FRAGMENTED)
goto freeit;
if (ip6->ip6_hlim != 255) {
nd6log((LOG_ERR,
"nd6_na_input: invalid hlim (%d) from %s to %s on %s\n",
ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src),
ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp)));
goto bad;
}
#ifndef PULLDOWN_TEST
IP6_EXTHDR_CHECK(m, off, icmp6len,);
nd_na = (struct nd_neighbor_advert *)((caddr_t)ip6 + off);
#else
IP6_EXTHDR_GET(nd_na, struct nd_neighbor_advert *, m, off, icmp6len);
if (nd_na == NULL) {
ICMP6STAT_INC(icp6s_tooshort);
return;
}
#endif
flags = nd_na->nd_na_flags_reserved;
is_router = ((flags & ND_NA_FLAG_ROUTER) != 0);
is_solicited = ((flags & ND_NA_FLAG_SOLICITED) != 0);
is_override = ((flags & ND_NA_FLAG_OVERRIDE) != 0);
memset(&sin6, 0, sizeof(sin6));
taddr6 = nd_na->nd_na_target;
if (in6_setscope(&taddr6, ifp, NULL))
goto bad; /* XXX: impossible */
if (IN6_IS_ADDR_MULTICAST(&taddr6)) {
nd6log((LOG_ERR,
"nd6_na_input: invalid target address %s\n",
ip6_sprintf(ip6bufs, &taddr6)));
goto bad;
}
if (IN6_IS_ADDR_MULTICAST(&daddr6))
if (is_solicited) {
nd6log((LOG_ERR,
"nd6_na_input: a solicited adv is multicasted\n"));
goto bad;
}
icmp6len -= sizeof(*nd_na);
nd6_option_init(nd_na + 1, icmp6len, &ndopts);
if (nd6_options(&ndopts) < 0) {
nd6log((LOG_INFO,
"nd6_na_input: invalid ND option, ignored\n"));
/* nd6_options have incremented stats */
goto freeit;
}
if (ndopts.nd_opts_tgt_lladdr) {
lladdr = (char *)(ndopts.nd_opts_tgt_lladdr + 1);
lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3;
}
/*
* This effectively disables the DAD check on a non-master CARP
* address.
*/
if (ifp->if_carp)
ifa = (*carp_iamatch6_p)(ifp, &taddr6);
else
ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6);
/*
* Target address matches one of my interface address.
*
* If my address is tentative, this means that there's somebody
* already using the same address as mine. This indicates DAD failure.
* This is defined in RFC 2462.
*
* Otherwise, process as defined in RFC 2461.
*/
if (ifa
&& (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_TENTATIVE)) {
nd6_dad_na_input(ifa);
ifa_free(ifa);
goto freeit;
}
/* Just for safety, maybe unnecessary. */
if (ifa) {
ifa_free(ifa);
log(LOG_ERR,
"nd6_na_input: duplicate IP6 address %s\n",
ip6_sprintf(ip6bufs, &taddr6));
goto freeit;
}
if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
nd6log((LOG_INFO, "nd6_na_input: lladdrlen mismatch for %s "
"(if %d, NA packet %d)\n", ip6_sprintf(ip6bufs, &taddr6),
ifp->if_addrlen, lladdrlen - 2));
goto bad;
}
/*
* If no neighbor cache entry is found, NA SHOULD silently be
* discarded.
*/
IF_AFDATA_RLOCK(ifp);
ln = nd6_lookup(&taddr6, LLE_EXCLUSIVE, ifp);
IF_AFDATA_RUNLOCK(ifp);
if (ln == NULL) {
goto freeit;
}
if (ln->ln_state == ND6_LLINFO_INCOMPLETE) {
/*
* If the link-layer has address, and no lladdr option came,
* discard the packet.
*/
if (ifp->if_addrlen && lladdr == NULL) {
goto freeit;
}
/*
* Record link-layer address, and update the state.
*/
linkhdrsize = sizeof(linkhdr);
if (lltable_calc_llheader(ifp, AF_INET6, lladdr,
linkhdr, &linkhdrsize, &lladdr_off) != 0)
return;
if (lltable_try_set_entry_addr(ifp, ln, linkhdr, linkhdrsize,
lladdr_off) == 0) {
ln = NULL;
goto freeit;
}
EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED);
if (is_solicited)
nd6_llinfo_setstate(ln, ND6_LLINFO_REACHABLE);
else
nd6_llinfo_setstate(ln, ND6_LLINFO_STALE);
if ((ln->ln_router = is_router) != 0) {
/*
* This means a router's state has changed from
* non-reachable to probably reachable, and might
* affect the status of associated prefixes..
*/
checklink = 1;
}
} else {
int llchange;
/*
* Check if the link-layer address has changed or not.
*/
if (lladdr == NULL)
llchange = 0;
else {
if (ln->la_flags & LLE_VALID) {
if (bcmp(lladdr, ln->ll_addr, ifp->if_addrlen))
llchange = 1;
else
llchange = 0;
} else
llchange = 1;
}
/*
* This is VERY complex. Look at it with care.
*
* override solicit lladdr llchange action
* (L: record lladdr)
*
* 0 0 n -- (2c)
* 0 0 y n (2b) L
* 0 0 y y (1) REACHABLE->STALE
* 0 1 n -- (2c) *->REACHABLE
* 0 1 y n (2b) L *->REACHABLE
* 0 1 y y (1) REACHABLE->STALE
* 1 0 n -- (2a)
* 1 0 y n (2a) L
* 1 0 y y (2a) L *->STALE
* 1 1 n -- (2a) *->REACHABLE
* 1 1 y n (2a) L *->REACHABLE
* 1 1 y y (2a) L *->REACHABLE
*/
if (!is_override && (lladdr != NULL && llchange)) { /* (1) */
/*
* If state is REACHABLE, make it STALE.
* no other updates should be done.
*/
if (ln->ln_state == ND6_LLINFO_REACHABLE)
nd6_llinfo_setstate(ln, ND6_LLINFO_STALE);
goto freeit;
} else if (is_override /* (2a) */
|| (!is_override && (lladdr != NULL && !llchange)) /* (2b) */
|| lladdr == NULL) { /* (2c) */
/*
* Update link-local address, if any.
*/
if (lladdr != NULL) {
linkhdrsize = sizeof(linkhdr);
if (lltable_calc_llheader(ifp, AF_INET6, lladdr,
linkhdr, &linkhdrsize, &lladdr_off) != 0)
goto freeit;
if (lltable_try_set_entry_addr(ifp, ln, linkhdr,
linkhdrsize, lladdr_off) == 0) {
ln = NULL;
goto freeit;
}
EVENTHANDLER_INVOKE(lle_event, ln,
LLENTRY_RESOLVED);
}
/*
* If solicited, make the state REACHABLE.
* If not solicited and the link-layer address was
* changed, make it STALE.
*/
if (is_solicited)
nd6_llinfo_setstate(ln, ND6_LLINFO_REACHABLE);
else {
if (lladdr != NULL && llchange)
nd6_llinfo_setstate(ln, ND6_LLINFO_STALE);
}
}
if (ln->ln_router && !is_router) {
/*
* The peer dropped the router flag.
* Remove the sender from the Default Router List and
* update the Destination Cache entries.
*/
struct ifnet *nd6_ifp;
nd6_ifp = lltable_get_ifp(ln->lle_tbl);
if (!defrouter_remove(&ln->r_l3addr.addr6, nd6_ifp) &&
(ND_IFINFO(nd6_ifp)->flags &
ND6_IFF_ACCEPT_RTADV) != 0)
/*
* Even if the neighbor is not in the default
* router list, the neighbor may be used as a
* next hop for some destinations (e.g. redirect
* case). So we must call rt6_flush explicitly.
*/
rt6_flush(&ip6->ip6_src, ifp);
}
ln->ln_router = is_router;
}
/* XXX - QL
* Does this matter?
* rt->rt_flags &= ~RTF_REJECT;
*/
ln->la_asked = 0;
if (ln->la_hold != NULL)
nd6_grab_holdchain(ln, &chain, &sin6);
freeit:
if (ln != NULL)
LLE_WUNLOCK(ln);
if (chain != NULL)
- nd6_flush_holdchain(ifp, ifp, chain, &sin6);
+ nd6_flush_holdchain(ifp, chain, &sin6);
if (checklink)
pfxlist_onlink_check();
m_freem(m);
return;
bad:
if (ln != NULL)
LLE_WUNLOCK(ln);
ICMP6STAT_INC(icp6s_badna);
m_freem(m);
}
/*
* Neighbor advertisement output handling.
*
* Based on RFC 2461
*
* the following items are not implemented yet:
* - proxy advertisement delay rule (RFC2461 7.2.8, last paragraph, SHOULD)
* - anycast advertisement delay rule (RFC2461 7.2.7, SHOULD)
*
* tlladdr - 1 if include target link-layer address
* sdl0 - sockaddr_dl (= proxy NA) or NULL
*/
static void
nd6_na_output_fib(struct ifnet *ifp, const struct in6_addr *daddr6_0,
const struct in6_addr *taddr6, u_long flags, int tlladdr,
struct sockaddr *sdl0, u_int fibnum)
{
struct mbuf *m;
struct m_tag *mtag;
struct ip6_hdr *ip6;
struct nd_neighbor_advert *nd_na;
struct ip6_moptions im6o;
struct in6_addr daddr6, dst6, src6;
uint32_t scopeid;
int icmp6len, maxlen, error;
caddr_t mac = NULL;
daddr6 = *daddr6_0; /* make a local copy for modification */
/* estimate the size of message */
maxlen = sizeof(*ip6) + sizeof(*nd_na);
maxlen += (sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7;
KASSERT(max_linkhdr + maxlen <= MCLBYTES, (
"%s: max_linkhdr + maxlen > MCLBYTES (%d + %d > %d)",
__func__, max_linkhdr, maxlen, MCLBYTES));
if (max_linkhdr + maxlen > MHLEN)
m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
else
m = m_gethdr(M_NOWAIT, MT_DATA);
if (m == NULL)
return;
M_SETFIB(m, fibnum);
if (IN6_IS_ADDR_MULTICAST(&daddr6)) {
m->m_flags |= M_MCAST;
im6o.im6o_multicast_ifp = ifp;
im6o.im6o_multicast_hlim = 255;
im6o.im6o_multicast_loop = 0;
}
icmp6len = sizeof(*nd_na);
m->m_pkthdr.len = m->m_len = sizeof(struct ip6_hdr) + icmp6len;
m->m_data += max_linkhdr; /* or M_ALIGN() equivalent? */
/* fill neighbor advertisement packet */
ip6 = mtod(m, struct ip6_hdr *);
ip6->ip6_flow = 0;
ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
ip6->ip6_vfc |= IPV6_VERSION;
ip6->ip6_nxt = IPPROTO_ICMPV6;
ip6->ip6_hlim = 255;
if (IN6_IS_ADDR_UNSPECIFIED(&daddr6)) {
/* reply to DAD */
daddr6.s6_addr16[0] = IPV6_ADDR_INT16_MLL;
daddr6.s6_addr16[1] = 0;
daddr6.s6_addr32[1] = 0;
daddr6.s6_addr32[2] = 0;
daddr6.s6_addr32[3] = IPV6_ADDR_INT32_ONE;
if (in6_setscope(&daddr6, ifp, NULL))
goto bad;
flags &= ~ND_NA_FLAG_SOLICITED;
}
ip6->ip6_dst = daddr6;
/*
* Select a source whose scope is the same as that of the dest.
*/
in6_splitscope(&daddr6, &dst6, &scopeid);
error = in6_selectsrc_addr(fibnum, &dst6,
scopeid, ifp, &src6, NULL);
if (error) {
char ip6buf[INET6_ADDRSTRLEN];
nd6log((LOG_DEBUG, "nd6_na_output: source can't be "
"determined: dst=%s, error=%d\n",
ip6_sprintf(ip6buf, &daddr6), error));
goto bad;
}
ip6->ip6_src = src6;
nd_na = (struct nd_neighbor_advert *)(ip6 + 1);
nd_na->nd_na_type = ND_NEIGHBOR_ADVERT;
nd_na->nd_na_code = 0;
nd_na->nd_na_target = *taddr6;
in6_clearscope(&nd_na->nd_na_target); /* XXX */
/*
* "tlladdr" indicates NS's condition for adding tlladdr or not.
* see nd6_ns_input() for details.
* Basically, if NS packet is sent to unicast/anycast addr,
* target lladdr option SHOULD NOT be included.
*/
if (tlladdr) {
/*
* sdl0 != NULL indicates proxy NA. If we do proxy, use
* lladdr in sdl0. If we are not proxying (sending NA for
* my address) use lladdr configured for the interface.
*/
if (sdl0 == NULL) {
if (ifp->if_carp)
mac = (*carp_macmatch6_p)(ifp, m, taddr6);
if (mac == NULL)
mac = nd6_ifptomac(ifp);
} else if (sdl0->sa_family == AF_LINK) {
struct sockaddr_dl *sdl;
sdl = (struct sockaddr_dl *)sdl0;
if (sdl->sdl_alen == ifp->if_addrlen)
mac = LLADDR(sdl);
}
}
if (tlladdr && mac) {
int optlen = sizeof(struct nd_opt_hdr) + ifp->if_addrlen;
struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)(nd_na + 1);
/* roundup to 8 bytes alignment! */
optlen = (optlen + 7) & ~7;
m->m_pkthdr.len += optlen;
m->m_len += optlen;
icmp6len += optlen;
bzero((caddr_t)nd_opt, optlen);
nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
nd_opt->nd_opt_len = optlen >> 3;
bcopy(mac, (caddr_t)(nd_opt + 1), ifp->if_addrlen);
} else
flags &= ~ND_NA_FLAG_OVERRIDE;
ip6->ip6_plen = htons((u_short)icmp6len);
nd_na->nd_na_flags_reserved = flags;
nd_na->nd_na_cksum = 0;
nd_na->nd_na_cksum =
in6_cksum(m, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), icmp6len);
if (send_sendso_input_hook != NULL) {
mtag = m_tag_get(PACKET_TAG_ND_OUTGOING,
sizeof(unsigned short), M_NOWAIT);
if (mtag == NULL)
goto bad;
*(unsigned short *)(mtag + 1) = nd_na->nd_na_type;
m_tag_prepend(m, mtag);
}
ip6_output(m, NULL, NULL, 0, &im6o, NULL, NULL);
icmp6_ifstat_inc(ifp, ifs6_out_msg);
icmp6_ifstat_inc(ifp, ifs6_out_neighboradvert);
ICMP6STAT_INC(icp6s_outhist[ND_NEIGHBOR_ADVERT]);
return;
bad:
m_freem(m);
}
#ifndef BURN_BRIDGES
void
nd6_na_output(struct ifnet *ifp, const struct in6_addr *daddr6_0,
const struct in6_addr *taddr6, u_long flags, int tlladdr,
struct sockaddr *sdl0)
{
nd6_na_output_fib(ifp, daddr6_0, taddr6, flags, tlladdr, sdl0,
RT_DEFAULT_FIB);
}
#endif
caddr_t
nd6_ifptomac(struct ifnet *ifp)
{
switch (ifp->if_type) {
case IFT_ARCNET:
case IFT_ETHER:
case IFT_FDDI:
case IFT_IEEE1394:
case IFT_L2VLAN:
case IFT_INFINIBAND:
case IFT_BRIDGE:
case IFT_ISO88025:
return IF_LLADDR(ifp);
default:
return NULL;
}
}
struct dadq {
TAILQ_ENTRY(dadq) dad_list;
struct ifaddr *dad_ifa;
int dad_count; /* max NS to send */
int dad_ns_tcount; /* # of trials to send NS */
int dad_ns_ocount; /* NS sent so far */
int dad_ns_icount;
int dad_na_icount;
int dad_ns_lcount; /* looped back NS */
int dad_loopbackprobe; /* probing state for loopback detection */
struct callout dad_timer_ch;
struct vnet *dad_vnet;
u_int dad_refcnt;
#define ND_OPT_NONCE_LEN32 \
((ND_OPT_NONCE_LEN + sizeof(uint32_t) - 1)/sizeof(uint32_t))
uint32_t dad_nonce[ND_OPT_NONCE_LEN32];
};
static VNET_DEFINE(TAILQ_HEAD(, dadq), dadq);
static VNET_DEFINE(struct rwlock, dad_rwlock);
#define V_dadq VNET(dadq)
#define V_dad_rwlock VNET(dad_rwlock)
#define DADQ_RLOCK() rw_rlock(&V_dad_rwlock)
#define DADQ_RUNLOCK() rw_runlock(&V_dad_rwlock)
#define DADQ_WLOCK() rw_wlock(&V_dad_rwlock)
#define DADQ_WUNLOCK() rw_wunlock(&V_dad_rwlock)
static void
nd6_dad_add(struct dadq *dp)
{
DADQ_WLOCK();
TAILQ_INSERT_TAIL(&V_dadq, dp, dad_list);
DADQ_WUNLOCK();
}
static void
nd6_dad_del(struct dadq *dp)
{
DADQ_WLOCK();
TAILQ_REMOVE(&V_dadq, dp, dad_list);
DADQ_WUNLOCK();
nd6_dad_rele(dp);
}
static struct dadq *
nd6_dad_find(struct ifaddr *ifa, struct nd_opt_nonce *n)
{
struct dadq *dp;
DADQ_RLOCK();
TAILQ_FOREACH(dp, &V_dadq, dad_list) {
if (dp->dad_ifa != ifa)
continue;
/*
* Skip if the nonce matches the received one.
* +2 in the length is required because of type and
* length fields are included in a header.
*/
if (n != NULL &&
n->nd_opt_nonce_len == (ND_OPT_NONCE_LEN + 2) / 8 &&
memcmp(&n->nd_opt_nonce[0], &dp->dad_nonce[0],
ND_OPT_NONCE_LEN) == 0) {
dp->dad_ns_lcount++;
continue;
}
refcount_acquire(&dp->dad_refcnt);
break;
}
DADQ_RUNLOCK();
return (dp);
}
static void
nd6_dad_starttimer(struct dadq *dp, int ticks, int send_ns)
{
if (send_ns != 0)
nd6_dad_ns_output(dp);
callout_reset(&dp->dad_timer_ch, ticks,
(void (*)(void *))nd6_dad_timer, (void *)dp);
}
static void
nd6_dad_stoptimer(struct dadq *dp)
{
callout_drain(&dp->dad_timer_ch);
}
static void
nd6_dad_rele(struct dadq *dp)
{
if (refcount_release(&dp->dad_refcnt)) {
ifa_free(dp->dad_ifa);
free(dp, M_IP6NDP);
}
}
void
nd6_dad_init(void)
{
rw_init(&V_dad_rwlock, "nd6 DAD queue");
TAILQ_INIT(&V_dadq);
}
/*
* Start Duplicate Address Detection (DAD) for specified interface address.
*/
void
nd6_dad_start(struct ifaddr *ifa, int delay)
{
struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa;
struct dadq *dp;
char ip6buf[INET6_ADDRSTRLEN];
KASSERT((ia->ia6_flags & IN6_IFF_TENTATIVE) != 0,
("starting DAD on non-tentative address %p", ifa));
/*
* If we don't need DAD, don't do it.
* There are several cases:
* - DAD is disabled globally or on the interface
* - the interface address is anycast
*/
if ((ia->ia6_flags & IN6_IFF_ANYCAST) != 0 ||
V_ip6_dad_count == 0 ||
(ND_IFINFO(ifa->ifa_ifp)->flags & ND6_IFF_NO_DAD) != 0) {
ia->ia6_flags &= ~IN6_IFF_TENTATIVE;
return;
}
if ((ifa->ifa_ifp->if_flags & IFF_UP) == 0 ||
(ifa->ifa_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
(ND_IFINFO(ifa->ifa_ifp)->flags & ND6_IFF_IFDISABLED) != 0)
return;
if ((dp = nd6_dad_find(ifa, NULL)) != NULL) {
/*
* DAD is already in progress. Let the existing entry
* finish it.
*/
nd6_dad_rele(dp);
return;
}
dp = malloc(sizeof(*dp), M_IP6NDP, M_NOWAIT | M_ZERO);
if (dp == NULL) {
log(LOG_ERR, "nd6_dad_start: memory allocation failed for "
"%s(%s)\n",
ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr),
ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???");
return;
}
callout_init(&dp->dad_timer_ch, 0);
#ifdef VIMAGE
dp->dad_vnet = curvnet;
#endif
nd6log((LOG_DEBUG, "%s: starting DAD for %s\n", if_name(ifa->ifa_ifp),
ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr)));
/*
* Send NS packet for DAD, ip6_dad_count times.
* Note that we must delay the first transmission, if this is the
* first packet to be sent from the interface after interface
* (re)initialization.
*/
dp->dad_ifa = ifa;
ifa_ref(dp->dad_ifa);
dp->dad_count = V_ip6_dad_count;
dp->dad_ns_icount = dp->dad_na_icount = 0;
dp->dad_ns_ocount = dp->dad_ns_tcount = 0;
dp->dad_ns_lcount = dp->dad_loopbackprobe = 0;
refcount_init(&dp->dad_refcnt, 1);
nd6_dad_add(dp);
nd6_dad_starttimer(dp, delay, 0);
}
/*
* terminate DAD unconditionally. used for address removals.
*/
void
nd6_dad_stop(struct ifaddr *ifa)
{
struct dadq *dp;
dp = nd6_dad_find(ifa, NULL);
if (!dp) {
/* DAD wasn't started yet */
return;
}
nd6_dad_stoptimer(dp);
/*
* The DAD queue entry may have been removed by nd6_dad_timer() while
* we were waiting for it to stop, so re-do the lookup.
*/
nd6_dad_rele(dp);
dp = nd6_dad_find(ifa, NULL);
if (dp == NULL)
return;
nd6_dad_del(dp);
nd6_dad_rele(dp);
}
static void
nd6_dad_timer(struct dadq *dp)
{
CURVNET_SET(dp->dad_vnet);
struct ifaddr *ifa = dp->dad_ifa;
struct ifnet *ifp = dp->dad_ifa->ifa_ifp;
struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa;
char ip6buf[INET6_ADDRSTRLEN];
KASSERT(ia != NULL, ("DAD entry %p with no address", dp));
if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) {
/* Do not need DAD for ifdisabled interface. */
log(LOG_ERR, "nd6_dad_timer: cancel DAD on %s because of "
"ND6_IFF_IFDISABLED.\n", ifp->if_xname);
goto err;
}
if (ia->ia6_flags & IN6_IFF_DUPLICATED) {
log(LOG_ERR, "nd6_dad_timer: called with duplicated address "
"%s(%s)\n",
ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr),
ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???");
goto err;
}
if ((ia->ia6_flags & IN6_IFF_TENTATIVE) == 0) {
log(LOG_ERR, "nd6_dad_timer: called with non-tentative address "
"%s(%s)\n",
ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr),
ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???");
goto err;
}
/* Stop DAD if the interface is down even after dad_maxtry attempts. */
if ((dp->dad_ns_tcount > V_dad_maxtry) &&
(((ifp->if_flags & IFF_UP) == 0) ||
((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0))) {
nd6log((LOG_INFO, "%s: could not run DAD "
"because the interface was down or not running.\n",
if_name(ifa->ifa_ifp)));
goto err;
}
/* Need more checks? */
if (dp->dad_ns_ocount < dp->dad_count) {
/*
* We have more NS to go. Send NS packet for DAD.
*/
nd6_dad_starttimer(dp,
(long)ND_IFINFO(ifa->ifa_ifp)->retrans * hz / 1000, 1);
goto done;
} else {
/*
* We have transmitted sufficient number of DAD packets.
* See what we've got.
*/
if (dp->dad_ns_icount > 0 || dp->dad_na_icount > 0)
/* We've seen NS or NA, means DAD has failed. */
nd6_dad_duplicated(ifa, dp);
else if (V_dad_enhanced != 0 &&
dp->dad_ns_lcount > 0 &&
dp->dad_ns_lcount > dp->dad_loopbackprobe) {
/*
* Sec. 4.1 in RFC 7527 requires transmission of
* additional probes until the loopback condition
* becomes clear when a looped back probe is detected.
*/
log(LOG_ERR, "%s: a looped back NS message is "
"detected during DAD for %s. "
"Another DAD probes are being sent.\n",
if_name(ifa->ifa_ifp),
ip6_sprintf(ip6buf, IFA_IN6(ifa)));
dp->dad_loopbackprobe = dp->dad_ns_lcount;
/*
* Send an NS immediately and increase dad_count by
* V_nd6_mmaxtries - 1.
*/
dp->dad_count =
dp->dad_ns_ocount + V_nd6_mmaxtries - 1;
nd6_dad_starttimer(dp,
(long)ND_IFINFO(ifa->ifa_ifp)->retrans * hz / 1000,
1);
goto done;
} else {
/*
* We are done with DAD. No NA came, no NS came.
* No duplicate address found. Check IFDISABLED flag
* again in case that it is changed between the
* beginning of this function and here.
*/
if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) == 0)
ia->ia6_flags &= ~IN6_IFF_TENTATIVE;
nd6log((LOG_DEBUG,
"%s: DAD complete for %s - no duplicates found\n",
if_name(ifa->ifa_ifp),
ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr)));
if (dp->dad_ns_lcount > 0)
log(LOG_ERR, "%s: DAD completed while "
"a looped back NS message is detected "
"during DAD for %s.\n",
if_name(ifa->ifa_ifp),
ip6_sprintf(ip6buf, IFA_IN6(ifa)));
}
}
err:
nd6_dad_del(dp);
done:
CURVNET_RESTORE();
}
static void
nd6_dad_duplicated(struct ifaddr *ifa, struct dadq *dp)
{
struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa;
struct ifnet *ifp;
char ip6buf[INET6_ADDRSTRLEN];
log(LOG_ERR, "%s: DAD detected duplicate IPv6 address %s: "
"NS in/out/loopback=%d/%d/%d, NA in=%d\n",
if_name(ifa->ifa_ifp), ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr),
dp->dad_ns_icount, dp->dad_ns_ocount, dp->dad_ns_lcount,
dp->dad_na_icount);
ia->ia6_flags &= ~IN6_IFF_TENTATIVE;
ia->ia6_flags |= IN6_IFF_DUPLICATED;
ifp = ifa->ifa_ifp;
log(LOG_ERR, "%s: DAD complete for %s - duplicate found\n",
if_name(ifp), ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr));
log(LOG_ERR, "%s: manual intervention required\n",
if_name(ifp));
/*
* If the address is a link-local address formed from an interface
* identifier based on the hardware address which is supposed to be
* uniquely assigned (e.g., EUI-64 for an Ethernet interface), IP
* operation on the interface SHOULD be disabled.
* [RFC 4862, Section 5.4.5]
*/
if (IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr)) {
struct in6_addr in6;
/*
* To avoid over-reaction, we only apply this logic when we are
* very sure that hardware addresses are supposed to be unique.
*/
switch (ifp->if_type) {
case IFT_ETHER:
case IFT_FDDI:
case IFT_ATM:
case IFT_IEEE1394:
case IFT_INFINIBAND:
in6 = ia->ia_addr.sin6_addr;
if (in6_get_hw_ifid(ifp, &in6) == 0 &&
IN6_ARE_ADDR_EQUAL(&ia->ia_addr.sin6_addr, &in6)) {
ND_IFINFO(ifp)->flags |= ND6_IFF_IFDISABLED;
log(LOG_ERR, "%s: possible hardware address "
"duplication detected, disable IPv6\n",
if_name(ifp));
}
break;
}
}
}
static void
nd6_dad_ns_output(struct dadq *dp)
{
struct in6_ifaddr *ia = (struct in6_ifaddr *)dp->dad_ifa;
struct ifnet *ifp = dp->dad_ifa->ifa_ifp;
int i;
dp->dad_ns_tcount++;
if ((ifp->if_flags & IFF_UP) == 0) {
return;
}
if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
return;
}
dp->dad_ns_ocount++;
if (V_dad_enhanced != 0) {
for (i = 0; i < ND_OPT_NONCE_LEN32; i++)
dp->dad_nonce[i] = arc4random();
/*
* XXXHRS: Note that in the case that
* DupAddrDetectTransmits > 1, multiple NS messages with
* different nonces can be looped back in an unexpected
* order. The current implementation recognizes only
* the latest nonce on the sender side. Practically it
* should work well in almost all cases.
*/
}
nd6_ns_output(ifp, NULL, NULL, &ia->ia_addr.sin6_addr,
(uint8_t *)&dp->dad_nonce[0]);
}
static void
nd6_dad_ns_input(struct ifaddr *ifa, struct nd_opt_nonce *ndopt_nonce)
{
- struct in6_ifaddr *ia;
- struct ifnet *ifp;
- const struct in6_addr *taddr6;
struct dadq *dp;
if (ifa == NULL)
panic("ifa == NULL in nd6_dad_ns_input");
- ia = (struct in6_ifaddr *)ifa;
- ifp = ifa->ifa_ifp;
- taddr6 = &ia->ia_addr.sin6_addr;
/* Ignore Nonce option when Enhanced DAD is disabled. */
if (V_dad_enhanced == 0)
ndopt_nonce = NULL;
dp = nd6_dad_find(ifa, ndopt_nonce);
if (dp == NULL)
return;
dp->dad_ns_icount++;
nd6_dad_rele(dp);
}
static void
nd6_dad_na_input(struct ifaddr *ifa)
{
struct dadq *dp;
if (ifa == NULL)
panic("ifa == NULL in nd6_dad_na_input");
dp = nd6_dad_find(ifa, NULL);
if (dp != NULL) {
dp->dad_na_icount++;
nd6_dad_rele(dp);
}
}
Index: head/sys/netinet6/raw_ip6.c
===================================================================
--- head/sys/netinet6/raw_ip6.c (revision 327172)
+++ head/sys/netinet6/raw_ip6.c (revision 327173)
@@ -1,909 +1,899 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*-
* Copyright (c) 1982, 1986, 1988, 1993
* The Regents of the University of California.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)raw_ip.c 8.2 (Berkeley) 1/4/94
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_ipsec.h"
#include "opt_inet6.h"
#include <sys/param.h>
#include <sys/errno.h>
#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/protosw.h>
#include <sys/signalvar.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sx.h>
#include <sys/syslog.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/if_types.h>
#include <net/route.h>
#include <net/vnet.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/in_systm.h>
#include <netinet/in_pcb.h>
#include <netinet/icmp6.h>
#include <netinet/ip6.h>
#include <netinet/ip_var.h>
#include <netinet6/ip6protosw.h>
#include <netinet6/ip6_mroute.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6_var.h>
#include <netinet6/nd6.h>
#include <netinet6/raw_ip6.h>
#include <netinet6/scope6_var.h>
#include <netinet6/send.h>
#include <netipsec/ipsec_support.h>
#include <machine/stdarg.h>
#define satosin6(sa) ((struct sockaddr_in6 *)(sa))
#define ifatoia6(ifa) ((struct in6_ifaddr *)(ifa))
/*
* Raw interface to IP6 protocol.
*/
VNET_DECLARE(struct inpcbhead, ripcb);
VNET_DECLARE(struct inpcbinfo, ripcbinfo);
#define V_ripcb VNET(ripcb)
#define V_ripcbinfo VNET(ripcbinfo)
extern u_long rip_sendspace;
extern u_long rip_recvspace;
VNET_PCPUSTAT_DEFINE(struct rip6stat, rip6stat);
VNET_PCPUSTAT_SYSINIT(rip6stat);
#ifdef VIMAGE
VNET_PCPUSTAT_SYSUNINIT(rip6stat);
#endif /* VIMAGE */
/*
* Hooks for multicast routing. They all default to NULL, so leave them not
* initialized and rely on BSS being set to 0.
*/
/*
* The socket used to communicate with the multicast routing daemon.
*/
VNET_DEFINE(struct socket *, ip6_mrouter);
/*
* The various mrouter functions.
*/
int (*ip6_mrouter_set)(struct socket *, struct sockopt *);
int (*ip6_mrouter_get)(struct socket *, struct sockopt *);
int (*ip6_mrouter_done)(void);
int (*ip6_mforward)(struct ip6_hdr *, struct ifnet *, struct mbuf *);
int (*mrt6_ioctl)(u_long, caddr_t);
/*
* Setup generic address and protocol structures for raw_input routine, then
* pass them along with mbuf chain.
*/
int
rip6_input(struct mbuf **mp, int *offp, int proto)
{
struct ifnet *ifp;
struct mbuf *m = *mp;
struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
struct inpcb *in6p;
struct inpcb *last = NULL;
struct mbuf *opts = NULL;
struct sockaddr_in6 fromsa;
RIP6STAT_INC(rip6s_ipackets);
init_sin6(&fromsa, m, 0); /* general init */
ifp = m->m_pkthdr.rcvif;
INP_INFO_RLOCK(&V_ripcbinfo);
LIST_FOREACH(in6p, &V_ripcb, inp_list) {
/* XXX inp locking */
if ((in6p->inp_vflag & INP_IPV6) == 0)
continue;
if (in6p->inp_ip_p &&
in6p->inp_ip_p != proto)
continue;
if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr) &&
!IN6_ARE_ADDR_EQUAL(&in6p->in6p_laddr, &ip6->ip6_dst))
continue;
if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr) &&
!IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &ip6->ip6_src))
continue;
if (jailed_without_vnet(in6p->inp_cred)) {
/*
* Allow raw socket in jail to receive multicast;
* assume process had PRIV_NETINET_RAW at attach,
* and fall through into normal filter path if so.
*/
if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) &&
prison_check_ip6(in6p->inp_cred,
&ip6->ip6_dst) != 0)
continue;
}
INP_RLOCK(in6p);
if (in6p->in6p_cksum != -1) {
RIP6STAT_INC(rip6s_isum);
if (in6_cksum(m, proto, *offp,
m->m_pkthdr.len - *offp)) {
INP_RUNLOCK(in6p);
RIP6STAT_INC(rip6s_badsum);
continue;
}
}
/*
* If this raw socket has multicast state, and we
* have received a multicast, check if this socket
* should receive it, as multicast filtering is now
* the responsibility of the transport layer.
*/
if (in6p->in6p_moptions &&
IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
/*
* If the incoming datagram is for MLD, allow it
* through unconditionally to the raw socket.
*
* Use the M_RTALERT_MLD flag to check for MLD
* traffic without having to inspect the mbuf chain
* more deeply, as all MLDv1/v2 host messages MUST
* contain the Router Alert option.
*
* In the case of MLDv1, we may not have explicitly
* joined the group, and may have set IFF_ALLMULTI
* on the interface. im6o_mc_filter() may discard
* control traffic we actually need to see.
*
* Userland multicast routing daemons should continue
* filter the control traffic appropriately.
*/
int blocked;
blocked = MCAST_PASS;
if ((m->m_flags & M_RTALERT_MLD) == 0) {
struct sockaddr_in6 mcaddr;
bzero(&mcaddr, sizeof(struct sockaddr_in6));
mcaddr.sin6_len = sizeof(struct sockaddr_in6);
mcaddr.sin6_family = AF_INET6;
mcaddr.sin6_addr = ip6->ip6_dst;
blocked = im6o_mc_filter(in6p->in6p_moptions,
ifp,
(struct sockaddr *)&mcaddr,
(struct sockaddr *)&fromsa);
}
if (blocked != MCAST_PASS) {
IP6STAT_INC(ip6s_notmember);
INP_RUNLOCK(in6p);
continue;
}
}
if (last != NULL) {
struct mbuf *n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
#if defined(IPSEC) || defined(IPSEC_SUPPORT)
/*
* Check AH/ESP integrity.
*/
if (IPSEC_ENABLED(ipv6)) {
if (n != NULL &&
IPSEC_CHECK_POLICY(ipv6, n, last) != 0) {
m_freem(n);
/* Do not inject data into pcb. */
n = NULL;
}
}
#endif /* IPSEC */
if (n) {
if (last->inp_flags & INP_CONTROLOPTS ||
last->inp_socket->so_options & SO_TIMESTAMP)
ip6_savecontrol(last, n, &opts);
/* strip intermediate headers */
m_adj(n, *offp);
if (sbappendaddr(&last->inp_socket->so_rcv,
(struct sockaddr *)&fromsa,
n, opts) == 0) {
m_freem(n);
if (opts)
m_freem(opts);
RIP6STAT_INC(rip6s_fullsock);
} else
sorwakeup(last->inp_socket);
opts = NULL;
}
INP_RUNLOCK(last);
}
last = in6p;
}
INP_INFO_RUNLOCK(&V_ripcbinfo);
#if defined(IPSEC) || defined(IPSEC_SUPPORT)
/*
* Check AH/ESP integrity.
*/
if (IPSEC_ENABLED(ipv6) && last != NULL &&
IPSEC_CHECK_POLICY(ipv6, m, last) != 0) {
m_freem(m);
IP6STAT_DEC(ip6s_delivered);
/* Do not inject data into pcb. */
INP_RUNLOCK(last);
} else
#endif /* IPSEC */
if (last != NULL) {
if (last->inp_flags & INP_CONTROLOPTS ||
last->inp_socket->so_options & SO_TIMESTAMP)
ip6_savecontrol(last, m, &opts);
/* Strip intermediate headers. */
m_adj(m, *offp);
if (sbappendaddr(&last->inp_socket->so_rcv,
(struct sockaddr *)&fromsa, m, opts) == 0) {
m_freem(m);
if (opts)
m_freem(opts);
RIP6STAT_INC(rip6s_fullsock);
} else
sorwakeup(last->inp_socket);
INP_RUNLOCK(last);
} else {
RIP6STAT_INC(rip6s_nosock);
if (m->m_flags & M_MCAST)
RIP6STAT_INC(rip6s_nosockmcast);
if (proto == IPPROTO_NONE)
m_freem(m);
else {
char *prvnxtp = ip6_get_prevhdr(m, *offp); /* XXX */
icmp6_error(m, ICMP6_PARAM_PROB,
ICMP6_PARAMPROB_NEXTHEADER,
prvnxtp - mtod(m, char *));
}
IP6STAT_DEC(ip6s_delivered);
}
return (IPPROTO_DONE);
}
void
rip6_ctlinput(int cmd, struct sockaddr *sa, void *d)
{
- struct ip6_hdr *ip6;
- struct mbuf *m;
- int off = 0;
struct ip6ctlparam *ip6cp = NULL;
const struct sockaddr_in6 *sa6_src = NULL;
void *cmdarg;
struct inpcb *(*notify)(struct inpcb *, int) = in6_rtchange;
if (sa->sa_family != AF_INET6 ||
sa->sa_len != sizeof(struct sockaddr_in6))
return;
if ((unsigned)cmd >= PRC_NCMDS)
return;
if (PRC_IS_REDIRECT(cmd))
notify = in6_rtchange, d = NULL;
else if (cmd == PRC_HOSTDEAD)
d = NULL;
else if (inet6ctlerrmap[cmd] == 0)
return;
/*
* If the parameter is from icmp6, decode it.
*/
if (d != NULL) {
ip6cp = (struct ip6ctlparam *)d;
- m = ip6cp->ip6c_m;
- ip6 = ip6cp->ip6c_ip6;
- off = ip6cp->ip6c_off;
cmdarg = ip6cp->ip6c_cmdarg;
sa6_src = ip6cp->ip6c_src;
} else {
- m = NULL;
- ip6 = NULL;
cmdarg = NULL;
sa6_src = &sa6_any;
}
(void) in6_pcbnotify(&V_ripcbinfo, sa, 0,
(const struct sockaddr *)sa6_src, 0, cmd, cmdarg, notify);
}
/*
* Generate IPv6 header and pass packet to ip6_output. Tack on options user
* may have setup with control call.
*/
int
rip6_output(struct mbuf *m, struct socket *so, ...)
{
struct mbuf *control;
struct m_tag *mtag;
struct sockaddr_in6 *dstsock;
- struct in6_addr *dst;
struct ip6_hdr *ip6;
struct inpcb *in6p;
u_int plen = m->m_pkthdr.len;
int error = 0;
struct ip6_pktopts opt, *optp;
struct ifnet *oifp = NULL;
int type = 0, code = 0; /* for ICMPv6 output statistics only */
int scope_ambiguous = 0;
int use_defzone = 0;
int hlim = 0;
struct in6_addr in6a;
va_list ap;
va_start(ap, so);
dstsock = va_arg(ap, struct sockaddr_in6 *);
control = va_arg(ap, struct mbuf *);
va_end(ap);
in6p = sotoinpcb(so);
INP_WLOCK(in6p);
- dst = &dstsock->sin6_addr;
if (control != NULL) {
if ((error = ip6_setpktopts(control, &opt,
in6p->in6p_outputopts, so->so_cred,
so->so_proto->pr_protocol)) != 0) {
goto bad;
}
optp = &opt;
} else
optp = in6p->in6p_outputopts;
/*
* Check and convert scope zone ID into internal form.
*
* XXX: we may still need to determine the zone later.
*/
if (!(so->so_state & SS_ISCONNECTED)) {
if (!optp || !optp->ip6po_pktinfo ||
!optp->ip6po_pktinfo->ipi6_ifindex)
use_defzone = V_ip6_use_defzone;
if (dstsock->sin6_scope_id == 0 && !use_defzone)
scope_ambiguous = 1;
if ((error = sa6_embedscope(dstsock, use_defzone)) != 0)
goto bad;
}
/*
* For an ICMPv6 packet, we should know its type and code to update
* statistics.
*/
if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) {
struct icmp6_hdr *icmp6;
if (m->m_len < sizeof(struct icmp6_hdr) &&
(m = m_pullup(m, sizeof(struct icmp6_hdr))) == NULL) {
error = ENOBUFS;
goto bad;
}
icmp6 = mtod(m, struct icmp6_hdr *);
type = icmp6->icmp6_type;
code = icmp6->icmp6_code;
}
M_PREPEND(m, sizeof(*ip6), M_NOWAIT);
if (m == NULL) {
error = ENOBUFS;
goto bad;
}
ip6 = mtod(m, struct ip6_hdr *);
/*
* Source address selection.
*/
error = in6_selectsrc_socket(dstsock, optp, in6p, so->so_cred,
scope_ambiguous, &in6a, &hlim);
if (error)
goto bad;
error = prison_check_ip6(in6p->inp_cred, &in6a);
if (error != 0)
goto bad;
ip6->ip6_src = in6a;
ip6->ip6_dst = dstsock->sin6_addr;
/*
* Fill in the rest of the IPv6 header fields.
*/
ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
(in6p->inp_flow & IPV6_FLOWINFO_MASK);
ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
(IPV6_VERSION & IPV6_VERSION_MASK);
/*
* ip6_plen will be filled in ip6_output, so not fill it here.
*/
ip6->ip6_nxt = in6p->inp_ip_p;
ip6->ip6_hlim = hlim;
if (so->so_proto->pr_protocol == IPPROTO_ICMPV6 ||
in6p->in6p_cksum != -1) {
struct mbuf *n;
int off;
u_int16_t *p;
/* Compute checksum. */
if (so->so_proto->pr_protocol == IPPROTO_ICMPV6)
off = offsetof(struct icmp6_hdr, icmp6_cksum);
else
off = in6p->in6p_cksum;
if (plen < off + 1) {
error = EINVAL;
goto bad;
}
off += sizeof(struct ip6_hdr);
n = m;
while (n && n->m_len <= off) {
off -= n->m_len;
n = n->m_next;
}
if (!n)
goto bad;
p = (u_int16_t *)(mtod(n, caddr_t) + off);
*p = 0;
*p = in6_cksum(m, ip6->ip6_nxt, sizeof(*ip6), plen);
}
/*
* Send RA/RS messages to user land for protection, before sending
* them to rtadvd/rtsol.
*/
if ((send_sendso_input_hook != NULL) &&
so->so_proto->pr_protocol == IPPROTO_ICMPV6) {
switch (type) {
case ND_ROUTER_ADVERT:
case ND_ROUTER_SOLICIT:
mtag = m_tag_get(PACKET_TAG_ND_OUTGOING,
sizeof(unsigned short), M_NOWAIT);
if (mtag == NULL)
goto bad;
m_tag_prepend(m, mtag);
}
}
error = ip6_output(m, optp, NULL, 0, in6p->in6p_moptions, &oifp, in6p);
if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) {
if (oifp)
icmp6_ifoutstat_inc(oifp, type, code);
ICMP6STAT_INC(icp6s_outhist[type]);
} else
RIP6STAT_INC(rip6s_opackets);
goto freectl;
bad:
if (m)
m_freem(m);
freectl:
if (control != NULL) {
ip6_clearpktopts(&opt, -1);
m_freem(control);
}
INP_WUNLOCK(in6p);
return (error);
}
/*
* Raw IPv6 socket option processing.
*/
int
rip6_ctloutput(struct socket *so, struct sockopt *sopt)
{
struct inpcb *inp;
int error;
if (sopt->sopt_level == IPPROTO_ICMPV6)
/*
* XXX: is it better to call icmp6_ctloutput() directly
* from protosw?
*/
return (icmp6_ctloutput(so, sopt));
else if (sopt->sopt_level != IPPROTO_IPV6) {
if (sopt->sopt_level == SOL_SOCKET &&
sopt->sopt_name == SO_SETFIB) {
inp = sotoinpcb(so);
INP_WLOCK(inp);
inp->inp_inc.inc_fibnum = so->so_fibnum;
INP_WUNLOCK(inp);
return (0);
}
return (EINVAL);
}
error = 0;
switch (sopt->sopt_dir) {
case SOPT_GET:
switch (sopt->sopt_name) {
case MRT6_INIT:
case MRT6_DONE:
case MRT6_ADD_MIF:
case MRT6_DEL_MIF:
case MRT6_ADD_MFC:
case MRT6_DEL_MFC:
case MRT6_PIM:
error = ip6_mrouter_get ? ip6_mrouter_get(so, sopt) :
EOPNOTSUPP;
break;
case IPV6_CHECKSUM:
error = ip6_raw_ctloutput(so, sopt);
break;
default:
error = ip6_ctloutput(so, sopt);
break;
}
break;
case SOPT_SET:
switch (sopt->sopt_name) {
case MRT6_INIT:
case MRT6_DONE:
case MRT6_ADD_MIF:
case MRT6_DEL_MIF:
case MRT6_ADD_MFC:
case MRT6_DEL_MFC:
case MRT6_PIM:
error = ip6_mrouter_set ? ip6_mrouter_set(so, sopt) :
EOPNOTSUPP;
break;
case IPV6_CHECKSUM:
error = ip6_raw_ctloutput(so, sopt);
break;
default:
error = ip6_ctloutput(so, sopt);
break;
}
break;
}
return (error);
}
static int
rip6_attach(struct socket *so, int proto, struct thread *td)
{
struct inpcb *inp;
struct icmp6_filter *filter;
int error;
inp = sotoinpcb(so);
KASSERT(inp == NULL, ("rip6_attach: inp != NULL"));
error = priv_check(td, PRIV_NETINET_RAW);
if (error)
return (error);
error = soreserve(so, rip_sendspace, rip_recvspace);
if (error)
return (error);
filter = malloc(sizeof(struct icmp6_filter), M_PCB, M_NOWAIT);
if (filter == NULL)
return (ENOMEM);
INP_INFO_WLOCK(&V_ripcbinfo);
error = in_pcballoc(so, &V_ripcbinfo);
if (error) {
INP_INFO_WUNLOCK(&V_ripcbinfo);
free(filter, M_PCB);
return (error);
}
inp = (struct inpcb *)so->so_pcb;
INP_INFO_WUNLOCK(&V_ripcbinfo);
inp->inp_vflag |= INP_IPV6;
inp->inp_ip_p = (long)proto;
inp->in6p_hops = -1; /* use kernel default */
inp->in6p_cksum = -1;
inp->in6p_icmp6filt = filter;
ICMP6_FILTER_SETPASSALL(inp->in6p_icmp6filt);
INP_WUNLOCK(inp);
return (0);
}
static void
rip6_detach(struct socket *so)
{
struct inpcb *inp;
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("rip6_detach: inp == NULL"));
if (so == V_ip6_mrouter && ip6_mrouter_done)
ip6_mrouter_done();
/* xxx: RSVP */
INP_INFO_WLOCK(&V_ripcbinfo);
INP_WLOCK(inp);
free(inp->in6p_icmp6filt, M_PCB);
in_pcbdetach(inp);
in_pcbfree(inp);
INP_INFO_WUNLOCK(&V_ripcbinfo);
}
/* XXXRW: This can't ever be called. */
static void
rip6_abort(struct socket *so)
{
struct inpcb *inp;
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("rip6_abort: inp == NULL"));
soisdisconnected(so);
}
static void
rip6_close(struct socket *so)
{
struct inpcb *inp;
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("rip6_close: inp == NULL"));
soisdisconnected(so);
}
static int
rip6_disconnect(struct socket *so)
{
struct inpcb *inp;
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("rip6_disconnect: inp == NULL"));
if ((so->so_state & SS_ISCONNECTED) == 0)
return (ENOTCONN);
inp->in6p_faddr = in6addr_any;
rip6_abort(so);
return (0);
}
static int
rip6_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
{
struct inpcb *inp;
struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam;
struct ifaddr *ifa = NULL;
int error = 0;
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("rip6_bind: inp == NULL"));
if (nam->sa_len != sizeof(*addr))
return (EINVAL);
if ((error = prison_check_ip6(td->td_ucred, &addr->sin6_addr)) != 0)
return (error);
if (TAILQ_EMPTY(&V_ifnet) || addr->sin6_family != AF_INET6)
return (EADDRNOTAVAIL);
if ((error = sa6_embedscope(addr, V_ip6_use_defzone)) != 0)
return (error);
if (!IN6_IS_ADDR_UNSPECIFIED(&addr->sin6_addr) &&
(ifa = ifa_ifwithaddr((struct sockaddr *)addr)) == NULL)
return (EADDRNOTAVAIL);
if (ifa != NULL &&
((struct in6_ifaddr *)ifa)->ia6_flags &
(IN6_IFF_ANYCAST|IN6_IFF_NOTREADY|
IN6_IFF_DETACHED|IN6_IFF_DEPRECATED)) {
ifa_free(ifa);
return (EADDRNOTAVAIL);
}
if (ifa != NULL)
ifa_free(ifa);
INP_INFO_WLOCK(&V_ripcbinfo);
INP_WLOCK(inp);
inp->in6p_laddr = addr->sin6_addr;
INP_WUNLOCK(inp);
INP_INFO_WUNLOCK(&V_ripcbinfo);
return (0);
}
static int
rip6_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
{
struct inpcb *inp;
struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam;
struct in6_addr in6a;
int error = 0, scope_ambiguous = 0;
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("rip6_connect: inp == NULL"));
if (nam->sa_len != sizeof(*addr))
return (EINVAL);
if (TAILQ_EMPTY(&V_ifnet))
return (EADDRNOTAVAIL);
if (addr->sin6_family != AF_INET6)
return (EAFNOSUPPORT);
/*
* Application should provide a proper zone ID or the use of default
* zone IDs should be enabled. Unfortunately, some applications do
* not behave as it should, so we need a workaround. Even if an
* appropriate ID is not determined, we'll see if we can determine
* the outgoing interface. If we can, determine the zone ID based on
* the interface below.
*/
if (addr->sin6_scope_id == 0 && !V_ip6_use_defzone)
scope_ambiguous = 1;
if ((error = sa6_embedscope(addr, V_ip6_use_defzone)) != 0)
return (error);
INP_INFO_WLOCK(&V_ripcbinfo);
INP_WLOCK(inp);
/* Source address selection. XXX: need pcblookup? */
error = in6_selectsrc_socket(addr, inp->in6p_outputopts,
inp, so->so_cred, scope_ambiguous, &in6a, NULL);
if (error) {
INP_WUNLOCK(inp);
INP_INFO_WUNLOCK(&V_ripcbinfo);
return (error);
}
inp->in6p_faddr = addr->sin6_addr;
inp->in6p_laddr = in6a;
soisconnected(so);
INP_WUNLOCK(inp);
INP_INFO_WUNLOCK(&V_ripcbinfo);
return (0);
}
static int
rip6_shutdown(struct socket *so)
{
struct inpcb *inp;
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("rip6_shutdown: inp == NULL"));
INP_WLOCK(inp);
socantsendmore(so);
INP_WUNLOCK(inp);
return (0);
}
static int
rip6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
struct mbuf *control, struct thread *td)
{
struct inpcb *inp;
struct sockaddr_in6 tmp;
struct sockaddr_in6 *dst;
int ret;
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("rip6_send: inp == NULL"));
/* Always copy sockaddr to avoid overwrites. */
/* Unlocked read. */
if (so->so_state & SS_ISCONNECTED) {
if (nam) {
m_freem(m);
return (EISCONN);
}
/* XXX */
bzero(&tmp, sizeof(tmp));
tmp.sin6_family = AF_INET6;
tmp.sin6_len = sizeof(struct sockaddr_in6);
INP_RLOCK(inp);
bcopy(&inp->in6p_faddr, &tmp.sin6_addr,
sizeof(struct in6_addr));
INP_RUNLOCK(inp);
dst = &tmp;
} else {
if (nam == NULL) {
m_freem(m);
return (ENOTCONN);
}
if (nam->sa_len != sizeof(struct sockaddr_in6)) {
m_freem(m);
return (EINVAL);
}
tmp = *(struct sockaddr_in6 *)nam;
dst = &tmp;
if (dst->sin6_family == AF_UNSPEC) {
/*
* XXX: we allow this case for backward
* compatibility to buggy applications that
* rely on old (and wrong) kernel behavior.
*/
log(LOG_INFO, "rip6 SEND: address family is "
"unspec. Assume AF_INET6\n");
dst->sin6_family = AF_INET6;
} else if (dst->sin6_family != AF_INET6) {
m_freem(m);
return(EAFNOSUPPORT);
}
}
ret = rip6_output(m, so, dst, control);
return (ret);
}
struct pr_usrreqs rip6_usrreqs = {
.pru_abort = rip6_abort,
.pru_attach = rip6_attach,
.pru_bind = rip6_bind,
.pru_connect = rip6_connect,
.pru_control = in6_control,
.pru_detach = rip6_detach,
.pru_disconnect = rip6_disconnect,
.pru_peeraddr = in6_getpeeraddr,
.pru_send = rip6_send,
.pru_shutdown = rip6_shutdown,
.pru_sockaddr = in6_getsockaddr,
.pru_close = rip6_close,
};
Index: head/sys/netinet6/udp6_usrreq.c
===================================================================
--- head/sys/netinet6/udp6_usrreq.c (revision 327172)
+++ head/sys/netinet6/udp6_usrreq.c (revision 327173)
@@ -1,1322 +1,1320 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* Copyright (c) 2010-2011 Juniper Networks, Inc.
* Copyright (c) 2014 Kevin Lo
* All rights reserved.
*
* Portions of this software were developed by Robert N. M. Watson under
* contract to Juniper Networks, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $KAME: udp6_usrreq.c,v 1.27 2001/05/21 05:45:10 jinmei Exp $
* $KAME: udp6_output.c,v 1.31 2001/05/21 16:39:15 jinmei Exp $
*/
/*-
* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
* The Regents of the University of California.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)udp_usrreq.c 8.6 (Berkeley) 5/23/95
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_ipsec.h"
#include "opt_rss.h"
#include <sys/param.h>
#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mbuf.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/protosw.h>
#include <sys/sdt.h>
#include <sys/signalvar.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sx.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/systm.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/if_types.h>
#include <net/route.h>
#include <net/rss_config.h>
#include <netinet/in.h>
#include <netinet/in_kdtrace.h>
#include <netinet/in_pcb.h>
#include <netinet/in_systm.h>
#include <netinet/in_var.h>
#include <netinet/ip.h>
#include <netinet/ip6.h>
#include <netinet/icmp6.h>
#include <netinet/ip_var.h>
#include <netinet/udp.h>
#include <netinet/udp_var.h>
#include <netinet/udplite.h>
#include <netinet6/ip6protosw.h>
#include <netinet6/ip6_var.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/in6_rss.h>
#include <netinet6/udp6_var.h>
#include <netinet6/scope6_var.h>
#include <netipsec/ipsec_support.h>
#include <security/mac/mac_framework.h>
/*
* UDP protocol implementation.
* Per RFC 768, August, 1980.
*/
extern struct protosw inetsw[];
static void udp6_detach(struct socket *so);
static int
udp6_append(struct inpcb *inp, struct mbuf *n, int off,
struct sockaddr_in6 *fromsa)
{
struct socket *so;
struct mbuf *opts = NULL, *tmp_opts;
struct udpcb *up;
INP_LOCK_ASSERT(inp);
/*
* Engage the tunneling protocol.
*/
up = intoudpcb(inp);
if (up->u_tun_func != NULL) {
in_pcbref(inp);
INP_RUNLOCK(inp);
(*up->u_tun_func)(n, off, inp, (struct sockaddr *)&fromsa[0],
up->u_tun_ctx);
INP_RLOCK(inp);
return (in_pcbrele_rlocked(inp));
}
#if defined(IPSEC) || defined(IPSEC_SUPPORT)
/* Check AH/ESP integrity. */
if (IPSEC_ENABLED(ipv6)) {
if (IPSEC_CHECK_POLICY(ipv6, n, inp) != 0) {
m_freem(n);
return (0);
}
}
#endif /* IPSEC */
#ifdef MAC
if (mac_inpcb_check_deliver(inp, n) != 0) {
m_freem(n);
return (0);
}
#endif
opts = NULL;
if (inp->inp_flags & INP_CONTROLOPTS ||
inp->inp_socket->so_options & SO_TIMESTAMP)
ip6_savecontrol(inp, n, &opts);
if ((inp->inp_vflag & INP_IPV6) && (inp->inp_flags2 & INP_ORIGDSTADDR)) {
tmp_opts = sbcreatecontrol((caddr_t)&fromsa[1],
sizeof(struct sockaddr_in6), IPV6_ORIGDSTADDR, IPPROTO_IPV6);
if (tmp_opts) {
if (opts) {
tmp_opts->m_next = opts;
opts = tmp_opts;
} else
opts = tmp_opts;
}
}
m_adj(n, off + sizeof(struct udphdr));
so = inp->inp_socket;
SOCKBUF_LOCK(&so->so_rcv);
if (sbappendaddr_locked(&so->so_rcv, (struct sockaddr *)&fromsa[0], n,
opts) == 0) {
SOCKBUF_UNLOCK(&so->so_rcv);
m_freem(n);
if (opts)
m_freem(opts);
UDPSTAT_INC(udps_fullsock);
} else
sorwakeup_locked(so);
return (0);
}
int
udp6_input(struct mbuf **mp, int *offp, int proto)
{
struct mbuf *m = *mp;
struct ifnet *ifp;
struct ip6_hdr *ip6;
struct udphdr *uh;
struct inpcb *inp;
struct inpcbinfo *pcbinfo;
struct udpcb *up;
int off = *offp;
int cscov_partial;
int plen, ulen;
struct sockaddr_in6 fromsa[2];
struct m_tag *fwd_tag;
uint16_t uh_sum;
uint8_t nxt;
ifp = m->m_pkthdr.rcvif;
ip6 = mtod(m, struct ip6_hdr *);
#ifndef PULLDOWN_TEST
IP6_EXTHDR_CHECK(m, off, sizeof(struct udphdr), IPPROTO_DONE);
ip6 = mtod(m, struct ip6_hdr *);
uh = (struct udphdr *)((caddr_t)ip6 + off);
#else
IP6_EXTHDR_GET(uh, struct udphdr *, m, off, sizeof(*uh));
if (!uh)
return (IPPROTO_DONE);
#endif
UDPSTAT_INC(udps_ipackets);
/*
* Destination port of 0 is illegal, based on RFC768.
*/
if (uh->uh_dport == 0)
goto badunlocked;
plen = ntohs(ip6->ip6_plen) - off + sizeof(*ip6);
ulen = ntohs((u_short)uh->uh_ulen);
nxt = proto;
cscov_partial = (nxt == IPPROTO_UDPLITE) ? 1 : 0;
if (nxt == IPPROTO_UDPLITE) {
/* Zero means checksum over the complete packet. */
if (ulen == 0)
ulen = plen;
if (ulen == plen)
cscov_partial = 0;
if ((ulen < sizeof(struct udphdr)) || (ulen > plen)) {
/* XXX: What is the right UDPLite MIB counter? */
goto badunlocked;
}
if (uh->uh_sum == 0) {
/* XXX: What is the right UDPLite MIB counter? */
goto badunlocked;
}
} else {
if ((ulen < sizeof(struct udphdr)) || (plen != ulen)) {
UDPSTAT_INC(udps_badlen);
goto badunlocked;
}
if (uh->uh_sum == 0) {
UDPSTAT_INC(udps_nosum);
goto badunlocked;
}
}
if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) &&
!cscov_partial) {
if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
uh_sum = m->m_pkthdr.csum_data;
else
uh_sum = in6_cksum_pseudo(ip6, ulen, nxt,
m->m_pkthdr.csum_data);
uh_sum ^= 0xffff;
} else
uh_sum = in6_cksum_partial(m, nxt, off, plen, ulen);
if (uh_sum != 0) {
UDPSTAT_INC(udps_badsum);
goto badunlocked;
}
/*
* Construct sockaddr format source address.
*/
init_sin6(&fromsa[0], m, 0);
fromsa[0].sin6_port = uh->uh_sport;
init_sin6(&fromsa[1], m, 1);
fromsa[1].sin6_port = uh->uh_dport;
pcbinfo = udp_get_inpcbinfo(nxt);
if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
struct inpcb *last;
struct inpcbhead *pcblist;
struct ip6_moptions *imo;
INP_INFO_RLOCK(pcbinfo);
/*
* In the event that laddr should be set to the link-local
* address (this happens in RIPng), the multicast address
* specified in the received packet will not match laddr. To
* handle this situation, matching is relaxed if the
* receiving interface is the same as one specified in the
* socket and if the destination multicast address matches
* one of the multicast groups specified in the socket.
*/
/*
* KAME note: traditionally we dropped udpiphdr from mbuf
* here. We need udphdr for IPsec processing so we do that
* later.
*/
pcblist = udp_get_pcblist(nxt);
last = NULL;
LIST_FOREACH(inp, pcblist, inp_list) {
if ((inp->inp_vflag & INP_IPV6) == 0)
continue;
if (inp->inp_lport != uh->uh_dport)
continue;
if (inp->inp_fport != 0 &&
inp->inp_fport != uh->uh_sport)
continue;
if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr,
&ip6->ip6_dst))
continue;
}
if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr,
&ip6->ip6_src) ||
inp->inp_fport != uh->uh_sport)
continue;
}
/*
* XXXRW: Because we weren't holding either the inpcb
* or the hash lock when we checked for a match
* before, we should probably recheck now that the
* inpcb lock is (supposed to be) held.
*/
/*
* Handle socket delivery policy for any-source
* and source-specific multicast. [RFC3678]
*/
imo = inp->in6p_moptions;
if (imo && IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
struct sockaddr_in6 mcaddr;
int blocked;
INP_RLOCK(inp);
bzero(&mcaddr, sizeof(struct sockaddr_in6));
mcaddr.sin6_len = sizeof(struct sockaddr_in6);
mcaddr.sin6_family = AF_INET6;
mcaddr.sin6_addr = ip6->ip6_dst;
blocked = im6o_mc_filter(imo, ifp,
(struct sockaddr *)&mcaddr,
(struct sockaddr *)&fromsa[0]);
if (blocked != MCAST_PASS) {
if (blocked == MCAST_NOTGMEMBER)
IP6STAT_INC(ip6s_notmember);
if (blocked == MCAST_NOTSMEMBER ||
blocked == MCAST_MUTED)
UDPSTAT_INC(udps_filtermcast);
INP_RUNLOCK(inp); /* XXX */
continue;
}
INP_RUNLOCK(inp);
}
if (last != NULL) {
struct mbuf *n;
if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) !=
NULL) {
INP_RLOCK(last);
UDP_PROBE(receive, NULL, last, ip6,
last, uh);
if (udp6_append(last, n, off, fromsa))
goto inp_lost;
INP_RUNLOCK(last);
}
}
last = inp;
/*
* Don't look for additional matches if this one does
* not have either the SO_REUSEPORT or SO_REUSEADDR
* socket options set. This heuristic avoids
* searching through all pcbs in the common case of a
* non-shared port. It assumes that an application
* will never clear these options after setting them.
*/
if ((last->inp_socket->so_options &
(SO_REUSEPORT|SO_REUSEADDR)) == 0)
break;
}
if (last == NULL) {
/*
* No matching pcb found; discard datagram. (No need
* to send an ICMP Port Unreachable for a broadcast
* or multicast datgram.)
*/
UDPSTAT_INC(udps_noport);
UDPSTAT_INC(udps_noportmcast);
goto badheadlocked;
}
INP_RLOCK(last);
INP_INFO_RUNLOCK(pcbinfo);
UDP_PROBE(receive, NULL, last, ip6, last, uh);
if (udp6_append(last, m, off, fromsa) == 0)
INP_RUNLOCK(last);
inp_lost:
return (IPPROTO_DONE);
}
/*
* Locate pcb for datagram.
*/
/*
* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
*/
if ((m->m_flags & M_IP6_NEXTHOP) &&
(fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
struct sockaddr_in6 *next_hop6;
next_hop6 = (struct sockaddr_in6 *)(fwd_tag + 1);
/*
* Transparently forwarded. Pretend to be the destination.
* Already got one like this?
*/
inp = in6_pcblookup_mbuf(pcbinfo, &ip6->ip6_src,
uh->uh_sport, &ip6->ip6_dst, uh->uh_dport,
INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif, m);
if (!inp) {
/*
* It's new. Try to find the ambushing socket.
* Because we've rewritten the destination address,
* any hardware-generated hash is ignored.
*/
inp = in6_pcblookup(pcbinfo, &ip6->ip6_src,
uh->uh_sport, &next_hop6->sin6_addr,
next_hop6->sin6_port ? htons(next_hop6->sin6_port) :
uh->uh_dport, INPLOOKUP_WILDCARD |
INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif);
}
/* Remove the tag from the packet. We don't need it anymore. */
m_tag_delete(m, fwd_tag);
m->m_flags &= ~M_IP6_NEXTHOP;
} else
inp = in6_pcblookup_mbuf(pcbinfo, &ip6->ip6_src,
uh->uh_sport, &ip6->ip6_dst, uh->uh_dport,
INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB,
m->m_pkthdr.rcvif, m);
if (inp == NULL) {
if (udp_log_in_vain) {
char ip6bufs[INET6_ADDRSTRLEN];
char ip6bufd[INET6_ADDRSTRLEN];
log(LOG_INFO,
"Connection attempt to UDP [%s]:%d from [%s]:%d\n",
ip6_sprintf(ip6bufd, &ip6->ip6_dst),
ntohs(uh->uh_dport),
ip6_sprintf(ip6bufs, &ip6->ip6_src),
ntohs(uh->uh_sport));
}
UDPSTAT_INC(udps_noport);
if (m->m_flags & M_MCAST) {
printf("UDP6: M_MCAST is set in a unicast packet.\n");
UDPSTAT_INC(udps_noportmcast);
goto badunlocked;
}
if (V_udp_blackhole)
goto badunlocked;
icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOPORT, 0);
return (IPPROTO_DONE);
}
INP_RLOCK_ASSERT(inp);
up = intoudpcb(inp);
if (cscov_partial) {
if (up->u_rxcslen == 0 || up->u_rxcslen > ulen) {
INP_RUNLOCK(inp);
m_freem(m);
return (IPPROTO_DONE);
}
}
UDP_PROBE(receive, NULL, inp, ip6, inp, uh);
if (udp6_append(inp, m, off, fromsa) == 0)
INP_RUNLOCK(inp);
return (IPPROTO_DONE);
badheadlocked:
INP_INFO_RUNLOCK(pcbinfo);
badunlocked:
if (m)
m_freem(m);
return (IPPROTO_DONE);
}
static void
udp6_common_ctlinput(int cmd, struct sockaddr *sa, void *d,
struct inpcbinfo *pcbinfo)
{
struct udphdr uh;
struct ip6_hdr *ip6;
struct mbuf *m;
int off = 0;
struct ip6ctlparam *ip6cp = NULL;
const struct sockaddr_in6 *sa6_src = NULL;
void *cmdarg;
struct inpcb *(*notify)(struct inpcb *, int) = udp_notify;
struct udp_portonly {
u_int16_t uh_sport;
u_int16_t uh_dport;
} *uhp;
if (sa->sa_family != AF_INET6 ||
sa->sa_len != sizeof(struct sockaddr_in6))
return;
if ((unsigned)cmd >= PRC_NCMDS)
return;
if (PRC_IS_REDIRECT(cmd))
notify = in6_rtchange, d = NULL;
else if (cmd == PRC_HOSTDEAD)
d = NULL;
else if (inet6ctlerrmap[cmd] == 0)
return;
/* if the parameter is from icmp6, decode it. */
if (d != NULL) {
ip6cp = (struct ip6ctlparam *)d;
m = ip6cp->ip6c_m;
ip6 = ip6cp->ip6c_ip6;
off = ip6cp->ip6c_off;
cmdarg = ip6cp->ip6c_cmdarg;
sa6_src = ip6cp->ip6c_src;
} else {
m = NULL;
ip6 = NULL;
cmdarg = NULL;
sa6_src = &sa6_any;
}
if (ip6) {
/*
* XXX: We assume that when IPV6 is non NULL,
* M and OFF are valid.
*/
/* Check if we can safely examine src and dst ports. */
if (m->m_pkthdr.len < off + sizeof(*uhp))
return;
bzero(&uh, sizeof(uh));
m_copydata(m, off, sizeof(*uhp), (caddr_t)&uh);
if (!PRC_IS_REDIRECT(cmd)) {
/* Check to see if its tunneled */
struct inpcb *inp;
inp = in6_pcblookup_mbuf(pcbinfo, &ip6->ip6_dst,
uh.uh_dport, &ip6->ip6_src, uh.uh_sport,
INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB,
m->m_pkthdr.rcvif, m);
if (inp != NULL) {
struct udpcb *up;
up = intoudpcb(inp);
if (up->u_icmp_func) {
/* Yes it is. */
INP_RUNLOCK(inp);
(*up->u_icmp_func)(cmd, (struct sockaddr *)ip6cp->ip6c_src,
d, up->u_tun_ctx);
return;
} else {
/* Can't find it. */
INP_RUNLOCK(inp);
}
}
}
(void)in6_pcbnotify(pcbinfo, sa, uh.uh_dport,
(struct sockaddr *)ip6cp->ip6c_src, uh.uh_sport, cmd,
cmdarg, notify);
} else
(void)in6_pcbnotify(pcbinfo, sa, 0,
(const struct sockaddr *)sa6_src, 0, cmd, cmdarg, notify);
}
void
udp6_ctlinput(int cmd, struct sockaddr *sa, void *d)
{
return (udp6_common_ctlinput(cmd, sa, d, &V_udbinfo));
}
void
udplite6_ctlinput(int cmd, struct sockaddr *sa, void *d)
{
return (udp6_common_ctlinput(cmd, sa, d, &V_ulitecbinfo));
}
static int
udp6_getcred(SYSCTL_HANDLER_ARGS)
{
struct xucred xuc;
struct sockaddr_in6 addrs[2];
struct inpcb *inp;
int error;
error = priv_check(req->td, PRIV_NETINET_GETCRED);
if (error)
return (error);
if (req->newlen != sizeof(addrs))
return (EINVAL);
if (req->oldlen != sizeof(struct xucred))
return (EINVAL);
error = SYSCTL_IN(req, addrs, sizeof(addrs));
if (error)
return (error);
if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 ||
(error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) {
return (error);
}
inp = in6_pcblookup(&V_udbinfo, &addrs[1].sin6_addr,
addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port,
INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL);
if (inp != NULL) {
INP_RLOCK_ASSERT(inp);
if (inp->inp_socket == NULL)
error = ENOENT;
if (error == 0)
error = cr_canseesocket(req->td->td_ucred,
inp->inp_socket);
if (error == 0)
cru2x(inp->inp_cred, &xuc);
INP_RUNLOCK(inp);
} else
error = ENOENT;
if (error == 0)
error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
return (error);
}
SYSCTL_PROC(_net_inet6_udp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW, 0,
0, udp6_getcred, "S,xucred", "Get the xucred of a UDP6 connection");
static int
udp6_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr6,
struct mbuf *control, struct thread *td)
{
u_int32_t ulen = m->m_pkthdr.len;
u_int32_t plen = sizeof(struct udphdr) + ulen;
struct ip6_hdr *ip6;
struct udphdr *udp6;
struct in6_addr *laddr, *faddr, in6a;
struct sockaddr_in6 *sin6 = NULL;
int cscov_partial = 0;
int scope_ambiguous = 0;
u_short fport;
int error = 0;
uint8_t nxt;
uint16_t cscov = 0;
struct ip6_pktopts *optp, opt;
int af = AF_INET6, hlen = sizeof(struct ip6_hdr);
int flags;
struct sockaddr_in6 tmp;
INP_WLOCK_ASSERT(inp);
INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
if (addr6) {
/* addr6 has been validated in udp6_send(). */
sin6 = (struct sockaddr_in6 *)addr6;
/* protect *sin6 from overwrites */
tmp = *sin6;
sin6 = &tmp;
/*
* Application should provide a proper zone ID or the use of
* default zone IDs should be enabled. Unfortunately, some
* applications do not behave as it should, so we need a
* workaround. Even if an appropriate ID is not determined,
* we'll see if we can determine the outgoing interface. If we
* can, determine the zone ID based on the interface below.
*/
if (sin6->sin6_scope_id == 0 && !V_ip6_use_defzone)
scope_ambiguous = 1;
if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0)
return (error);
}
nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ?
IPPROTO_UDP : IPPROTO_UDPLITE;
if (control) {
if ((error = ip6_setpktopts(control, &opt,
inp->in6p_outputopts, td->td_ucred, nxt)) != 0)
goto release;
optp = &opt;
} else
optp = inp->in6p_outputopts;
if (sin6) {
faddr = &sin6->sin6_addr;
/*
* Since we saw no essential reason for calling in_pcbconnect,
* we get rid of such kind of logic, and call in6_selectsrc
* and in6_pcbsetport in order to fill in the local address
* and the local port.
*/
if (sin6->sin6_port == 0) {
error = EADDRNOTAVAIL;
goto release;
}
if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
/* how about ::ffff:0.0.0.0 case? */
error = EISCONN;
goto release;
}
fport = sin6->sin6_port; /* allow 0 port */
if (IN6_IS_ADDR_V4MAPPED(faddr)) {
if ((inp->inp_flags & IN6P_IPV6_V6ONLY)) {
/*
* I believe we should explicitly discard the
* packet when mapped addresses are disabled,
* rather than send the packet as an IPv6 one.
* If we chose the latter approach, the packet
* might be sent out on the wire based on the
* default route, the situation which we'd
* probably want to avoid.
* (20010421 jinmei@kame.net)
*/
error = EINVAL;
goto release;
}
if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) &&
!IN6_IS_ADDR_V4MAPPED(&inp->in6p_laddr)) {
/*
* when remote addr is an IPv4-mapped address,
* local addr should not be an IPv6 address,
* since you cannot determine how to map IPv6
* source address to IPv4.
*/
error = EINVAL;
goto release;
}
af = AF_INET;
}
if (!IN6_IS_ADDR_V4MAPPED(faddr)) {
error = in6_selectsrc_socket(sin6, optp, inp,
td->td_ucred, scope_ambiguous, &in6a, NULL);
if (error)
goto release;
laddr = &in6a;
} else
laddr = &inp->in6p_laddr; /* XXX */
if (laddr == NULL) {
if (error == 0)
error = EADDRNOTAVAIL;
goto release;
}
if (inp->inp_lport == 0 &&
(error = in6_pcbsetport(laddr, inp, td->td_ucred)) != 0) {
/* Undo an address bind that may have occurred. */
inp->in6p_laddr = in6addr_any;
goto release;
}
} else {
if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
error = ENOTCONN;
goto release;
}
if (IN6_IS_ADDR_V4MAPPED(&inp->in6p_faddr)) {
if ((inp->inp_flags & IN6P_IPV6_V6ONLY)) {
/*
* XXX: this case would happen when the
* application sets the V6ONLY flag after
* connecting the foreign address.
* Such applications should be fixed,
* so we bark here.
*/
log(LOG_INFO, "udp6_output: IPV6_V6ONLY "
"option was set for a connected socket\n");
error = EINVAL;
goto release;
} else
af = AF_INET;
}
laddr = &inp->in6p_laddr;
faddr = &inp->in6p_faddr;
fport = inp->inp_fport;
}
if (af == AF_INET)
hlen = sizeof(struct ip);
/*
* Calculate data length and get a mbuf
* for UDP and IP6 headers.
*/
M_PREPEND(m, hlen + sizeof(struct udphdr), M_NOWAIT);
if (m == NULL) {
error = ENOBUFS;
goto release;
}
/*
* Stuff checksum and output datagram.
*/
udp6 = (struct udphdr *)(mtod(m, caddr_t) + hlen);
udp6->uh_sport = inp->inp_lport; /* lport is always set in the PCB */
udp6->uh_dport = fport;
if (nxt == IPPROTO_UDPLITE) {
struct udpcb *up;
up = intoudpcb(inp);
cscov = up->u_txcslen;
if (cscov >= plen)
cscov = 0;
udp6->uh_ulen = htons(cscov);
/*
* For UDP-Lite, checksum coverage length of zero means
* the entire UDPLite packet is covered by the checksum.
*/
cscov_partial = (cscov == 0) ? 0 : 1;
} else if (plen <= 0xffff)
udp6->uh_ulen = htons((u_short)plen);
else
udp6->uh_ulen = 0;
udp6->uh_sum = 0;
switch (af) {
case AF_INET6:
ip6 = mtod(m, struct ip6_hdr *);
ip6->ip6_flow = inp->inp_flow & IPV6_FLOWINFO_MASK;
ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
ip6->ip6_vfc |= IPV6_VERSION;
ip6->ip6_plen = htons((u_short)plen);
ip6->ip6_nxt = nxt;
ip6->ip6_hlim = in6_selecthlim(inp, NULL);
ip6->ip6_src = *laddr;
ip6->ip6_dst = *faddr;
if (cscov_partial) {
if ((udp6->uh_sum = in6_cksum_partial(m, nxt,
sizeof(struct ip6_hdr), plen, cscov)) == 0)
udp6->uh_sum = 0xffff;
} else {
udp6->uh_sum = in6_cksum_pseudo(ip6, plen, nxt, 0);
m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
}
#ifdef RSS
{
uint32_t hash_val, hash_type;
uint8_t pr;
pr = inp->inp_socket->so_proto->pr_protocol;
/*
* Calculate an appropriate RSS hash for UDP and
* UDP Lite.
*
* The called function will take care of figuring out
* whether a 2-tuple or 4-tuple hash is required based
* on the currently configured scheme.
*
* Later later on connected socket values should be
* cached in the inpcb and reused, rather than constantly
* re-calculating it.
*
* UDP Lite is a different protocol number and will
* likely end up being hashed as a 2-tuple until
* RSS / NICs grow UDP Lite protocol awareness.
*/
if (rss_proto_software_hash_v6(faddr, laddr, fport,
inp->inp_lport, pr, &hash_val, &hash_type) == 0) {
m->m_pkthdr.flowid = hash_val;
M_HASHTYPE_SET(m, hash_type);
}
}
#endif
flags = 0;
#ifdef RSS
/*
* Don't override with the inp cached flowid.
*
* Until the whole UDP path is vetted, it may actually
* be incorrect.
*/
flags |= IP_NODEFAULTFLOWID;
#endif
UDP_PROBE(send, NULL, inp, ip6, inp, udp6);
UDPSTAT_INC(udps_opackets);
error = ip6_output(m, optp, &inp->inp_route6, flags,
inp->in6p_moptions, NULL, inp);
break;
case AF_INET:
error = EAFNOSUPPORT;
goto release;
}
goto releaseopt;
release:
m_freem(m);
releaseopt:
if (control) {
ip6_clearpktopts(&opt, -1);
m_freem(control);
}
return (error);
}
static void
udp6_abort(struct socket *so)
{
struct inpcb *inp;
struct inpcbinfo *pcbinfo;
pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("udp6_abort: inp == NULL"));
INP_WLOCK(inp);
#ifdef INET
if (inp->inp_vflag & INP_IPV4) {
struct pr_usrreqs *pru;
uint8_t nxt;
nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ?
IPPROTO_UDP : IPPROTO_UDPLITE;
INP_WUNLOCK(inp);
pru = inetsw[ip_protox[nxt]].pr_usrreqs;
(*pru->pru_abort)(so);
return;
}
#endif
if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
INP_HASH_WLOCK(pcbinfo);
in6_pcbdisconnect(inp);
inp->in6p_laddr = in6addr_any;
INP_HASH_WUNLOCK(pcbinfo);
soisdisconnected(so);
}
INP_WUNLOCK(inp);
}
static int
udp6_attach(struct socket *so, int proto, struct thread *td)
{
struct inpcb *inp;
struct inpcbinfo *pcbinfo;
int error;
pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
inp = sotoinpcb(so);
KASSERT(inp == NULL, ("udp6_attach: inp != NULL"));
if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
error = soreserve(so, udp_sendspace, udp_recvspace);
if (error)
return (error);
}
INP_INFO_WLOCK(pcbinfo);
error = in_pcballoc(so, pcbinfo);
if (error) {
INP_INFO_WUNLOCK(pcbinfo);
return (error);
}
inp = (struct inpcb *)so->so_pcb;
inp->inp_vflag |= INP_IPV6;
if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
inp->inp_vflag |= INP_IPV4;
inp->in6p_hops = -1; /* use kernel default */
inp->in6p_cksum = -1; /* just to be sure */
/*
* XXX: ugly!!
* IPv4 TTL initialization is necessary for an IPv6 socket as well,
* because the socket may be bound to an IPv6 wildcard address,
* which may match an IPv4-mapped IPv6 address.
*/
inp->inp_ip_ttl = V_ip_defttl;
error = udp_newudpcb(inp);
if (error) {
in_pcbdetach(inp);
in_pcbfree(inp);
INP_INFO_WUNLOCK(pcbinfo);
return (error);
}
INP_WUNLOCK(inp);
INP_INFO_WUNLOCK(pcbinfo);
return (0);
}
static int
udp6_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
{
struct inpcb *inp;
struct inpcbinfo *pcbinfo;
int error;
pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("udp6_bind: inp == NULL"));
INP_WLOCK(inp);
INP_HASH_WLOCK(pcbinfo);
inp->inp_vflag &= ~INP_IPV4;
inp->inp_vflag |= INP_IPV6;
if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
struct sockaddr_in6 *sin6_p;
sin6_p = (struct sockaddr_in6 *)nam;
if (IN6_IS_ADDR_UNSPECIFIED(&sin6_p->sin6_addr))
inp->inp_vflag |= INP_IPV4;
#ifdef INET
else if (IN6_IS_ADDR_V4MAPPED(&sin6_p->sin6_addr)) {
struct sockaddr_in sin;
in6_sin6_2_sin(&sin, sin6_p);
inp->inp_vflag |= INP_IPV4;
inp->inp_vflag &= ~INP_IPV6;
error = in_pcbbind(inp, (struct sockaddr *)&sin,
td->td_ucred);
goto out;
}
#endif
}
error = in6_pcbbind(inp, nam, td->td_ucred);
#ifdef INET
out:
#endif
INP_HASH_WUNLOCK(pcbinfo);
INP_WUNLOCK(inp);
return (error);
}
static void
udp6_close(struct socket *so)
{
struct inpcb *inp;
struct inpcbinfo *pcbinfo;
pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("udp6_close: inp == NULL"));
INP_WLOCK(inp);
#ifdef INET
if (inp->inp_vflag & INP_IPV4) {
struct pr_usrreqs *pru;
uint8_t nxt;
nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ?
IPPROTO_UDP : IPPROTO_UDPLITE;
INP_WUNLOCK(inp);
pru = inetsw[ip_protox[nxt]].pr_usrreqs;
(*pru->pru_disconnect)(so);
return;
}
#endif
if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
INP_HASH_WLOCK(pcbinfo);
in6_pcbdisconnect(inp);
inp->in6p_laddr = in6addr_any;
INP_HASH_WUNLOCK(pcbinfo);
soisdisconnected(so);
}
INP_WUNLOCK(inp);
}
static int
udp6_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
{
struct inpcb *inp;
struct inpcbinfo *pcbinfo;
struct sockaddr_in6 *sin6;
int error;
pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
inp = sotoinpcb(so);
sin6 = (struct sockaddr_in6 *)nam;
KASSERT(inp != NULL, ("udp6_connect: inp == NULL"));
/*
* XXXRW: Need to clarify locking of v4/v6 flags.
*/
INP_WLOCK(inp);
#ifdef INET
if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
struct sockaddr_in sin;
if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) {
error = EINVAL;
goto out;
}
if ((inp->inp_vflag & INP_IPV4) == 0) {
error = EAFNOSUPPORT;
goto out;
}
if (inp->inp_faddr.s_addr != INADDR_ANY) {
error = EISCONN;
goto out;
}
in6_sin6_2_sin(&sin, sin6);
inp->inp_vflag |= INP_IPV4;
inp->inp_vflag &= ~INP_IPV6;
error = prison_remote_ip4(td->td_ucred, &sin.sin_addr);
if (error != 0)
goto out;
INP_HASH_WLOCK(pcbinfo);
error = in_pcbconnect(inp, (struct sockaddr *)&sin,
td->td_ucred);
INP_HASH_WUNLOCK(pcbinfo);
if (error == 0)
soisconnected(so);
goto out;
} else {
if ((inp->inp_vflag & INP_IPV6) == 0) {
error = EAFNOSUPPORT;
goto out;
}
}
#endif
if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
error = EISCONN;
goto out;
}
inp->inp_vflag &= ~INP_IPV4;
inp->inp_vflag |= INP_IPV6;
error = prison_remote_ip6(td->td_ucred, &sin6->sin6_addr);
if (error != 0)
goto out;
INP_HASH_WLOCK(pcbinfo);
error = in6_pcbconnect(inp, nam, td->td_ucred);
INP_HASH_WUNLOCK(pcbinfo);
if (error == 0)
soisconnected(so);
out:
INP_WUNLOCK(inp);
return (error);
}
static void
udp6_detach(struct socket *so)
{
struct inpcb *inp;
struct inpcbinfo *pcbinfo;
struct udpcb *up;
pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("udp6_detach: inp == NULL"));
INP_INFO_WLOCK(pcbinfo);
INP_WLOCK(inp);
up = intoudpcb(inp);
KASSERT(up != NULL, ("%s: up == NULL", __func__));
in_pcbdetach(inp);
in_pcbfree(inp);
INP_INFO_WUNLOCK(pcbinfo);
udp_discardcb(up);
}
static int
udp6_disconnect(struct socket *so)
{
struct inpcb *inp;
struct inpcbinfo *pcbinfo;
- int error;
pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("udp6_disconnect: inp == NULL"));
INP_WLOCK(inp);
#ifdef INET
if (inp->inp_vflag & INP_IPV4) {
struct pr_usrreqs *pru;
uint8_t nxt;
nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ?
IPPROTO_UDP : IPPROTO_UDPLITE;
INP_WUNLOCK(inp);
pru = inetsw[ip_protox[nxt]].pr_usrreqs;
(void)(*pru->pru_disconnect)(so);
return (0);
}
#endif
if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
- error = ENOTCONN;
- goto out;
+ INP_WUNLOCK(inp);
+ return (ENOTCONN);
}
INP_HASH_WLOCK(pcbinfo);
in6_pcbdisconnect(inp);
inp->in6p_laddr = in6addr_any;
INP_HASH_WUNLOCK(pcbinfo);
SOCK_LOCK(so);
so->so_state &= ~SS_ISCONNECTED; /* XXX */
SOCK_UNLOCK(so);
-out:
INP_WUNLOCK(inp);
return (0);
}
static int
udp6_send(struct socket *so, int flags, struct mbuf *m,
struct sockaddr *addr, struct mbuf *control, struct thread *td)
{
struct inpcb *inp;
struct inpcbinfo *pcbinfo;
int error = 0;
pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("udp6_send: inp == NULL"));
INP_WLOCK(inp);
if (addr) {
if (addr->sa_len != sizeof(struct sockaddr_in6)) {
error = EINVAL;
goto bad;
}
if (addr->sa_family != AF_INET6) {
error = EAFNOSUPPORT;
goto bad;
}
}
#ifdef INET
if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
int hasv4addr;
struct sockaddr_in6 *sin6 = NULL;
if (addr == NULL)
hasv4addr = (inp->inp_vflag & INP_IPV4);
else {
sin6 = (struct sockaddr_in6 *)addr;
hasv4addr = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)
? 1 : 0;
}
if (hasv4addr) {
struct pr_usrreqs *pru;
uint8_t nxt;
nxt = (inp->inp_socket->so_proto->pr_protocol ==
IPPROTO_UDP) ? IPPROTO_UDP : IPPROTO_UDPLITE;
/*
* XXXRW: We release UDP-layer locks before calling
* udp_send() in order to avoid recursion. However,
* this does mean there is a short window where inp's
* fields are unstable. Could this lead to a
* potential race in which the factors causing us to
* select the UDPv4 output routine are invalidated?
*/
INP_WUNLOCK(inp);
if (sin6)
in6_sin6_2_sin_in_sock(addr);
pru = inetsw[ip_protox[nxt]].pr_usrreqs;
/* addr will just be freed in sendit(). */
return ((*pru->pru_send)(so, flags, m, addr, control,
td));
}
}
#endif
#ifdef MAC
mac_inpcb_create_mbuf(inp, m);
#endif
INP_HASH_WLOCK(pcbinfo);
error = udp6_output(inp, m, addr, control, td);
INP_HASH_WUNLOCK(pcbinfo);
INP_WUNLOCK(inp);
return (error);
bad:
INP_WUNLOCK(inp);
m_freem(m);
return (error);
}
struct pr_usrreqs udp6_usrreqs = {
.pru_abort = udp6_abort,
.pru_attach = udp6_attach,
.pru_bind = udp6_bind,
.pru_connect = udp6_connect,
.pru_control = in6_control,
.pru_detach = udp6_detach,
.pru_disconnect = udp6_disconnect,
.pru_peeraddr = in6_mapped_peeraddr,
.pru_send = udp6_send,
.pru_shutdown = udp_shutdown,
.pru_sockaddr = in6_mapped_sockaddr,
.pru_soreceive = soreceive_dgram,
.pru_sosend = sosend_dgram,
.pru_sosetlabel = in_pcbsosetlabel,
.pru_close = udp6_close
};
Index: head/sys/netipsec/key.c
===================================================================
--- head/sys/netipsec/key.c (revision 327172)
+++ head/sys/netipsec/key.c (revision 327173)
@@ -1,8454 +1,8449 @@
/* $FreeBSD$ */
/* $KAME: key.c,v 1.191 2001/06/27 10:46:49 sakane Exp $ */
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* This code is referd to RFC 2367
*/
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_ipsec.h"
#include <sys/types.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/fnv_hash.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/mbuf.h>
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/malloc.h>
#include <sys/rmlock.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <sys/errno.h>
#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/refcount.h>
#include <sys/syslog.h>
#include <vm/uma.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/vnet.h>
#include <net/raw_cb.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_var.h>
#include <netinet/udp.h>
#ifdef INET6
#include <netinet/ip6.h>
#include <netinet6/in6_var.h>
#include <netinet6/ip6_var.h>
#endif /* INET6 */
#include <net/pfkeyv2.h>
#include <netipsec/keydb.h>
#include <netipsec/key.h>
#include <netipsec/keysock.h>
#include <netipsec/key_debug.h>
#include <netipsec/ipsec.h>
#ifdef INET6
#include <netipsec/ipsec6.h>
#endif
#include <netipsec/xform.h>
#include <machine/in_cksum.h>
#include <machine/stdarg.h>
/* randomness */
#include <sys/random.h>
#define FULLMASK 0xff
#define _BITS(bytes) ((bytes) << 3)
/*
* Note on SA reference counting:
* - SAs that are not in DEAD state will have (total external reference + 1)
* following value in reference count field. they cannot be freed and are
* referenced from SA header.
* - SAs that are in DEAD state will have (total external reference)
* in reference count field. they are ready to be freed. reference from
* SA header will be removed in key_delsav(), when the reference count
* field hits 0 (= no external reference other than from SA header.
*/
VNET_DEFINE(u_int32_t, key_debug_level) = 0;
static VNET_DEFINE(u_int, key_spi_trycnt) = 1000;
static VNET_DEFINE(u_int32_t, key_spi_minval) = 0x100;
static VNET_DEFINE(u_int32_t, key_spi_maxval) = 0x0fffffff; /* XXX */
static VNET_DEFINE(u_int32_t, policy_id) = 0;
/*interval to initialize randseed,1(m)*/
static VNET_DEFINE(u_int, key_int_random) = 60;
/* interval to expire acquiring, 30(s)*/
static VNET_DEFINE(u_int, key_larval_lifetime) = 30;
/* counter for blocking SADB_ACQUIRE.*/
static VNET_DEFINE(int, key_blockacq_count) = 10;
/* lifetime for blocking SADB_ACQUIRE.*/
static VNET_DEFINE(int, key_blockacq_lifetime) = 20;
/* preferred old sa rather than new sa.*/
static VNET_DEFINE(int, key_preferred_oldsa) = 1;
#define V_key_spi_trycnt VNET(key_spi_trycnt)
#define V_key_spi_minval VNET(key_spi_minval)
#define V_key_spi_maxval VNET(key_spi_maxval)
#define V_policy_id VNET(policy_id)
#define V_key_int_random VNET(key_int_random)
#define V_key_larval_lifetime VNET(key_larval_lifetime)
#define V_key_blockacq_count VNET(key_blockacq_count)
#define V_key_blockacq_lifetime VNET(key_blockacq_lifetime)
#define V_key_preferred_oldsa VNET(key_preferred_oldsa)
static VNET_DEFINE(u_int32_t, acq_seq) = 0;
#define V_acq_seq VNET(acq_seq)
static VNET_DEFINE(uint32_t, sp_genid) = 0;
#define V_sp_genid VNET(sp_genid)
/* SPD */
TAILQ_HEAD(secpolicy_queue, secpolicy);
LIST_HEAD(secpolicy_list, secpolicy);
static VNET_DEFINE(struct secpolicy_queue, sptree[IPSEC_DIR_MAX]);
static VNET_DEFINE(struct secpolicy_queue, sptree_ifnet[IPSEC_DIR_MAX]);
static struct rmlock sptree_lock;
#define V_sptree VNET(sptree)
#define V_sptree_ifnet VNET(sptree_ifnet)
#define SPTREE_LOCK_INIT() rm_init(&sptree_lock, "sptree")
#define SPTREE_LOCK_DESTROY() rm_destroy(&sptree_lock)
#define SPTREE_RLOCK_TRACKER struct rm_priotracker sptree_tracker
#define SPTREE_RLOCK() rm_rlock(&sptree_lock, &sptree_tracker)
#define SPTREE_RUNLOCK() rm_runlock(&sptree_lock, &sptree_tracker)
#define SPTREE_RLOCK_ASSERT() rm_assert(&sptree_lock, RA_RLOCKED)
#define SPTREE_WLOCK() rm_wlock(&sptree_lock)
#define SPTREE_WUNLOCK() rm_wunlock(&sptree_lock)
#define SPTREE_WLOCK_ASSERT() rm_assert(&sptree_lock, RA_WLOCKED)
#define SPTREE_UNLOCK_ASSERT() rm_assert(&sptree_lock, RA_UNLOCKED)
/* Hash table for lookup SP using unique id */
static VNET_DEFINE(struct secpolicy_list *, sphashtbl);
static VNET_DEFINE(u_long, sphash_mask);
#define V_sphashtbl VNET(sphashtbl)
#define V_sphash_mask VNET(sphash_mask)
#define SPHASH_NHASH_LOG2 7
#define SPHASH_NHASH (1 << SPHASH_NHASH_LOG2)
#define SPHASH_HASHVAL(id) (key_u32hash(id) & V_sphash_mask)
#define SPHASH_HASH(id) &V_sphashtbl[SPHASH_HASHVAL(id)]
/* SAD */
TAILQ_HEAD(secashead_queue, secashead);
LIST_HEAD(secashead_list, secashead);
static VNET_DEFINE(struct secashead_queue, sahtree);
static struct rmlock sahtree_lock;
#define V_sahtree VNET(sahtree)
#define SAHTREE_LOCK_INIT() rm_init(&sahtree_lock, "sahtree")
#define SAHTREE_LOCK_DESTROY() rm_destroy(&sahtree_lock)
#define SAHTREE_RLOCK_TRACKER struct rm_priotracker sahtree_tracker
#define SAHTREE_RLOCK() rm_rlock(&sahtree_lock, &sahtree_tracker)
#define SAHTREE_RUNLOCK() rm_runlock(&sahtree_lock, &sahtree_tracker)
#define SAHTREE_RLOCK_ASSERT() rm_assert(&sahtree_lock, RA_RLOCKED)
#define SAHTREE_WLOCK() rm_wlock(&sahtree_lock)
#define SAHTREE_WUNLOCK() rm_wunlock(&sahtree_lock)
#define SAHTREE_WLOCK_ASSERT() rm_assert(&sahtree_lock, RA_WLOCKED)
#define SAHTREE_UNLOCK_ASSERT() rm_assert(&sahtree_lock, RA_UNLOCKED)
/* Hash table for lookup in SAD using SA addresses */
static VNET_DEFINE(struct secashead_list *, sahaddrhashtbl);
static VNET_DEFINE(u_long, sahaddrhash_mask);
#define V_sahaddrhashtbl VNET(sahaddrhashtbl)
#define V_sahaddrhash_mask VNET(sahaddrhash_mask)
#define SAHHASH_NHASH_LOG2 7
#define SAHHASH_NHASH (1 << SAHHASH_NHASH_LOG2)
#define SAHADDRHASH_HASHVAL(saidx) \
(key_saidxhash(saidx) & V_sahaddrhash_mask)
#define SAHADDRHASH_HASH(saidx) \
&V_sahaddrhashtbl[SAHADDRHASH_HASHVAL(saidx)]
/* Hash table for lookup in SAD using SPI */
LIST_HEAD(secasvar_list, secasvar);
static VNET_DEFINE(struct secasvar_list *, savhashtbl);
static VNET_DEFINE(u_long, savhash_mask);
#define V_savhashtbl VNET(savhashtbl)
#define V_savhash_mask VNET(savhash_mask)
#define SAVHASH_NHASH_LOG2 7
#define SAVHASH_NHASH (1 << SAVHASH_NHASH_LOG2)
#define SAVHASH_HASHVAL(spi) (key_u32hash(spi) & V_savhash_mask)
#define SAVHASH_HASH(spi) &V_savhashtbl[SAVHASH_HASHVAL(spi)]
static uint32_t
key_saidxhash(const struct secasindex *saidx)
{
uint32_t hval;
hval = fnv_32_buf(&saidx->proto, sizeof(saidx->proto),
FNV1_32_INIT);
switch (saidx->dst.sa.sa_family) {
#ifdef INET
case AF_INET:
hval = fnv_32_buf(&saidx->src.sin.sin_addr,
sizeof(in_addr_t), hval);
hval = fnv_32_buf(&saidx->dst.sin.sin_addr,
sizeof(in_addr_t), hval);
break;
#endif
#ifdef INET6
case AF_INET6:
hval = fnv_32_buf(&saidx->src.sin6.sin6_addr,
sizeof(struct in6_addr), hval);
hval = fnv_32_buf(&saidx->dst.sin6.sin6_addr,
sizeof(struct in6_addr), hval);
break;
#endif
default:
hval = 0;
ipseclog((LOG_DEBUG, "%s: unknown address family %d",
__func__, saidx->dst.sa.sa_family));
}
return (hval);
}
static uint32_t
key_u32hash(uint32_t val)
{
return (fnv_32_buf(&val, sizeof(val), FNV1_32_INIT));
}
/* registed list */
static VNET_DEFINE(LIST_HEAD(_regtree, secreg), regtree[SADB_SATYPE_MAX + 1]);
#define V_regtree VNET(regtree)
static struct mtx regtree_lock;
#define REGTREE_LOCK_INIT() \
mtx_init(&regtree_lock, "regtree", "fast ipsec regtree", MTX_DEF)
#define REGTREE_LOCK_DESTROY() mtx_destroy(&regtree_lock)
#define REGTREE_LOCK() mtx_lock(&regtree_lock)
#define REGTREE_UNLOCK() mtx_unlock(&regtree_lock)
#define REGTREE_LOCK_ASSERT() mtx_assert(&regtree_lock, MA_OWNED)
/* Acquiring list */
LIST_HEAD(secacq_list, secacq);
static VNET_DEFINE(struct secacq_list, acqtree);
#define V_acqtree VNET(acqtree)
static struct mtx acq_lock;
#define ACQ_LOCK_INIT() \
mtx_init(&acq_lock, "acqtree", "ipsec SA acquiring list", MTX_DEF)
#define ACQ_LOCK_DESTROY() mtx_destroy(&acq_lock)
#define ACQ_LOCK() mtx_lock(&acq_lock)
#define ACQ_UNLOCK() mtx_unlock(&acq_lock)
#define ACQ_LOCK_ASSERT() mtx_assert(&acq_lock, MA_OWNED)
/* Hash table for lookup in ACQ list using SA addresses */
static VNET_DEFINE(struct secacq_list *, acqaddrhashtbl);
static VNET_DEFINE(u_long, acqaddrhash_mask);
#define V_acqaddrhashtbl VNET(acqaddrhashtbl)
#define V_acqaddrhash_mask VNET(acqaddrhash_mask)
/* Hash table for lookup in ACQ list using SEQ number */
static VNET_DEFINE(struct secacq_list *, acqseqhashtbl);
static VNET_DEFINE(u_long, acqseqhash_mask);
#define V_acqseqhashtbl VNET(acqseqhashtbl)
#define V_acqseqhash_mask VNET(acqseqhash_mask)
#define ACQHASH_NHASH_LOG2 7
#define ACQHASH_NHASH (1 << ACQHASH_NHASH_LOG2)
#define ACQADDRHASH_HASHVAL(saidx) \
(key_saidxhash(saidx) & V_acqaddrhash_mask)
#define ACQSEQHASH_HASHVAL(seq) \
(key_u32hash(seq) & V_acqseqhash_mask)
#define ACQADDRHASH_HASH(saidx) \
&V_acqaddrhashtbl[ACQADDRHASH_HASHVAL(saidx)]
#define ACQSEQHASH_HASH(seq) \
&V_acqseqhashtbl[ACQSEQHASH_HASHVAL(seq)]
/* SP acquiring list */
static VNET_DEFINE(LIST_HEAD(_spacqtree, secspacq), spacqtree);
#define V_spacqtree VNET(spacqtree)
static struct mtx spacq_lock;
#define SPACQ_LOCK_INIT() \
mtx_init(&spacq_lock, "spacqtree", \
"fast ipsec security policy acquire list", MTX_DEF)
#define SPACQ_LOCK_DESTROY() mtx_destroy(&spacq_lock)
#define SPACQ_LOCK() mtx_lock(&spacq_lock)
#define SPACQ_UNLOCK() mtx_unlock(&spacq_lock)
#define SPACQ_LOCK_ASSERT() mtx_assert(&spacq_lock, MA_OWNED)
static const int minsize[] = {
sizeof(struct sadb_msg), /* SADB_EXT_RESERVED */
sizeof(struct sadb_sa), /* SADB_EXT_SA */
sizeof(struct sadb_lifetime), /* SADB_EXT_LIFETIME_CURRENT */
sizeof(struct sadb_lifetime), /* SADB_EXT_LIFETIME_HARD */
sizeof(struct sadb_lifetime), /* SADB_EXT_LIFETIME_SOFT */
sizeof(struct sadb_address), /* SADB_EXT_ADDRESS_SRC */
sizeof(struct sadb_address), /* SADB_EXT_ADDRESS_DST */
sizeof(struct sadb_address), /* SADB_EXT_ADDRESS_PROXY */
sizeof(struct sadb_key), /* SADB_EXT_KEY_AUTH */
sizeof(struct sadb_key), /* SADB_EXT_KEY_ENCRYPT */
sizeof(struct sadb_ident), /* SADB_EXT_IDENTITY_SRC */
sizeof(struct sadb_ident), /* SADB_EXT_IDENTITY_DST */
sizeof(struct sadb_sens), /* SADB_EXT_SENSITIVITY */
sizeof(struct sadb_prop), /* SADB_EXT_PROPOSAL */
sizeof(struct sadb_supported), /* SADB_EXT_SUPPORTED_AUTH */
sizeof(struct sadb_supported), /* SADB_EXT_SUPPORTED_ENCRYPT */
sizeof(struct sadb_spirange), /* SADB_EXT_SPIRANGE */
0, /* SADB_X_EXT_KMPRIVATE */
sizeof(struct sadb_x_policy), /* SADB_X_EXT_POLICY */
sizeof(struct sadb_x_sa2), /* SADB_X_SA2 */
sizeof(struct sadb_x_nat_t_type),/* SADB_X_EXT_NAT_T_TYPE */
sizeof(struct sadb_x_nat_t_port),/* SADB_X_EXT_NAT_T_SPORT */
sizeof(struct sadb_x_nat_t_port),/* SADB_X_EXT_NAT_T_DPORT */
sizeof(struct sadb_address), /* SADB_X_EXT_NAT_T_OAI */
sizeof(struct sadb_address), /* SADB_X_EXT_NAT_T_OAR */
sizeof(struct sadb_x_nat_t_frag),/* SADB_X_EXT_NAT_T_FRAG */
sizeof(struct sadb_x_sa_replay), /* SADB_X_EXT_SA_REPLAY */
sizeof(struct sadb_address), /* SADB_X_EXT_NEW_ADDRESS_SRC */
sizeof(struct sadb_address), /* SADB_X_EXT_NEW_ADDRESS_DST */
};
_Static_assert(sizeof(minsize)/sizeof(int) == SADB_EXT_MAX + 1, "minsize size mismatch");
static const int maxsize[] = {
sizeof(struct sadb_msg), /* SADB_EXT_RESERVED */
sizeof(struct sadb_sa), /* SADB_EXT_SA */
sizeof(struct sadb_lifetime), /* SADB_EXT_LIFETIME_CURRENT */
sizeof(struct sadb_lifetime), /* SADB_EXT_LIFETIME_HARD */
sizeof(struct sadb_lifetime), /* SADB_EXT_LIFETIME_SOFT */
0, /* SADB_EXT_ADDRESS_SRC */
0, /* SADB_EXT_ADDRESS_DST */
0, /* SADB_EXT_ADDRESS_PROXY */
0, /* SADB_EXT_KEY_AUTH */
0, /* SADB_EXT_KEY_ENCRYPT */
0, /* SADB_EXT_IDENTITY_SRC */
0, /* SADB_EXT_IDENTITY_DST */
0, /* SADB_EXT_SENSITIVITY */
0, /* SADB_EXT_PROPOSAL */
0, /* SADB_EXT_SUPPORTED_AUTH */
0, /* SADB_EXT_SUPPORTED_ENCRYPT */
sizeof(struct sadb_spirange), /* SADB_EXT_SPIRANGE */
0, /* SADB_X_EXT_KMPRIVATE */
0, /* SADB_X_EXT_POLICY */
sizeof(struct sadb_x_sa2), /* SADB_X_SA2 */
sizeof(struct sadb_x_nat_t_type),/* SADB_X_EXT_NAT_T_TYPE */
sizeof(struct sadb_x_nat_t_port),/* SADB_X_EXT_NAT_T_SPORT */
sizeof(struct sadb_x_nat_t_port),/* SADB_X_EXT_NAT_T_DPORT */
0, /* SADB_X_EXT_NAT_T_OAI */
0, /* SADB_X_EXT_NAT_T_OAR */
sizeof(struct sadb_x_nat_t_frag),/* SADB_X_EXT_NAT_T_FRAG */
sizeof(struct sadb_x_sa_replay), /* SADB_X_EXT_SA_REPLAY */
0, /* SADB_X_EXT_NEW_ADDRESS_SRC */
0, /* SADB_X_EXT_NEW_ADDRESS_DST */
};
_Static_assert(sizeof(maxsize)/sizeof(int) == SADB_EXT_MAX + 1, "minsize size mismatch");
/*
* Internal values for SA flags:
* SADB_X_EXT_F_CLONED means that SA was cloned by key_updateaddresses,
* thus we will not free the most of SA content in key_delsav().
*/
#define SADB_X_EXT_F_CLONED 0x80000000
#define SADB_CHECKLEN(_mhp, _ext) \
((_mhp)->extlen[(_ext)] < minsize[(_ext)] || (maxsize[(_ext)] != 0 && \
((_mhp)->extlen[(_ext)] > maxsize[(_ext)])))
#define SADB_CHECKHDR(_mhp, _ext) ((_mhp)->ext[(_ext)] == NULL)
static VNET_DEFINE(int, ipsec_esp_keymin) = 256;
static VNET_DEFINE(int, ipsec_esp_auth) = 0;
static VNET_DEFINE(int, ipsec_ah_keymin) = 128;
#define V_ipsec_esp_keymin VNET(ipsec_esp_keymin)
#define V_ipsec_esp_auth VNET(ipsec_esp_auth)
#define V_ipsec_ah_keymin VNET(ipsec_ah_keymin)
#ifdef IPSEC_DEBUG
VNET_DEFINE(int, ipsec_debug) = 1;
#else
VNET_DEFINE(int, ipsec_debug) = 0;
#endif
#ifdef INET
SYSCTL_DECL(_net_inet_ipsec);
SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEBUG, debug,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipsec_debug), 0,
"Enable IPsec debugging output when set.");
#endif
#ifdef INET6
SYSCTL_DECL(_net_inet6_ipsec6);
SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEBUG, debug,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipsec_debug), 0,
"Enable IPsec debugging output when set.");
#endif
SYSCTL_DECL(_net_key);
SYSCTL_INT(_net_key, KEYCTL_DEBUG_LEVEL, debug,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(key_debug_level), 0, "");
/* max count of trial for the decision of spi value */
SYSCTL_INT(_net_key, KEYCTL_SPI_TRY, spi_trycnt,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(key_spi_trycnt), 0, "");
/* minimum spi value to allocate automatically. */
SYSCTL_INT(_net_key, KEYCTL_SPI_MIN_VALUE, spi_minval,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(key_spi_minval), 0, "");
/* maximun spi value to allocate automatically. */
SYSCTL_INT(_net_key, KEYCTL_SPI_MAX_VALUE, spi_maxval,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(key_spi_maxval), 0, "");
/* interval to initialize randseed */
SYSCTL_INT(_net_key, KEYCTL_RANDOM_INT, int_random,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(key_int_random), 0, "");
/* lifetime for larval SA */
SYSCTL_INT(_net_key, KEYCTL_LARVAL_LIFETIME, larval_lifetime,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(key_larval_lifetime), 0, "");
/* counter for blocking to send SADB_ACQUIRE to IKEd */
SYSCTL_INT(_net_key, KEYCTL_BLOCKACQ_COUNT, blockacq_count,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(key_blockacq_count), 0, "");
/* lifetime for blocking to send SADB_ACQUIRE to IKEd */
SYSCTL_INT(_net_key, KEYCTL_BLOCKACQ_LIFETIME, blockacq_lifetime,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(key_blockacq_lifetime), 0, "");
/* ESP auth */
SYSCTL_INT(_net_key, KEYCTL_ESP_AUTH, esp_auth,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipsec_esp_auth), 0, "");
/* minimum ESP key length */
SYSCTL_INT(_net_key, KEYCTL_ESP_KEYMIN, esp_keymin,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipsec_esp_keymin), 0, "");
/* minimum AH key length */
SYSCTL_INT(_net_key, KEYCTL_AH_KEYMIN, ah_keymin,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipsec_ah_keymin), 0, "");
/* perfered old SA rather than new SA */
SYSCTL_INT(_net_key, KEYCTL_PREFERED_OLDSA, preferred_oldsa,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(key_preferred_oldsa), 0, "");
#define __LIST_CHAINED(elm) \
(!((elm)->chain.le_next == NULL && (elm)->chain.le_prev == NULL))
MALLOC_DEFINE(M_IPSEC_SA, "secasvar", "ipsec security association");
MALLOC_DEFINE(M_IPSEC_SAH, "sahead", "ipsec sa head");
MALLOC_DEFINE(M_IPSEC_SP, "ipsecpolicy", "ipsec security policy");
MALLOC_DEFINE(M_IPSEC_SR, "ipsecrequest", "ipsec security request");
MALLOC_DEFINE(M_IPSEC_MISC, "ipsec-misc", "ipsec miscellaneous");
MALLOC_DEFINE(M_IPSEC_SAQ, "ipsec-saq", "ipsec sa acquire");
MALLOC_DEFINE(M_IPSEC_SAR, "ipsec-reg", "ipsec sa acquire");
static VNET_DEFINE(uma_zone_t, key_lft_zone);
#define V_key_lft_zone VNET(key_lft_zone)
static LIST_HEAD(xforms_list, xformsw) xforms = LIST_HEAD_INITIALIZER();
static struct mtx xforms_lock;
#define XFORMS_LOCK_INIT() \
mtx_init(&xforms_lock, "xforms_list", "IPsec transforms list", MTX_DEF)
#define XFORMS_LOCK_DESTROY() mtx_destroy(&xforms_lock)
#define XFORMS_LOCK() mtx_lock(&xforms_lock)
#define XFORMS_UNLOCK() mtx_unlock(&xforms_lock)
/*
* set parameters into secpolicyindex buffer.
* Must allocate secpolicyindex buffer passed to this function.
*/
#define KEY_SETSECSPIDX(_dir, s, d, ps, pd, ulp, idx) \
do { \
bzero((idx), sizeof(struct secpolicyindex)); \
(idx)->dir = (_dir); \
(idx)->prefs = (ps); \
(idx)->prefd = (pd); \
(idx)->ul_proto = (ulp); \
bcopy((s), &(idx)->src, ((const struct sockaddr *)(s))->sa_len); \
bcopy((d), &(idx)->dst, ((const struct sockaddr *)(d))->sa_len); \
} while (0)
/*
* set parameters into secasindex buffer.
* Must allocate secasindex buffer before calling this function.
*/
#define KEY_SETSECASIDX(p, m, r, s, d, idx) \
do { \
bzero((idx), sizeof(struct secasindex)); \
(idx)->proto = (p); \
(idx)->mode = (m); \
(idx)->reqid = (r); \
bcopy((s), &(idx)->src, ((const struct sockaddr *)(s))->sa_len); \
bcopy((d), &(idx)->dst, ((const struct sockaddr *)(d))->sa_len); \
key_porttosaddr(&(idx)->src.sa, 0); \
key_porttosaddr(&(idx)->dst.sa, 0); \
} while (0)
/* key statistics */
struct _keystat {
u_long getspi_count; /* the avarage of count to try to get new SPI */
} keystat;
struct sadb_msghdr {
struct sadb_msg *msg;
struct sadb_ext *ext[SADB_EXT_MAX + 1];
int extoff[SADB_EXT_MAX + 1];
int extlen[SADB_EXT_MAX + 1];
};
static struct supported_ealgs {
int sadb_alg;
const struct enc_xform *xform;
} supported_ealgs[] = {
{ SADB_EALG_DESCBC, &enc_xform_des },
{ SADB_EALG_3DESCBC, &enc_xform_3des },
{ SADB_X_EALG_AES, &enc_xform_rijndael128 },
{ SADB_X_EALG_BLOWFISHCBC, &enc_xform_blf },
{ SADB_X_EALG_CAST128CBC, &enc_xform_cast5 },
{ SADB_EALG_NULL, &enc_xform_null },
{ SADB_X_EALG_CAMELLIACBC, &enc_xform_camellia },
{ SADB_X_EALG_AESCTR, &enc_xform_aes_icm },
{ SADB_X_EALG_AESGCM16, &enc_xform_aes_nist_gcm },
{ SADB_X_EALG_AESGMAC, &enc_xform_aes_nist_gmac },
};
static struct supported_aalgs {
int sadb_alg;
const struct auth_hash *xform;
} supported_aalgs[] = {
{ SADB_X_AALG_NULL, &auth_hash_null },
{ SADB_AALG_MD5HMAC, &auth_hash_hmac_md5 },
{ SADB_AALG_SHA1HMAC, &auth_hash_hmac_sha1 },
{ SADB_X_AALG_RIPEMD160HMAC, &auth_hash_hmac_ripemd_160 },
{ SADB_X_AALG_MD5, &auth_hash_key_md5 },
{ SADB_X_AALG_SHA, &auth_hash_key_sha1 },
{ SADB_X_AALG_SHA2_256, &auth_hash_hmac_sha2_256 },
{ SADB_X_AALG_SHA2_384, &auth_hash_hmac_sha2_384 },
{ SADB_X_AALG_SHA2_512, &auth_hash_hmac_sha2_512 },
{ SADB_X_AALG_AES128GMAC, &auth_hash_nist_gmac_aes_128 },
{ SADB_X_AALG_AES192GMAC, &auth_hash_nist_gmac_aes_192 },
{ SADB_X_AALG_AES256GMAC, &auth_hash_nist_gmac_aes_256 },
};
static struct supported_calgs {
int sadb_alg;
const struct comp_algo *xform;
} supported_calgs[] = {
{ SADB_X_CALG_DEFLATE, &comp_algo_deflate },
};
#ifndef IPSEC_DEBUG2
static struct callout key_timer;
#endif
static void key_unlink(struct secpolicy *);
static struct secpolicy *key_getsp(struct secpolicyindex *);
static struct secpolicy *key_getspbyid(u_int32_t);
static struct mbuf *key_gather_mbuf(struct mbuf *,
const struct sadb_msghdr *, int, int, ...);
static int key_spdadd(struct socket *, struct mbuf *,
const struct sadb_msghdr *);
static uint32_t key_getnewspid(void);
static int key_spddelete(struct socket *, struct mbuf *,
const struct sadb_msghdr *);
static int key_spddelete2(struct socket *, struct mbuf *,
const struct sadb_msghdr *);
static int key_spdget(struct socket *, struct mbuf *,
const struct sadb_msghdr *);
static int key_spdflush(struct socket *, struct mbuf *,
const struct sadb_msghdr *);
static int key_spddump(struct socket *, struct mbuf *,
const struct sadb_msghdr *);
static struct mbuf *key_setdumpsp(struct secpolicy *,
u_int8_t, u_int32_t, u_int32_t);
static struct mbuf *key_sp2mbuf(struct secpolicy *);
static size_t key_getspreqmsglen(struct secpolicy *);
static int key_spdexpire(struct secpolicy *);
static struct secashead *key_newsah(struct secasindex *);
static void key_freesah(struct secashead **);
static void key_delsah(struct secashead *);
static struct secasvar *key_newsav(const struct sadb_msghdr *,
struct secasindex *, uint32_t, int *);
static void key_delsav(struct secasvar *);
static void key_unlinksav(struct secasvar *);
static struct secashead *key_getsah(struct secasindex *);
static int key_checkspidup(uint32_t);
static struct secasvar *key_getsavbyspi(uint32_t);
static int key_setnatt(struct secasvar *, const struct sadb_msghdr *);
static int key_setsaval(struct secasvar *, const struct sadb_msghdr *);
static int key_updatelifetimes(struct secasvar *, const struct sadb_msghdr *);
static int key_updateaddresses(struct socket *, struct mbuf *,
const struct sadb_msghdr *, struct secasvar *, struct secasindex *);
static struct mbuf *key_setdumpsa(struct secasvar *, u_int8_t,
u_int8_t, u_int32_t, u_int32_t);
static struct mbuf *key_setsadbmsg(u_int8_t, u_int16_t, u_int8_t,
u_int32_t, pid_t, u_int16_t);
static struct mbuf *key_setsadbsa(struct secasvar *);
static struct mbuf *key_setsadbaddr(u_int16_t,
const struct sockaddr *, u_int8_t, u_int16_t);
static struct mbuf *key_setsadbxport(u_int16_t, u_int16_t);
static struct mbuf *key_setsadbxtype(u_int16_t);
static struct mbuf *key_setsadbxsa2(u_int8_t, u_int32_t, u_int32_t);
static struct mbuf *key_setsadbxsareplay(u_int32_t);
static struct mbuf *key_setsadbxpolicy(u_int16_t, u_int8_t,
u_int32_t, u_int32_t);
static struct seckey *key_dup_keymsg(const struct sadb_key *, size_t,
struct malloc_type *);
static struct seclifetime *key_dup_lifemsg(const struct sadb_lifetime *src,
struct malloc_type *);
/* flags for key_cmpsaidx() */
#define CMP_HEAD 1 /* protocol, addresses. */
#define CMP_MODE_REQID 2 /* additionally HEAD, reqid, mode. */
#define CMP_REQID 3 /* additionally HEAD, reaid. */
#define CMP_EXACTLY 4 /* all elements. */
static int key_cmpsaidx(const struct secasindex *,
const struct secasindex *, int);
static int key_cmpspidx_exactly(struct secpolicyindex *,
struct secpolicyindex *);
static int key_cmpspidx_withmask(struct secpolicyindex *,
struct secpolicyindex *);
static int key_bbcmp(const void *, const void *, u_int);
static uint8_t key_satype2proto(uint8_t);
static uint8_t key_proto2satype(uint8_t);
static int key_getspi(struct socket *, struct mbuf *,
const struct sadb_msghdr *);
static uint32_t key_do_getnewspi(struct sadb_spirange *, struct secasindex *);
static int key_update(struct socket *, struct mbuf *,
const struct sadb_msghdr *);
static int key_add(struct socket *, struct mbuf *,
const struct sadb_msghdr *);
static int key_setident(struct secashead *, const struct sadb_msghdr *);
static struct mbuf *key_getmsgbuf_x1(struct mbuf *,
const struct sadb_msghdr *);
static int key_delete(struct socket *, struct mbuf *,
const struct sadb_msghdr *);
static int key_delete_all(struct socket *, struct mbuf *,
const struct sadb_msghdr *, struct secasindex *);
static void key_delete_xform(const struct xformsw *);
static int key_get(struct socket *, struct mbuf *,
const struct sadb_msghdr *);
static void key_getcomb_setlifetime(struct sadb_comb *);
static struct mbuf *key_getcomb_ealg(void);
static struct mbuf *key_getcomb_ah(void);
static struct mbuf *key_getcomb_ipcomp(void);
static struct mbuf *key_getprop(const struct secasindex *);
static int key_acquire(const struct secasindex *, struct secpolicy *);
static uint32_t key_newacq(const struct secasindex *, int *);
static uint32_t key_getacq(const struct secasindex *, int *);
static int key_acqdone(const struct secasindex *, uint32_t);
static int key_acqreset(uint32_t);
static struct secspacq *key_newspacq(struct secpolicyindex *);
static struct secspacq *key_getspacq(struct secpolicyindex *);
static int key_acquire2(struct socket *, struct mbuf *,
const struct sadb_msghdr *);
static int key_register(struct socket *, struct mbuf *,
const struct sadb_msghdr *);
static int key_expire(struct secasvar *, int);
static int key_flush(struct socket *, struct mbuf *,
const struct sadb_msghdr *);
static int key_dump(struct socket *, struct mbuf *,
const struct sadb_msghdr *);
static int key_promisc(struct socket *, struct mbuf *,
const struct sadb_msghdr *);
static int key_senderror(struct socket *, struct mbuf *, int);
static int key_validate_ext(const struct sadb_ext *, int);
static int key_align(struct mbuf *, struct sadb_msghdr *);
static struct mbuf *key_setlifetime(struct seclifetime *, uint16_t);
static struct mbuf *key_setkey(struct seckey *, uint16_t);
static int xform_init(struct secasvar *, u_short);
#define DBG_IPSEC_INITREF(t, p) do { \
refcount_init(&(p)->refcnt, 1); \
KEYDBG(KEY_STAMP, \
printf("%s: Initialize refcnt %s(%p) = %u\n", \
__func__, #t, (p), (p)->refcnt)); \
} while (0)
#define DBG_IPSEC_ADDREF(t, p) do { \
refcount_acquire(&(p)->refcnt); \
KEYDBG(KEY_STAMP, \
printf("%s: Acquire refcnt %s(%p) -> %u\n", \
__func__, #t, (p), (p)->refcnt)); \
} while (0)
#define DBG_IPSEC_DELREF(t, p) do { \
KEYDBG(KEY_STAMP, \
printf("%s: Release refcnt %s(%p) -> %u\n", \
__func__, #t, (p), (p)->refcnt - 1)); \
refcount_release(&(p)->refcnt); \
} while (0)
#define IPSEC_INITREF(t, p) refcount_init(&(p)->refcnt, 1)
#define IPSEC_ADDREF(t, p) refcount_acquire(&(p)->refcnt)
#define IPSEC_DELREF(t, p) refcount_release(&(p)->refcnt)
#define SP_INITREF(p) IPSEC_INITREF(SP, p)
#define SP_ADDREF(p) IPSEC_ADDREF(SP, p)
#define SP_DELREF(p) IPSEC_DELREF(SP, p)
#define SAH_INITREF(p) IPSEC_INITREF(SAH, p)
#define SAH_ADDREF(p) IPSEC_ADDREF(SAH, p)
#define SAH_DELREF(p) IPSEC_DELREF(SAH, p)
#define SAV_INITREF(p) IPSEC_INITREF(SAV, p)
#define SAV_ADDREF(p) IPSEC_ADDREF(SAV, p)
#define SAV_DELREF(p) IPSEC_DELREF(SAV, p)
/*
* Update the refcnt while holding the SPTREE lock.
*/
void
key_addref(struct secpolicy *sp)
{
SP_ADDREF(sp);
}
/*
* Return 0 when there are known to be no SP's for the specified
* direction. Otherwise return 1. This is used by IPsec code
* to optimize performance.
*/
int
key_havesp(u_int dir)
{
return (dir == IPSEC_DIR_INBOUND || dir == IPSEC_DIR_OUTBOUND ?
TAILQ_FIRST(&V_sptree[dir]) != NULL : 1);
}
/* %%% IPsec policy management */
/*
* Return current SPDB generation.
*/
uint32_t
key_getspgen(void)
{
return (V_sp_genid);
}
void
key_bumpspgen(void)
{
V_sp_genid++;
}
static int
key_checksockaddrs(struct sockaddr *src, struct sockaddr *dst)
{
/* family match */
if (src->sa_family != dst->sa_family)
return (EINVAL);
/* sa_len match */
if (src->sa_len != dst->sa_len)
return (EINVAL);
switch (src->sa_family) {
#ifdef INET
case AF_INET:
if (src->sa_len != sizeof(struct sockaddr_in))
return (EINVAL);
break;
#endif
#ifdef INET6
case AF_INET6:
if (src->sa_len != sizeof(struct sockaddr_in6))
return (EINVAL);
break;
#endif
default:
return (EAFNOSUPPORT);
}
return (0);
}
/*
* allocating a SP for OUTBOUND or INBOUND packet.
* Must call key_freesp() later.
* OUT: NULL: not found
* others: found and return the pointer.
*/
struct secpolicy *
key_allocsp(struct secpolicyindex *spidx, u_int dir)
{
SPTREE_RLOCK_TRACKER;
struct secpolicy *sp;
IPSEC_ASSERT(spidx != NULL, ("null spidx"));
IPSEC_ASSERT(dir == IPSEC_DIR_INBOUND || dir == IPSEC_DIR_OUTBOUND,
("invalid direction %u", dir));
SPTREE_RLOCK();
TAILQ_FOREACH(sp, &V_sptree[dir], chain) {
if (key_cmpspidx_withmask(&sp->spidx, spidx)) {
SP_ADDREF(sp);
break;
}
}
SPTREE_RUNLOCK();
if (sp != NULL) { /* found a SPD entry */
sp->lastused = time_second;
KEYDBG(IPSEC_STAMP,
printf("%s: return SP(%p)\n", __func__, sp));
KEYDBG(IPSEC_DATA, kdebug_secpolicy(sp));
} else {
KEYDBG(IPSEC_DATA,
printf("%s: lookup failed for ", __func__);
kdebug_secpolicyindex(spidx, NULL));
}
return (sp);
}
/*
* Allocating an SA entry for an *INBOUND* or *OUTBOUND* TCP packet, signed
* or should be signed by MD5 signature.
* We don't use key_allocsa() for such lookups, because we don't know SPI.
* Unlike ESP and AH protocols, SPI isn't transmitted in the TCP header with
* signed packet. We use SADB only as storage for password.
* OUT: positive: corresponding SA for given saidx found.
* NULL: SA not found
*/
struct secasvar *
key_allocsa_tcpmd5(struct secasindex *saidx)
{
SAHTREE_RLOCK_TRACKER;
struct secashead *sah;
struct secasvar *sav;
IPSEC_ASSERT(saidx->proto == IPPROTO_TCP,
("unexpected security protocol %u", saidx->proto));
IPSEC_ASSERT(saidx->mode == IPSEC_MODE_TCPMD5,
("unexpected mode %u", saidx->mode));
SAHTREE_RLOCK();
LIST_FOREACH(sah, SAHADDRHASH_HASH(saidx), addrhash) {
KEYDBG(IPSEC_DUMP,
printf("%s: checking SAH\n", __func__);
kdebug_secash(sah, " "));
if (sah->saidx.proto != IPPROTO_TCP)
continue;
if (!key_sockaddrcmp(&saidx->dst.sa, &sah->saidx.dst.sa, 0) &&
!key_sockaddrcmp(&saidx->src.sa, &sah->saidx.src.sa, 0))
break;
}
if (sah != NULL) {
if (V_key_preferred_oldsa)
sav = TAILQ_LAST(&sah->savtree_alive, secasvar_queue);
else
sav = TAILQ_FIRST(&sah->savtree_alive);
if (sav != NULL)
SAV_ADDREF(sav);
} else
sav = NULL;
SAHTREE_RUNLOCK();
if (sav != NULL) {
KEYDBG(IPSEC_STAMP,
printf("%s: return SA(%p)\n", __func__, sav));
KEYDBG(IPSEC_DATA, kdebug_secasv(sav));
} else {
KEYDBG(IPSEC_STAMP,
printf("%s: SA not found\n", __func__));
KEYDBG(IPSEC_DATA, kdebug_secasindex(saidx, NULL));
}
return (sav);
}
/*
* Allocating an SA entry for an *OUTBOUND* packet.
* OUT: positive: corresponding SA for given saidx found.
* NULL: SA not found, but will be acquired, check *error
* for acquiring status.
*/
struct secasvar *
key_allocsa_policy(struct secpolicy *sp, const struct secasindex *saidx,
int *error)
{
SAHTREE_RLOCK_TRACKER;
struct secashead *sah;
struct secasvar *sav;
IPSEC_ASSERT(saidx != NULL, ("null saidx"));
IPSEC_ASSERT(saidx->mode == IPSEC_MODE_TRANSPORT ||
saidx->mode == IPSEC_MODE_TUNNEL,
("unexpected policy %u", saidx->mode));
/*
* We check new SA in the IPsec request because a different
* SA may be involved each time this request is checked, either
* because new SAs are being configured, or this request is
* associated with an unconnected datagram socket, or this request
* is associated with a system default policy.
*/
SAHTREE_RLOCK();
LIST_FOREACH(sah, SAHADDRHASH_HASH(saidx), addrhash) {
KEYDBG(IPSEC_DUMP,
printf("%s: checking SAH\n", __func__);
kdebug_secash(sah, " "));
if (key_cmpsaidx(&sah->saidx, saidx, CMP_MODE_REQID))
break;
}
if (sah != NULL) {
/*
* Allocate the oldest SA available according to
* draft-jenkins-ipsec-rekeying-03.
*/
if (V_key_preferred_oldsa)
sav = TAILQ_LAST(&sah->savtree_alive, secasvar_queue);
else
sav = TAILQ_FIRST(&sah->savtree_alive);
if (sav != NULL)
SAV_ADDREF(sav);
} else
sav = NULL;
SAHTREE_RUNLOCK();
if (sav != NULL) {
*error = 0;
KEYDBG(IPSEC_STAMP,
printf("%s: chosen SA(%p) for SP(%p)\n", __func__,
sav, sp));
KEYDBG(IPSEC_DATA, kdebug_secasv(sav));
return (sav); /* return referenced SA */
}
/* there is no SA */
*error = key_acquire(saidx, sp);
if ((*error) != 0)
ipseclog((LOG_DEBUG,
"%s: error %d returned from key_acquire()\n",
__func__, *error));
KEYDBG(IPSEC_STAMP,
printf("%s: acquire SA for SP(%p), error %d\n",
__func__, sp, *error));
KEYDBG(IPSEC_DATA, kdebug_secasindex(saidx, NULL));
return (NULL);
}
/*
* allocating a usable SA entry for a *INBOUND* packet.
* Must call key_freesav() later.
* OUT: positive: pointer to a usable sav (i.e. MATURE or DYING state).
* NULL: not found, or error occurred.
*
* According to RFC 2401 SA is uniquely identified by a triple SPI,
* destination address, and security protocol. But according to RFC 4301,
* SPI by itself suffices to specify an SA.
*
* Note that, however, we do need to keep source address in IPsec SA.
* IKE specification and PF_KEY specification do assume that we
* keep source address in IPsec SA. We see a tricky situation here.
*/
struct secasvar *
key_allocsa(union sockaddr_union *dst, uint8_t proto, uint32_t spi)
{
SAHTREE_RLOCK_TRACKER;
struct secasvar *sav;
IPSEC_ASSERT(proto == IPPROTO_ESP || proto == IPPROTO_AH ||
proto == IPPROTO_IPCOMP, ("unexpected security protocol %u",
proto));
SAHTREE_RLOCK();
LIST_FOREACH(sav, SAVHASH_HASH(spi), spihash) {
if (sav->spi == spi)
break;
}
/*
* We use single SPI namespace for all protocols, so it is
* impossible to have SPI duplicates in the SAVHASH.
*/
if (sav != NULL) {
if (sav->state != SADB_SASTATE_LARVAL &&
sav->sah->saidx.proto == proto &&
key_sockaddrcmp(&dst->sa,
&sav->sah->saidx.dst.sa, 0) == 0)
SAV_ADDREF(sav);
else
sav = NULL;
}
SAHTREE_RUNLOCK();
if (sav == NULL) {
KEYDBG(IPSEC_STAMP,
char buf[IPSEC_ADDRSTRLEN];
printf("%s: SA not found for spi %u proto %u dst %s\n",
__func__, ntohl(spi), proto, ipsec_address(dst, buf,
sizeof(buf))));
} else {
KEYDBG(IPSEC_STAMP,
printf("%s: return SA(%p)\n", __func__, sav));
KEYDBG(IPSEC_DATA, kdebug_secasv(sav));
}
return (sav);
}
struct secasvar *
key_allocsa_tunnel(union sockaddr_union *src, union sockaddr_union *dst,
uint8_t proto)
{
SAHTREE_RLOCK_TRACKER;
struct secasindex saidx;
struct secashead *sah;
struct secasvar *sav;
IPSEC_ASSERT(src != NULL, ("null src address"));
IPSEC_ASSERT(dst != NULL, ("null dst address"));
KEY_SETSECASIDX(proto, IPSEC_MODE_TUNNEL, 0, &src->sa,
&dst->sa, &saidx);
sav = NULL;
SAHTREE_RLOCK();
LIST_FOREACH(sah, SAHADDRHASH_HASH(&saidx), addrhash) {
if (IPSEC_MODE_TUNNEL != sah->saidx.mode)
continue;
if (proto != sah->saidx.proto)
continue;
if (key_sockaddrcmp(&src->sa, &sah->saidx.src.sa, 0) != 0)
continue;
if (key_sockaddrcmp(&dst->sa, &sah->saidx.dst.sa, 0) != 0)
continue;
/* XXXAE: is key_preferred_oldsa reasonably?*/
if (V_key_preferred_oldsa)
sav = TAILQ_LAST(&sah->savtree_alive, secasvar_queue);
else
sav = TAILQ_FIRST(&sah->savtree_alive);
if (sav != NULL) {
SAV_ADDREF(sav);
break;
}
}
SAHTREE_RUNLOCK();
KEYDBG(IPSEC_STAMP,
printf("%s: return SA(%p)\n", __func__, sav));
if (sav != NULL)
KEYDBG(IPSEC_DATA, kdebug_secasv(sav));
return (sav);
}
/*
* Must be called after calling key_allocsp().
*/
void
key_freesp(struct secpolicy **spp)
{
struct secpolicy *sp = *spp;
IPSEC_ASSERT(sp != NULL, ("null sp"));
if (SP_DELREF(sp) == 0)
return;
KEYDBG(IPSEC_STAMP,
printf("%s: last reference to SP(%p)\n", __func__, sp));
KEYDBG(IPSEC_DATA, kdebug_secpolicy(sp));
*spp = NULL;
while (sp->tcount > 0)
ipsec_delisr(sp->req[--sp->tcount]);
free(sp, M_IPSEC_SP);
}
static void
key_unlink(struct secpolicy *sp)
{
IPSEC_ASSERT(sp->spidx.dir == IPSEC_DIR_INBOUND ||
sp->spidx.dir == IPSEC_DIR_OUTBOUND,
("invalid direction %u", sp->spidx.dir));
SPTREE_UNLOCK_ASSERT();
KEYDBG(KEY_STAMP,
printf("%s: SP(%p)\n", __func__, sp));
SPTREE_WLOCK();
if (sp->state != IPSEC_SPSTATE_ALIVE) {
/* SP is already unlinked */
SPTREE_WUNLOCK();
return;
}
sp->state = IPSEC_SPSTATE_DEAD;
TAILQ_REMOVE(&V_sptree[sp->spidx.dir], sp, chain);
LIST_REMOVE(sp, idhash);
V_sp_genid++;
SPTREE_WUNLOCK();
key_freesp(&sp);
}
/*
* insert a secpolicy into the SP database. Lower priorities first
*/
static void
key_insertsp(struct secpolicy *newsp)
{
struct secpolicy *sp;
SPTREE_WLOCK_ASSERT();
TAILQ_FOREACH(sp, &V_sptree[newsp->spidx.dir], chain) {
if (newsp->priority < sp->priority) {
TAILQ_INSERT_BEFORE(sp, newsp, chain);
goto done;
}
}
TAILQ_INSERT_TAIL(&V_sptree[newsp->spidx.dir], newsp, chain);
done:
LIST_INSERT_HEAD(SPHASH_HASH(newsp->id), newsp, idhash);
newsp->state = IPSEC_SPSTATE_ALIVE;
V_sp_genid++;
}
/*
* Insert a bunch of VTI secpolicies into the SPDB.
* We keep VTI policies in the separate list due to following reasons:
* 1) they should be immutable to user's or some deamon's attempts to
* delete. The only way delete such policies - destroy or unconfigure
* corresponding virtual inteface.
* 2) such policies have traffic selector that matches all traffic per
* address family.
* Since all VTI policies have the same priority, we don't care about
* policies order.
*/
int
key_register_ifnet(struct secpolicy **spp, u_int count)
{
struct mbuf *m;
u_int i;
SPTREE_WLOCK();
/*
* First of try to acquire id for each SP.
*/
for (i = 0; i < count; i++) {
IPSEC_ASSERT(spp[i]->spidx.dir == IPSEC_DIR_INBOUND ||
spp[i]->spidx.dir == IPSEC_DIR_OUTBOUND,
("invalid direction %u", spp[i]->spidx.dir));
if ((spp[i]->id = key_getnewspid()) == 0) {
SPTREE_WUNLOCK();
return (EAGAIN);
}
}
for (i = 0; i < count; i++) {
TAILQ_INSERT_TAIL(&V_sptree_ifnet[spp[i]->spidx.dir],
spp[i], chain);
/*
* NOTE: despite the fact that we keep VTI SP in the
* separate list, SPHASH contains policies from both
* sources. Thus SADB_X_SPDGET will correctly return
* SP by id, because it uses SPHASH for lookups.
*/
LIST_INSERT_HEAD(SPHASH_HASH(spp[i]->id), spp[i], idhash);
spp[i]->state = IPSEC_SPSTATE_IFNET;
}
SPTREE_WUNLOCK();
/*
* Notify user processes about new SP.
*/
for (i = 0; i < count; i++) {
m = key_setdumpsp(spp[i], SADB_X_SPDADD, 0, 0);
if (m != NULL)
key_sendup_mbuf(NULL, m, KEY_SENDUP_ALL);
}
return (0);
}
void
key_unregister_ifnet(struct secpolicy **spp, u_int count)
{
struct mbuf *m;
u_int i;
SPTREE_WLOCK();
for (i = 0; i < count; i++) {
IPSEC_ASSERT(spp[i]->spidx.dir == IPSEC_DIR_INBOUND ||
spp[i]->spidx.dir == IPSEC_DIR_OUTBOUND,
("invalid direction %u", spp[i]->spidx.dir));
if (spp[i]->state != IPSEC_SPSTATE_IFNET)
continue;
spp[i]->state = IPSEC_SPSTATE_DEAD;
TAILQ_REMOVE(&V_sptree_ifnet[spp[i]->spidx.dir],
spp[i], chain);
LIST_REMOVE(spp[i], idhash);
}
SPTREE_WUNLOCK();
for (i = 0; i < count; i++) {
m = key_setdumpsp(spp[i], SADB_X_SPDDELETE, 0, 0);
if (m != NULL)
key_sendup_mbuf(NULL, m, KEY_SENDUP_ALL);
}
}
/*
* Must be called after calling key_allocsa().
* This function is called by key_freesp() to free some SA allocated
* for a policy.
*/
void
key_freesav(struct secasvar **psav)
{
struct secasvar *sav = *psav;
IPSEC_ASSERT(sav != NULL, ("null sav"));
if (SAV_DELREF(sav) == 0)
return;
KEYDBG(IPSEC_STAMP,
printf("%s: last reference to SA(%p)\n", __func__, sav));
*psav = NULL;
key_delsav(sav);
}
/*
* Unlink SA from SAH and SPI hash under SAHTREE_WLOCK.
* Expect that SA has extra reference due to lookup.
* Release this references, also release SAH reference after unlink.
*/
static void
key_unlinksav(struct secasvar *sav)
{
struct secashead *sah;
KEYDBG(KEY_STAMP,
printf("%s: SA(%p)\n", __func__, sav));
SAHTREE_UNLOCK_ASSERT();
SAHTREE_WLOCK();
if (sav->state == SADB_SASTATE_DEAD) {
/* SA is already unlinked */
SAHTREE_WUNLOCK();
return;
}
/* Unlink from SAH */
if (sav->state == SADB_SASTATE_LARVAL)
TAILQ_REMOVE(&sav->sah->savtree_larval, sav, chain);
else
TAILQ_REMOVE(&sav->sah->savtree_alive, sav, chain);
/* Unlink from SPI hash */
LIST_REMOVE(sav, spihash);
sav->state = SADB_SASTATE_DEAD;
sah = sav->sah;
SAHTREE_WUNLOCK();
key_freesav(&sav);
/* Since we are unlinked, release reference to SAH */
key_freesah(&sah);
}
/* %%% SPD management */
/*
* search SPD
* OUT: NULL : not found
* others : found, pointer to a SP.
*/
static struct secpolicy *
key_getsp(struct secpolicyindex *spidx)
{
SPTREE_RLOCK_TRACKER;
struct secpolicy *sp;
IPSEC_ASSERT(spidx != NULL, ("null spidx"));
SPTREE_RLOCK();
TAILQ_FOREACH(sp, &V_sptree[spidx->dir], chain) {
if (key_cmpspidx_exactly(spidx, &sp->spidx)) {
SP_ADDREF(sp);
break;
}
}
SPTREE_RUNLOCK();
return sp;
}
/*
* get SP by index.
* OUT: NULL : not found
* others : found, pointer to referenced SP.
*/
static struct secpolicy *
key_getspbyid(uint32_t id)
{
SPTREE_RLOCK_TRACKER;
struct secpolicy *sp;
SPTREE_RLOCK();
LIST_FOREACH(sp, SPHASH_HASH(id), idhash) {
if (sp->id == id) {
SP_ADDREF(sp);
break;
}
}
SPTREE_RUNLOCK();
return (sp);
}
struct secpolicy *
key_newsp(void)
{
struct secpolicy *sp;
sp = malloc(sizeof(*sp), M_IPSEC_SP, M_NOWAIT | M_ZERO);
if (sp != NULL)
SP_INITREF(sp);
return (sp);
}
struct ipsecrequest *
ipsec_newisr(void)
{
return (malloc(sizeof(struct ipsecrequest), M_IPSEC_SR,
M_NOWAIT | M_ZERO));
}
void
ipsec_delisr(struct ipsecrequest *p)
{
free(p, M_IPSEC_SR);
}
/*
* create secpolicy structure from sadb_x_policy structure.
* NOTE: `state', `secpolicyindex' and 'id' in secpolicy structure
* are not set, so must be set properly later.
*/
struct secpolicy *
key_msg2sp(struct sadb_x_policy *xpl0, size_t len, int *error)
{
struct secpolicy *newsp;
IPSEC_ASSERT(xpl0 != NULL, ("null xpl0"));
IPSEC_ASSERT(len >= sizeof(*xpl0), ("policy too short: %zu", len));
if (len != PFKEY_EXTLEN(xpl0)) {
ipseclog((LOG_DEBUG, "%s: Invalid msg length.\n", __func__));
*error = EINVAL;
return NULL;
}
if ((newsp = key_newsp()) == NULL) {
*error = ENOBUFS;
return NULL;
}
newsp->spidx.dir = xpl0->sadb_x_policy_dir;
newsp->policy = xpl0->sadb_x_policy_type;
newsp->priority = xpl0->sadb_x_policy_priority;
newsp->tcount = 0;
/* check policy */
switch (xpl0->sadb_x_policy_type) {
case IPSEC_POLICY_DISCARD:
case IPSEC_POLICY_NONE:
case IPSEC_POLICY_ENTRUST:
case IPSEC_POLICY_BYPASS:
break;
case IPSEC_POLICY_IPSEC:
{
struct sadb_x_ipsecrequest *xisr;
struct ipsecrequest *isr;
int tlen;
/* validity check */
if (PFKEY_EXTLEN(xpl0) < sizeof(*xpl0)) {
ipseclog((LOG_DEBUG, "%s: Invalid msg length.\n",
__func__));
key_freesp(&newsp);
*error = EINVAL;
return NULL;
}
tlen = PFKEY_EXTLEN(xpl0) - sizeof(*xpl0);
xisr = (struct sadb_x_ipsecrequest *)(xpl0 + 1);
while (tlen > 0) {
/* length check */
if (xisr->sadb_x_ipsecrequest_len < sizeof(*xisr) ||
xisr->sadb_x_ipsecrequest_len > tlen) {
ipseclog((LOG_DEBUG, "%s: invalid ipsecrequest "
"length.\n", __func__));
key_freesp(&newsp);
*error = EINVAL;
return NULL;
}
if (newsp->tcount >= IPSEC_MAXREQ) {
ipseclog((LOG_DEBUG,
"%s: too many ipsecrequests.\n",
__func__));
key_freesp(&newsp);
*error = EINVAL;
return (NULL);
}
/* allocate request buffer */
/* NB: data structure is zero'd */
isr = ipsec_newisr();
if (isr == NULL) {
ipseclog((LOG_DEBUG,
"%s: No more memory.\n", __func__));
key_freesp(&newsp);
*error = ENOBUFS;
return NULL;
}
newsp->req[newsp->tcount++] = isr;
/* set values */
switch (xisr->sadb_x_ipsecrequest_proto) {
case IPPROTO_ESP:
case IPPROTO_AH:
case IPPROTO_IPCOMP:
break;
default:
ipseclog((LOG_DEBUG,
"%s: invalid proto type=%u\n", __func__,
xisr->sadb_x_ipsecrequest_proto));
key_freesp(&newsp);
*error = EPROTONOSUPPORT;
return NULL;
}
isr->saidx.proto =
(uint8_t)xisr->sadb_x_ipsecrequest_proto;
switch (xisr->sadb_x_ipsecrequest_mode) {
case IPSEC_MODE_TRANSPORT:
case IPSEC_MODE_TUNNEL:
break;
case IPSEC_MODE_ANY:
default:
ipseclog((LOG_DEBUG,
"%s: invalid mode=%u\n", __func__,
xisr->sadb_x_ipsecrequest_mode));
key_freesp(&newsp);
*error = EINVAL;
return NULL;
}
isr->saidx.mode = xisr->sadb_x_ipsecrequest_mode;
switch (xisr->sadb_x_ipsecrequest_level) {
case IPSEC_LEVEL_DEFAULT:
case IPSEC_LEVEL_USE:
case IPSEC_LEVEL_REQUIRE:
break;
case IPSEC_LEVEL_UNIQUE:
/* validity check */
/*
* If range violation of reqid, kernel will
* update it, don't refuse it.
*/
if (xisr->sadb_x_ipsecrequest_reqid
> IPSEC_MANUAL_REQID_MAX) {
ipseclog((LOG_DEBUG,
"%s: reqid=%d range "
"violation, updated by kernel.\n",
__func__,
xisr->sadb_x_ipsecrequest_reqid));
xisr->sadb_x_ipsecrequest_reqid = 0;
}
/* allocate new reqid id if reqid is zero. */
if (xisr->sadb_x_ipsecrequest_reqid == 0) {
u_int32_t reqid;
if ((reqid = key_newreqid()) == 0) {
key_freesp(&newsp);
*error = ENOBUFS;
return NULL;
}
isr->saidx.reqid = reqid;
xisr->sadb_x_ipsecrequest_reqid = reqid;
} else {
/* set it for manual keying. */
isr->saidx.reqid =
xisr->sadb_x_ipsecrequest_reqid;
}
break;
default:
ipseclog((LOG_DEBUG, "%s: invalid level=%u\n",
__func__,
xisr->sadb_x_ipsecrequest_level));
key_freesp(&newsp);
*error = EINVAL;
return NULL;
}
isr->level = xisr->sadb_x_ipsecrequest_level;
/* set IP addresses if there */
if (xisr->sadb_x_ipsecrequest_len > sizeof(*xisr)) {
struct sockaddr *paddr;
len = tlen - sizeof(*xisr);
paddr = (struct sockaddr *)(xisr + 1);
/* validity check */
if (len < sizeof(struct sockaddr) ||
len < 2 * paddr->sa_len ||
paddr->sa_len > sizeof(isr->saidx.src)) {
ipseclog((LOG_DEBUG, "%s: invalid "
"request address length.\n",
__func__));
key_freesp(&newsp);
*error = EINVAL;
return NULL;
}
/*
* Request length should be enough to keep
* source and destination addresses.
*/
if (xisr->sadb_x_ipsecrequest_len <
sizeof(*xisr) + 2 * paddr->sa_len) {
ipseclog((LOG_DEBUG, "%s: invalid "
"ipsecrequest length.\n",
__func__));
key_freesp(&newsp);
*error = EINVAL;
return (NULL);
}
bcopy(paddr, &isr->saidx.src, paddr->sa_len);
paddr = (struct sockaddr *)((caddr_t)paddr +
paddr->sa_len);
/* validity check */
if (paddr->sa_len !=
isr->saidx.src.sa.sa_len) {
ipseclog((LOG_DEBUG, "%s: invalid "
"request address length.\n",
__func__));
key_freesp(&newsp);
*error = EINVAL;
return NULL;
}
/* AF family should match */
if (paddr->sa_family !=
isr->saidx.src.sa.sa_family) {
ipseclog((LOG_DEBUG, "%s: address "
"family doesn't match.\n",
__func__));
key_freesp(&newsp);
*error = EINVAL;
return (NULL);
}
bcopy(paddr, &isr->saidx.dst, paddr->sa_len);
} else {
/*
* Addresses for TUNNEL mode requests are
* mandatory.
*/
if (isr->saidx.mode == IPSEC_MODE_TUNNEL) {
ipseclog((LOG_DEBUG, "%s: missing "
"request addresses.\n", __func__));
key_freesp(&newsp);
*error = EINVAL;
return (NULL);
}
}
tlen -= xisr->sadb_x_ipsecrequest_len;
/* validity check */
if (tlen < 0) {
ipseclog((LOG_DEBUG, "%s: becoming tlen < 0.\n",
__func__));
key_freesp(&newsp);
*error = EINVAL;
return NULL;
}
xisr = (struct sadb_x_ipsecrequest *)((caddr_t)xisr
+ xisr->sadb_x_ipsecrequest_len);
}
/* XXXAE: LARVAL SP */
if (newsp->tcount < 1) {
ipseclog((LOG_DEBUG, "%s: valid IPSEC transforms "
"not found.\n", __func__));
key_freesp(&newsp);
*error = EINVAL;
return (NULL);
}
}
break;
default:
ipseclog((LOG_DEBUG, "%s: invalid policy type.\n", __func__));
key_freesp(&newsp);
*error = EINVAL;
return NULL;
}
*error = 0;
return (newsp);
}
uint32_t
key_newreqid(void)
{
static uint32_t auto_reqid = IPSEC_MANUAL_REQID_MAX + 1;
if (auto_reqid == ~0)
auto_reqid = IPSEC_MANUAL_REQID_MAX + 1;
else
auto_reqid++;
/* XXX should be unique check */
return (auto_reqid);
}
/*
* copy secpolicy struct to sadb_x_policy structure indicated.
*/
static struct mbuf *
key_sp2mbuf(struct secpolicy *sp)
{
struct mbuf *m;
size_t tlen;
tlen = key_getspreqmsglen(sp);
m = m_get2(tlen, M_NOWAIT, MT_DATA, 0);
if (m == NULL)
return (NULL);
m_align(m, tlen);
m->m_len = tlen;
if (key_sp2msg(sp, m->m_data, &tlen) != 0) {
m_freem(m);
return (NULL);
}
return (m);
}
int
key_sp2msg(struct secpolicy *sp, void *request, size_t *len)
{
struct sadb_x_ipsecrequest *xisr;
struct sadb_x_policy *xpl;
struct ipsecrequest *isr;
size_t xlen, ilen;
caddr_t p;
int error, i;
IPSEC_ASSERT(sp != NULL, ("null policy"));
xlen = sizeof(*xpl);
if (*len < xlen)
return (EINVAL);
error = 0;
bzero(request, *len);
xpl = (struct sadb_x_policy *)request;
xpl->sadb_x_policy_exttype = SADB_X_EXT_POLICY;
xpl->sadb_x_policy_type = sp->policy;
xpl->sadb_x_policy_dir = sp->spidx.dir;
xpl->sadb_x_policy_id = sp->id;
xpl->sadb_x_policy_priority = sp->priority;
switch (sp->state) {
case IPSEC_SPSTATE_IFNET:
xpl->sadb_x_policy_scope = IPSEC_POLICYSCOPE_IFNET;
break;
case IPSEC_SPSTATE_PCB:
xpl->sadb_x_policy_scope = IPSEC_POLICYSCOPE_PCB;
break;
default:
xpl->sadb_x_policy_scope = IPSEC_POLICYSCOPE_GLOBAL;
}
/* if is the policy for ipsec ? */
if (sp->policy == IPSEC_POLICY_IPSEC) {
p = (caddr_t)xpl + sizeof(*xpl);
for (i = 0; i < sp->tcount; i++) {
isr = sp->req[i];
ilen = PFKEY_ALIGN8(sizeof(*xisr) +
isr->saidx.src.sa.sa_len +
isr->saidx.dst.sa.sa_len);
xlen += ilen;
if (xlen > *len) {
error = ENOBUFS;
/* Calculate needed size */
continue;
}
xisr = (struct sadb_x_ipsecrequest *)p;
xisr->sadb_x_ipsecrequest_len = ilen;
xisr->sadb_x_ipsecrequest_proto = isr->saidx.proto;
xisr->sadb_x_ipsecrequest_mode = isr->saidx.mode;
xisr->sadb_x_ipsecrequest_level = isr->level;
xisr->sadb_x_ipsecrequest_reqid = isr->saidx.reqid;
p += sizeof(*xisr);
bcopy(&isr->saidx.src, p, isr->saidx.src.sa.sa_len);
p += isr->saidx.src.sa.sa_len;
bcopy(&isr->saidx.dst, p, isr->saidx.dst.sa.sa_len);
p += isr->saidx.dst.sa.sa_len;
}
}
xpl->sadb_x_policy_len = PFKEY_UNIT64(xlen);
if (error == 0)
*len = xlen;
else
*len = sizeof(*xpl);
return (error);
}
/* m will not be freed nor modified */
static struct mbuf *
key_gather_mbuf(struct mbuf *m, const struct sadb_msghdr *mhp,
int ndeep, int nitem, ...)
{
va_list ap;
int idx;
int i;
struct mbuf *result = NULL, *n;
int len;
IPSEC_ASSERT(m != NULL, ("null mbuf"));
IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
va_start(ap, nitem);
for (i = 0; i < nitem; i++) {
idx = va_arg(ap, int);
if (idx < 0 || idx > SADB_EXT_MAX)
goto fail;
/* don't attempt to pull empty extension */
if (idx == SADB_EXT_RESERVED && mhp->msg == NULL)
continue;
if (idx != SADB_EXT_RESERVED &&
(mhp->ext[idx] == NULL || mhp->extlen[idx] == 0))
continue;
if (idx == SADB_EXT_RESERVED) {
len = PFKEY_ALIGN8(sizeof(struct sadb_msg));
IPSEC_ASSERT(len <= MHLEN, ("header too big %u", len));
MGETHDR(n, M_NOWAIT, MT_DATA);
if (!n)
goto fail;
n->m_len = len;
n->m_next = NULL;
m_copydata(m, 0, sizeof(struct sadb_msg),
mtod(n, caddr_t));
} else if (i < ndeep) {
len = mhp->extlen[idx];
n = m_get2(len, M_NOWAIT, MT_DATA, 0);
if (n == NULL)
goto fail;
m_align(n, len);
n->m_len = len;
m_copydata(m, mhp->extoff[idx], mhp->extlen[idx],
mtod(n, caddr_t));
} else {
n = m_copym(m, mhp->extoff[idx], mhp->extlen[idx],
M_NOWAIT);
}
if (n == NULL)
goto fail;
if (result)
m_cat(result, n);
else
result = n;
}
va_end(ap);
if ((result->m_flags & M_PKTHDR) != 0) {
result->m_pkthdr.len = 0;
for (n = result; n; n = n->m_next)
result->m_pkthdr.len += n->m_len;
}
return result;
fail:
m_freem(result);
va_end(ap);
return NULL;
}
/*
* SADB_X_SPDADD, SADB_X_SPDSETIDX or SADB_X_SPDUPDATE processing
* add an entry to SP database, when received
* <base, address(SD), (lifetime(H),) policy>
* from the user(?).
* Adding to SP database,
* and send
* <base, address(SD), (lifetime(H),) policy>
* to the socket which was send.
*
* SPDADD set a unique policy entry.
* SPDSETIDX like SPDADD without a part of policy requests.
* SPDUPDATE replace a unique policy entry.
*
* XXXAE: serialize this in PF_KEY to avoid races.
* m will always be freed.
*/
static int
key_spdadd(struct socket *so, struct mbuf *m, const struct sadb_msghdr *mhp)
{
struct secpolicyindex spidx;
struct sadb_address *src0, *dst0;
struct sadb_x_policy *xpl0, *xpl;
struct sadb_lifetime *lft = NULL;
struct secpolicy *newsp;
int error;
IPSEC_ASSERT(so != NULL, ("null socket"));
IPSEC_ASSERT(m != NULL, ("null mbuf"));
IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
if (SADB_CHECKHDR(mhp, SADB_EXT_ADDRESS_SRC) ||
SADB_CHECKHDR(mhp, SADB_EXT_ADDRESS_DST) ||
SADB_CHECKHDR(mhp, SADB_X_EXT_POLICY)) {
ipseclog((LOG_DEBUG,
"%s: invalid message: missing required header.\n",
__func__));
return key_senderror(so, m, EINVAL);
}
if (SADB_CHECKLEN(mhp, SADB_EXT_ADDRESS_SRC) ||
SADB_CHECKLEN(mhp, SADB_EXT_ADDRESS_DST) ||
SADB_CHECKLEN(mhp, SADB_X_EXT_POLICY)) {
ipseclog((LOG_DEBUG,
"%s: invalid message: wrong header size.\n", __func__));
return key_senderror(so, m, EINVAL);
}
if (!SADB_CHECKHDR(mhp, SADB_EXT_LIFETIME_HARD)) {
if (SADB_CHECKLEN(mhp, SADB_EXT_LIFETIME_HARD)) {
ipseclog((LOG_DEBUG,
"%s: invalid message: wrong header size.\n",
__func__));
return key_senderror(so, m, EINVAL);
}
lft = (struct sadb_lifetime *)mhp->ext[SADB_EXT_LIFETIME_HARD];
}
src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC];
dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST];
xpl0 = (struct sadb_x_policy *)mhp->ext[SADB_X_EXT_POLICY];
/* check the direciton */
switch (xpl0->sadb_x_policy_dir) {
case IPSEC_DIR_INBOUND:
case IPSEC_DIR_OUTBOUND:
break;
default:
ipseclog((LOG_DEBUG, "%s: invalid SP direction.\n", __func__));
return key_senderror(so, m, EINVAL);
}
/* key_spdadd() accepts DISCARD, NONE and IPSEC. */
if (xpl0->sadb_x_policy_type != IPSEC_POLICY_DISCARD &&
xpl0->sadb_x_policy_type != IPSEC_POLICY_NONE &&
xpl0->sadb_x_policy_type != IPSEC_POLICY_IPSEC) {
ipseclog((LOG_DEBUG, "%s: invalid policy type.\n", __func__));
return key_senderror(so, m, EINVAL);
}
/* policy requests are mandatory when action is ipsec. */
if (xpl0->sadb_x_policy_type == IPSEC_POLICY_IPSEC &&
mhp->extlen[SADB_X_EXT_POLICY] <= sizeof(*xpl0)) {
ipseclog((LOG_DEBUG,
"%s: policy requests required.\n", __func__));
return key_senderror(so, m, EINVAL);
}
error = key_checksockaddrs((struct sockaddr *)(src0 + 1),
(struct sockaddr *)(dst0 + 1));
if (error != 0 ||
src0->sadb_address_proto != dst0->sadb_address_proto) {
ipseclog((LOG_DEBUG, "%s: invalid sockaddr.\n", __func__));
return key_senderror(so, m, error);
}
/* make secindex */
KEY_SETSECSPIDX(xpl0->sadb_x_policy_dir,
src0 + 1,
dst0 + 1,
src0->sadb_address_prefixlen,
dst0->sadb_address_prefixlen,
src0->sadb_address_proto,
&spidx);
/* Checking there is SP already or not. */
newsp = key_getsp(&spidx);
if (newsp != NULL) {
if (mhp->msg->sadb_msg_type == SADB_X_SPDUPDATE) {
KEYDBG(KEY_STAMP,
printf("%s: unlink SP(%p) for SPDUPDATE\n",
__func__, newsp));
KEYDBG(KEY_DATA, kdebug_secpolicy(newsp));
key_unlink(newsp);
key_freesp(&newsp);
} else {
key_freesp(&newsp);
ipseclog((LOG_DEBUG, "%s: a SP entry exists already.",
__func__));
return (key_senderror(so, m, EEXIST));
}
}
/* allocate new SP entry */
if ((newsp = key_msg2sp(xpl0, PFKEY_EXTLEN(xpl0), &error)) == NULL) {
return key_senderror(so, m, error);
}
newsp->lastused = newsp->created = time_second;
newsp->lifetime = lft ? lft->sadb_lifetime_addtime : 0;
newsp->validtime = lft ? lft->sadb_lifetime_usetime : 0;
bcopy(&spidx, &newsp->spidx, sizeof(spidx));
/* XXXAE: there is race between key_getsp() and key_insertsp() */
SPTREE_WLOCK();
if ((newsp->id = key_getnewspid()) == 0) {
SPTREE_WUNLOCK();
key_freesp(&newsp);
return key_senderror(so, m, ENOBUFS);
}
key_insertsp(newsp);
SPTREE_WUNLOCK();
KEYDBG(KEY_STAMP,
printf("%s: SP(%p)\n", __func__, newsp));
KEYDBG(KEY_DATA, kdebug_secpolicy(newsp));
{
struct mbuf *n, *mpolicy;
struct sadb_msg *newmsg;
int off;
/* create new sadb_msg to reply. */
if (lft) {
n = key_gather_mbuf(m, mhp, 2, 5, SADB_EXT_RESERVED,
SADB_X_EXT_POLICY, SADB_EXT_LIFETIME_HARD,
SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST);
} else {
n = key_gather_mbuf(m, mhp, 2, 4, SADB_EXT_RESERVED,
SADB_X_EXT_POLICY,
SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST);
}
if (!n)
return key_senderror(so, m, ENOBUFS);
if (n->m_len < sizeof(*newmsg)) {
n = m_pullup(n, sizeof(*newmsg));
if (!n)
return key_senderror(so, m, ENOBUFS);
}
newmsg = mtod(n, struct sadb_msg *);
newmsg->sadb_msg_errno = 0;
newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len);
off = 0;
mpolicy = m_pulldown(n, PFKEY_ALIGN8(sizeof(struct sadb_msg)),
sizeof(*xpl), &off);
if (mpolicy == NULL) {
/* n is already freed */
return key_senderror(so, m, ENOBUFS);
}
xpl = (struct sadb_x_policy *)(mtod(mpolicy, caddr_t) + off);
if (xpl->sadb_x_policy_exttype != SADB_X_EXT_POLICY) {
m_freem(n);
return key_senderror(so, m, EINVAL);
}
xpl->sadb_x_policy_id = newsp->id;
m_freem(m);
return key_sendup_mbuf(so, n, KEY_SENDUP_ALL);
}
}
/*
* get new policy id.
* OUT:
* 0: failure.
* others: success.
*/
static uint32_t
key_getnewspid(void)
{
struct secpolicy *sp;
uint32_t newid = 0;
int count = V_key_spi_trycnt; /* XXX */
SPTREE_WLOCK_ASSERT();
while (count--) {
if (V_policy_id == ~0) /* overflowed */
newid = V_policy_id = 1;
else
newid = ++V_policy_id;
LIST_FOREACH(sp, SPHASH_HASH(newid), idhash) {
if (sp->id == newid)
break;
}
if (sp == NULL)
break;
}
if (count == 0 || newid == 0) {
ipseclog((LOG_DEBUG, "%s: failed to allocate policy id.\n",
__func__));
return (0);
}
return (newid);
}
/*
* SADB_SPDDELETE processing
* receive
* <base, address(SD), policy(*)>
* from the user(?), and set SADB_SASTATE_DEAD,
* and send,
* <base, address(SD), policy(*)>
* to the ikmpd.
* policy(*) including direction of policy.
*
* m will always be freed.
*/
static int
key_spddelete(struct socket *so, struct mbuf *m,
const struct sadb_msghdr *mhp)
{
struct secpolicyindex spidx;
struct sadb_address *src0, *dst0;
struct sadb_x_policy *xpl0;
struct secpolicy *sp;
IPSEC_ASSERT(so != NULL, ("null so"));
IPSEC_ASSERT(m != NULL, ("null mbuf"));
IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
if (SADB_CHECKHDR(mhp, SADB_EXT_ADDRESS_SRC) ||
SADB_CHECKHDR(mhp, SADB_EXT_ADDRESS_DST) ||
SADB_CHECKHDR(mhp, SADB_X_EXT_POLICY)) {
ipseclog((LOG_DEBUG,
"%s: invalid message: missing required header.\n",
__func__));
return key_senderror(so, m, EINVAL);
}
if (SADB_CHECKLEN(mhp, SADB_EXT_ADDRESS_SRC) ||
SADB_CHECKLEN(mhp, SADB_EXT_ADDRESS_DST) ||
SADB_CHECKLEN(mhp, SADB_X_EXT_POLICY)) {
ipseclog((LOG_DEBUG,
"%s: invalid message: wrong header size.\n", __func__));
return key_senderror(so, m, EINVAL);
}
src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC];
dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST];
xpl0 = (struct sadb_x_policy *)mhp->ext[SADB_X_EXT_POLICY];
/* check the direciton */
switch (xpl0->sadb_x_policy_dir) {
case IPSEC_DIR_INBOUND:
case IPSEC_DIR_OUTBOUND:
break;
default:
ipseclog((LOG_DEBUG, "%s: invalid SP direction.\n", __func__));
return key_senderror(so, m, EINVAL);
}
/* Only DISCARD, NONE and IPSEC are allowed */
if (xpl0->sadb_x_policy_type != IPSEC_POLICY_DISCARD &&
xpl0->sadb_x_policy_type != IPSEC_POLICY_NONE &&
xpl0->sadb_x_policy_type != IPSEC_POLICY_IPSEC) {
ipseclog((LOG_DEBUG, "%s: invalid policy type.\n", __func__));
return key_senderror(so, m, EINVAL);
}
if (key_checksockaddrs((struct sockaddr *)(src0 + 1),
(struct sockaddr *)(dst0 + 1)) != 0 ||
src0->sadb_address_proto != dst0->sadb_address_proto) {
ipseclog((LOG_DEBUG, "%s: invalid sockaddr.\n", __func__));
return key_senderror(so, m, EINVAL);
}
/* make secindex */
KEY_SETSECSPIDX(xpl0->sadb_x_policy_dir,
src0 + 1,
dst0 + 1,
src0->sadb_address_prefixlen,
dst0->sadb_address_prefixlen,
src0->sadb_address_proto,
&spidx);
/* Is there SP in SPD ? */
if ((sp = key_getsp(&spidx)) == NULL) {
ipseclog((LOG_DEBUG, "%s: no SP found.\n", __func__));
return key_senderror(so, m, EINVAL);
}
/* save policy id to buffer to be returned. */
xpl0->sadb_x_policy_id = sp->id;
KEYDBG(KEY_STAMP,
printf("%s: SP(%p)\n", __func__, sp));
KEYDBG(KEY_DATA, kdebug_secpolicy(sp));
key_unlink(sp);
key_freesp(&sp);
{
struct mbuf *n;
struct sadb_msg *newmsg;
/* create new sadb_msg to reply. */
n = key_gather_mbuf(m, mhp, 1, 4, SADB_EXT_RESERVED,
SADB_X_EXT_POLICY, SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST);
if (!n)
return key_senderror(so, m, ENOBUFS);
newmsg = mtod(n, struct sadb_msg *);
newmsg->sadb_msg_errno = 0;
newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len);
m_freem(m);
return key_sendup_mbuf(so, n, KEY_SENDUP_ALL);
}
}
/*
* SADB_SPDDELETE2 processing
* receive
* <base, policy(*)>
* from the user(?), and set SADB_SASTATE_DEAD,
* and send,
* <base, policy(*)>
* to the ikmpd.
* policy(*) including direction of policy.
*
* m will always be freed.
*/
static int
key_spddelete2(struct socket *so, struct mbuf *m,
const struct sadb_msghdr *mhp)
{
struct secpolicy *sp;
uint32_t id;
IPSEC_ASSERT(so != NULL, ("null socket"));
IPSEC_ASSERT(m != NULL, ("null mbuf"));
IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
if (SADB_CHECKHDR(mhp, SADB_X_EXT_POLICY) ||
SADB_CHECKLEN(mhp, SADB_X_EXT_POLICY)) {
ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
__func__));
return key_senderror(so, m, EINVAL);
}
id = ((struct sadb_x_policy *)
mhp->ext[SADB_X_EXT_POLICY])->sadb_x_policy_id;
/* Is there SP in SPD ? */
if ((sp = key_getspbyid(id)) == NULL) {
ipseclog((LOG_DEBUG, "%s: no SP found for id %u.\n",
__func__, id));
return key_senderror(so, m, EINVAL);
}
KEYDBG(KEY_STAMP,
printf("%s: SP(%p)\n", __func__, sp));
KEYDBG(KEY_DATA, kdebug_secpolicy(sp));
key_unlink(sp);
if (sp->state != IPSEC_SPSTATE_DEAD) {
ipseclog((LOG_DEBUG, "%s: failed to delete SP with id %u.\n",
__func__, id));
key_freesp(&sp);
return (key_senderror(so, m, EACCES));
}
key_freesp(&sp);
{
struct mbuf *n, *nn;
struct sadb_msg *newmsg;
int off, len;
/* create new sadb_msg to reply. */
len = PFKEY_ALIGN8(sizeof(struct sadb_msg));
MGETHDR(n, M_NOWAIT, MT_DATA);
if (n && len > MHLEN) {
if (!(MCLGET(n, M_NOWAIT))) {
m_freem(n);
n = NULL;
}
}
if (!n)
return key_senderror(so, m, ENOBUFS);
n->m_len = len;
n->m_next = NULL;
off = 0;
m_copydata(m, 0, sizeof(struct sadb_msg), mtod(n, caddr_t) + off);
off += PFKEY_ALIGN8(sizeof(struct sadb_msg));
IPSEC_ASSERT(off == len, ("length inconsistency (off %u len %u)",
off, len));
n->m_next = m_copym(m, mhp->extoff[SADB_X_EXT_POLICY],
mhp->extlen[SADB_X_EXT_POLICY], M_NOWAIT);
if (!n->m_next) {
m_freem(n);
return key_senderror(so, m, ENOBUFS);
}
n->m_pkthdr.len = 0;
for (nn = n; nn; nn = nn->m_next)
n->m_pkthdr.len += nn->m_len;
newmsg = mtod(n, struct sadb_msg *);
newmsg->sadb_msg_errno = 0;
newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len);
m_freem(m);
return key_sendup_mbuf(so, n, KEY_SENDUP_ALL);
}
}
/*
* SADB_X_SPDGET processing
* receive
* <base, policy(*)>
* from the user(?),
* and send,
* <base, address(SD), policy>
* to the ikmpd.
* policy(*) including direction of policy.
*
* m will always be freed.
*/
static int
key_spdget(struct socket *so, struct mbuf *m, const struct sadb_msghdr *mhp)
{
struct secpolicy *sp;
struct mbuf *n;
uint32_t id;
IPSEC_ASSERT(so != NULL, ("null socket"));
IPSEC_ASSERT(m != NULL, ("null mbuf"));
IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
if (SADB_CHECKHDR(mhp, SADB_X_EXT_POLICY) ||
SADB_CHECKLEN(mhp, SADB_X_EXT_POLICY)) {
ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
__func__));
return key_senderror(so, m, EINVAL);
}
id = ((struct sadb_x_policy *)
mhp->ext[SADB_X_EXT_POLICY])->sadb_x_policy_id;
/* Is there SP in SPD ? */
if ((sp = key_getspbyid(id)) == NULL) {
ipseclog((LOG_DEBUG, "%s: no SP found for id %u.\n",
__func__, id));
return key_senderror(so, m, ENOENT);
}
n = key_setdumpsp(sp, SADB_X_SPDGET, mhp->msg->sadb_msg_seq,
mhp->msg->sadb_msg_pid);
key_freesp(&sp);
if (n != NULL) {
m_freem(m);
return key_sendup_mbuf(so, n, KEY_SENDUP_ONE);
} else
return key_senderror(so, m, ENOBUFS);
}
/*
* SADB_X_SPDACQUIRE processing.
* Acquire policy and SA(s) for a *OUTBOUND* packet.
* send
* <base, policy(*)>
* to KMD, and expect to receive
* <base> with SADB_X_SPDACQUIRE if error occurred,
* or
* <base, policy>
* with SADB_X_SPDUPDATE from KMD by PF_KEY.
* policy(*) is without policy requests.
*
* 0 : succeed
* others: error number
*/
int
key_spdacquire(struct secpolicy *sp)
{
struct mbuf *result = NULL, *m;
struct secspacq *newspacq;
IPSEC_ASSERT(sp != NULL, ("null secpolicy"));
IPSEC_ASSERT(sp->req == NULL, ("policy exists"));
IPSEC_ASSERT(sp->policy == IPSEC_POLICY_IPSEC,
("policy not IPSEC %u", sp->policy));
/* Get an entry to check whether sent message or not. */
newspacq = key_getspacq(&sp->spidx);
if (newspacq != NULL) {
if (V_key_blockacq_count < newspacq->count) {
/* reset counter and do send message. */
newspacq->count = 0;
} else {
/* increment counter and do nothing. */
newspacq->count++;
SPACQ_UNLOCK();
return (0);
}
SPACQ_UNLOCK();
} else {
/* make new entry for blocking to send SADB_ACQUIRE. */
newspacq = key_newspacq(&sp->spidx);
if (newspacq == NULL)
return ENOBUFS;
}
/* create new sadb_msg to reply. */
m = key_setsadbmsg(SADB_X_SPDACQUIRE, 0, 0, 0, 0, 0);
if (!m)
return ENOBUFS;
result = m;
result->m_pkthdr.len = 0;
for (m = result; m; m = m->m_next)
result->m_pkthdr.len += m->m_len;
mtod(result, struct sadb_msg *)->sadb_msg_len =
PFKEY_UNIT64(result->m_pkthdr.len);
return key_sendup_mbuf(NULL, m, KEY_SENDUP_REGISTERED);
}
/*
* SADB_SPDFLUSH processing
* receive
* <base>
* from the user, and free all entries in secpctree.
* and send,
* <base>
* to the user.
* NOTE: what to do is only marking SADB_SASTATE_DEAD.
*
* m will always be freed.
*/
static int
key_spdflush(struct socket *so, struct mbuf *m, const struct sadb_msghdr *mhp)
{
struct secpolicy_queue drainq;
struct sadb_msg *newmsg;
struct secpolicy *sp, *nextsp;
u_int dir;
IPSEC_ASSERT(so != NULL, ("null socket"));
IPSEC_ASSERT(m != NULL, ("null mbuf"));
IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
if (m->m_len != PFKEY_ALIGN8(sizeof(struct sadb_msg)))
return key_senderror(so, m, EINVAL);
TAILQ_INIT(&drainq);
SPTREE_WLOCK();
for (dir = 0; dir < IPSEC_DIR_MAX; dir++) {
TAILQ_CONCAT(&drainq, &V_sptree[dir], chain);
}
/*
* We need to set state to DEAD for each policy to be sure,
* that another thread won't try to unlink it.
* Also remove SP from sphash.
*/
TAILQ_FOREACH(sp, &drainq, chain) {
sp->state = IPSEC_SPSTATE_DEAD;
LIST_REMOVE(sp, idhash);
}
V_sp_genid++;
SPTREE_WUNLOCK();
sp = TAILQ_FIRST(&drainq);
while (sp != NULL) {
nextsp = TAILQ_NEXT(sp, chain);
key_freesp(&sp);
sp = nextsp;
}
if (sizeof(struct sadb_msg) > m->m_len + M_TRAILINGSPACE(m)) {
ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
return key_senderror(so, m, ENOBUFS);
}
if (m->m_next)
m_freem(m->m_next);
m->m_next = NULL;
m->m_pkthdr.len = m->m_len = PFKEY_ALIGN8(sizeof(struct sadb_msg));
newmsg = mtod(m, struct sadb_msg *);
newmsg->sadb_msg_errno = 0;
newmsg->sadb_msg_len = PFKEY_UNIT64(m->m_pkthdr.len);
return key_sendup_mbuf(so, m, KEY_SENDUP_ALL);
}
static uint8_t
key_satype2scopemask(uint8_t satype)
{
if (satype == IPSEC_POLICYSCOPE_ANY)
return (0xff);
return (satype);
}
/*
* SADB_SPDDUMP processing
* receive
* <base>
* from the user, and dump all SP leaves and send,
* <base> .....
* to the ikmpd.
*
* NOTE:
* sadb_msg_satype is considered as mask of policy scopes.
* m will always be freed.
*/
static int
key_spddump(struct socket *so, struct mbuf *m, const struct sadb_msghdr *mhp)
{
SPTREE_RLOCK_TRACKER;
struct secpolicy *sp;
struct mbuf *n;
int cnt;
u_int dir, scope;
IPSEC_ASSERT(so != NULL, ("null socket"));
IPSEC_ASSERT(m != NULL, ("null mbuf"));
IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
/* search SPD entry and get buffer size. */
cnt = 0;
scope = key_satype2scopemask(mhp->msg->sadb_msg_satype);
SPTREE_RLOCK();
for (dir = 0; dir < IPSEC_DIR_MAX; dir++) {
if (scope & IPSEC_POLICYSCOPE_GLOBAL) {
TAILQ_FOREACH(sp, &V_sptree[dir], chain)
cnt++;
}
if (scope & IPSEC_POLICYSCOPE_IFNET) {
TAILQ_FOREACH(sp, &V_sptree_ifnet[dir], chain)
cnt++;
}
}
if (cnt == 0) {
SPTREE_RUNLOCK();
return key_senderror(so, m, ENOENT);
}
for (dir = 0; dir < IPSEC_DIR_MAX; dir++) {
if (scope & IPSEC_POLICYSCOPE_GLOBAL) {
TAILQ_FOREACH(sp, &V_sptree[dir], chain) {
--cnt;
n = key_setdumpsp(sp, SADB_X_SPDDUMP, cnt,
mhp->msg->sadb_msg_pid);
if (n != NULL)
key_sendup_mbuf(so, n, KEY_SENDUP_ONE);
}
}
if (scope & IPSEC_POLICYSCOPE_IFNET) {
TAILQ_FOREACH(sp, &V_sptree_ifnet[dir], chain) {
--cnt;
n = key_setdumpsp(sp, SADB_X_SPDDUMP, cnt,
mhp->msg->sadb_msg_pid);
if (n != NULL)
key_sendup_mbuf(so, n, KEY_SENDUP_ONE);
}
}
}
SPTREE_RUNLOCK();
m_freem(m);
return (0);
}
static struct mbuf *
key_setdumpsp(struct secpolicy *sp, u_int8_t type, u_int32_t seq,
u_int32_t pid)
{
struct mbuf *result = NULL, *m;
struct seclifetime lt;
m = key_setsadbmsg(type, 0, SADB_SATYPE_UNSPEC, seq, pid, sp->refcnt);
if (!m)
goto fail;
result = m;
m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC,
&sp->spidx.src.sa, sp->spidx.prefs,
sp->spidx.ul_proto);
if (!m)
goto fail;
m_cat(result, m);
m = key_setsadbaddr(SADB_EXT_ADDRESS_DST,
&sp->spidx.dst.sa, sp->spidx.prefd,
sp->spidx.ul_proto);
if (!m)
goto fail;
m_cat(result, m);
m = key_sp2mbuf(sp);
if (!m)
goto fail;
m_cat(result, m);
if(sp->lifetime){
lt.addtime=sp->created;
lt.usetime= sp->lastused;
m = key_setlifetime(&lt, SADB_EXT_LIFETIME_CURRENT);
if (!m)
goto fail;
m_cat(result, m);
lt.addtime=sp->lifetime;
lt.usetime= sp->validtime;
m = key_setlifetime(&lt, SADB_EXT_LIFETIME_HARD);
if (!m)
goto fail;
m_cat(result, m);
}
if ((result->m_flags & M_PKTHDR) == 0)
goto fail;
if (result->m_len < sizeof(struct sadb_msg)) {
result = m_pullup(result, sizeof(struct sadb_msg));
if (result == NULL)
goto fail;
}
result->m_pkthdr.len = 0;
for (m = result; m; m = m->m_next)
result->m_pkthdr.len += m->m_len;
mtod(result, struct sadb_msg *)->sadb_msg_len =
PFKEY_UNIT64(result->m_pkthdr.len);
return result;
fail:
m_freem(result);
return NULL;
}
/*
* get PFKEY message length for security policy and request.
*/
static size_t
key_getspreqmsglen(struct secpolicy *sp)
{
size_t tlen, len;
int i;
tlen = sizeof(struct sadb_x_policy);
/* if is the policy for ipsec ? */
if (sp->policy != IPSEC_POLICY_IPSEC)
return (tlen);
/* get length of ipsec requests */
for (i = 0; i < sp->tcount; i++) {
len = sizeof(struct sadb_x_ipsecrequest)
+ sp->req[i]->saidx.src.sa.sa_len
+ sp->req[i]->saidx.dst.sa.sa_len;
tlen += PFKEY_ALIGN8(len);
}
return (tlen);
}
/*
* SADB_SPDEXPIRE processing
* send
* <base, address(SD), lifetime(CH), policy>
* to KMD by PF_KEY.
*
* OUT: 0 : succeed
* others : error number
*/
static int
key_spdexpire(struct secpolicy *sp)
{
struct sadb_lifetime *lt;
struct mbuf *result = NULL, *m;
int len, error = -1;
IPSEC_ASSERT(sp != NULL, ("null secpolicy"));
KEYDBG(KEY_STAMP,
printf("%s: SP(%p)\n", __func__, sp));
KEYDBG(KEY_DATA, kdebug_secpolicy(sp));
/* set msg header */
m = key_setsadbmsg(SADB_X_SPDEXPIRE, 0, 0, 0, 0, 0);
if (!m) {
error = ENOBUFS;
goto fail;
}
result = m;
/* create lifetime extension (current and hard) */
len = PFKEY_ALIGN8(sizeof(*lt)) * 2;
m = m_get2(len, M_NOWAIT, MT_DATA, 0);
if (m == NULL) {
error = ENOBUFS;
goto fail;
}
m_align(m, len);
m->m_len = len;
bzero(mtod(m, caddr_t), len);
lt = mtod(m, struct sadb_lifetime *);
lt->sadb_lifetime_len = PFKEY_UNIT64(sizeof(struct sadb_lifetime));
lt->sadb_lifetime_exttype = SADB_EXT_LIFETIME_CURRENT;
lt->sadb_lifetime_allocations = 0;
lt->sadb_lifetime_bytes = 0;
lt->sadb_lifetime_addtime = sp->created;
lt->sadb_lifetime_usetime = sp->lastused;
lt = (struct sadb_lifetime *)(mtod(m, caddr_t) + len / 2);
lt->sadb_lifetime_len = PFKEY_UNIT64(sizeof(struct sadb_lifetime));
lt->sadb_lifetime_exttype = SADB_EXT_LIFETIME_HARD;
lt->sadb_lifetime_allocations = 0;
lt->sadb_lifetime_bytes = 0;
lt->sadb_lifetime_addtime = sp->lifetime;
lt->sadb_lifetime_usetime = sp->validtime;
m_cat(result, m);
/* set sadb_address for source */
m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC,
&sp->spidx.src.sa,
sp->spidx.prefs, sp->spidx.ul_proto);
if (!m) {
error = ENOBUFS;
goto fail;
}
m_cat(result, m);
/* set sadb_address for destination */
m = key_setsadbaddr(SADB_EXT_ADDRESS_DST,
&sp->spidx.dst.sa,
sp->spidx.prefd, sp->spidx.ul_proto);
if (!m) {
error = ENOBUFS;
goto fail;
}
m_cat(result, m);
/* set secpolicy */
m = key_sp2mbuf(sp);
if (!m) {
error = ENOBUFS;
goto fail;
}
m_cat(result, m);
if ((result->m_flags & M_PKTHDR) == 0) {
error = EINVAL;
goto fail;
}
if (result->m_len < sizeof(struct sadb_msg)) {
result = m_pullup(result, sizeof(struct sadb_msg));
if (result == NULL) {
error = ENOBUFS;
goto fail;
}
}
result->m_pkthdr.len = 0;
for (m = result; m; m = m->m_next)
result->m_pkthdr.len += m->m_len;
mtod(result, struct sadb_msg *)->sadb_msg_len =
PFKEY_UNIT64(result->m_pkthdr.len);
return key_sendup_mbuf(NULL, result, KEY_SENDUP_REGISTERED);
fail:
if (result)
m_freem(result);
return error;
}
/* %%% SAD management */
/*
* allocating and initialize new SA head.
* OUT: NULL : failure due to the lack of memory.
* others : pointer to new SA head.
*/
static struct secashead *
key_newsah(struct secasindex *saidx)
{
struct secashead *sah;
sah = malloc(sizeof(struct secashead), M_IPSEC_SAH,
M_NOWAIT | M_ZERO);
if (sah == NULL) {
PFKEYSTAT_INC(in_nomem);
return (NULL);
}
TAILQ_INIT(&sah->savtree_larval);
TAILQ_INIT(&sah->savtree_alive);
sah->saidx = *saidx;
sah->state = SADB_SASTATE_DEAD;
SAH_INITREF(sah);
KEYDBG(KEY_STAMP,
printf("%s: SAH(%p)\n", __func__, sah));
KEYDBG(KEY_DATA, kdebug_secash(sah, NULL));
return (sah);
}
static void
key_freesah(struct secashead **psah)
{
struct secashead *sah = *psah;
if (SAH_DELREF(sah) == 0)
return;
KEYDBG(KEY_STAMP,
printf("%s: last reference to SAH(%p)\n", __func__, sah));
KEYDBG(KEY_DATA, kdebug_secash(sah, NULL));
*psah = NULL;
key_delsah(sah);
}
static void
key_delsah(struct secashead *sah)
{
IPSEC_ASSERT(sah != NULL, ("NULL sah"));
IPSEC_ASSERT(sah->state == SADB_SASTATE_DEAD,
("Attempt to free non DEAD SAH %p", sah));
IPSEC_ASSERT(TAILQ_EMPTY(&sah->savtree_larval),
("Attempt to free SAH %p with LARVAL SA", sah));
IPSEC_ASSERT(TAILQ_EMPTY(&sah->savtree_alive),
("Attempt to free SAH %p with ALIVE SA", sah));
free(sah, M_IPSEC_SAH);
}
/*
* allocating a new SA for key_add() and key_getspi() call,
* and copy the values of mhp into new buffer.
* When SAD message type is SADB_GETSPI set SA state to LARVAL.
* For SADB_ADD create and initialize SA with MATURE state.
* OUT: NULL : fail
* others : pointer to new secasvar.
*/
static struct secasvar *
key_newsav(const struct sadb_msghdr *mhp, struct secasindex *saidx,
uint32_t spi, int *errp)
{
struct secashead *sah;
struct secasvar *sav;
int isnew;
IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
IPSEC_ASSERT(mhp->msg->sadb_msg_type == SADB_GETSPI ||
mhp->msg->sadb_msg_type == SADB_ADD, ("wrong message type"));
sav = NULL;
sah = NULL;
/* check SPI value */
switch (saidx->proto) {
case IPPROTO_ESP:
case IPPROTO_AH:
/*
* RFC 4302, 2.4. Security Parameters Index (SPI), SPI values
* 1-255 reserved by IANA for future use,
* 0 for implementation specific, local use.
*/
if (ntohl(spi) <= 255) {
ipseclog((LOG_DEBUG, "%s: illegal range of SPI %u.\n",
__func__, ntohl(spi)));
*errp = EINVAL;
goto done;
}
break;
}
sav = malloc(sizeof(struct secasvar), M_IPSEC_SA, M_NOWAIT | M_ZERO);
if (sav == NULL) {
*errp = ENOBUFS;
goto done;
}
sav->lock = malloc(sizeof(struct mtx), M_IPSEC_MISC,
M_NOWAIT | M_ZERO);
if (sav->lock == NULL) {
*errp = ENOBUFS;
goto done;
}
mtx_init(sav->lock, "ipsec association", NULL, MTX_DEF);
sav->lft_c = uma_zalloc(V_key_lft_zone, M_NOWAIT);
if (sav->lft_c == NULL) {
*errp = ENOBUFS;
goto done;
}
counter_u64_zero(sav->lft_c_allocations);
counter_u64_zero(sav->lft_c_bytes);
sav->spi = spi;
sav->seq = mhp->msg->sadb_msg_seq;
sav->state = SADB_SASTATE_LARVAL;
sav->pid = (pid_t)mhp->msg->sadb_msg_pid;
SAV_INITREF(sav);
again:
sah = key_getsah(saidx);
if (sah == NULL) {
/* create a new SA index */
sah = key_newsah(saidx);
if (sah == NULL) {
ipseclog((LOG_DEBUG,
"%s: No more memory.\n", __func__));
*errp = ENOBUFS;
goto done;
}
isnew = 1;
} else
isnew = 0;
sav->sah = sah;
if (mhp->msg->sadb_msg_type == SADB_GETSPI) {
sav->created = time_second;
} else if (sav->state == SADB_SASTATE_LARVAL) {
/*
* Do not call key_setsaval() second time in case
* of `goto again`. We will have MATURE state.
*/
*errp = key_setsaval(sav, mhp);
if (*errp != 0)
goto done;
sav->state = SADB_SASTATE_MATURE;
}
SAHTREE_WLOCK();
/*
* Check that existing SAH wasn't unlinked.
* Since we didn't hold the SAHTREE lock, it is possible,
* that callout handler or key_flush() or key_delete() could
* unlink this SAH.
*/
if (isnew == 0 && sah->state == SADB_SASTATE_DEAD) {
SAHTREE_WUNLOCK();
key_freesah(&sah); /* reference from key_getsah() */
goto again;
}
if (isnew != 0) {
/*
* Add new SAH into SADB.
*
* XXXAE: we can serialize key_add and key_getspi calls, so
* several threads will not fight in the race.
* Otherwise we should check under SAHTREE lock, that this
* SAH would not added twice.
*/
TAILQ_INSERT_HEAD(&V_sahtree, sah, chain);
/* Add new SAH into hash by addresses */
LIST_INSERT_HEAD(SAHADDRHASH_HASH(saidx), sah, addrhash);
/* Now we are linked in the chain */
sah->state = SADB_SASTATE_MATURE;
/*
* SAV references this new SAH.
* In case of existing SAH we reuse reference
* from key_getsah().
*/
SAH_ADDREF(sah);
}
/* Link SAV with SAH */
if (sav->state == SADB_SASTATE_MATURE)
TAILQ_INSERT_HEAD(&sah->savtree_alive, sav, chain);
else
TAILQ_INSERT_HEAD(&sah->savtree_larval, sav, chain);
/* Add SAV into SPI hash */
LIST_INSERT_HEAD(SAVHASH_HASH(sav->spi), sav, spihash);
SAHTREE_WUNLOCK();
*errp = 0; /* success */
done:
if (*errp != 0) {
if (sav != NULL) {
if (sav->lock != NULL) {
mtx_destroy(sav->lock);
free(sav->lock, M_IPSEC_MISC);
}
if (sav->lft_c != NULL)
uma_zfree(V_key_lft_zone, sav->lft_c);
free(sav, M_IPSEC_SA), sav = NULL;
}
if (sah != NULL)
key_freesah(&sah);
if (*errp == ENOBUFS) {
ipseclog((LOG_DEBUG, "%s: No more memory.\n",
__func__));
PFKEYSTAT_INC(in_nomem);
}
}
return (sav);
}
/*
* free() SA variable entry.
*/
static void
key_cleansav(struct secasvar *sav)
{
if (sav->natt != NULL) {
free(sav->natt, M_IPSEC_MISC);
sav->natt = NULL;
}
if (sav->flags & SADB_X_EXT_F_CLONED)
return;
/*
* Cleanup xform state. Note that zeroize'ing causes the
* keys to be cleared; otherwise we must do it ourself.
*/
if (sav->tdb_xform != NULL) {
sav->tdb_xform->xf_zeroize(sav);
sav->tdb_xform = NULL;
} else {
if (sav->key_auth != NULL)
bzero(sav->key_auth->key_data, _KEYLEN(sav->key_auth));
if (sav->key_enc != NULL)
bzero(sav->key_enc->key_data, _KEYLEN(sav->key_enc));
}
if (sav->key_auth != NULL) {
if (sav->key_auth->key_data != NULL)
free(sav->key_auth->key_data, M_IPSEC_MISC);
free(sav->key_auth, M_IPSEC_MISC);
sav->key_auth = NULL;
}
if (sav->key_enc != NULL) {
if (sav->key_enc->key_data != NULL)
free(sav->key_enc->key_data, M_IPSEC_MISC);
free(sav->key_enc, M_IPSEC_MISC);
sav->key_enc = NULL;
}
if (sav->replay != NULL) {
if (sav->replay->bitmap != NULL)
free(sav->replay->bitmap, M_IPSEC_MISC);
free(sav->replay, M_IPSEC_MISC);
sav->replay = NULL;
}
if (sav->lft_h != NULL) {
free(sav->lft_h, M_IPSEC_MISC);
sav->lft_h = NULL;
}
if (sav->lft_s != NULL) {
free(sav->lft_s, M_IPSEC_MISC);
sav->lft_s = NULL;
}
}
/*
* free() SA variable entry.
*/
static void
key_delsav(struct secasvar *sav)
{
IPSEC_ASSERT(sav != NULL, ("null sav"));
IPSEC_ASSERT(sav->state == SADB_SASTATE_DEAD,
("attempt to free non DEAD SA %p", sav));
IPSEC_ASSERT(sav->refcnt == 0, ("reference count %u > 0",
sav->refcnt));
/*
* SA must be unlinked from the chain and hashtbl.
* If SA was cloned, we leave all fields untouched,
* except NAT-T config.
*/
key_cleansav(sav);
if ((sav->flags & SADB_X_EXT_F_CLONED) == 0) {
mtx_destroy(sav->lock);
free(sav->lock, M_IPSEC_MISC);
uma_zfree(V_key_lft_zone, sav->lft_c);
}
free(sav, M_IPSEC_SA);
}
/*
* search SAH.
* OUT:
* NULL : not found
* others : found, referenced pointer to a SAH.
*/
static struct secashead *
key_getsah(struct secasindex *saidx)
{
SAHTREE_RLOCK_TRACKER;
struct secashead *sah;
SAHTREE_RLOCK();
LIST_FOREACH(sah, SAHADDRHASH_HASH(saidx), addrhash) {
if (key_cmpsaidx(&sah->saidx, saidx, CMP_MODE_REQID) != 0) {
SAH_ADDREF(sah);
break;
}
}
SAHTREE_RUNLOCK();
return (sah);
}
/*
* Check not to be duplicated SPI.
* OUT:
* 0 : not found
* 1 : found SA with given SPI.
*/
static int
key_checkspidup(uint32_t spi)
{
SAHTREE_RLOCK_TRACKER;
struct secasvar *sav;
/* Assume SPI is in network byte order */
SAHTREE_RLOCK();
LIST_FOREACH(sav, SAVHASH_HASH(spi), spihash) {
if (sav->spi == spi)
break;
}
SAHTREE_RUNLOCK();
return (sav != NULL);
}
/*
* Search SA by SPI.
* OUT:
* NULL : not found
* others : found, referenced pointer to a SA.
*/
static struct secasvar *
key_getsavbyspi(uint32_t spi)
{
SAHTREE_RLOCK_TRACKER;
struct secasvar *sav;
/* Assume SPI is in network byte order */
SAHTREE_RLOCK();
LIST_FOREACH(sav, SAVHASH_HASH(spi), spihash) {
if (sav->spi != spi)
continue;
SAV_ADDREF(sav);
break;
}
SAHTREE_RUNLOCK();
return (sav);
}
static int
key_updatelifetimes(struct secasvar *sav, const struct sadb_msghdr *mhp)
{
struct seclifetime *lft_h, *lft_s, *tmp;
/* Lifetime extension is optional, check that it is present. */
if (SADB_CHECKHDR(mhp, SADB_EXT_LIFETIME_HARD) &&
SADB_CHECKHDR(mhp, SADB_EXT_LIFETIME_SOFT)) {
/*
* In case of SADB_UPDATE we may need to change
* existing lifetimes.
*/
if (sav->state == SADB_SASTATE_MATURE) {
lft_h = lft_s = NULL;
goto reset;
}
return (0);
}
/* Both HARD and SOFT extensions must present */
if ((SADB_CHECKHDR(mhp, SADB_EXT_LIFETIME_HARD) &&
!SADB_CHECKHDR(mhp, SADB_EXT_LIFETIME_SOFT)) ||
(SADB_CHECKHDR(mhp, SADB_EXT_LIFETIME_SOFT) &&
!SADB_CHECKHDR(mhp, SADB_EXT_LIFETIME_HARD))) {
ipseclog((LOG_DEBUG,
"%s: invalid message: missing required header.\n",
__func__));
return (EINVAL);
}
if (SADB_CHECKLEN(mhp, SADB_EXT_LIFETIME_HARD) ||
SADB_CHECKLEN(mhp, SADB_EXT_LIFETIME_SOFT)) {
ipseclog((LOG_DEBUG,
"%s: invalid message: wrong header size.\n", __func__));
return (EINVAL);
}
lft_h = key_dup_lifemsg((const struct sadb_lifetime *)
mhp->ext[SADB_EXT_LIFETIME_HARD], M_IPSEC_MISC);
if (lft_h == NULL) {
PFKEYSTAT_INC(in_nomem);
ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
return (ENOBUFS);
}
lft_s = key_dup_lifemsg((const struct sadb_lifetime *)
mhp->ext[SADB_EXT_LIFETIME_SOFT], M_IPSEC_MISC);
if (lft_s == NULL) {
PFKEYSTAT_INC(in_nomem);
free(lft_h, M_IPSEC_MISC);
ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
return (ENOBUFS);
}
reset:
if (sav->state != SADB_SASTATE_LARVAL) {
/*
* key_update() holds reference to this SA,
* so it won't be deleted in meanwhile.
*/
SECASVAR_LOCK(sav);
tmp = sav->lft_h;
sav->lft_h = lft_h;
lft_h = tmp;
tmp = sav->lft_s;
sav->lft_s = lft_s;
lft_s = tmp;
SECASVAR_UNLOCK(sav);
if (lft_h != NULL)
free(lft_h, M_IPSEC_MISC);
if (lft_s != NULL)
free(lft_s, M_IPSEC_MISC);
return (0);
}
/* We can update lifetime without holding a lock */
IPSEC_ASSERT(sav->lft_h == NULL, ("lft_h is already initialized\n"));
IPSEC_ASSERT(sav->lft_s == NULL, ("lft_s is already initialized\n"));
sav->lft_h = lft_h;
sav->lft_s = lft_s;
return (0);
}
/*
* copy SA values from PF_KEY message except *SPI, SEQ, PID and TYPE*.
* You must update these if need. Expects only LARVAL SAs.
* OUT: 0: success.
* !0: failure.
*/
static int
key_setsaval(struct secasvar *sav, const struct sadb_msghdr *mhp)
{
const struct sadb_sa *sa0;
const struct sadb_key *key0;
uint32_t replay;
size_t len;
int error;
IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
IPSEC_ASSERT(sav->state == SADB_SASTATE_LARVAL,
("Attempt to update non LARVAL SA"));
/* XXX rewrite */
error = key_setident(sav->sah, mhp);
if (error != 0)
goto fail;
/* SA */
if (!SADB_CHECKHDR(mhp, SADB_EXT_SA)) {
if (SADB_CHECKLEN(mhp, SADB_EXT_SA)) {
error = EINVAL;
goto fail;
}
sa0 = (const struct sadb_sa *)mhp->ext[SADB_EXT_SA];
sav->alg_auth = sa0->sadb_sa_auth;
sav->alg_enc = sa0->sadb_sa_encrypt;
sav->flags = sa0->sadb_sa_flags;
if ((sav->flags & SADB_KEY_FLAGS_MAX) != sav->flags) {
ipseclog((LOG_DEBUG,
"%s: invalid sa_flags 0x%08x.\n", __func__,
sav->flags));
error = EINVAL;
goto fail;
}
/* Optional replay window */
replay = 0;
if ((sa0->sadb_sa_flags & SADB_X_EXT_OLD) == 0)
replay = sa0->sadb_sa_replay;
if (!SADB_CHECKHDR(mhp, SADB_X_EXT_SA_REPLAY)) {
if (SADB_CHECKLEN(mhp, SADB_X_EXT_SA_REPLAY)) {
error = EINVAL;
goto fail;
}
replay = ((const struct sadb_x_sa_replay *)
mhp->ext[SADB_X_EXT_SA_REPLAY])->sadb_x_sa_replay_replay;
if (replay > UINT32_MAX - 32) {
ipseclog((LOG_DEBUG,
"%s: replay window too big.\n", __func__));
error = EINVAL;
goto fail;
}
replay = (replay + 7) >> 3;
}
sav->replay = malloc(sizeof(struct secreplay), M_IPSEC_MISC,
M_NOWAIT | M_ZERO);
if (sav->replay == NULL) {
PFKEYSTAT_INC(in_nomem);
ipseclog((LOG_DEBUG, "%s: No more memory.\n",
__func__));
error = ENOBUFS;
goto fail;
}
if (replay != 0) {
/* number of 32b blocks to be allocated */
uint32_t bitmap_size;
/* RFC 6479:
* - the allocated replay window size must be
* a power of two.
* - use an extra 32b block as a redundant window.
*/
bitmap_size = 1;
while (replay + 4 > bitmap_size)
bitmap_size <<= 1;
bitmap_size = bitmap_size / 4;
sav->replay->bitmap = malloc(
bitmap_size * sizeof(uint32_t), M_IPSEC_MISC,
M_NOWAIT | M_ZERO);
if (sav->replay->bitmap == NULL) {
PFKEYSTAT_INC(in_nomem);
ipseclog((LOG_DEBUG, "%s: No more memory.\n",
__func__));
error = ENOBUFS;
goto fail;
}
sav->replay->bitmap_size = bitmap_size;
sav->replay->wsize = replay;
}
}
/* Authentication keys */
if (!SADB_CHECKHDR(mhp, SADB_EXT_KEY_AUTH)) {
if (SADB_CHECKLEN(mhp, SADB_EXT_KEY_AUTH)) {
error = EINVAL;
goto fail;
}
error = 0;
key0 = (const struct sadb_key *)mhp->ext[SADB_EXT_KEY_AUTH];
len = mhp->extlen[SADB_EXT_KEY_AUTH];
switch (mhp->msg->sadb_msg_satype) {
case SADB_SATYPE_AH:
case SADB_SATYPE_ESP:
case SADB_X_SATYPE_TCPSIGNATURE:
if (len == PFKEY_ALIGN8(sizeof(struct sadb_key)) &&
sav->alg_auth != SADB_X_AALG_NULL)
error = EINVAL;
break;
case SADB_X_SATYPE_IPCOMP:
default:
error = EINVAL;
break;
}
if (error) {
ipseclog((LOG_DEBUG, "%s: invalid key_auth values.\n",
__func__));
goto fail;
}
sav->key_auth = key_dup_keymsg(key0, len, M_IPSEC_MISC);
if (sav->key_auth == NULL ) {
ipseclog((LOG_DEBUG, "%s: No more memory.\n",
__func__));
PFKEYSTAT_INC(in_nomem);
error = ENOBUFS;
goto fail;
}
}
/* Encryption key */
if (!SADB_CHECKHDR(mhp, SADB_EXT_KEY_ENCRYPT)) {
if (SADB_CHECKLEN(mhp, SADB_EXT_KEY_ENCRYPT)) {
error = EINVAL;
goto fail;
}
error = 0;
key0 = (const struct sadb_key *)mhp->ext[SADB_EXT_KEY_ENCRYPT];
len = mhp->extlen[SADB_EXT_KEY_ENCRYPT];
switch (mhp->msg->sadb_msg_satype) {
case SADB_SATYPE_ESP:
if (len == PFKEY_ALIGN8(sizeof(struct sadb_key)) &&
sav->alg_enc != SADB_EALG_NULL) {
error = EINVAL;
break;
}
sav->key_enc = key_dup_keymsg(key0, len, M_IPSEC_MISC);
if (sav->key_enc == NULL) {
ipseclog((LOG_DEBUG, "%s: No more memory.\n",
__func__));
PFKEYSTAT_INC(in_nomem);
error = ENOBUFS;
goto fail;
}
break;
case SADB_X_SATYPE_IPCOMP:
if (len != PFKEY_ALIGN8(sizeof(struct sadb_key)))
error = EINVAL;
sav->key_enc = NULL; /*just in case*/
break;
case SADB_SATYPE_AH:
case SADB_X_SATYPE_TCPSIGNATURE:
default:
error = EINVAL;
break;
}
if (error) {
ipseclog((LOG_DEBUG, "%s: invalid key_enc value.\n",
__func__));
goto fail;
}
}
/* set iv */
sav->ivlen = 0;
switch (mhp->msg->sadb_msg_satype) {
case SADB_SATYPE_AH:
if (sav->flags & SADB_X_EXT_DERIV) {
ipseclog((LOG_DEBUG, "%s: invalid flag (derived) "
"given to AH SA.\n", __func__));
error = EINVAL;
goto fail;
}
if (sav->alg_enc != SADB_EALG_NONE) {
ipseclog((LOG_DEBUG, "%s: protocol and algorithm "
"mismated.\n", __func__));
error = EINVAL;
goto fail;
}
error = xform_init(sav, XF_AH);
break;
case SADB_SATYPE_ESP:
if ((sav->flags & (SADB_X_EXT_OLD | SADB_X_EXT_DERIV)) ==
(SADB_X_EXT_OLD | SADB_X_EXT_DERIV)) {
ipseclog((LOG_DEBUG, "%s: invalid flag (derived) "
"given to old-esp.\n", __func__));
error = EINVAL;
goto fail;
}
error = xform_init(sav, XF_ESP);
break;
case SADB_X_SATYPE_IPCOMP:
if (sav->alg_auth != SADB_AALG_NONE) {
ipseclog((LOG_DEBUG, "%s: protocol and algorithm "
"mismated.\n", __func__));
error = EINVAL;
goto fail;
}
if ((sav->flags & SADB_X_EXT_RAWCPI) == 0 &&
ntohl(sav->spi) >= 0x10000) {
ipseclog((LOG_DEBUG, "%s: invalid cpi for IPComp.\n",
__func__));
error = EINVAL;
goto fail;
}
error = xform_init(sav, XF_IPCOMP);
break;
case SADB_X_SATYPE_TCPSIGNATURE:
if (sav->alg_enc != SADB_EALG_NONE) {
ipseclog((LOG_DEBUG, "%s: protocol and algorithm "
"mismated.\n", __func__));
error = EINVAL;
goto fail;
}
error = xform_init(sav, XF_TCPSIGNATURE);
break;
default:
ipseclog((LOG_DEBUG, "%s: Invalid satype.\n", __func__));
error = EPROTONOSUPPORT;
goto fail;
}
if (error) {
ipseclog((LOG_DEBUG, "%s: unable to initialize SA type %u.\n",
__func__, mhp->msg->sadb_msg_satype));
goto fail;
}
/* Handle NAT-T headers */
error = key_setnatt(sav, mhp);
if (error != 0)
goto fail;
/* Initialize lifetime for CURRENT */
sav->firstused = 0;
sav->created = time_second;
/* lifetimes for HARD and SOFT */
error = key_updatelifetimes(sav, mhp);
if (error == 0)
return (0);
fail:
key_cleansav(sav);
return (error);
}
/*
* subroutine for SADB_GET and SADB_DUMP.
*/
static struct mbuf *
key_setdumpsa(struct secasvar *sav, uint8_t type, uint8_t satype,
uint32_t seq, uint32_t pid)
{
struct seclifetime lft_c;
struct mbuf *result = NULL, *tres = NULL, *m;
int i, dumporder[] = {
SADB_EXT_SA, SADB_X_EXT_SA2, SADB_X_EXT_SA_REPLAY,
SADB_EXT_LIFETIME_HARD, SADB_EXT_LIFETIME_SOFT,
SADB_EXT_LIFETIME_CURRENT, SADB_EXT_ADDRESS_SRC,
SADB_EXT_ADDRESS_DST, SADB_EXT_ADDRESS_PROXY,
SADB_EXT_KEY_AUTH, SADB_EXT_KEY_ENCRYPT,
SADB_EXT_IDENTITY_SRC, SADB_EXT_IDENTITY_DST,
SADB_EXT_SENSITIVITY,
SADB_X_EXT_NAT_T_TYPE,
SADB_X_EXT_NAT_T_SPORT, SADB_X_EXT_NAT_T_DPORT,
SADB_X_EXT_NAT_T_OAI, SADB_X_EXT_NAT_T_OAR,
SADB_X_EXT_NAT_T_FRAG,
};
uint32_t replay_count;
m = key_setsadbmsg(type, 0, satype, seq, pid, sav->refcnt);
if (m == NULL)
goto fail;
result = m;
for (i = nitems(dumporder) - 1; i >= 0; i--) {
m = NULL;
switch (dumporder[i]) {
case SADB_EXT_SA:
m = key_setsadbsa(sav);
if (!m)
goto fail;
break;
case SADB_X_EXT_SA2:
SECASVAR_LOCK(sav);
replay_count = sav->replay ? sav->replay->count : 0;
SECASVAR_UNLOCK(sav);
m = key_setsadbxsa2(sav->sah->saidx.mode, replay_count,
sav->sah->saidx.reqid);
if (!m)
goto fail;
break;
case SADB_X_EXT_SA_REPLAY:
if (sav->replay == NULL ||
sav->replay->wsize <= UINT8_MAX)
continue;
m = key_setsadbxsareplay(sav->replay->wsize);
if (!m)
goto fail;
break;
case SADB_EXT_ADDRESS_SRC:
m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC,
&sav->sah->saidx.src.sa,
FULLMASK, IPSEC_ULPROTO_ANY);
if (!m)
goto fail;
break;
case SADB_EXT_ADDRESS_DST:
m = key_setsadbaddr(SADB_EXT_ADDRESS_DST,
&sav->sah->saidx.dst.sa,
FULLMASK, IPSEC_ULPROTO_ANY);
if (!m)
goto fail;
break;
case SADB_EXT_KEY_AUTH:
if (!sav->key_auth)
continue;
m = key_setkey(sav->key_auth, SADB_EXT_KEY_AUTH);
if (!m)
goto fail;
break;
case SADB_EXT_KEY_ENCRYPT:
if (!sav->key_enc)
continue;
m = key_setkey(sav->key_enc, SADB_EXT_KEY_ENCRYPT);
if (!m)
goto fail;
break;
case SADB_EXT_LIFETIME_CURRENT:
lft_c.addtime = sav->created;
lft_c.allocations = (uint32_t)counter_u64_fetch(
sav->lft_c_allocations);
lft_c.bytes = counter_u64_fetch(sav->lft_c_bytes);
lft_c.usetime = sav->firstused;
m = key_setlifetime(&lft_c, SADB_EXT_LIFETIME_CURRENT);
if (!m)
goto fail;
break;
case SADB_EXT_LIFETIME_HARD:
if (!sav->lft_h)
continue;
m = key_setlifetime(sav->lft_h,
SADB_EXT_LIFETIME_HARD);
if (!m)
goto fail;
break;
case SADB_EXT_LIFETIME_SOFT:
if (!sav->lft_s)
continue;
m = key_setlifetime(sav->lft_s,
SADB_EXT_LIFETIME_SOFT);
if (!m)
goto fail;
break;
case SADB_X_EXT_NAT_T_TYPE:
if (sav->natt == NULL)
continue;
m = key_setsadbxtype(UDP_ENCAP_ESPINUDP);
if (!m)
goto fail;
break;
case SADB_X_EXT_NAT_T_DPORT:
if (sav->natt == NULL)
continue;
m = key_setsadbxport(sav->natt->dport,
SADB_X_EXT_NAT_T_DPORT);
if (!m)
goto fail;
break;
case SADB_X_EXT_NAT_T_SPORT:
if (sav->natt == NULL)
continue;
m = key_setsadbxport(sav->natt->sport,
SADB_X_EXT_NAT_T_SPORT);
if (!m)
goto fail;
break;
case SADB_X_EXT_NAT_T_OAI:
if (sav->natt == NULL ||
(sav->natt->flags & IPSEC_NATT_F_OAI) == 0)
continue;
m = key_setsadbaddr(SADB_X_EXT_NAT_T_OAI,
&sav->natt->oai.sa, FULLMASK, IPSEC_ULPROTO_ANY);
if (!m)
goto fail;
break;
case SADB_X_EXT_NAT_T_OAR:
if (sav->natt == NULL ||
(sav->natt->flags & IPSEC_NATT_F_OAR) == 0)
continue;
m = key_setsadbaddr(SADB_X_EXT_NAT_T_OAR,
&sav->natt->oar.sa, FULLMASK, IPSEC_ULPROTO_ANY);
if (!m)
goto fail;
break;
case SADB_X_EXT_NAT_T_FRAG:
/* We do not (yet) support those. */
continue;
case SADB_EXT_ADDRESS_PROXY:
case SADB_EXT_IDENTITY_SRC:
case SADB_EXT_IDENTITY_DST:
/* XXX: should we brought from SPD ? */
case SADB_EXT_SENSITIVITY:
default:
continue;
}
if (!m)
goto fail;
if (tres)
m_cat(m, tres);
tres = m;
}
m_cat(result, tres);
tres = NULL;
if (result->m_len < sizeof(struct sadb_msg)) {
result = m_pullup(result, sizeof(struct sadb_msg));
if (result == NULL)
goto fail;
}
result->m_pkthdr.len = 0;
for (m = result; m; m = m->m_next)
result->m_pkthdr.len += m->m_len;
mtod(result, struct sadb_msg *)->sadb_msg_len =
PFKEY_UNIT64(result->m_pkthdr.len);
return result;
fail:
m_freem(result);
m_freem(tres);
return NULL;
}
/*
* set data into sadb_msg.
*/
static struct mbuf *
key_setsadbmsg(u_int8_t type, u_int16_t tlen, u_int8_t satype, u_int32_t seq,
pid_t pid, u_int16_t reserved)
{
struct mbuf *m;
struct sadb_msg *p;
int len;
len = PFKEY_ALIGN8(sizeof(struct sadb_msg));
if (len > MCLBYTES)
return NULL;
MGETHDR(m, M_NOWAIT, MT_DATA);
if (m && len > MHLEN) {
if (!(MCLGET(m, M_NOWAIT))) {
m_freem(m);
m = NULL;
}
}
if (!m)
return NULL;
m->m_pkthdr.len = m->m_len = len;
m->m_next = NULL;
p = mtod(m, struct sadb_msg *);
bzero(p, len);
p->sadb_msg_version = PF_KEY_V2;
p->sadb_msg_type = type;
p->sadb_msg_errno = 0;
p->sadb_msg_satype = satype;
p->sadb_msg_len = PFKEY_UNIT64(tlen);
p->sadb_msg_reserved = reserved;
p->sadb_msg_seq = seq;
p->sadb_msg_pid = (u_int32_t)pid;
return m;
}
/*
* copy secasvar data into sadb_address.
*/
static struct mbuf *
key_setsadbsa(struct secasvar *sav)
{
struct mbuf *m;
struct sadb_sa *p;
int len;
len = PFKEY_ALIGN8(sizeof(struct sadb_sa));
m = m_get2(len, M_NOWAIT, MT_DATA, 0);
if (m == NULL)
return (NULL);
m_align(m, len);
m->m_len = len;
p = mtod(m, struct sadb_sa *);
bzero(p, len);
p->sadb_sa_len = PFKEY_UNIT64(len);
p->sadb_sa_exttype = SADB_EXT_SA;
p->sadb_sa_spi = sav->spi;
p->sadb_sa_replay = sav->replay ?
(sav->replay->wsize > UINT8_MAX ? UINT8_MAX :
sav->replay->wsize): 0;
p->sadb_sa_state = sav->state;
p->sadb_sa_auth = sav->alg_auth;
p->sadb_sa_encrypt = sav->alg_enc;
p->sadb_sa_flags = sav->flags & SADB_KEY_FLAGS_MAX;
return (m);
}
/*
* set data into sadb_address.
*/
static struct mbuf *
key_setsadbaddr(u_int16_t exttype, const struct sockaddr *saddr,
u_int8_t prefixlen, u_int16_t ul_proto)
{
struct mbuf *m;
struct sadb_address *p;
size_t len;
len = PFKEY_ALIGN8(sizeof(struct sadb_address)) +
PFKEY_ALIGN8(saddr->sa_len);
m = m_get2(len, M_NOWAIT, MT_DATA, 0);
if (m == NULL)
return (NULL);
m_align(m, len);
m->m_len = len;
p = mtod(m, struct sadb_address *);
bzero(p, len);
p->sadb_address_len = PFKEY_UNIT64(len);
p->sadb_address_exttype = exttype;
p->sadb_address_proto = ul_proto;
if (prefixlen == FULLMASK) {
switch (saddr->sa_family) {
case AF_INET:
prefixlen = sizeof(struct in_addr) << 3;
break;
case AF_INET6:
prefixlen = sizeof(struct in6_addr) << 3;
break;
default:
; /*XXX*/
}
}
p->sadb_address_prefixlen = prefixlen;
p->sadb_address_reserved = 0;
bcopy(saddr,
mtod(m, caddr_t) + PFKEY_ALIGN8(sizeof(struct sadb_address)),
saddr->sa_len);
return m;
}
/*
* set data into sadb_x_sa2.
*/
static struct mbuf *
key_setsadbxsa2(u_int8_t mode, u_int32_t seq, u_int32_t reqid)
{
struct mbuf *m;
struct sadb_x_sa2 *p;
size_t len;
len = PFKEY_ALIGN8(sizeof(struct sadb_x_sa2));
m = m_get2(len, M_NOWAIT, MT_DATA, 0);
if (m == NULL)
return (NULL);
m_align(m, len);
m->m_len = len;
p = mtod(m, struct sadb_x_sa2 *);
bzero(p, len);
p->sadb_x_sa2_len = PFKEY_UNIT64(len);
p->sadb_x_sa2_exttype = SADB_X_EXT_SA2;
p->sadb_x_sa2_mode = mode;
p->sadb_x_sa2_reserved1 = 0;
p->sadb_x_sa2_reserved2 = 0;
p->sadb_x_sa2_sequence = seq;
p->sadb_x_sa2_reqid = reqid;
return m;
}
/*
* Set data into sadb_x_sa_replay.
*/
static struct mbuf *
key_setsadbxsareplay(u_int32_t replay)
{
struct mbuf *m;
struct sadb_x_sa_replay *p;
size_t len;
len = PFKEY_ALIGN8(sizeof(struct sadb_x_sa_replay));
m = m_get2(len, M_NOWAIT, MT_DATA, 0);
if (m == NULL)
return (NULL);
m_align(m, len);
m->m_len = len;
p = mtod(m, struct sadb_x_sa_replay *);
bzero(p, len);
p->sadb_x_sa_replay_len = PFKEY_UNIT64(len);
p->sadb_x_sa_replay_exttype = SADB_X_EXT_SA_REPLAY;
p->sadb_x_sa_replay_replay = (replay << 3);
return m;
}
/*
* Set a type in sadb_x_nat_t_type.
*/
static struct mbuf *
key_setsadbxtype(u_int16_t type)
{
struct mbuf *m;
size_t len;
struct sadb_x_nat_t_type *p;
len = PFKEY_ALIGN8(sizeof(struct sadb_x_nat_t_type));
m = m_get2(len, M_NOWAIT, MT_DATA, 0);
if (m == NULL)
return (NULL);
m_align(m, len);
m->m_len = len;
p = mtod(m, struct sadb_x_nat_t_type *);
bzero(p, len);
p->sadb_x_nat_t_type_len = PFKEY_UNIT64(len);
p->sadb_x_nat_t_type_exttype = SADB_X_EXT_NAT_T_TYPE;
p->sadb_x_nat_t_type_type = type;
return (m);
}
/*
* Set a port in sadb_x_nat_t_port.
* In contrast to default RFC 2367 behaviour, port is in network byte order.
*/
static struct mbuf *
key_setsadbxport(u_int16_t port, u_int16_t type)
{
struct mbuf *m;
size_t len;
struct sadb_x_nat_t_port *p;
len = PFKEY_ALIGN8(sizeof(struct sadb_x_nat_t_port));
m = m_get2(len, M_NOWAIT, MT_DATA, 0);
if (m == NULL)
return (NULL);
m_align(m, len);
m->m_len = len;
p = mtod(m, struct sadb_x_nat_t_port *);
bzero(p, len);
p->sadb_x_nat_t_port_len = PFKEY_UNIT64(len);
p->sadb_x_nat_t_port_exttype = type;
p->sadb_x_nat_t_port_port = port;
return (m);
}
/*
* Get port from sockaddr. Port is in network byte order.
*/
uint16_t
key_portfromsaddr(struct sockaddr *sa)
{
switch (sa->sa_family) {
#ifdef INET
case AF_INET:
return ((struct sockaddr_in *)sa)->sin_port;
#endif
#ifdef INET6
case AF_INET6:
return ((struct sockaddr_in6 *)sa)->sin6_port;
#endif
}
return (0);
}
/*
* Set port in struct sockaddr. Port is in network byte order.
*/
void
key_porttosaddr(struct sockaddr *sa, uint16_t port)
{
switch (sa->sa_family) {
#ifdef INET
case AF_INET:
((struct sockaddr_in *)sa)->sin_port = port;
break;
#endif
#ifdef INET6
case AF_INET6:
((struct sockaddr_in6 *)sa)->sin6_port = port;
break;
#endif
default:
ipseclog((LOG_DEBUG, "%s: unexpected address family %d.\n",
__func__, sa->sa_family));
break;
}
}
/*
* set data into sadb_x_policy
*/
static struct mbuf *
key_setsadbxpolicy(u_int16_t type, u_int8_t dir, u_int32_t id, u_int32_t priority)
{
struct mbuf *m;
struct sadb_x_policy *p;
size_t len;
len = PFKEY_ALIGN8(sizeof(struct sadb_x_policy));
m = m_get2(len, M_NOWAIT, MT_DATA, 0);
if (m == NULL)
return (NULL);
m_align(m, len);
m->m_len = len;
p = mtod(m, struct sadb_x_policy *);
bzero(p, len);
p->sadb_x_policy_len = PFKEY_UNIT64(len);
p->sadb_x_policy_exttype = SADB_X_EXT_POLICY;
p->sadb_x_policy_type = type;
p->sadb_x_policy_dir = dir;
p->sadb_x_policy_id = id;
p->sadb_x_policy_priority = priority;
return m;
}
/* %%% utilities */
/* Take a key message (sadb_key) from the socket and turn it into one
* of the kernel's key structures (seckey).
*
* IN: pointer to the src
* OUT: NULL no more memory
*/
struct seckey *
key_dup_keymsg(const struct sadb_key *src, size_t len,
struct malloc_type *type)
{
struct seckey *dst;
dst = malloc(sizeof(*dst), type, M_NOWAIT);
if (dst != NULL) {
dst->bits = src->sadb_key_bits;
dst->key_data = malloc(len, type, M_NOWAIT);
if (dst->key_data != NULL) {
bcopy((const char *)(src + 1), dst->key_data, len);
} else {
ipseclog((LOG_DEBUG, "%s: No more memory.\n",
__func__));
free(dst, type);
dst = NULL;
}
} else {
ipseclog((LOG_DEBUG, "%s: No more memory.\n",
__func__));
}
return (dst);
}
/* Take a lifetime message (sadb_lifetime) passed in on a socket and
* turn it into one of the kernel's lifetime structures (seclifetime).
*
* IN: pointer to the destination, source and malloc type
* OUT: NULL, no more memory
*/
static struct seclifetime *
key_dup_lifemsg(const struct sadb_lifetime *src, struct malloc_type *type)
{
struct seclifetime *dst;
dst = malloc(sizeof(*dst), type, M_NOWAIT);
if (dst == NULL) {
ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
return (NULL);
}
dst->allocations = src->sadb_lifetime_allocations;
dst->bytes = src->sadb_lifetime_bytes;
dst->addtime = src->sadb_lifetime_addtime;
dst->usetime = src->sadb_lifetime_usetime;
return (dst);
}
/*
* compare two secasindex structure.
* flag can specify to compare 2 saidxes.
* compare two secasindex structure without both mode and reqid.
* don't compare port.
* IN:
* saidx0: source, it can be in SAD.
* saidx1: object.
* OUT:
* 1 : equal
* 0 : not equal
*/
static int
key_cmpsaidx(const struct secasindex *saidx0, const struct secasindex *saidx1,
int flag)
{
/* sanity */
if (saidx0 == NULL && saidx1 == NULL)
return 1;
if (saidx0 == NULL || saidx1 == NULL)
return 0;
if (saidx0->proto != saidx1->proto)
return 0;
if (flag == CMP_EXACTLY) {
if (saidx0->mode != saidx1->mode)
return 0;
if (saidx0->reqid != saidx1->reqid)
return 0;
if (bcmp(&saidx0->src, &saidx1->src,
saidx0->src.sa.sa_len) != 0 ||
bcmp(&saidx0->dst, &saidx1->dst,
saidx0->dst.sa.sa_len) != 0)
return 0;
} else {
/* CMP_MODE_REQID, CMP_REQID, CMP_HEAD */
if (flag == CMP_MODE_REQID || flag == CMP_REQID) {
/*
* If reqid of SPD is non-zero, unique SA is required.
* The result must be of same reqid in this case.
*/
if (saidx1->reqid != 0 &&
saidx0->reqid != saidx1->reqid)
return 0;
}
if (flag == CMP_MODE_REQID) {
if (saidx0->mode != IPSEC_MODE_ANY
&& saidx0->mode != saidx1->mode)
return 0;
}
if (key_sockaddrcmp(&saidx0->src.sa, &saidx1->src.sa, 0) != 0)
return 0;
if (key_sockaddrcmp(&saidx0->dst.sa, &saidx1->dst.sa, 0) != 0)
return 0;
}
return 1;
}
/*
* compare two secindex structure exactly.
* IN:
* spidx0: source, it is often in SPD.
* spidx1: object, it is often from PFKEY message.
* OUT:
* 1 : equal
* 0 : not equal
*/
static int
key_cmpspidx_exactly(struct secpolicyindex *spidx0,
struct secpolicyindex *spidx1)
{
/* sanity */
if (spidx0 == NULL && spidx1 == NULL)
return 1;
if (spidx0 == NULL || spidx1 == NULL)
return 0;
if (spidx0->prefs != spidx1->prefs
|| spidx0->prefd != spidx1->prefd
|| spidx0->ul_proto != spidx1->ul_proto)
return 0;
return key_sockaddrcmp(&spidx0->src.sa, &spidx1->src.sa, 1) == 0 &&
key_sockaddrcmp(&spidx0->dst.sa, &spidx1->dst.sa, 1) == 0;
}
/*
* compare two secindex structure with mask.
* IN:
* spidx0: source, it is often in SPD.
* spidx1: object, it is often from IP header.
* OUT:
* 1 : equal
* 0 : not equal
*/
static int
key_cmpspidx_withmask(struct secpolicyindex *spidx0,
struct secpolicyindex *spidx1)
{
/* sanity */
if (spidx0 == NULL && spidx1 == NULL)
return 1;
if (spidx0 == NULL || spidx1 == NULL)
return 0;
if (spidx0->src.sa.sa_family != spidx1->src.sa.sa_family ||
spidx0->dst.sa.sa_family != spidx1->dst.sa.sa_family ||
spidx0->src.sa.sa_len != spidx1->src.sa.sa_len ||
spidx0->dst.sa.sa_len != spidx1->dst.sa.sa_len)
return 0;
/* if spidx.ul_proto == IPSEC_ULPROTO_ANY, ignore. */
if (spidx0->ul_proto != (u_int16_t)IPSEC_ULPROTO_ANY
&& spidx0->ul_proto != spidx1->ul_proto)
return 0;
switch (spidx0->src.sa.sa_family) {
case AF_INET:
if (spidx0->src.sin.sin_port != IPSEC_PORT_ANY
&& spidx0->src.sin.sin_port != spidx1->src.sin.sin_port)
return 0;
if (!key_bbcmp(&spidx0->src.sin.sin_addr,
&spidx1->src.sin.sin_addr, spidx0->prefs))
return 0;
break;
case AF_INET6:
if (spidx0->src.sin6.sin6_port != IPSEC_PORT_ANY
&& spidx0->src.sin6.sin6_port != spidx1->src.sin6.sin6_port)
return 0;
/*
* scope_id check. if sin6_scope_id is 0, we regard it
* as a wildcard scope, which matches any scope zone ID.
*/
if (spidx0->src.sin6.sin6_scope_id &&
spidx1->src.sin6.sin6_scope_id &&
spidx0->src.sin6.sin6_scope_id != spidx1->src.sin6.sin6_scope_id)
return 0;
if (!key_bbcmp(&spidx0->src.sin6.sin6_addr,
&spidx1->src.sin6.sin6_addr, spidx0->prefs))
return 0;
break;
default:
/* XXX */
if (bcmp(&spidx0->src, &spidx1->src, spidx0->src.sa.sa_len) != 0)
return 0;
break;
}
switch (spidx0->dst.sa.sa_family) {
case AF_INET:
if (spidx0->dst.sin.sin_port != IPSEC_PORT_ANY
&& spidx0->dst.sin.sin_port != spidx1->dst.sin.sin_port)
return 0;
if (!key_bbcmp(&spidx0->dst.sin.sin_addr,
&spidx1->dst.sin.sin_addr, spidx0->prefd))
return 0;
break;
case AF_INET6:
if (spidx0->dst.sin6.sin6_port != IPSEC_PORT_ANY
&& spidx0->dst.sin6.sin6_port != spidx1->dst.sin6.sin6_port)
return 0;
/*
* scope_id check. if sin6_scope_id is 0, we regard it
* as a wildcard scope, which matches any scope zone ID.
*/
if (spidx0->dst.sin6.sin6_scope_id &&
spidx1->dst.sin6.sin6_scope_id &&
spidx0->dst.sin6.sin6_scope_id != spidx1->dst.sin6.sin6_scope_id)
return 0;
if (!key_bbcmp(&spidx0->dst.sin6.sin6_addr,
&spidx1->dst.sin6.sin6_addr, spidx0->prefd))
return 0;
break;
default:
/* XXX */
if (bcmp(&spidx0->dst, &spidx1->dst, spidx0->dst.sa.sa_len) != 0)
return 0;
break;
}
/* XXX Do we check other field ? e.g. flowinfo */
return 1;
}
#ifdef satosin
#undef satosin
#endif
#define satosin(s) ((const struct sockaddr_in *)s)
#ifdef satosin6
#undef satosin6
#endif
#define satosin6(s) ((const struct sockaddr_in6 *)s)
/* returns 0 on match */
int
key_sockaddrcmp(const struct sockaddr *sa1, const struct sockaddr *sa2,
int port)
{
if (sa1->sa_family != sa2->sa_family || sa1->sa_len != sa2->sa_len)
return 1;
switch (sa1->sa_family) {
#ifdef INET
case AF_INET:
if (sa1->sa_len != sizeof(struct sockaddr_in))
return 1;
if (satosin(sa1)->sin_addr.s_addr !=
satosin(sa2)->sin_addr.s_addr) {
return 1;
}
if (port && satosin(sa1)->sin_port != satosin(sa2)->sin_port)
return 1;
break;
#endif
#ifdef INET6
case AF_INET6:
if (sa1->sa_len != sizeof(struct sockaddr_in6))
return 1; /*EINVAL*/
if (satosin6(sa1)->sin6_scope_id !=
satosin6(sa2)->sin6_scope_id) {
return 1;
}
if (!IN6_ARE_ADDR_EQUAL(&satosin6(sa1)->sin6_addr,
&satosin6(sa2)->sin6_addr)) {
return 1;
}
if (port &&
satosin6(sa1)->sin6_port != satosin6(sa2)->sin6_port) {
return 1;
}
break;
#endif
default:
if (bcmp(sa1, sa2, sa1->sa_len) != 0)
return 1;
break;
}
return 0;
}
/* returns 0 on match */
int
key_sockaddrcmp_withmask(const struct sockaddr *sa1,
const struct sockaddr *sa2, size_t mask)
{
if (sa1->sa_family != sa2->sa_family || sa1->sa_len != sa2->sa_len)
return (1);
switch (sa1->sa_family) {
#ifdef INET
case AF_INET:
return (!key_bbcmp(&satosin(sa1)->sin_addr,
&satosin(sa2)->sin_addr, mask));
#endif
#ifdef INET6
case AF_INET6:
if (satosin6(sa1)->sin6_scope_id !=
satosin6(sa2)->sin6_scope_id)
return (1);
return (!key_bbcmp(&satosin6(sa1)->sin6_addr,
&satosin6(sa2)->sin6_addr, mask));
#endif
}
return (1);
}
#undef satosin
#undef satosin6
/*
* compare two buffers with mask.
* IN:
* addr1: source
* addr2: object
* bits: Number of bits to compare
* OUT:
* 1 : equal
* 0 : not equal
*/
static int
key_bbcmp(const void *a1, const void *a2, u_int bits)
{
const unsigned char *p1 = a1;
const unsigned char *p2 = a2;
/* XXX: This could be considerably faster if we compare a word
* at a time, but it is complicated on LSB Endian machines */
/* Handle null pointers */
if (p1 == NULL || p2 == NULL)
return (p1 == p2);
while (bits >= 8) {
if (*p1++ != *p2++)
return 0;
bits -= 8;
}
if (bits > 0) {
u_int8_t mask = ~((1<<(8-bits))-1);
if ((*p1 & mask) != (*p2 & mask))
return 0;
}
return 1; /* Match! */
}
static void
key_flush_spd(time_t now)
{
SPTREE_RLOCK_TRACKER;
struct secpolicy_list drainq;
struct secpolicy *sp, *nextsp;
u_int dir;
LIST_INIT(&drainq);
SPTREE_RLOCK();
for (dir = 0; dir < IPSEC_DIR_MAX; dir++) {
TAILQ_FOREACH(sp, &V_sptree[dir], chain) {
if (sp->lifetime == 0 && sp->validtime == 0)
continue;
if ((sp->lifetime &&
now - sp->created > sp->lifetime) ||
(sp->validtime &&
now - sp->lastused > sp->validtime)) {
/* Hold extra reference to send SPDEXPIRE */
SP_ADDREF(sp);
LIST_INSERT_HEAD(&drainq, sp, drainq);
}
}
}
SPTREE_RUNLOCK();
if (LIST_EMPTY(&drainq))
return;
SPTREE_WLOCK();
sp = LIST_FIRST(&drainq);
while (sp != NULL) {
nextsp = LIST_NEXT(sp, drainq);
/* Check that SP is still linked */
if (sp->state != IPSEC_SPSTATE_ALIVE) {
LIST_REMOVE(sp, drainq);
key_freesp(&sp); /* release extra reference */
sp = nextsp;
continue;
}
TAILQ_REMOVE(&V_sptree[sp->spidx.dir], sp, chain);
LIST_REMOVE(sp, idhash);
sp->state = IPSEC_SPSTATE_DEAD;
sp = nextsp;
}
V_sp_genid++;
SPTREE_WUNLOCK();
sp = LIST_FIRST(&drainq);
while (sp != NULL) {
nextsp = LIST_NEXT(sp, drainq);
key_spdexpire(sp);
key_freesp(&sp); /* release extra reference */
key_freesp(&sp); /* release last reference */
sp = nextsp;
}
}
static void
key_flush_sad(time_t now)
{
SAHTREE_RLOCK_TRACKER;
struct secashead_list emptyq;
struct secasvar_list drainq, hexpireq, sexpireq, freeq;
struct secashead *sah, *nextsah;
struct secasvar *sav, *nextsav;
LIST_INIT(&drainq);
LIST_INIT(&hexpireq);
LIST_INIT(&sexpireq);
LIST_INIT(&emptyq);
SAHTREE_RLOCK();
TAILQ_FOREACH(sah, &V_sahtree, chain) {
/* Check for empty SAH */
if (TAILQ_EMPTY(&sah->savtree_larval) &&
TAILQ_EMPTY(&sah->savtree_alive)) {
SAH_ADDREF(sah);
LIST_INSERT_HEAD(&emptyq, sah, drainq);
continue;
}
/* Add all stale LARVAL SAs into drainq */
TAILQ_FOREACH(sav, &sah->savtree_larval, chain) {
if (now - sav->created < V_key_larval_lifetime)
continue;
SAV_ADDREF(sav);
LIST_INSERT_HEAD(&drainq, sav, drainq);
}
TAILQ_FOREACH(sav, &sah->savtree_alive, chain) {
/* lifetimes aren't specified */
if (sav->lft_h == NULL)
continue;
SECASVAR_LOCK(sav);
/*
* Check again with lock held, because it may
* be updated by SADB_UPDATE.
*/
if (sav->lft_h == NULL) {
SECASVAR_UNLOCK(sav);
continue;
}
/*
* RFC 2367:
* HARD lifetimes MUST take precedence over SOFT
* lifetimes, meaning if the HARD and SOFT lifetimes
* are the same, the HARD lifetime will appear on the
* EXPIRE message.
*/
/* check HARD lifetime */
if ((sav->lft_h->addtime != 0 &&
now - sav->created > sav->lft_h->addtime) ||
(sav->lft_h->usetime != 0 && sav->firstused &&
now - sav->firstused > sav->lft_h->usetime) ||
(sav->lft_h->bytes != 0 && counter_u64_fetch(
sav->lft_c_bytes) > sav->lft_h->bytes)) {
SECASVAR_UNLOCK(sav);
SAV_ADDREF(sav);
LIST_INSERT_HEAD(&hexpireq, sav, drainq);
continue;
}
/* check SOFT lifetime (only for MATURE SAs) */
if (sav->state == SADB_SASTATE_MATURE && (
(sav->lft_s->addtime != 0 &&
now - sav->created > sav->lft_s->addtime) ||
(sav->lft_s->usetime != 0 && sav->firstused &&
now - sav->firstused > sav->lft_s->usetime) ||
(sav->lft_s->bytes != 0 && counter_u64_fetch(
sav->lft_c_bytes) > sav->lft_s->bytes))) {
SECASVAR_UNLOCK(sav);
SAV_ADDREF(sav);
LIST_INSERT_HEAD(&sexpireq, sav, drainq);
continue;
}
SECASVAR_UNLOCK(sav);
}
}
SAHTREE_RUNLOCK();
if (LIST_EMPTY(&emptyq) && LIST_EMPTY(&drainq) &&
LIST_EMPTY(&hexpireq) && LIST_EMPTY(&sexpireq))
return;
LIST_INIT(&freeq);
SAHTREE_WLOCK();
/* Unlink stale LARVAL SAs */
sav = LIST_FIRST(&drainq);
while (sav != NULL) {
nextsav = LIST_NEXT(sav, drainq);
/* Check that SA is still LARVAL */
if (sav->state != SADB_SASTATE_LARVAL) {
LIST_REMOVE(sav, drainq);
LIST_INSERT_HEAD(&freeq, sav, drainq);
sav = nextsav;
continue;
}
TAILQ_REMOVE(&sav->sah->savtree_larval, sav, chain);
LIST_REMOVE(sav, spihash);
sav->state = SADB_SASTATE_DEAD;
sav = nextsav;
}
/* Unlink all SAs with expired HARD lifetime */
sav = LIST_FIRST(&hexpireq);
while (sav != NULL) {
nextsav = LIST_NEXT(sav, drainq);
/* Check that SA is not unlinked */
if (sav->state == SADB_SASTATE_DEAD) {
LIST_REMOVE(sav, drainq);
LIST_INSERT_HEAD(&freeq, sav, drainq);
sav = nextsav;
continue;
}
TAILQ_REMOVE(&sav->sah->savtree_alive, sav, chain);
LIST_REMOVE(sav, spihash);
sav->state = SADB_SASTATE_DEAD;
sav = nextsav;
}
/* Mark all SAs with expired SOFT lifetime as DYING */
sav = LIST_FIRST(&sexpireq);
while (sav != NULL) {
nextsav = LIST_NEXT(sav, drainq);
/* Check that SA is not unlinked */
if (sav->state == SADB_SASTATE_DEAD) {
LIST_REMOVE(sav, drainq);
LIST_INSERT_HEAD(&freeq, sav, drainq);
sav = nextsav;
continue;
}
/*
* NOTE: this doesn't change SA order in the chain.
*/
sav->state = SADB_SASTATE_DYING;
sav = nextsav;
}
/* Unlink empty SAHs */
sah = LIST_FIRST(&emptyq);
while (sah != NULL) {
nextsah = LIST_NEXT(sah, drainq);
/* Check that SAH is still empty and not unlinked */
if (sah->state == SADB_SASTATE_DEAD ||
!TAILQ_EMPTY(&sah->savtree_larval) ||
!TAILQ_EMPTY(&sah->savtree_alive)) {
LIST_REMOVE(sah, drainq);
key_freesah(&sah); /* release extra reference */
sah = nextsah;
continue;
}
TAILQ_REMOVE(&V_sahtree, sah, chain);
LIST_REMOVE(sah, addrhash);
sah->state = SADB_SASTATE_DEAD;
sah = nextsah;
}
SAHTREE_WUNLOCK();
/* Send SPDEXPIRE messages */
sav = LIST_FIRST(&hexpireq);
while (sav != NULL) {
nextsav = LIST_NEXT(sav, drainq);
key_expire(sav, 1);
key_freesah(&sav->sah); /* release reference from SAV */
key_freesav(&sav); /* release extra reference */
key_freesav(&sav); /* release last reference */
sav = nextsav;
}
sav = LIST_FIRST(&sexpireq);
while (sav != NULL) {
nextsav = LIST_NEXT(sav, drainq);
key_expire(sav, 0);
key_freesav(&sav); /* release extra reference */
sav = nextsav;
}
/* Free stale LARVAL SAs */
sav = LIST_FIRST(&drainq);
while (sav != NULL) {
nextsav = LIST_NEXT(sav, drainq);
key_freesah(&sav->sah); /* release reference from SAV */
key_freesav(&sav); /* release extra reference */
key_freesav(&sav); /* release last reference */
sav = nextsav;
}
/* Free SAs that were unlinked/changed by someone else */
sav = LIST_FIRST(&freeq);
while (sav != NULL) {
nextsav = LIST_NEXT(sav, drainq);
key_freesav(&sav); /* release extra reference */
sav = nextsav;
}
/* Free empty SAH */
sah = LIST_FIRST(&emptyq);
while (sah != NULL) {
nextsah = LIST_NEXT(sah, drainq);
key_freesah(&sah); /* release extra reference */
key_freesah(&sah); /* release last reference */
sah = nextsah;
}
}
static void
key_flush_acq(time_t now)
{
struct secacq *acq, *nextacq;
/* ACQ tree */
ACQ_LOCK();
acq = LIST_FIRST(&V_acqtree);
while (acq != NULL) {
nextacq = LIST_NEXT(acq, chain);
if (now - acq->created > V_key_blockacq_lifetime) {
LIST_REMOVE(acq, chain);
LIST_REMOVE(acq, addrhash);
LIST_REMOVE(acq, seqhash);
free(acq, M_IPSEC_SAQ);
}
acq = nextacq;
}
ACQ_UNLOCK();
}
static void
key_flush_spacq(time_t now)
{
struct secspacq *acq, *nextacq;
/* SP ACQ tree */
SPACQ_LOCK();
for (acq = LIST_FIRST(&V_spacqtree); acq != NULL; acq = nextacq) {
nextacq = LIST_NEXT(acq, chain);
if (now - acq->created > V_key_blockacq_lifetime
&& __LIST_CHAINED(acq)) {
LIST_REMOVE(acq, chain);
free(acq, M_IPSEC_SAQ);
}
}
SPACQ_UNLOCK();
}
/*
* time handler.
* scanning SPD and SAD to check status for each entries,
* and do to remove or to expire.
* XXX: year 2038 problem may remain.
*/
static void
key_timehandler(void *arg)
{
VNET_ITERATOR_DECL(vnet_iter);
time_t now = time_second;
VNET_LIST_RLOCK_NOSLEEP();
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
key_flush_spd(now);
key_flush_sad(now);
key_flush_acq(now);
key_flush_spacq(now);
CURVNET_RESTORE();
}
VNET_LIST_RUNLOCK_NOSLEEP();
#ifndef IPSEC_DEBUG2
/* do exchange to tick time !! */
callout_schedule(&key_timer, hz);
#endif /* IPSEC_DEBUG2 */
}
u_long
key_random()
{
u_long value;
key_randomfill(&value, sizeof(value));
return value;
}
void
key_randomfill(void *p, size_t l)
{
size_t n;
u_long v;
static int warn = 1;
n = 0;
n = (size_t)read_random(p, (u_int)l);
/* last resort */
while (n < l) {
v = random();
bcopy(&v, (u_int8_t *)p + n,
l - n < sizeof(v) ? l - n : sizeof(v));
n += sizeof(v);
if (warn) {
printf("WARNING: pseudo-random number generator "
"used for IPsec processing\n");
warn = 0;
}
}
}
/*
* map SADB_SATYPE_* to IPPROTO_*.
* if satype == SADB_SATYPE then satype is mapped to ~0.
* OUT:
* 0: invalid satype.
*/
static uint8_t
key_satype2proto(uint8_t satype)
{
switch (satype) {
case SADB_SATYPE_UNSPEC:
return IPSEC_PROTO_ANY;
case SADB_SATYPE_AH:
return IPPROTO_AH;
case SADB_SATYPE_ESP:
return IPPROTO_ESP;
case SADB_X_SATYPE_IPCOMP:
return IPPROTO_IPCOMP;
case SADB_X_SATYPE_TCPSIGNATURE:
return IPPROTO_TCP;
default:
return 0;
}
/* NOTREACHED */
}
/*
* map IPPROTO_* to SADB_SATYPE_*
* OUT:
* 0: invalid protocol type.
*/
static uint8_t
key_proto2satype(uint8_t proto)
{
switch (proto) {
case IPPROTO_AH:
return SADB_SATYPE_AH;
case IPPROTO_ESP:
return SADB_SATYPE_ESP;
case IPPROTO_IPCOMP:
return SADB_X_SATYPE_IPCOMP;
case IPPROTO_TCP:
return SADB_X_SATYPE_TCPSIGNATURE;
default:
return 0;
}
/* NOTREACHED */
}
/* %%% PF_KEY */
/*
* SADB_GETSPI processing is to receive
* <base, (SA2), src address, dst address, (SPI range)>
* from the IKMPd, to assign a unique spi value, to hang on the INBOUND
* tree with the status of LARVAL, and send
* <base, SA(*), address(SD)>
* to the IKMPd.
*
* IN: mhp: pointer to the pointer to each header.
* OUT: NULL if fail.
* other if success, return pointer to the message to send.
*/
static int
key_getspi(struct socket *so, struct mbuf *m, const struct sadb_msghdr *mhp)
{
struct secasindex saidx;
struct sadb_address *src0, *dst0;
struct secasvar *sav;
uint32_t reqid, spi;
int error;
uint8_t mode, proto;
IPSEC_ASSERT(so != NULL, ("null socket"));
IPSEC_ASSERT(m != NULL, ("null mbuf"));
IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
if (SADB_CHECKHDR(mhp, SADB_EXT_ADDRESS_SRC) ||
SADB_CHECKHDR(mhp, SADB_EXT_ADDRESS_DST)
#ifdef PFKEY_STRICT_CHECKS
|| SADB_CHECKHDR(mhp, SADB_EXT_SPIRANGE)
#endif
) {
ipseclog((LOG_DEBUG,
"%s: invalid message: missing required header.\n",
__func__));
error = EINVAL;
goto fail;
}
if (SADB_CHECKLEN(mhp, SADB_EXT_ADDRESS_SRC) ||
SADB_CHECKLEN(mhp, SADB_EXT_ADDRESS_DST)
#ifdef PFKEY_STRICT_CHECKS
|| SADB_CHECKLEN(mhp, SADB_EXT_SPIRANGE)
#endif
) {
ipseclog((LOG_DEBUG,
"%s: invalid message: wrong header size.\n", __func__));
error = EINVAL;
goto fail;
}
if (SADB_CHECKHDR(mhp, SADB_X_EXT_SA2)) {
mode = IPSEC_MODE_ANY;
reqid = 0;
} else {
if (SADB_CHECKLEN(mhp, SADB_X_EXT_SA2)) {
ipseclog((LOG_DEBUG,
"%s: invalid message: wrong header size.\n",
__func__));
error = EINVAL;
goto fail;
}
mode = ((struct sadb_x_sa2 *)
mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_mode;
reqid = ((struct sadb_x_sa2 *)
mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_reqid;
}
src0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_SRC]);
dst0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_DST]);
/* map satype to proto */
if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
__func__));
error = EINVAL;
goto fail;
}
error = key_checksockaddrs((struct sockaddr *)(src0 + 1),
(struct sockaddr *)(dst0 + 1));
if (error != 0) {
ipseclog((LOG_DEBUG, "%s: invalid sockaddr.\n", __func__));
error = EINVAL;
goto fail;
}
KEY_SETSECASIDX(proto, mode, reqid, src0 + 1, dst0 + 1, &saidx);
/* SPI allocation */
spi = key_do_getnewspi(
(struct sadb_spirange *)mhp->ext[SADB_EXT_SPIRANGE], &saidx);
if (spi == 0) {
/*
* Requested SPI or SPI range is not available or
* already used.
*/
error = EEXIST;
goto fail;
}
sav = key_newsav(mhp, &saidx, spi, &error);
if (sav == NULL)
goto fail;
if (sav->seq != 0) {
/*
* RFC2367:
* If the SADB_GETSPI message is in response to a
* kernel-generated SADB_ACQUIRE, the sadb_msg_seq
* MUST be the same as the SADB_ACQUIRE message.
*
* XXXAE: However it doesn't definethe behaviour how to
* check this and what to do if it doesn't match.
* Also what we should do if it matches?
*
* We can compare saidx used in SADB_ACQUIRE with saidx
* used in SADB_GETSPI, but this probably can break
* existing software. For now just warn if it doesn't match.
*
* XXXAE: anyway it looks useless.
*/
key_acqdone(&saidx, sav->seq);
}
KEYDBG(KEY_STAMP,
printf("%s: SA(%p)\n", __func__, sav));
KEYDBG(KEY_DATA, kdebug_secasv(sav));
{
struct mbuf *n, *nn;
struct sadb_sa *m_sa;
struct sadb_msg *newmsg;
int off, len;
/* create new sadb_msg to reply. */
len = PFKEY_ALIGN8(sizeof(struct sadb_msg)) +
PFKEY_ALIGN8(sizeof(struct sadb_sa));
MGETHDR(n, M_NOWAIT, MT_DATA);
if (len > MHLEN) {
if (!(MCLGET(n, M_NOWAIT))) {
m_freem(n);
n = NULL;
}
}
if (!n) {
error = ENOBUFS;
goto fail;
}
n->m_len = len;
n->m_next = NULL;
off = 0;
m_copydata(m, 0, sizeof(struct sadb_msg), mtod(n, caddr_t) + off);
off += PFKEY_ALIGN8(sizeof(struct sadb_msg));
m_sa = (struct sadb_sa *)(mtod(n, caddr_t) + off);
m_sa->sadb_sa_len = PFKEY_UNIT64(sizeof(struct sadb_sa));
m_sa->sadb_sa_exttype = SADB_EXT_SA;
m_sa->sadb_sa_spi = spi; /* SPI is already in network byte order */
off += PFKEY_ALIGN8(sizeof(struct sadb_sa));
IPSEC_ASSERT(off == len,
("length inconsistency (off %u len %u)", off, len));
n->m_next = key_gather_mbuf(m, mhp, 0, 2, SADB_EXT_ADDRESS_SRC,
SADB_EXT_ADDRESS_DST);
if (!n->m_next) {
m_freem(n);
error = ENOBUFS;
goto fail;
}
if (n->m_len < sizeof(struct sadb_msg)) {
n = m_pullup(n, sizeof(struct sadb_msg));
if (n == NULL)
return key_sendup_mbuf(so, m, KEY_SENDUP_ONE);
}
n->m_pkthdr.len = 0;
for (nn = n; nn; nn = nn->m_next)
n->m_pkthdr.len += nn->m_len;
newmsg = mtod(n, struct sadb_msg *);
newmsg->sadb_msg_seq = sav->seq;
newmsg->sadb_msg_errno = 0;
newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len);
m_freem(m);
return key_sendup_mbuf(so, n, KEY_SENDUP_ONE);
}
fail:
return (key_senderror(so, m, error));
}
/*
* allocating new SPI
* called by key_getspi().
* OUT:
* 0: failure.
* others: success, SPI in network byte order.
*/
static uint32_t
key_do_getnewspi(struct sadb_spirange *spirange, struct secasindex *saidx)
{
uint32_t min, max, newspi, t;
int count = V_key_spi_trycnt;
/* set spi range to allocate */
if (spirange != NULL) {
min = spirange->sadb_spirange_min;
max = spirange->sadb_spirange_max;
} else {
min = V_key_spi_minval;
max = V_key_spi_maxval;
}
/* IPCOMP needs 2-byte SPI */
if (saidx->proto == IPPROTO_IPCOMP) {
if (min >= 0x10000)
min = 0xffff;
if (max >= 0x10000)
max = 0xffff;
if (min > max) {
t = min; min = max; max = t;
}
}
if (min == max) {
if (!key_checkspidup(htonl(min))) {
ipseclog((LOG_DEBUG, "%s: SPI %u exists already.\n",
__func__, min));
return 0;
}
count--; /* taking one cost. */
newspi = min;
} else {
/* init SPI */
newspi = 0;
/* when requesting to allocate spi ranged */
while (count--) {
/* generate pseudo-random SPI value ranged. */
newspi = min + (key_random() % (max - min + 1));
if (!key_checkspidup(htonl(newspi)))
break;
}
if (count == 0 || newspi == 0) {
ipseclog((LOG_DEBUG,
"%s: failed to allocate SPI.\n", __func__));
return 0;
}
}
/* statistics */
keystat.getspi_count =
(keystat.getspi_count + V_key_spi_trycnt - count) / 2;
return (htonl(newspi));
}
/*
* Find TCP-MD5 SA with corresponding secasindex.
* If not found, return NULL and fill SPI with usable value if needed.
*/
static struct secasvar *
key_getsav_tcpmd5(struct secasindex *saidx, uint32_t *spi)
{
SAHTREE_RLOCK_TRACKER;
struct secashead *sah;
struct secasvar *sav;
IPSEC_ASSERT(saidx->proto == IPPROTO_TCP, ("wrong proto"));
SAHTREE_RLOCK();
LIST_FOREACH(sah, SAHADDRHASH_HASH(saidx), addrhash) {
if (sah->saidx.proto != IPPROTO_TCP)
continue;
if (!key_sockaddrcmp(&saidx->dst.sa, &sah->saidx.dst.sa, 0) &&
!key_sockaddrcmp(&saidx->src.sa, &sah->saidx.src.sa, 0))
break;
}
if (sah != NULL) {
if (V_key_preferred_oldsa)
sav = TAILQ_LAST(&sah->savtree_alive, secasvar_queue);
else
sav = TAILQ_FIRST(&sah->savtree_alive);
if (sav != NULL) {
SAV_ADDREF(sav);
SAHTREE_RUNLOCK();
return (sav);
}
}
if (spi == NULL) {
/* No SPI required */
SAHTREE_RUNLOCK();
return (NULL);
}
/* Check that SPI is unique */
LIST_FOREACH(sav, SAVHASH_HASH(*spi), spihash) {
if (sav->spi == *spi)
break;
}
if (sav == NULL) {
SAHTREE_RUNLOCK();
/* SPI is already unique */
return (NULL);
}
SAHTREE_RUNLOCK();
/* XXX: not optimal */
*spi = key_do_getnewspi(NULL, saidx);
return (NULL);
}
static int
key_updateaddresses(struct socket *so, struct mbuf *m,
const struct sadb_msghdr *mhp, struct secasvar *sav,
struct secasindex *saidx)
{
struct sockaddr *newaddr;
struct secashead *sah;
struct secasvar *newsav, *tmp;
struct mbuf *n;
int error, isnew;
/* Check that we need to change SAH */
if (!SADB_CHECKHDR(mhp, SADB_X_EXT_NEW_ADDRESS_SRC)) {
newaddr = (struct sockaddr *)(
((struct sadb_address *)
mhp->ext[SADB_X_EXT_NEW_ADDRESS_SRC]) + 1);
bcopy(newaddr, &saidx->src, newaddr->sa_len);
key_porttosaddr(&saidx->src.sa, 0);
}
if (!SADB_CHECKHDR(mhp, SADB_X_EXT_NEW_ADDRESS_DST)) {
newaddr = (struct sockaddr *)(
((struct sadb_address *)
mhp->ext[SADB_X_EXT_NEW_ADDRESS_DST]) + 1);
bcopy(newaddr, &saidx->dst, newaddr->sa_len);
key_porttosaddr(&saidx->dst.sa, 0);
}
if (!SADB_CHECKHDR(mhp, SADB_X_EXT_NEW_ADDRESS_SRC) ||
!SADB_CHECKHDR(mhp, SADB_X_EXT_NEW_ADDRESS_DST)) {
error = key_checksockaddrs(&saidx->src.sa, &saidx->dst.sa);
if (error != 0) {
ipseclog((LOG_DEBUG, "%s: invalid new sockaddr.\n",
__func__));
return (error);
}
sah = key_getsah(saidx);
if (sah == NULL) {
/* create a new SA index */
sah = key_newsah(saidx);
if (sah == NULL) {
ipseclog((LOG_DEBUG,
"%s: No more memory.\n", __func__));
return (ENOBUFS);
}
isnew = 2; /* SAH is new */
} else
isnew = 1; /* existing SAH is referenced */
} else {
/*
* src and dst addresses are still the same.
* Do we want to change NAT-T config?
*/
if (sav->sah->saidx.proto != IPPROTO_ESP ||
SADB_CHECKHDR(mhp, SADB_X_EXT_NAT_T_TYPE) ||
SADB_CHECKHDR(mhp, SADB_X_EXT_NAT_T_SPORT) ||
SADB_CHECKHDR(mhp, SADB_X_EXT_NAT_T_DPORT)) {
ipseclog((LOG_DEBUG,
"%s: invalid message: missing required header.\n",
__func__));
return (EINVAL);
}
/* We hold reference to SA, thus SAH will be referenced too. */
sah = sav->sah;
isnew = 0;
}
newsav = malloc(sizeof(struct secasvar), M_IPSEC_SA,
M_NOWAIT | M_ZERO);
if (newsav == NULL) {
ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
error = ENOBUFS;
goto fail;
}
/* Clone SA's content into newsav */
SAV_INITREF(newsav);
bcopy(sav, newsav, offsetof(struct secasvar, chain));
/*
* We create new NAT-T config if it is needed.
* Old NAT-T config will be freed by key_cleansav() when
* last reference to SA will be released.
*/
newsav->natt = NULL;
newsav->sah = sah;
newsav->state = SADB_SASTATE_MATURE;
error = key_setnatt(newsav, mhp);
if (error != 0)
goto fail;
SAHTREE_WLOCK();
/* Check that SA is still alive */
if (sav->state == SADB_SASTATE_DEAD) {
/* SA was unlinked */
SAHTREE_WUNLOCK();
error = ESRCH;
goto fail;
}
/* Unlink SA from SAH and SPI hash */
IPSEC_ASSERT((sav->flags & SADB_X_EXT_F_CLONED) == 0,
("SA is already cloned"));
IPSEC_ASSERT(sav->state == SADB_SASTATE_MATURE ||
sav->state == SADB_SASTATE_DYING,
("Wrong SA state %u\n", sav->state));
TAILQ_REMOVE(&sav->sah->savtree_alive, sav, chain);
LIST_REMOVE(sav, spihash);
sav->state = SADB_SASTATE_DEAD;
/*
* Link new SA with SAH. Keep SAs ordered by
* create time (newer are first).
*/
TAILQ_FOREACH(tmp, &sah->savtree_alive, chain) {
if (newsav->created > tmp->created) {
TAILQ_INSERT_BEFORE(tmp, newsav, chain);
break;
}
}
if (tmp == NULL)
TAILQ_INSERT_TAIL(&sah->savtree_alive, newsav, chain);
/* Add new SA into SPI hash. */
LIST_INSERT_HEAD(SAVHASH_HASH(newsav->spi), newsav, spihash);
/* Add new SAH into SADB. */
if (isnew == 2) {
TAILQ_INSERT_HEAD(&V_sahtree, sah, chain);
LIST_INSERT_HEAD(SAHADDRHASH_HASH(saidx), sah, addrhash);
sah->state = SADB_SASTATE_MATURE;
SAH_ADDREF(sah); /* newsav references new SAH */
}
/*
* isnew == 1 -> @sah was referenced by key_getsah().
* isnew == 0 -> we use the same @sah, that was used by @sav,
* and we use its reference for @newsav.
*/
SECASVAR_LOCK(sav);
/* XXX: replace cntr with pointer? */
newsav->cntr = sav->cntr;
sav->flags |= SADB_X_EXT_F_CLONED;
SECASVAR_UNLOCK(sav);
SAHTREE_WUNLOCK();
KEYDBG(KEY_STAMP,
printf("%s: SA(%p) cloned into SA(%p)\n",
__func__, sav, newsav));
KEYDBG(KEY_DATA, kdebug_secasv(newsav));
key_freesav(&sav); /* release last reference */
/* set msg buf from mhp */
n = key_getmsgbuf_x1(m, mhp);
if (n == NULL) {
ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
return (ENOBUFS);
}
m_freem(m);
key_sendup_mbuf(so, n, KEY_SENDUP_ALL);
return (0);
fail:
if (isnew != 0)
key_freesah(&sah);
if (newsav != NULL) {
if (newsav->natt != NULL)
free(newsav->natt, M_IPSEC_MISC);
free(newsav, M_IPSEC_SA);
}
return (error);
}
/*
* SADB_UPDATE processing
* receive
* <base, SA, (SA2), (lifetime(HSC),) address(SD), (address(P),)
* key(AE), (identity(SD),) (sensitivity)>
* from the ikmpd, and update a secasvar entry whose status is SADB_SASTATE_LARVAL.
* and send
* <base, SA, (SA2), (lifetime(HSC),) address(SD), (address(P),)
* (identity(SD),) (sensitivity)>
* to the ikmpd.
*
* m will always be freed.
*/
static int
key_update(struct socket *so, struct mbuf *m, const struct sadb_msghdr *mhp)
{
struct secasindex saidx;
struct sadb_address *src0, *dst0;
struct sadb_sa *sa0;
struct secasvar *sav;
uint32_t reqid;
int error;
uint8_t mode, proto;
IPSEC_ASSERT(so != NULL, ("null socket"));
IPSEC_ASSERT(m != NULL, ("null mbuf"));
IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
/* map satype to proto */
if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
__func__));
return key_senderror(so, m, EINVAL);
}
if (SADB_CHECKHDR(mhp, SADB_EXT_SA) ||
SADB_CHECKHDR(mhp, SADB_EXT_ADDRESS_SRC) ||
SADB_CHECKHDR(mhp, SADB_EXT_ADDRESS_DST) ||
(SADB_CHECKHDR(mhp, SADB_EXT_LIFETIME_HARD) &&
!SADB_CHECKHDR(mhp, SADB_EXT_LIFETIME_SOFT)) ||
(SADB_CHECKHDR(mhp, SADB_EXT_LIFETIME_SOFT) &&
!SADB_CHECKHDR(mhp, SADB_EXT_LIFETIME_HARD))) {
ipseclog((LOG_DEBUG,
"%s: invalid message: missing required header.\n",
__func__));
return key_senderror(so, m, EINVAL);
}
if (SADB_CHECKLEN(mhp, SADB_EXT_SA) ||
SADB_CHECKLEN(mhp, SADB_EXT_ADDRESS_SRC) ||
SADB_CHECKLEN(mhp, SADB_EXT_ADDRESS_DST)) {
ipseclog((LOG_DEBUG,
"%s: invalid message: wrong header size.\n", __func__));
return key_senderror(so, m, EINVAL);
}
if (SADB_CHECKHDR(mhp, SADB_X_EXT_SA2)) {
mode = IPSEC_MODE_ANY;
reqid = 0;
} else {
if (SADB_CHECKLEN(mhp, SADB_X_EXT_SA2)) {
ipseclog((LOG_DEBUG,
"%s: invalid message: wrong header size.\n",
__func__));
return key_senderror(so, m, EINVAL);
}
mode = ((struct sadb_x_sa2 *)
mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_mode;
reqid = ((struct sadb_x_sa2 *)
mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_reqid;
}
sa0 = (struct sadb_sa *)mhp->ext[SADB_EXT_SA];
src0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_SRC]);
dst0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_DST]);
/*
* Only SADB_SASTATE_MATURE SAs may be submitted in an
* SADB_UPDATE message.
*/
if (sa0->sadb_sa_state != SADB_SASTATE_MATURE) {
ipseclog((LOG_DEBUG, "%s: invalid state.\n", __func__));
#ifdef PFKEY_STRICT_CHECKS
return key_senderror(so, m, EINVAL);
#endif
}
error = key_checksockaddrs((struct sockaddr *)(src0 + 1),
(struct sockaddr *)(dst0 + 1));
if (error != 0) {
ipseclog((LOG_DEBUG, "%s: invalid sockaddr.\n", __func__));
return key_senderror(so, m, error);
}
KEY_SETSECASIDX(proto, mode, reqid, src0 + 1, dst0 + 1, &saidx);
sav = key_getsavbyspi(sa0->sadb_sa_spi);
if (sav == NULL) {
ipseclog((LOG_DEBUG, "%s: no SA found for SPI %u\n",
__func__, ntohl(sa0->sadb_sa_spi)));
return key_senderror(so, m, EINVAL);
}
/*
* Check that SADB_UPDATE issued by the same process that did
* SADB_GETSPI or SADB_ADD.
*/
if (sav->pid != mhp->msg->sadb_msg_pid) {
ipseclog((LOG_DEBUG,
"%s: pid mismatched (SPI %u, pid %u vs. %u)\n", __func__,
ntohl(sav->spi), sav->pid, mhp->msg->sadb_msg_pid));
key_freesav(&sav);
return key_senderror(so, m, EINVAL);
}
/* saidx should match with SA. */
if (key_cmpsaidx(&sav->sah->saidx, &saidx, CMP_MODE_REQID) == 0) {
ipseclog((LOG_DEBUG, "%s: saidx mismatched for SPI %u",
__func__, ntohl(sav->spi)));
key_freesav(&sav);
return key_senderror(so, m, ESRCH);
}
if (sav->state == SADB_SASTATE_LARVAL) {
if ((mhp->msg->sadb_msg_satype == SADB_SATYPE_ESP &&
SADB_CHECKHDR(mhp, SADB_EXT_KEY_ENCRYPT)) ||
(mhp->msg->sadb_msg_satype == SADB_SATYPE_AH &&
SADB_CHECKHDR(mhp, SADB_EXT_KEY_AUTH))) {
ipseclog((LOG_DEBUG,
"%s: invalid message: missing required header.\n",
__func__));
key_freesav(&sav);
return key_senderror(so, m, EINVAL);
}
/*
* We can set any values except src, dst and SPI.
*/
error = key_setsaval(sav, mhp);
if (error != 0) {
key_freesav(&sav);
return (key_senderror(so, m, error));
}
/* Change SA state to MATURE */
SAHTREE_WLOCK();
if (sav->state != SADB_SASTATE_LARVAL) {
/* SA was deleted or another thread made it MATURE. */
SAHTREE_WUNLOCK();
key_freesav(&sav);
return (key_senderror(so, m, ESRCH));
}
/*
* NOTE: we keep SAs in savtree_alive ordered by created
* time. When SA's state changed from LARVAL to MATURE,
* we update its created time in key_setsaval() and move
* it into head of savtree_alive.
*/
TAILQ_REMOVE(&sav->sah->savtree_larval, sav, chain);
TAILQ_INSERT_HEAD(&sav->sah->savtree_alive, sav, chain);
sav->state = SADB_SASTATE_MATURE;
SAHTREE_WUNLOCK();
} else {
/*
* For DYING and MATURE SA we can change only state
* and lifetimes. Report EINVAL if something else attempted
* to change.
*/
if (!SADB_CHECKHDR(mhp, SADB_EXT_KEY_ENCRYPT) ||
!SADB_CHECKHDR(mhp, SADB_EXT_KEY_AUTH)) {
key_freesav(&sav);
return (key_senderror(so, m, EINVAL));
}
error = key_updatelifetimes(sav, mhp);
if (error != 0) {
key_freesav(&sav);
return (key_senderror(so, m, error));
}
/*
* This is FreeBSD extension to RFC2367.
* IKEd can specify SADB_X_EXT_NEW_ADDRESS_SRC and/or
* SADB_X_EXT_NEW_ADDRESS_DST when it wants to change
* SA addresses (for example to implement MOBIKE protocol
* as described in RFC4555). Also we allow to change
* NAT-T config.
*/
if (!SADB_CHECKHDR(mhp, SADB_X_EXT_NEW_ADDRESS_SRC) ||
!SADB_CHECKHDR(mhp, SADB_X_EXT_NEW_ADDRESS_DST) ||
!SADB_CHECKHDR(mhp, SADB_X_EXT_NAT_T_TYPE) ||
sav->natt != NULL) {
error = key_updateaddresses(so, m, mhp, sav, &saidx);
key_freesav(&sav);
if (error != 0)
return (key_senderror(so, m, error));
return (0);
}
/* Check that SA is still alive */
SAHTREE_WLOCK();
if (sav->state == SADB_SASTATE_DEAD) {
/* SA was unlinked */
SAHTREE_WUNLOCK();
key_freesav(&sav);
return (key_senderror(so, m, ESRCH));
}
/*
* NOTE: there is possible state moving from DYING to MATURE,
* but this doesn't change created time, so we won't reorder
* this SA.
*/
sav->state = SADB_SASTATE_MATURE;
SAHTREE_WUNLOCK();
}
KEYDBG(KEY_STAMP,
printf("%s: SA(%p)\n", __func__, sav));
KEYDBG(KEY_DATA, kdebug_secasv(sav));
key_freesav(&sav);
{
struct mbuf *n;
/* set msg buf from mhp */
n = key_getmsgbuf_x1(m, mhp);
if (n == NULL) {
ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
return key_senderror(so, m, ENOBUFS);
}
m_freem(m);
return key_sendup_mbuf(so, n, KEY_SENDUP_ALL);
}
}
/*
* SADB_ADD processing
* add an entry to SA database, when received
* <base, SA, (SA2), (lifetime(HSC),) address(SD), (address(P),)
* key(AE), (identity(SD),) (sensitivity)>
* from the ikmpd,
* and send
* <base, SA, (SA2), (lifetime(HSC),) address(SD), (address(P),)
* (identity(SD),) (sensitivity)>
* to the ikmpd.
*
* IGNORE identity and sensitivity messages.
*
* m will always be freed.
*/
static int
key_add(struct socket *so, struct mbuf *m, const struct sadb_msghdr *mhp)
{
struct secasindex saidx;
struct sadb_address *src0, *dst0;
struct sadb_sa *sa0;
struct secasvar *sav;
uint32_t reqid, spi;
uint8_t mode, proto;
int error;
IPSEC_ASSERT(so != NULL, ("null socket"));
IPSEC_ASSERT(m != NULL, ("null mbuf"));
IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
/* map satype to proto */
if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
__func__));
return key_senderror(so, m, EINVAL);
}
if (SADB_CHECKHDR(mhp, SADB_EXT_SA) ||
SADB_CHECKHDR(mhp, SADB_EXT_ADDRESS_SRC) ||
SADB_CHECKHDR(mhp, SADB_EXT_ADDRESS_DST) ||
(mhp->msg->sadb_msg_satype == SADB_SATYPE_ESP && (
SADB_CHECKHDR(mhp, SADB_EXT_KEY_ENCRYPT) ||
SADB_CHECKLEN(mhp, SADB_EXT_KEY_ENCRYPT))) ||
(mhp->msg->sadb_msg_satype == SADB_SATYPE_AH && (
SADB_CHECKHDR(mhp, SADB_EXT_KEY_AUTH) ||
SADB_CHECKLEN(mhp, SADB_EXT_KEY_AUTH))) ||
(SADB_CHECKHDR(mhp, SADB_EXT_LIFETIME_HARD) &&
!SADB_CHECKHDR(mhp, SADB_EXT_LIFETIME_SOFT)) ||
(SADB_CHECKHDR(mhp, SADB_EXT_LIFETIME_SOFT) &&
!SADB_CHECKHDR(mhp, SADB_EXT_LIFETIME_HARD))) {
ipseclog((LOG_DEBUG,
"%s: invalid message: missing required header.\n",
__func__));
return key_senderror(so, m, EINVAL);
}
if (SADB_CHECKLEN(mhp, SADB_EXT_SA) ||
SADB_CHECKLEN(mhp, SADB_EXT_ADDRESS_SRC) ||
SADB_CHECKLEN(mhp, SADB_EXT_ADDRESS_DST)) {
ipseclog((LOG_DEBUG,
"%s: invalid message: wrong header size.\n", __func__));
return key_senderror(so, m, EINVAL);
}
if (SADB_CHECKHDR(mhp, SADB_X_EXT_SA2)) {
mode = IPSEC_MODE_ANY;
reqid = 0;
} else {
if (SADB_CHECKLEN(mhp, SADB_X_EXT_SA2)) {
ipseclog((LOG_DEBUG,
"%s: invalid message: wrong header size.\n",
__func__));
return key_senderror(so, m, EINVAL);
}
mode = ((struct sadb_x_sa2 *)
mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_mode;
reqid = ((struct sadb_x_sa2 *)
mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_reqid;
}
sa0 = (struct sadb_sa *)mhp->ext[SADB_EXT_SA];
src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC];
dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST];
/*
* Only SADB_SASTATE_MATURE SAs may be submitted in an
* SADB_ADD message.
*/
if (sa0->sadb_sa_state != SADB_SASTATE_MATURE) {
ipseclog((LOG_DEBUG, "%s: invalid state.\n", __func__));
#ifdef PFKEY_STRICT_CHECKS
return key_senderror(so, m, EINVAL);
#endif
}
error = key_checksockaddrs((struct sockaddr *)(src0 + 1),
(struct sockaddr *)(dst0 + 1));
if (error != 0) {
ipseclog((LOG_DEBUG, "%s: invalid sockaddr.\n", __func__));
return key_senderror(so, m, error);
}
KEY_SETSECASIDX(proto, mode, reqid, src0 + 1, dst0 + 1, &saidx);
spi = sa0->sadb_sa_spi;
/*
* For TCP-MD5 SAs we don't use SPI. Check the uniqueness using
* secasindex.
* XXXAE: IPComp seems also doesn't use SPI.
*/
if (proto == IPPROTO_TCP) {
sav = key_getsav_tcpmd5(&saidx, &spi);
if (sav == NULL && spi == 0) {
/* Failed to allocate SPI */
ipseclog((LOG_DEBUG, "%s: SA already exists.\n",
__func__));
return key_senderror(so, m, EEXIST);
}
/* XXX: SPI that we report back can have another value */
} else {
/* We can create new SA only if SPI is different. */
sav = key_getsavbyspi(spi);
}
if (sav != NULL) {
key_freesav(&sav);
ipseclog((LOG_DEBUG, "%s: SA already exists.\n", __func__));
return key_senderror(so, m, EEXIST);
}
sav = key_newsav(mhp, &saidx, spi, &error);
if (sav == NULL)
return key_senderror(so, m, error);
KEYDBG(KEY_STAMP,
printf("%s: return SA(%p)\n", __func__, sav));
KEYDBG(KEY_DATA, kdebug_secasv(sav));
/*
* If SADB_ADD was in response to SADB_ACQUIRE, we need to schedule
* ACQ for deletion.
*/
if (sav->seq != 0)
key_acqdone(&saidx, sav->seq);
{
/*
* Don't call key_freesav() on error here, as we would like to
* keep the SA in the database.
*/
struct mbuf *n;
/* set msg buf from mhp */
n = key_getmsgbuf_x1(m, mhp);
if (n == NULL) {
ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
return key_senderror(so, m, ENOBUFS);
}
m_freem(m);
return key_sendup_mbuf(so, n, KEY_SENDUP_ALL);
}
}
/*
* NAT-T support.
* IKEd may request the use ESP in UDP encapsulation when it detects the
* presence of NAT. It uses NAT-T extension headers for such SAs to specify
* parameters needed for encapsulation and decapsulation. These PF_KEY
* extension headers are not standardized, so this comment addresses our
* implementation.
* SADB_X_EXT_NAT_T_TYPE specifies type of encapsulation, we support only
* UDP_ENCAP_ESPINUDP as described in RFC3948.
* SADB_X_EXT_NAT_T_SPORT/DPORT specifies source and destination ports for
* UDP header. We use these ports in UDP encapsulation procedure, also we
* can check them in UDP decapsulation procedure.
* SADB_X_EXT_NAT_T_OA[IR] specifies original address of initiator or
* responder. These addresses can be used for transport mode to adjust
* checksum after decapsulation and decryption. Since original IP addresses
* used by peer usually different (we detected presence of NAT), TCP/UDP
* pseudo header checksum and IP header checksum was calculated using original
* addresses. After decapsulation and decryption we need to adjust checksum
* to have correct datagram.
*
* We expect presence of NAT-T extension headers only in SADB_ADD and
* SADB_UPDATE messages. We report NAT-T extension headers in replies
* to SADB_ADD, SADB_UPDATE, SADB_GET, and SADB_DUMP messages.
*/
static int
key_setnatt(struct secasvar *sav, const struct sadb_msghdr *mhp)
{
struct sadb_x_nat_t_port *port;
struct sadb_x_nat_t_type *type;
struct sadb_address *oai, *oar;
struct sockaddr *sa;
uint32_t addr;
uint16_t cksum;
IPSEC_ASSERT(sav->natt == NULL, ("natt is already initialized"));
/*
* Ignore NAT-T headers if sproto isn't ESP.
*/
if (sav->sah->saidx.proto != IPPROTO_ESP)
return (0);
if (!SADB_CHECKHDR(mhp, SADB_X_EXT_NAT_T_TYPE) &&
!SADB_CHECKHDR(mhp, SADB_X_EXT_NAT_T_SPORT) &&
!SADB_CHECKHDR(mhp, SADB_X_EXT_NAT_T_DPORT)) {
if (SADB_CHECKLEN(mhp, SADB_X_EXT_NAT_T_TYPE) ||
SADB_CHECKLEN(mhp, SADB_X_EXT_NAT_T_SPORT) ||
SADB_CHECKLEN(mhp, SADB_X_EXT_NAT_T_DPORT)) {
ipseclog((LOG_DEBUG,
"%s: invalid message: wrong header size.\n",
__func__));
return (EINVAL);
}
} else
return (0);
type = (struct sadb_x_nat_t_type *)mhp->ext[SADB_X_EXT_NAT_T_TYPE];
if (type->sadb_x_nat_t_type_type != UDP_ENCAP_ESPINUDP) {
ipseclog((LOG_DEBUG, "%s: unsupported NAT-T type %u.\n",
__func__, type->sadb_x_nat_t_type_type));
return (EINVAL);
}
/*
* Allocate storage for NAT-T config.
* On error it will be released by key_cleansav().
*/
sav->natt = malloc(sizeof(struct secnatt), M_IPSEC_MISC,
M_NOWAIT | M_ZERO);
if (sav->natt == NULL) {
PFKEYSTAT_INC(in_nomem);
ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
return (ENOBUFS);
}
port = (struct sadb_x_nat_t_port *)mhp->ext[SADB_X_EXT_NAT_T_SPORT];
if (port->sadb_x_nat_t_port_port == 0) {
ipseclog((LOG_DEBUG, "%s: invalid NAT-T sport specified.\n",
__func__));
return (EINVAL);
}
sav->natt->sport = port->sadb_x_nat_t_port_port;
port = (struct sadb_x_nat_t_port *)mhp->ext[SADB_X_EXT_NAT_T_DPORT];
if (port->sadb_x_nat_t_port_port == 0) {
ipseclog((LOG_DEBUG, "%s: invalid NAT-T dport specified.\n",
__func__));
return (EINVAL);
}
sav->natt->dport = port->sadb_x_nat_t_port_port;
/*
* SADB_X_EXT_NAT_T_OAI and SADB_X_EXT_NAT_T_OAR are optional
* and needed only for transport mode IPsec.
* Usually NAT translates only one address, but it is possible,
* that both addresses could be translated.
* NOTE: Value of SADB_X_EXT_NAT_T_OAI is equal to SADB_X_EXT_NAT_T_OA.
*/
if (!SADB_CHECKHDR(mhp, SADB_X_EXT_NAT_T_OAI)) {
if (SADB_CHECKLEN(mhp, SADB_X_EXT_NAT_T_OAI)) {
ipseclog((LOG_DEBUG,
"%s: invalid message: wrong header size.\n",
__func__));
return (EINVAL);
}
oai = (struct sadb_address *)mhp->ext[SADB_X_EXT_NAT_T_OAI];
} else
oai = NULL;
if (!SADB_CHECKHDR(mhp, SADB_X_EXT_NAT_T_OAR)) {
if (SADB_CHECKLEN(mhp, SADB_X_EXT_NAT_T_OAR)) {
ipseclog((LOG_DEBUG,
"%s: invalid message: wrong header size.\n",
__func__));
return (EINVAL);
}
oar = (struct sadb_address *)mhp->ext[SADB_X_EXT_NAT_T_OAR];
} else
oar = NULL;
/* Initialize addresses only for transport mode */
if (sav->sah->saidx.mode != IPSEC_MODE_TUNNEL) {
cksum = 0;
if (oai != NULL) {
/* Currently we support only AF_INET */
sa = (struct sockaddr *)(oai + 1);
if (sa->sa_family != AF_INET ||
sa->sa_len != sizeof(struct sockaddr_in)) {
ipseclog((LOG_DEBUG,
"%s: wrong NAT-OAi header.\n",
__func__));
return (EINVAL);
}
/* Ignore address if it the same */
if (((struct sockaddr_in *)sa)->sin_addr.s_addr !=
sav->sah->saidx.src.sin.sin_addr.s_addr) {
bcopy(sa, &sav->natt->oai.sa, sa->sa_len);
sav->natt->flags |= IPSEC_NATT_F_OAI;
/* Calculate checksum delta */
addr = sav->sah->saidx.src.sin.sin_addr.s_addr;
cksum = in_addword(cksum, ~addr >> 16);
cksum = in_addword(cksum, ~addr & 0xffff);
addr = sav->natt->oai.sin.sin_addr.s_addr;
cksum = in_addword(cksum, addr >> 16);
cksum = in_addword(cksum, addr & 0xffff);
}
}
if (oar != NULL) {
/* Currently we support only AF_INET */
sa = (struct sockaddr *)(oar + 1);
if (sa->sa_family != AF_INET ||
sa->sa_len != sizeof(struct sockaddr_in)) {
ipseclog((LOG_DEBUG,
"%s: wrong NAT-OAr header.\n",
__func__));
return (EINVAL);
}
/* Ignore address if it the same */
if (((struct sockaddr_in *)sa)->sin_addr.s_addr !=
sav->sah->saidx.dst.sin.sin_addr.s_addr) {
bcopy(sa, &sav->natt->oar.sa, sa->sa_len);
sav->natt->flags |= IPSEC_NATT_F_OAR;
/* Calculate checksum delta */
addr = sav->sah->saidx.dst.sin.sin_addr.s_addr;
cksum = in_addword(cksum, ~addr >> 16);
cksum = in_addword(cksum, ~addr & 0xffff);
addr = sav->natt->oar.sin.sin_addr.s_addr;
cksum = in_addword(cksum, addr >> 16);
cksum = in_addword(cksum, addr & 0xffff);
}
}
sav->natt->cksum = cksum;
}
return (0);
}
static int
key_setident(struct secashead *sah, const struct sadb_msghdr *mhp)
{
const struct sadb_ident *idsrc, *iddst;
- int idsrclen, iddstlen;
IPSEC_ASSERT(sah != NULL, ("null secashead"));
IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
/* don't make buffer if not there */
if (SADB_CHECKHDR(mhp, SADB_EXT_IDENTITY_SRC) &&
SADB_CHECKHDR(mhp, SADB_EXT_IDENTITY_DST)) {
sah->idents = NULL;
sah->identd = NULL;
return (0);
}
if (SADB_CHECKHDR(mhp, SADB_EXT_IDENTITY_SRC) ||
SADB_CHECKHDR(mhp, SADB_EXT_IDENTITY_DST)) {
ipseclog((LOG_DEBUG, "%s: invalid identity.\n", __func__));
return (EINVAL);
}
idsrc = (const struct sadb_ident *)mhp->ext[SADB_EXT_IDENTITY_SRC];
iddst = (const struct sadb_ident *)mhp->ext[SADB_EXT_IDENTITY_DST];
- idsrclen = mhp->extlen[SADB_EXT_IDENTITY_SRC];
- iddstlen = mhp->extlen[SADB_EXT_IDENTITY_DST];
/* validity check */
if (idsrc->sadb_ident_type != iddst->sadb_ident_type) {
ipseclog((LOG_DEBUG, "%s: ident type mismatch.\n", __func__));
return EINVAL;
}
switch (idsrc->sadb_ident_type) {
case SADB_IDENTTYPE_PREFIX:
case SADB_IDENTTYPE_FQDN:
case SADB_IDENTTYPE_USERFQDN:
default:
/* XXX do nothing */
sah->idents = NULL;
sah->identd = NULL;
return 0;
}
/* make structure */
sah->idents = malloc(sizeof(struct secident), M_IPSEC_MISC, M_NOWAIT);
if (sah->idents == NULL) {
ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
return ENOBUFS;
}
sah->identd = malloc(sizeof(struct secident), M_IPSEC_MISC, M_NOWAIT);
if (sah->identd == NULL) {
free(sah->idents, M_IPSEC_MISC);
sah->idents = NULL;
ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
return ENOBUFS;
}
sah->idents->type = idsrc->sadb_ident_type;
sah->idents->id = idsrc->sadb_ident_id;
sah->identd->type = iddst->sadb_ident_type;
sah->identd->id = iddst->sadb_ident_id;
return 0;
}
/*
* m will not be freed on return.
* it is caller's responsibility to free the result.
*
* Called from SADB_ADD and SADB_UPDATE. Reply will contain headers
* from the request in defined order.
*/
static struct mbuf *
key_getmsgbuf_x1(struct mbuf *m, const struct sadb_msghdr *mhp)
{
struct mbuf *n;
IPSEC_ASSERT(m != NULL, ("null mbuf"));
IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
/* create new sadb_msg to reply. */
n = key_gather_mbuf(m, mhp, 1, 16, SADB_EXT_RESERVED,
SADB_EXT_SA, SADB_X_EXT_SA2,
SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST,
SADB_EXT_LIFETIME_HARD, SADB_EXT_LIFETIME_SOFT,
SADB_EXT_IDENTITY_SRC, SADB_EXT_IDENTITY_DST,
SADB_X_EXT_NAT_T_TYPE, SADB_X_EXT_NAT_T_SPORT,
SADB_X_EXT_NAT_T_DPORT, SADB_X_EXT_NAT_T_OAI,
SADB_X_EXT_NAT_T_OAR, SADB_X_EXT_NEW_ADDRESS_SRC,
SADB_X_EXT_NEW_ADDRESS_DST);
if (!n)
return NULL;
if (n->m_len < sizeof(struct sadb_msg)) {
n = m_pullup(n, sizeof(struct sadb_msg));
if (n == NULL)
return NULL;
}
mtod(n, struct sadb_msg *)->sadb_msg_errno = 0;
mtod(n, struct sadb_msg *)->sadb_msg_len =
PFKEY_UNIT64(n->m_pkthdr.len);
return n;
}
/*
* SADB_DELETE processing
* receive
* <base, SA(*), address(SD)>
* from the ikmpd, and set SADB_SASTATE_DEAD,
* and send,
* <base, SA(*), address(SD)>
* to the ikmpd.
*
* m will always be freed.
*/
static int
key_delete(struct socket *so, struct mbuf *m, const struct sadb_msghdr *mhp)
{
struct secasindex saidx;
struct sadb_address *src0, *dst0;
struct secasvar *sav;
struct sadb_sa *sa0;
uint8_t proto;
IPSEC_ASSERT(so != NULL, ("null socket"));
IPSEC_ASSERT(m != NULL, ("null mbuf"));
IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
/* map satype to proto */
if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
__func__));
return key_senderror(so, m, EINVAL);
}
if (SADB_CHECKHDR(mhp, SADB_EXT_ADDRESS_SRC) ||
SADB_CHECKHDR(mhp, SADB_EXT_ADDRESS_DST) ||
SADB_CHECKLEN(mhp, SADB_EXT_ADDRESS_SRC) ||
SADB_CHECKLEN(mhp, SADB_EXT_ADDRESS_DST)) {
ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
__func__));
return key_senderror(so, m, EINVAL);
}
src0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_SRC]);
dst0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_DST]);
if (key_checksockaddrs((struct sockaddr *)(src0 + 1),
(struct sockaddr *)(dst0 + 1)) != 0) {
ipseclog((LOG_DEBUG, "%s: invalid sockaddr.\n", __func__));
return (key_senderror(so, m, EINVAL));
}
KEY_SETSECASIDX(proto, IPSEC_MODE_ANY, 0, src0 + 1, dst0 + 1, &saidx);
if (SADB_CHECKHDR(mhp, SADB_EXT_SA)) {
/*
* Caller wants us to delete all non-LARVAL SAs
* that match the src/dst. This is used during
* IKE INITIAL-CONTACT.
* XXXAE: this looks like some extension to RFC2367.
*/
ipseclog((LOG_DEBUG, "%s: doing delete all.\n", __func__));
return (key_delete_all(so, m, mhp, &saidx));
}
if (SADB_CHECKLEN(mhp, SADB_EXT_SA)) {
ipseclog((LOG_DEBUG,
"%s: invalid message: wrong header size.\n", __func__));
return (key_senderror(so, m, EINVAL));
}
sa0 = (struct sadb_sa *)mhp->ext[SADB_EXT_SA];
if (proto == IPPROTO_TCP)
sav = key_getsav_tcpmd5(&saidx, NULL);
else
sav = key_getsavbyspi(sa0->sadb_sa_spi);
if (sav == NULL) {
ipseclog((LOG_DEBUG, "%s: no SA found for SPI %u.\n",
__func__, ntohl(sa0->sadb_sa_spi)));
return (key_senderror(so, m, ESRCH));
}
if (key_cmpsaidx(&sav->sah->saidx, &saidx, CMP_HEAD) == 0) {
ipseclog((LOG_DEBUG, "%s: saidx mismatched for SPI %u.\n",
__func__, ntohl(sav->spi)));
key_freesav(&sav);
return (key_senderror(so, m, ESRCH));
}
KEYDBG(KEY_STAMP,
printf("%s: SA(%p)\n", __func__, sav));
KEYDBG(KEY_DATA, kdebug_secasv(sav));
key_unlinksav(sav);
key_freesav(&sav);
{
struct mbuf *n;
struct sadb_msg *newmsg;
/* create new sadb_msg to reply. */
n = key_gather_mbuf(m, mhp, 1, 4, SADB_EXT_RESERVED,
SADB_EXT_SA, SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST);
if (!n)
return key_senderror(so, m, ENOBUFS);
if (n->m_len < sizeof(struct sadb_msg)) {
n = m_pullup(n, sizeof(struct sadb_msg));
if (n == NULL)
return key_senderror(so, m, ENOBUFS);
}
newmsg = mtod(n, struct sadb_msg *);
newmsg->sadb_msg_errno = 0;
newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len);
m_freem(m);
return key_sendup_mbuf(so, n, KEY_SENDUP_ALL);
}
}
/*
* delete all SAs for src/dst. Called from key_delete().
*/
static int
key_delete_all(struct socket *so, struct mbuf *m,
const struct sadb_msghdr *mhp, struct secasindex *saidx)
{
struct secasvar_queue drainq;
struct secashead *sah;
struct secasvar *sav, *nextsav;
TAILQ_INIT(&drainq);
SAHTREE_WLOCK();
LIST_FOREACH(sah, SAHADDRHASH_HASH(saidx), addrhash) {
if (key_cmpsaidx(&sah->saidx, saidx, CMP_HEAD) == 0)
continue;
/* Move all ALIVE SAs into drainq */
TAILQ_CONCAT(&drainq, &sah->savtree_alive, chain);
}
/* Unlink all queued SAs from SPI hash */
TAILQ_FOREACH(sav, &drainq, chain) {
sav->state = SADB_SASTATE_DEAD;
LIST_REMOVE(sav, spihash);
}
SAHTREE_WUNLOCK();
/* Now we can release reference for all SAs in drainq */
sav = TAILQ_FIRST(&drainq);
while (sav != NULL) {
KEYDBG(KEY_STAMP,
printf("%s: SA(%p)\n", __func__, sav));
KEYDBG(KEY_DATA, kdebug_secasv(sav));
nextsav = TAILQ_NEXT(sav, chain);
key_freesah(&sav->sah); /* release reference from SAV */
key_freesav(&sav); /* release last reference */
sav = nextsav;
}
{
struct mbuf *n;
struct sadb_msg *newmsg;
/* create new sadb_msg to reply. */
n = key_gather_mbuf(m, mhp, 1, 3, SADB_EXT_RESERVED,
SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST);
if (!n)
return key_senderror(so, m, ENOBUFS);
if (n->m_len < sizeof(struct sadb_msg)) {
n = m_pullup(n, sizeof(struct sadb_msg));
if (n == NULL)
return key_senderror(so, m, ENOBUFS);
}
newmsg = mtod(n, struct sadb_msg *);
newmsg->sadb_msg_errno = 0;
newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len);
m_freem(m);
return key_sendup_mbuf(so, n, KEY_SENDUP_ALL);
}
}
/*
* Delete all alive SAs for corresponding xform.
* Larval SAs have not initialized tdb_xform, so it is safe to leave them
* here when xform disappears.
*/
static void
key_delete_xform(const struct xformsw *xsp)
{
struct secasvar_queue drainq;
struct secashead *sah;
struct secasvar *sav, *nextsav;
TAILQ_INIT(&drainq);
SAHTREE_WLOCK();
TAILQ_FOREACH(sah, &V_sahtree, chain) {
sav = TAILQ_FIRST(&sah->savtree_alive);
if (sav == NULL)
continue;
if (sav->tdb_xform != xsp)
continue;
/*
* It is supposed that all SAs in the chain are related to
* one xform.
*/
TAILQ_CONCAT(&drainq, &sah->savtree_alive, chain);
}
/* Unlink all queued SAs from SPI hash */
TAILQ_FOREACH(sav, &drainq, chain) {
sav->state = SADB_SASTATE_DEAD;
LIST_REMOVE(sav, spihash);
}
SAHTREE_WUNLOCK();
/* Now we can release reference for all SAs in drainq */
sav = TAILQ_FIRST(&drainq);
while (sav != NULL) {
KEYDBG(KEY_STAMP,
printf("%s: SA(%p)\n", __func__, sav));
KEYDBG(KEY_DATA, kdebug_secasv(sav));
nextsav = TAILQ_NEXT(sav, chain);
key_freesah(&sav->sah); /* release reference from SAV */
key_freesav(&sav); /* release last reference */
sav = nextsav;
}
}
/*
* SADB_GET processing
* receive
* <base, SA(*), address(SD)>
* from the ikmpd, and get a SP and a SA to respond,
* and send,
* <base, SA, (lifetime(HSC),) address(SD), (address(P),) key(AE),
* (identity(SD),) (sensitivity)>
* to the ikmpd.
*
* m will always be freed.
*/
static int
key_get(struct socket *so, struct mbuf *m, const struct sadb_msghdr *mhp)
{
struct secasindex saidx;
struct sadb_address *src0, *dst0;
struct sadb_sa *sa0;
struct secasvar *sav;
uint8_t proto;
IPSEC_ASSERT(so != NULL, ("null socket"));
IPSEC_ASSERT(m != NULL, ("null mbuf"));
IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
/* map satype to proto */
if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
__func__));
return key_senderror(so, m, EINVAL);
}
if (SADB_CHECKHDR(mhp, SADB_EXT_SA) ||
SADB_CHECKHDR(mhp, SADB_EXT_ADDRESS_SRC) ||
SADB_CHECKHDR(mhp, SADB_EXT_ADDRESS_DST)) {
ipseclog((LOG_DEBUG,
"%s: invalid message: missing required header.\n",
__func__));
return key_senderror(so, m, EINVAL);
}
if (SADB_CHECKLEN(mhp, SADB_EXT_SA) ||
SADB_CHECKLEN(mhp, SADB_EXT_ADDRESS_SRC) ||
SADB_CHECKLEN(mhp, SADB_EXT_ADDRESS_DST)) {
ipseclog((LOG_DEBUG,
"%s: invalid message: wrong header size.\n", __func__));
return key_senderror(so, m, EINVAL);
}
sa0 = (struct sadb_sa *)mhp->ext[SADB_EXT_SA];
src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC];
dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST];
if (key_checksockaddrs((struct sockaddr *)(src0 + 1),
(struct sockaddr *)(dst0 + 1)) != 0) {
ipseclog((LOG_DEBUG, "%s: invalid sockaddr.\n", __func__));
return key_senderror(so, m, EINVAL);
}
KEY_SETSECASIDX(proto, IPSEC_MODE_ANY, 0, src0 + 1, dst0 + 1, &saidx);
if (proto == IPPROTO_TCP)
sav = key_getsav_tcpmd5(&saidx, NULL);
else
sav = key_getsavbyspi(sa0->sadb_sa_spi);
if (sav == NULL) {
ipseclog((LOG_DEBUG, "%s: no SA found.\n", __func__));
return key_senderror(so, m, ESRCH);
}
if (key_cmpsaidx(&sav->sah->saidx, &saidx, CMP_HEAD) == 0) {
ipseclog((LOG_DEBUG, "%s: saidx mismatched for SPI %u.\n",
__func__, ntohl(sa0->sadb_sa_spi)));
key_freesav(&sav);
return (key_senderror(so, m, ESRCH));
}
{
struct mbuf *n;
uint8_t satype;
/* map proto to satype */
if ((satype = key_proto2satype(sav->sah->saidx.proto)) == 0) {
ipseclog((LOG_DEBUG, "%s: there was invalid proto in SAD.\n",
__func__));
key_freesav(&sav);
return key_senderror(so, m, EINVAL);
}
/* create new sadb_msg to reply. */
n = key_setdumpsa(sav, SADB_GET, satype, mhp->msg->sadb_msg_seq,
mhp->msg->sadb_msg_pid);
key_freesav(&sav);
if (!n)
return key_senderror(so, m, ENOBUFS);
m_freem(m);
return key_sendup_mbuf(so, n, KEY_SENDUP_ONE);
}
}
/* XXX make it sysctl-configurable? */
static void
key_getcomb_setlifetime(struct sadb_comb *comb)
{
comb->sadb_comb_soft_allocations = 1;
comb->sadb_comb_hard_allocations = 1;
comb->sadb_comb_soft_bytes = 0;
comb->sadb_comb_hard_bytes = 0;
comb->sadb_comb_hard_addtime = 86400; /* 1 day */
comb->sadb_comb_soft_addtime = comb->sadb_comb_soft_addtime * 80 / 100;
comb->sadb_comb_soft_usetime = 28800; /* 8 hours */
comb->sadb_comb_hard_usetime = comb->sadb_comb_hard_usetime * 80 / 100;
}
/*
* XXX reorder combinations by preference
* XXX no idea if the user wants ESP authentication or not
*/
static struct mbuf *
key_getcomb_ealg(void)
{
struct sadb_comb *comb;
const struct enc_xform *algo;
struct mbuf *result = NULL, *m, *n;
int encmin;
int i, off, o;
int totlen;
const int l = PFKEY_ALIGN8(sizeof(struct sadb_comb));
m = NULL;
for (i = 1; i <= SADB_EALG_MAX; i++) {
algo = enc_algorithm_lookup(i);
if (algo == NULL)
continue;
/* discard algorithms with key size smaller than system min */
if (_BITS(algo->maxkey) < V_ipsec_esp_keymin)
continue;
if (_BITS(algo->minkey) < V_ipsec_esp_keymin)
encmin = V_ipsec_esp_keymin;
else
encmin = _BITS(algo->minkey);
if (V_ipsec_esp_auth)
m = key_getcomb_ah();
else {
IPSEC_ASSERT(l <= MLEN,
("l=%u > MLEN=%lu", l, (u_long) MLEN));
MGET(m, M_NOWAIT, MT_DATA);
if (m) {
M_ALIGN(m, l);
m->m_len = l;
m->m_next = NULL;
bzero(mtod(m, caddr_t), m->m_len);
}
}
if (!m)
goto fail;
totlen = 0;
for (n = m; n; n = n->m_next)
totlen += n->m_len;
IPSEC_ASSERT((totlen % l) == 0, ("totlen=%u, l=%u", totlen, l));
for (off = 0; off < totlen; off += l) {
n = m_pulldown(m, off, l, &o);
if (!n) {
/* m is already freed */
goto fail;
}
comb = (struct sadb_comb *)(mtod(n, caddr_t) + o);
bzero(comb, sizeof(*comb));
key_getcomb_setlifetime(comb);
comb->sadb_comb_encrypt = i;
comb->sadb_comb_encrypt_minbits = encmin;
comb->sadb_comb_encrypt_maxbits = _BITS(algo->maxkey);
}
if (!result)
result = m;
else
m_cat(result, m);
}
return result;
fail:
if (result)
m_freem(result);
return NULL;
}
static void
key_getsizes_ah(const struct auth_hash *ah, int alg, u_int16_t* min,
u_int16_t* max)
{
*min = *max = ah->hashsize;
if (ah->keysize == 0) {
/*
* Transform takes arbitrary key size but algorithm
* key size is restricted. Enforce this here.
*/
switch (alg) {
case SADB_X_AALG_MD5: *min = *max = 16; break;
case SADB_X_AALG_SHA: *min = *max = 20; break;
case SADB_X_AALG_NULL: *min = 1; *max = 256; break;
case SADB_X_AALG_SHA2_256: *min = *max = 32; break;
case SADB_X_AALG_SHA2_384: *min = *max = 48; break;
case SADB_X_AALG_SHA2_512: *min = *max = 64; break;
default:
DPRINTF(("%s: unknown AH algorithm %u\n",
__func__, alg));
break;
}
}
}
/*
* XXX reorder combinations by preference
*/
static struct mbuf *
key_getcomb_ah()
{
const struct auth_hash *algo;
struct sadb_comb *comb;
struct mbuf *m;
u_int16_t minkeysize, maxkeysize;
int i;
const int l = PFKEY_ALIGN8(sizeof(struct sadb_comb));
m = NULL;
for (i = 1; i <= SADB_AALG_MAX; i++) {
#if 1
/* we prefer HMAC algorithms, not old algorithms */
if (i != SADB_AALG_SHA1HMAC &&
i != SADB_AALG_MD5HMAC &&
i != SADB_X_AALG_SHA2_256 &&
i != SADB_X_AALG_SHA2_384 &&
i != SADB_X_AALG_SHA2_512)
continue;
#endif
algo = auth_algorithm_lookup(i);
if (!algo)
continue;
key_getsizes_ah(algo, i, &minkeysize, &maxkeysize);
/* discard algorithms with key size smaller than system min */
if (_BITS(minkeysize) < V_ipsec_ah_keymin)
continue;
if (!m) {
IPSEC_ASSERT(l <= MLEN,
("l=%u > MLEN=%lu", l, (u_long) MLEN));
MGET(m, M_NOWAIT, MT_DATA);
if (m) {
M_ALIGN(m, l);
m->m_len = l;
m->m_next = NULL;
}
} else
M_PREPEND(m, l, M_NOWAIT);
if (!m)
return NULL;
comb = mtod(m, struct sadb_comb *);
bzero(comb, sizeof(*comb));
key_getcomb_setlifetime(comb);
comb->sadb_comb_auth = i;
comb->sadb_comb_auth_minbits = _BITS(minkeysize);
comb->sadb_comb_auth_maxbits = _BITS(maxkeysize);
}
return m;
}
/*
* not really an official behavior. discussed in pf_key@inner.net in Sep2000.
* XXX reorder combinations by preference
*/
static struct mbuf *
key_getcomb_ipcomp()
{
const struct comp_algo *algo;
struct sadb_comb *comb;
struct mbuf *m;
int i;
const int l = PFKEY_ALIGN8(sizeof(struct sadb_comb));
m = NULL;
for (i = 1; i <= SADB_X_CALG_MAX; i++) {
algo = comp_algorithm_lookup(i);
if (!algo)
continue;
if (!m) {
IPSEC_ASSERT(l <= MLEN,
("l=%u > MLEN=%lu", l, (u_long) MLEN));
MGET(m, M_NOWAIT, MT_DATA);
if (m) {
M_ALIGN(m, l);
m->m_len = l;
m->m_next = NULL;
}
} else
M_PREPEND(m, l, M_NOWAIT);
if (!m)
return NULL;
comb = mtod(m, struct sadb_comb *);
bzero(comb, sizeof(*comb));
key_getcomb_setlifetime(comb);
comb->sadb_comb_encrypt = i;
/* what should we set into sadb_comb_*_{min,max}bits? */
}
return m;
}
/*
* XXX no way to pass mode (transport/tunnel) to userland
* XXX replay checking?
* XXX sysctl interface to ipsec_{ah,esp}_keymin
*/
static struct mbuf *
key_getprop(const struct secasindex *saidx)
{
struct sadb_prop *prop;
struct mbuf *m, *n;
const int l = PFKEY_ALIGN8(sizeof(struct sadb_prop));
int totlen;
switch (saidx->proto) {
case IPPROTO_ESP:
m = key_getcomb_ealg();
break;
case IPPROTO_AH:
m = key_getcomb_ah();
break;
case IPPROTO_IPCOMP:
m = key_getcomb_ipcomp();
break;
default:
return NULL;
}
if (!m)
return NULL;
M_PREPEND(m, l, M_NOWAIT);
if (!m)
return NULL;
totlen = 0;
for (n = m; n; n = n->m_next)
totlen += n->m_len;
prop = mtod(m, struct sadb_prop *);
bzero(prop, sizeof(*prop));
prop->sadb_prop_len = PFKEY_UNIT64(totlen);
prop->sadb_prop_exttype = SADB_EXT_PROPOSAL;
prop->sadb_prop_replay = 32; /* XXX */
return m;
}
/*
* SADB_ACQUIRE processing called by key_checkrequest() and key_acquire2().
* send
* <base, SA, address(SD), (address(P)), x_policy,
* (identity(SD),) (sensitivity,) proposal>
* to KMD, and expect to receive
* <base> with SADB_ACQUIRE if error occurred,
* or
* <base, src address, dst address, (SPI range)> with SADB_GETSPI
* from KMD by PF_KEY.
*
* XXX x_policy is outside of RFC2367 (KAME extension).
* XXX sensitivity is not supported.
* XXX for ipcomp, RFC2367 does not define how to fill in proposal.
* see comment for key_getcomb_ipcomp().
*
* OUT:
* 0 : succeed
* others: error number
*/
static int
key_acquire(const struct secasindex *saidx, struct secpolicy *sp)
{
union sockaddr_union addr;
struct mbuf *result, *m;
uint32_t seq;
int error;
uint16_t ul_proto;
uint8_t mask, satype;
IPSEC_ASSERT(saidx != NULL, ("null saidx"));
satype = key_proto2satype(saidx->proto);
IPSEC_ASSERT(satype != 0, ("null satype, protocol %u", saidx->proto));
error = -1;
result = NULL;
ul_proto = IPSEC_ULPROTO_ANY;
/* Get seq number to check whether sending message or not. */
seq = key_getacq(saidx, &error);
if (seq == 0)
return (error);
m = key_setsadbmsg(SADB_ACQUIRE, 0, satype, seq, 0, 0);
if (!m) {
error = ENOBUFS;
goto fail;
}
result = m;
/*
* set sadb_address for saidx's.
*
* Note that if sp is supplied, then we're being called from
* key_allocsa_policy() and should supply port and protocol
* information.
* XXXAE: why only TCP and UDP? ICMP and SCTP looks applicable too.
* XXXAE: probably we can handle this in the ipsec[46]_allocsa().
* XXXAE: it looks like we should save this info in the ACQ entry.
*/
if (sp != NULL && (sp->spidx.ul_proto == IPPROTO_TCP ||
sp->spidx.ul_proto == IPPROTO_UDP))
ul_proto = sp->spidx.ul_proto;
addr = saidx->src;
mask = FULLMASK;
if (ul_proto != IPSEC_ULPROTO_ANY) {
switch (sp->spidx.src.sa.sa_family) {
case AF_INET:
if (sp->spidx.src.sin.sin_port != IPSEC_PORT_ANY) {
addr.sin.sin_port = sp->spidx.src.sin.sin_port;
mask = sp->spidx.prefs;
}
break;
case AF_INET6:
if (sp->spidx.src.sin6.sin6_port != IPSEC_PORT_ANY) {
addr.sin6.sin6_port =
sp->spidx.src.sin6.sin6_port;
mask = sp->spidx.prefs;
}
break;
default:
break;
}
}
m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC, &addr.sa, mask, ul_proto);
if (!m) {
error = ENOBUFS;
goto fail;
}
m_cat(result, m);
addr = saidx->dst;
mask = FULLMASK;
if (ul_proto != IPSEC_ULPROTO_ANY) {
switch (sp->spidx.dst.sa.sa_family) {
case AF_INET:
if (sp->spidx.dst.sin.sin_port != IPSEC_PORT_ANY) {
addr.sin.sin_port = sp->spidx.dst.sin.sin_port;
mask = sp->spidx.prefd;
}
break;
case AF_INET6:
if (sp->spidx.dst.sin6.sin6_port != IPSEC_PORT_ANY) {
addr.sin6.sin6_port =
sp->spidx.dst.sin6.sin6_port;
mask = sp->spidx.prefd;
}
break;
default:
break;
}
}
m = key_setsadbaddr(SADB_EXT_ADDRESS_DST, &addr.sa, mask, ul_proto);
if (!m) {
error = ENOBUFS;
goto fail;
}
m_cat(result, m);
/* XXX proxy address (optional) */
/* set sadb_x_policy */
if (sp != NULL) {
m = key_setsadbxpolicy(sp->policy, sp->spidx.dir, sp->id,
sp->priority);
if (!m) {
error = ENOBUFS;
goto fail;
}
m_cat(result, m);
}
/* XXX identity (optional) */
#if 0
if (idexttype && fqdn) {
/* create identity extension (FQDN) */
struct sadb_ident *id;
int fqdnlen;
fqdnlen = strlen(fqdn) + 1; /* +1 for terminating-NUL */
id = (struct sadb_ident *)p;
bzero(id, sizeof(*id) + PFKEY_ALIGN8(fqdnlen));
id->sadb_ident_len = PFKEY_UNIT64(sizeof(*id) + PFKEY_ALIGN8(fqdnlen));
id->sadb_ident_exttype = idexttype;
id->sadb_ident_type = SADB_IDENTTYPE_FQDN;
bcopy(fqdn, id + 1, fqdnlen);
p += sizeof(struct sadb_ident) + PFKEY_ALIGN8(fqdnlen);
}
if (idexttype) {
/* create identity extension (USERFQDN) */
struct sadb_ident *id;
int userfqdnlen;
if (userfqdn) {
/* +1 for terminating-NUL */
userfqdnlen = strlen(userfqdn) + 1;
} else
userfqdnlen = 0;
id = (struct sadb_ident *)p;
bzero(id, sizeof(*id) + PFKEY_ALIGN8(userfqdnlen));
id->sadb_ident_len = PFKEY_UNIT64(sizeof(*id) + PFKEY_ALIGN8(userfqdnlen));
id->sadb_ident_exttype = idexttype;
id->sadb_ident_type = SADB_IDENTTYPE_USERFQDN;
/* XXX is it correct? */
if (curproc && curproc->p_cred)
id->sadb_ident_id = curproc->p_cred->p_ruid;
if (userfqdn && userfqdnlen)
bcopy(userfqdn, id + 1, userfqdnlen);
p += sizeof(struct sadb_ident) + PFKEY_ALIGN8(userfqdnlen);
}
#endif
/* XXX sensitivity (optional) */
/* create proposal/combination extension */
m = key_getprop(saidx);
#if 0
/*
* spec conformant: always attach proposal/combination extension,
* the problem is that we have no way to attach it for ipcomp,
* due to the way sadb_comb is declared in RFC2367.
*/
if (!m) {
error = ENOBUFS;
goto fail;
}
m_cat(result, m);
#else
/*
* outside of spec; make proposal/combination extension optional.
*/
if (m)
m_cat(result, m);
#endif
if ((result->m_flags & M_PKTHDR) == 0) {
error = EINVAL;
goto fail;
}
if (result->m_len < sizeof(struct sadb_msg)) {
result = m_pullup(result, sizeof(struct sadb_msg));
if (result == NULL) {
error = ENOBUFS;
goto fail;
}
}
result->m_pkthdr.len = 0;
for (m = result; m; m = m->m_next)
result->m_pkthdr.len += m->m_len;
mtod(result, struct sadb_msg *)->sadb_msg_len =
PFKEY_UNIT64(result->m_pkthdr.len);
KEYDBG(KEY_STAMP,
printf("%s: SP(%p)\n", __func__, sp));
KEYDBG(KEY_DATA, kdebug_secasindex(saidx, NULL));
return key_sendup_mbuf(NULL, result, KEY_SENDUP_REGISTERED);
fail:
if (result)
m_freem(result);
return error;
}
static uint32_t
key_newacq(const struct secasindex *saidx, int *perror)
{
struct secacq *acq;
uint32_t seq;
acq = malloc(sizeof(*acq), M_IPSEC_SAQ, M_NOWAIT | M_ZERO);
if (acq == NULL) {
ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
*perror = ENOBUFS;
return (0);
}
/* copy secindex */
bcopy(saidx, &acq->saidx, sizeof(acq->saidx));
acq->created = time_second;
acq->count = 0;
/* add to acqtree */
ACQ_LOCK();
seq = acq->seq = (V_acq_seq == ~0 ? 1 : ++V_acq_seq);
LIST_INSERT_HEAD(&V_acqtree, acq, chain);
LIST_INSERT_HEAD(ACQADDRHASH_HASH(saidx), acq, addrhash);
LIST_INSERT_HEAD(ACQSEQHASH_HASH(seq), acq, seqhash);
ACQ_UNLOCK();
*perror = 0;
return (seq);
}
static uint32_t
key_getacq(const struct secasindex *saidx, int *perror)
{
struct secacq *acq;
uint32_t seq;
ACQ_LOCK();
LIST_FOREACH(acq, ACQADDRHASH_HASH(saidx), addrhash) {
if (key_cmpsaidx(&acq->saidx, saidx, CMP_EXACTLY)) {
if (acq->count > V_key_blockacq_count) {
/*
* Reset counter and send message.
* Also reset created time to keep ACQ for
* this saidx.
*/
acq->created = time_second;
acq->count = 0;
seq = acq->seq;
} else {
/*
* Increment counter and do nothing.
* We send SADB_ACQUIRE message only
* for each V_key_blockacq_count packet.
*/
acq->count++;
seq = 0;
}
break;
}
}
ACQ_UNLOCK();
if (acq != NULL) {
*perror = 0;
return (seq);
}
/* allocate new entry */
return (key_newacq(saidx, perror));
}
static int
key_acqreset(uint32_t seq)
{
struct secacq *acq;
ACQ_LOCK();
LIST_FOREACH(acq, ACQSEQHASH_HASH(seq), seqhash) {
if (acq->seq == seq) {
acq->count = 0;
acq->created = time_second;
break;
}
}
ACQ_UNLOCK();
if (acq == NULL)
return (ESRCH);
return (0);
}
/*
* Mark ACQ entry as stale to remove it in key_flush_acq().
* Called after successful SADB_GETSPI message.
*/
static int
key_acqdone(const struct secasindex *saidx, uint32_t seq)
{
struct secacq *acq;
ACQ_LOCK();
LIST_FOREACH(acq, ACQSEQHASH_HASH(seq), seqhash) {
if (acq->seq == seq)
break;
}
if (acq != NULL) {
if (key_cmpsaidx(&acq->saidx, saidx, CMP_EXACTLY) == 0) {
ipseclog((LOG_DEBUG,
"%s: Mismatched saidx for ACQ %u", __func__, seq));
acq = NULL;
} else {
acq->created = 0;
}
} else {
ipseclog((LOG_DEBUG,
"%s: ACQ %u is not found.", __func__, seq));
}
ACQ_UNLOCK();
if (acq == NULL)
return (ESRCH);
return (0);
}
static struct secspacq *
key_newspacq(struct secpolicyindex *spidx)
{
struct secspacq *acq;
/* get new entry */
acq = malloc(sizeof(struct secspacq), M_IPSEC_SAQ, M_NOWAIT|M_ZERO);
if (acq == NULL) {
ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
return NULL;
}
/* copy secindex */
bcopy(spidx, &acq->spidx, sizeof(acq->spidx));
acq->created = time_second;
acq->count = 0;
/* add to spacqtree */
SPACQ_LOCK();
LIST_INSERT_HEAD(&V_spacqtree, acq, chain);
SPACQ_UNLOCK();
return acq;
}
static struct secspacq *
key_getspacq(struct secpolicyindex *spidx)
{
struct secspacq *acq;
SPACQ_LOCK();
LIST_FOREACH(acq, &V_spacqtree, chain) {
if (key_cmpspidx_exactly(spidx, &acq->spidx)) {
/* NB: return holding spacq_lock */
return acq;
}
}
SPACQ_UNLOCK();
return NULL;
}
/*
* SADB_ACQUIRE processing,
* in first situation, is receiving
* <base>
* from the ikmpd, and clear sequence of its secasvar entry.
*
* In second situation, is receiving
* <base, address(SD), (address(P),) (identity(SD),) (sensitivity,) proposal>
* from a user land process, and return
* <base, address(SD), (address(P),) (identity(SD),) (sensitivity,) proposal>
* to the socket.
*
* m will always be freed.
*/
static int
key_acquire2(struct socket *so, struct mbuf *m, const struct sadb_msghdr *mhp)
{
SAHTREE_RLOCK_TRACKER;
struct sadb_address *src0, *dst0;
struct secasindex saidx;
struct secashead *sah;
uint32_t reqid;
int error;
uint8_t mode, proto;
IPSEC_ASSERT(so != NULL, ("null socket"));
IPSEC_ASSERT(m != NULL, ("null mbuf"));
IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
/*
* Error message from KMd.
* We assume that if error was occurred in IKEd, the length of PFKEY
* message is equal to the size of sadb_msg structure.
* We do not raise error even if error occurred in this function.
*/
if (mhp->msg->sadb_msg_len == PFKEY_UNIT64(sizeof(struct sadb_msg))) {
/* check sequence number */
if (mhp->msg->sadb_msg_seq == 0 ||
mhp->msg->sadb_msg_errno == 0) {
ipseclog((LOG_DEBUG, "%s: must specify sequence "
"number and errno.\n", __func__));
} else {
/*
* IKEd reported that error occurred.
* XXXAE: what it expects from the kernel?
* Probably we should send SADB_ACQUIRE again?
* If so, reset ACQ's state.
* XXXAE: it looks useless.
*/
key_acqreset(mhp->msg->sadb_msg_seq);
}
m_freem(m);
return (0);
}
/*
* This message is from user land.
*/
/* map satype to proto */
if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
__func__));
return key_senderror(so, m, EINVAL);
}
if (SADB_CHECKHDR(mhp, SADB_EXT_ADDRESS_SRC) ||
SADB_CHECKHDR(mhp, SADB_EXT_ADDRESS_DST) ||
SADB_CHECKHDR(mhp, SADB_EXT_PROPOSAL)) {
ipseclog((LOG_DEBUG,
"%s: invalid message: missing required header.\n",
__func__));
return key_senderror(so, m, EINVAL);
}
if (SADB_CHECKLEN(mhp, SADB_EXT_ADDRESS_SRC) ||
SADB_CHECKLEN(mhp, SADB_EXT_ADDRESS_DST) ||
SADB_CHECKLEN(mhp, SADB_EXT_PROPOSAL)) {
ipseclog((LOG_DEBUG,
"%s: invalid message: wrong header size.\n", __func__));
return key_senderror(so, m, EINVAL);
}
if (SADB_CHECKHDR(mhp, SADB_X_EXT_SA2)) {
mode = IPSEC_MODE_ANY;
reqid = 0;
} else {
if (SADB_CHECKLEN(mhp, SADB_X_EXT_SA2)) {
ipseclog((LOG_DEBUG,
"%s: invalid message: wrong header size.\n",
__func__));
return key_senderror(so, m, EINVAL);
}
mode = ((struct sadb_x_sa2 *)
mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_mode;
reqid = ((struct sadb_x_sa2 *)
mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_reqid;
}
src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC];
dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST];
error = key_checksockaddrs((struct sockaddr *)(src0 + 1),
(struct sockaddr *)(dst0 + 1));
if (error != 0) {
ipseclog((LOG_DEBUG, "%s: invalid sockaddr.\n", __func__));
return key_senderror(so, m, EINVAL);
}
KEY_SETSECASIDX(proto, mode, reqid, src0 + 1, dst0 + 1, &saidx);
/* get a SA index */
SAHTREE_RLOCK();
LIST_FOREACH(sah, SAHADDRHASH_HASH(&saidx), addrhash) {
if (key_cmpsaidx(&sah->saidx, &saidx, CMP_MODE_REQID))
break;
}
SAHTREE_RUNLOCK();
if (sah != NULL) {
ipseclog((LOG_DEBUG, "%s: a SA exists already.\n", __func__));
return key_senderror(so, m, EEXIST);
}
error = key_acquire(&saidx, NULL);
if (error != 0) {
ipseclog((LOG_DEBUG,
"%s: error %d returned from key_acquire()\n",
__func__, error));
return key_senderror(so, m, error);
}
m_freem(m);
return (0);
}
/*
* SADB_REGISTER processing.
* If SATYPE_UNSPEC has been passed as satype, only return sabd_supported.
* receive
* <base>
* from the ikmpd, and register a socket to send PF_KEY messages,
* and send
* <base, supported>
* to KMD by PF_KEY.
* If socket is detached, must free from regnode.
*
* m will always be freed.
*/
static int
key_register(struct socket *so, struct mbuf *m, const struct sadb_msghdr *mhp)
{
struct secreg *reg, *newreg = NULL;
IPSEC_ASSERT(so != NULL, ("null socket"));
IPSEC_ASSERT(m != NULL, ("null mbuf"));
IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
/* check for invalid register message */
if (mhp->msg->sadb_msg_satype >= sizeof(V_regtree)/sizeof(V_regtree[0]))
return key_senderror(so, m, EINVAL);
/* When SATYPE_UNSPEC is specified, only return sabd_supported. */
if (mhp->msg->sadb_msg_satype == SADB_SATYPE_UNSPEC)
goto setmsg;
/* check whether existing or not */
REGTREE_LOCK();
LIST_FOREACH(reg, &V_regtree[mhp->msg->sadb_msg_satype], chain) {
if (reg->so == so) {
REGTREE_UNLOCK();
ipseclog((LOG_DEBUG, "%s: socket exists already.\n",
__func__));
return key_senderror(so, m, EEXIST);
}
}
/* create regnode */
newreg = malloc(sizeof(struct secreg), M_IPSEC_SAR, M_NOWAIT|M_ZERO);
if (newreg == NULL) {
REGTREE_UNLOCK();
ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
return key_senderror(so, m, ENOBUFS);
}
newreg->so = so;
((struct keycb *)sotorawcb(so))->kp_registered++;
/* add regnode to regtree. */
LIST_INSERT_HEAD(&V_regtree[mhp->msg->sadb_msg_satype], newreg, chain);
REGTREE_UNLOCK();
setmsg:
{
struct mbuf *n;
struct sadb_msg *newmsg;
struct sadb_supported *sup;
u_int len, alen, elen;
int off;
int i;
struct sadb_alg *alg;
/* create new sadb_msg to reply. */
alen = 0;
for (i = 1; i <= SADB_AALG_MAX; i++) {
if (auth_algorithm_lookup(i))
alen += sizeof(struct sadb_alg);
}
if (alen)
alen += sizeof(struct sadb_supported);
elen = 0;
for (i = 1; i <= SADB_EALG_MAX; i++) {
if (enc_algorithm_lookup(i))
elen += sizeof(struct sadb_alg);
}
if (elen)
elen += sizeof(struct sadb_supported);
len = sizeof(struct sadb_msg) + alen + elen;
if (len > MCLBYTES)
return key_senderror(so, m, ENOBUFS);
MGETHDR(n, M_NOWAIT, MT_DATA);
if (len > MHLEN) {
if (!(MCLGET(n, M_NOWAIT))) {
m_freem(n);
n = NULL;
}
}
if (!n)
return key_senderror(so, m, ENOBUFS);
n->m_pkthdr.len = n->m_len = len;
n->m_next = NULL;
off = 0;
m_copydata(m, 0, sizeof(struct sadb_msg), mtod(n, caddr_t) + off);
newmsg = mtod(n, struct sadb_msg *);
newmsg->sadb_msg_errno = 0;
newmsg->sadb_msg_len = PFKEY_UNIT64(len);
off += PFKEY_ALIGN8(sizeof(struct sadb_msg));
/* for authentication algorithm */
if (alen) {
sup = (struct sadb_supported *)(mtod(n, caddr_t) + off);
sup->sadb_supported_len = PFKEY_UNIT64(alen);
sup->sadb_supported_exttype = SADB_EXT_SUPPORTED_AUTH;
off += PFKEY_ALIGN8(sizeof(*sup));
for (i = 1; i <= SADB_AALG_MAX; i++) {
const struct auth_hash *aalgo;
u_int16_t minkeysize, maxkeysize;
aalgo = auth_algorithm_lookup(i);
if (!aalgo)
continue;
alg = (struct sadb_alg *)(mtod(n, caddr_t) + off);
alg->sadb_alg_id = i;
alg->sadb_alg_ivlen = 0;
key_getsizes_ah(aalgo, i, &minkeysize, &maxkeysize);
alg->sadb_alg_minbits = _BITS(minkeysize);
alg->sadb_alg_maxbits = _BITS(maxkeysize);
off += PFKEY_ALIGN8(sizeof(*alg));
}
}
/* for encryption algorithm */
if (elen) {
sup = (struct sadb_supported *)(mtod(n, caddr_t) + off);
sup->sadb_supported_len = PFKEY_UNIT64(elen);
sup->sadb_supported_exttype = SADB_EXT_SUPPORTED_ENCRYPT;
off += PFKEY_ALIGN8(sizeof(*sup));
for (i = 1; i <= SADB_EALG_MAX; i++) {
const struct enc_xform *ealgo;
ealgo = enc_algorithm_lookup(i);
if (!ealgo)
continue;
alg = (struct sadb_alg *)(mtod(n, caddr_t) + off);
alg->sadb_alg_id = i;
alg->sadb_alg_ivlen = ealgo->ivsize;
alg->sadb_alg_minbits = _BITS(ealgo->minkey);
alg->sadb_alg_maxbits = _BITS(ealgo->maxkey);
off += PFKEY_ALIGN8(sizeof(struct sadb_alg));
}
}
IPSEC_ASSERT(off == len,
("length assumption failed (off %u len %u)", off, len));
m_freem(m);
return key_sendup_mbuf(so, n, KEY_SENDUP_REGISTERED);
}
}
/*
* free secreg entry registered.
* XXX: I want to do free a socket marked done SADB_RESIGER to socket.
*/
void
key_freereg(struct socket *so)
{
struct secreg *reg;
int i;
IPSEC_ASSERT(so != NULL, ("NULL so"));
/*
* check whether existing or not.
* check all type of SA, because there is a potential that
* one socket is registered to multiple type of SA.
*/
REGTREE_LOCK();
for (i = 0; i <= SADB_SATYPE_MAX; i++) {
LIST_FOREACH(reg, &V_regtree[i], chain) {
if (reg->so == so && __LIST_CHAINED(reg)) {
LIST_REMOVE(reg, chain);
free(reg, M_IPSEC_SAR);
break;
}
}
}
REGTREE_UNLOCK();
}
/*
* SADB_EXPIRE processing
* send
* <base, SA, SA2, lifetime(C and one of HS), address(SD)>
* to KMD by PF_KEY.
* NOTE: We send only soft lifetime extension.
*
* OUT: 0 : succeed
* others : error number
*/
static int
key_expire(struct secasvar *sav, int hard)
{
struct mbuf *result = NULL, *m;
struct sadb_lifetime *lt;
uint32_t replay_count;
int error, len;
uint8_t satype;
IPSEC_ASSERT (sav != NULL, ("null sav"));
IPSEC_ASSERT (sav->sah != NULL, ("null sa header"));
KEYDBG(KEY_STAMP,
printf("%s: SA(%p) expired %s lifetime\n", __func__,
sav, hard ? "hard": "soft"));
KEYDBG(KEY_DATA, kdebug_secasv(sav));
/* set msg header */
satype = key_proto2satype(sav->sah->saidx.proto);
IPSEC_ASSERT(satype != 0, ("invalid proto, satype %u", satype));
m = key_setsadbmsg(SADB_EXPIRE, 0, satype, sav->seq, 0, sav->refcnt);
if (!m) {
error = ENOBUFS;
goto fail;
}
result = m;
/* create SA extension */
m = key_setsadbsa(sav);
if (!m) {
error = ENOBUFS;
goto fail;
}
m_cat(result, m);
/* create SA extension */
SECASVAR_LOCK(sav);
replay_count = sav->replay ? sav->replay->count : 0;
SECASVAR_UNLOCK(sav);
m = key_setsadbxsa2(sav->sah->saidx.mode, replay_count,
sav->sah->saidx.reqid);
if (!m) {
error = ENOBUFS;
goto fail;
}
m_cat(result, m);
if (sav->replay && sav->replay->wsize > UINT8_MAX) {
m = key_setsadbxsareplay(sav->replay->wsize);
if (!m) {
error = ENOBUFS;
goto fail;
}
m_cat(result, m);
}
/* create lifetime extension (current and soft) */
len = PFKEY_ALIGN8(sizeof(*lt)) * 2;
m = m_get2(len, M_NOWAIT, MT_DATA, 0);
if (m == NULL) {
error = ENOBUFS;
goto fail;
}
m_align(m, len);
m->m_len = len;
bzero(mtod(m, caddr_t), len);
lt = mtod(m, struct sadb_lifetime *);
lt->sadb_lifetime_len = PFKEY_UNIT64(sizeof(struct sadb_lifetime));
lt->sadb_lifetime_exttype = SADB_EXT_LIFETIME_CURRENT;
lt->sadb_lifetime_allocations =
(uint32_t)counter_u64_fetch(sav->lft_c_allocations);
lt->sadb_lifetime_bytes =
counter_u64_fetch(sav->lft_c_bytes);
lt->sadb_lifetime_addtime = sav->created;
lt->sadb_lifetime_usetime = sav->firstused;
lt = (struct sadb_lifetime *)(mtod(m, caddr_t) + len / 2);
lt->sadb_lifetime_len = PFKEY_UNIT64(sizeof(struct sadb_lifetime));
if (hard) {
lt->sadb_lifetime_exttype = SADB_EXT_LIFETIME_HARD;
lt->sadb_lifetime_allocations = sav->lft_h->allocations;
lt->sadb_lifetime_bytes = sav->lft_h->bytes;
lt->sadb_lifetime_addtime = sav->lft_h->addtime;
lt->sadb_lifetime_usetime = sav->lft_h->usetime;
} else {
lt->sadb_lifetime_exttype = SADB_EXT_LIFETIME_SOFT;
lt->sadb_lifetime_allocations = sav->lft_s->allocations;
lt->sadb_lifetime_bytes = sav->lft_s->bytes;
lt->sadb_lifetime_addtime = sav->lft_s->addtime;
lt->sadb_lifetime_usetime = sav->lft_s->usetime;
}
m_cat(result, m);
/* set sadb_address for source */
m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC,
&sav->sah->saidx.src.sa,
FULLMASK, IPSEC_ULPROTO_ANY);
if (!m) {
error = ENOBUFS;
goto fail;
}
m_cat(result, m);
/* set sadb_address for destination */
m = key_setsadbaddr(SADB_EXT_ADDRESS_DST,
&sav->sah->saidx.dst.sa,
FULLMASK, IPSEC_ULPROTO_ANY);
if (!m) {
error = ENOBUFS;
goto fail;
}
m_cat(result, m);
/*
* XXX-BZ Handle NAT-T extensions here.
* XXXAE: it doesn't seem quite useful. IKEs should not depend on
* this information, we report only significant SA fields.
*/
if ((result->m_flags & M_PKTHDR) == 0) {
error = EINVAL;
goto fail;
}
if (result->m_len < sizeof(struct sadb_msg)) {
result = m_pullup(result, sizeof(struct sadb_msg));
if (result == NULL) {
error = ENOBUFS;
goto fail;
}
}
result->m_pkthdr.len = 0;
for (m = result; m; m = m->m_next)
result->m_pkthdr.len += m->m_len;
mtod(result, struct sadb_msg *)->sadb_msg_len =
PFKEY_UNIT64(result->m_pkthdr.len);
return key_sendup_mbuf(NULL, result, KEY_SENDUP_REGISTERED);
fail:
if (result)
m_freem(result);
return error;
}
static void
key_freesah_flushed(struct secashead_queue *flushq)
{
struct secashead *sah, *nextsah;
struct secasvar *sav, *nextsav;
sah = TAILQ_FIRST(flushq);
while (sah != NULL) {
sav = TAILQ_FIRST(&sah->savtree_larval);
while (sav != NULL) {
nextsav = TAILQ_NEXT(sav, chain);
TAILQ_REMOVE(&sah->savtree_larval, sav, chain);
key_freesav(&sav); /* release last reference */
key_freesah(&sah); /* release reference from SAV */
sav = nextsav;
}
sav = TAILQ_FIRST(&sah->savtree_alive);
while (sav != NULL) {
nextsav = TAILQ_NEXT(sav, chain);
TAILQ_REMOVE(&sah->savtree_alive, sav, chain);
key_freesav(&sav); /* release last reference */
key_freesah(&sah); /* release reference from SAV */
sav = nextsav;
}
nextsah = TAILQ_NEXT(sah, chain);
key_freesah(&sah); /* release last reference */
sah = nextsah;
}
}
/*
* SADB_FLUSH processing
* receive
* <base>
* from the ikmpd, and free all entries in secastree.
* and send,
* <base>
* to the ikmpd.
* NOTE: to do is only marking SADB_SASTATE_DEAD.
*
* m will always be freed.
*/
static int
key_flush(struct socket *so, struct mbuf *m, const struct sadb_msghdr *mhp)
{
struct secashead_queue flushq;
struct sadb_msg *newmsg;
struct secashead *sah, *nextsah;
struct secasvar *sav;
uint8_t proto;
int i;
IPSEC_ASSERT(so != NULL, ("null socket"));
IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
/* map satype to proto */
if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
__func__));
return key_senderror(so, m, EINVAL);
}
KEYDBG(KEY_STAMP,
printf("%s: proto %u\n", __func__, proto));
TAILQ_INIT(&flushq);
if (proto == IPSEC_PROTO_ANY) {
/* no SATYPE specified, i.e. flushing all SA. */
SAHTREE_WLOCK();
/* Move all SAHs into flushq */
TAILQ_CONCAT(&flushq, &V_sahtree, chain);
/* Flush all buckets in SPI hash */
for (i = 0; i < V_savhash_mask + 1; i++)
LIST_INIT(&V_savhashtbl[i]);
/* Flush all buckets in SAHADDRHASH */
for (i = 0; i < V_sahaddrhash_mask + 1; i++)
LIST_INIT(&V_sahaddrhashtbl[i]);
/* Mark all SAHs as unlinked */
TAILQ_FOREACH(sah, &flushq, chain) {
sah->state = SADB_SASTATE_DEAD;
/*
* Callout handler makes its job using
* RLOCK and drain queues. In case, when this
* function will be called just before it
* acquires WLOCK, we need to mark SAs as
* unlinked to prevent second unlink.
*/
TAILQ_FOREACH(sav, &sah->savtree_larval, chain) {
sav->state = SADB_SASTATE_DEAD;
}
TAILQ_FOREACH(sav, &sah->savtree_alive, chain) {
sav->state = SADB_SASTATE_DEAD;
}
}
SAHTREE_WUNLOCK();
} else {
SAHTREE_WLOCK();
sah = TAILQ_FIRST(&V_sahtree);
while (sah != NULL) {
IPSEC_ASSERT(sah->state != SADB_SASTATE_DEAD,
("DEAD SAH %p in SADB_FLUSH", sah));
nextsah = TAILQ_NEXT(sah, chain);
if (sah->saidx.proto != proto) {
sah = nextsah;
continue;
}
sah->state = SADB_SASTATE_DEAD;
TAILQ_REMOVE(&V_sahtree, sah, chain);
LIST_REMOVE(sah, addrhash);
/* Unlink all SAs from SPI hash */
TAILQ_FOREACH(sav, &sah->savtree_larval, chain) {
LIST_REMOVE(sav, spihash);
sav->state = SADB_SASTATE_DEAD;
}
TAILQ_FOREACH(sav, &sah->savtree_alive, chain) {
LIST_REMOVE(sav, spihash);
sav->state = SADB_SASTATE_DEAD;
}
/* Add SAH into flushq */
TAILQ_INSERT_HEAD(&flushq, sah, chain);
sah = nextsah;
}
SAHTREE_WUNLOCK();
}
key_freesah_flushed(&flushq);
/* Free all queued SAs and SAHs */
if (m->m_len < sizeof(struct sadb_msg) ||
sizeof(struct sadb_msg) > m->m_len + M_TRAILINGSPACE(m)) {
ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
return key_senderror(so, m, ENOBUFS);
}
if (m->m_next)
m_freem(m->m_next);
m->m_next = NULL;
m->m_pkthdr.len = m->m_len = sizeof(struct sadb_msg);
newmsg = mtod(m, struct sadb_msg *);
newmsg->sadb_msg_errno = 0;
newmsg->sadb_msg_len = PFKEY_UNIT64(m->m_pkthdr.len);
return key_sendup_mbuf(so, m, KEY_SENDUP_ALL);
}
/*
* SADB_DUMP processing
* dump all entries including status of DEAD in SAD.
* receive
* <base>
* from the ikmpd, and dump all secasvar leaves
* and send,
* <base> .....
* to the ikmpd.
*
* m will always be freed.
*/
static int
key_dump(struct socket *so, struct mbuf *m, const struct sadb_msghdr *mhp)
{
SAHTREE_RLOCK_TRACKER;
struct secashead *sah;
struct secasvar *sav;
- struct sadb_msg *newmsg;
struct mbuf *n;
uint32_t cnt;
uint8_t proto, satype;
IPSEC_ASSERT(so != NULL, ("null socket"));
IPSEC_ASSERT(m != NULL, ("null mbuf"));
IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
/* map satype to proto */
if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
__func__));
return key_senderror(so, m, EINVAL);
}
/* count sav entries to be sent to the userland. */
cnt = 0;
SAHTREE_RLOCK();
TAILQ_FOREACH(sah, &V_sahtree, chain) {
if (mhp->msg->sadb_msg_satype != SADB_SATYPE_UNSPEC &&
proto != sah->saidx.proto)
continue;
TAILQ_FOREACH(sav, &sah->savtree_larval, chain)
cnt++;
TAILQ_FOREACH(sav, &sah->savtree_alive, chain)
cnt++;
}
if (cnt == 0) {
SAHTREE_RUNLOCK();
return key_senderror(so, m, ENOENT);
}
/* send this to the userland, one at a time. */
- newmsg = NULL;
TAILQ_FOREACH(sah, &V_sahtree, chain) {
if (mhp->msg->sadb_msg_satype != SADB_SATYPE_UNSPEC &&
proto != sah->saidx.proto)
continue;
/* map proto to satype */
if ((satype = key_proto2satype(sah->saidx.proto)) == 0) {
SAHTREE_RUNLOCK();
ipseclog((LOG_DEBUG, "%s: there was invalid proto in "
"SAD.\n", __func__));
return key_senderror(so, m, EINVAL);
}
TAILQ_FOREACH(sav, &sah->savtree_larval, chain) {
n = key_setdumpsa(sav, SADB_DUMP, satype,
--cnt, mhp->msg->sadb_msg_pid);
if (n == NULL) {
SAHTREE_RUNLOCK();
return key_senderror(so, m, ENOBUFS);
}
key_sendup_mbuf(so, n, KEY_SENDUP_ONE);
}
TAILQ_FOREACH(sav, &sah->savtree_alive, chain) {
n = key_setdumpsa(sav, SADB_DUMP, satype,
--cnt, mhp->msg->sadb_msg_pid);
if (n == NULL) {
SAHTREE_RUNLOCK();
return key_senderror(so, m, ENOBUFS);
}
key_sendup_mbuf(so, n, KEY_SENDUP_ONE);
}
}
SAHTREE_RUNLOCK();
m_freem(m);
return (0);
}
/*
* SADB_X_PROMISC processing
*
* m will always be freed.
*/
static int
key_promisc(struct socket *so, struct mbuf *m, const struct sadb_msghdr *mhp)
{
int olen;
IPSEC_ASSERT(so != NULL, ("null socket"));
IPSEC_ASSERT(m != NULL, ("null mbuf"));
IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
olen = PFKEY_UNUNIT64(mhp->msg->sadb_msg_len);
if (olen < sizeof(struct sadb_msg)) {
#if 1
return key_senderror(so, m, EINVAL);
#else
m_freem(m);
return 0;
#endif
} else if (olen == sizeof(struct sadb_msg)) {
/* enable/disable promisc mode */
struct keycb *kp;
if ((kp = (struct keycb *)sotorawcb(so)) == NULL)
return key_senderror(so, m, EINVAL);
mhp->msg->sadb_msg_errno = 0;
switch (mhp->msg->sadb_msg_satype) {
case 0:
case 1:
kp->kp_promisc = mhp->msg->sadb_msg_satype;
break;
default:
return key_senderror(so, m, EINVAL);
}
/* send the original message back to everyone */
mhp->msg->sadb_msg_errno = 0;
return key_sendup_mbuf(so, m, KEY_SENDUP_ALL);
} else {
/* send packet as is */
m_adj(m, PFKEY_ALIGN8(sizeof(struct sadb_msg)));
/* TODO: if sadb_msg_seq is specified, send to specific pid */
return key_sendup_mbuf(so, m, KEY_SENDUP_ALL);
}
}
static int (*key_typesw[])(struct socket *, struct mbuf *,
const struct sadb_msghdr *) = {
NULL, /* SADB_RESERVED */
key_getspi, /* SADB_GETSPI */
key_update, /* SADB_UPDATE */
key_add, /* SADB_ADD */
key_delete, /* SADB_DELETE */
key_get, /* SADB_GET */
key_acquire2, /* SADB_ACQUIRE */
key_register, /* SADB_REGISTER */
NULL, /* SADB_EXPIRE */
key_flush, /* SADB_FLUSH */
key_dump, /* SADB_DUMP */
key_promisc, /* SADB_X_PROMISC */
NULL, /* SADB_X_PCHANGE */
key_spdadd, /* SADB_X_SPDUPDATE */
key_spdadd, /* SADB_X_SPDADD */
key_spddelete, /* SADB_X_SPDDELETE */
key_spdget, /* SADB_X_SPDGET */
NULL, /* SADB_X_SPDACQUIRE */
key_spddump, /* SADB_X_SPDDUMP */
key_spdflush, /* SADB_X_SPDFLUSH */
key_spdadd, /* SADB_X_SPDSETIDX */
NULL, /* SADB_X_SPDEXPIRE */
key_spddelete2, /* SADB_X_SPDDELETE2 */
};
/*
* parse sadb_msg buffer to process PFKEYv2,
* and create a data to response if needed.
* I think to be dealed with mbuf directly.
* IN:
* msgp : pointer to pointer to a received buffer pulluped.
* This is rewrited to response.
* so : pointer to socket.
* OUT:
* length for buffer to send to user process.
*/
int
key_parse(struct mbuf *m, struct socket *so)
{
struct sadb_msg *msg;
struct sadb_msghdr mh;
u_int orglen;
int error;
int target;
IPSEC_ASSERT(so != NULL, ("null socket"));
IPSEC_ASSERT(m != NULL, ("null mbuf"));
if (m->m_len < sizeof(struct sadb_msg)) {
m = m_pullup(m, sizeof(struct sadb_msg));
if (!m)
return ENOBUFS;
}
msg = mtod(m, struct sadb_msg *);
orglen = PFKEY_UNUNIT64(msg->sadb_msg_len);
target = KEY_SENDUP_ONE;
if ((m->m_flags & M_PKTHDR) == 0 || m->m_pkthdr.len != orglen) {
ipseclog((LOG_DEBUG, "%s: invalid message length.\n",__func__));
PFKEYSTAT_INC(out_invlen);
error = EINVAL;
goto senderror;
}
if (msg->sadb_msg_version != PF_KEY_V2) {
ipseclog((LOG_DEBUG, "%s: PF_KEY version %u is mismatched.\n",
__func__, msg->sadb_msg_version));
PFKEYSTAT_INC(out_invver);
error = EINVAL;
goto senderror;
}
if (msg->sadb_msg_type > SADB_MAX) {
ipseclog((LOG_DEBUG, "%s: invalid type %u is passed.\n",
__func__, msg->sadb_msg_type));
PFKEYSTAT_INC(out_invmsgtype);
error = EINVAL;
goto senderror;
}
/* for old-fashioned code - should be nuked */
if (m->m_pkthdr.len > MCLBYTES) {
m_freem(m);
return ENOBUFS;
}
if (m->m_next) {
struct mbuf *n;
MGETHDR(n, M_NOWAIT, MT_DATA);
if (n && m->m_pkthdr.len > MHLEN) {
if (!(MCLGET(n, M_NOWAIT))) {
m_free(n);
n = NULL;
}
}
if (!n) {
m_freem(m);
return ENOBUFS;
}
m_copydata(m, 0, m->m_pkthdr.len, mtod(n, caddr_t));
n->m_pkthdr.len = n->m_len = m->m_pkthdr.len;
n->m_next = NULL;
m_freem(m);
m = n;
}
/* align the mbuf chain so that extensions are in contiguous region. */
error = key_align(m, &mh);
if (error)
return error;
msg = mh.msg;
/* We use satype as scope mask for spddump */
if (msg->sadb_msg_type == SADB_X_SPDDUMP) {
switch (msg->sadb_msg_satype) {
case IPSEC_POLICYSCOPE_ANY:
case IPSEC_POLICYSCOPE_GLOBAL:
case IPSEC_POLICYSCOPE_IFNET:
case IPSEC_POLICYSCOPE_PCB:
break;
default:
ipseclog((LOG_DEBUG, "%s: illegal satype=%u\n",
__func__, msg->sadb_msg_type));
PFKEYSTAT_INC(out_invsatype);
error = EINVAL;
goto senderror;
}
} else {
switch (msg->sadb_msg_satype) { /* check SA type */
case SADB_SATYPE_UNSPEC:
switch (msg->sadb_msg_type) {
case SADB_GETSPI:
case SADB_UPDATE:
case SADB_ADD:
case SADB_DELETE:
case SADB_GET:
case SADB_ACQUIRE:
case SADB_EXPIRE:
ipseclog((LOG_DEBUG, "%s: must specify satype "
"when msg type=%u.\n", __func__,
msg->sadb_msg_type));
PFKEYSTAT_INC(out_invsatype);
error = EINVAL;
goto senderror;
}
break;
case SADB_SATYPE_AH:
case SADB_SATYPE_ESP:
case SADB_X_SATYPE_IPCOMP:
case SADB_X_SATYPE_TCPSIGNATURE:
switch (msg->sadb_msg_type) {
case SADB_X_SPDADD:
case SADB_X_SPDDELETE:
case SADB_X_SPDGET:
case SADB_X_SPDFLUSH:
case SADB_X_SPDSETIDX:
case SADB_X_SPDUPDATE:
case SADB_X_SPDDELETE2:
ipseclog((LOG_DEBUG, "%s: illegal satype=%u\n",
__func__, msg->sadb_msg_type));
PFKEYSTAT_INC(out_invsatype);
error = EINVAL;
goto senderror;
}
break;
case SADB_SATYPE_RSVP:
case SADB_SATYPE_OSPFV2:
case SADB_SATYPE_RIPV2:
case SADB_SATYPE_MIP:
ipseclog((LOG_DEBUG, "%s: type %u isn't supported.\n",
__func__, msg->sadb_msg_satype));
PFKEYSTAT_INC(out_invsatype);
error = EOPNOTSUPP;
goto senderror;
case 1: /* XXX: What does it do? */
if (msg->sadb_msg_type == SADB_X_PROMISC)
break;
/*FALLTHROUGH*/
default:
ipseclog((LOG_DEBUG, "%s: invalid type %u is passed.\n",
__func__, msg->sadb_msg_satype));
PFKEYSTAT_INC(out_invsatype);
error = EINVAL;
goto senderror;
}
}
/* check field of upper layer protocol and address family */
if (mh.ext[SADB_EXT_ADDRESS_SRC] != NULL
&& mh.ext[SADB_EXT_ADDRESS_DST] != NULL) {
struct sadb_address *src0, *dst0;
u_int plen;
src0 = (struct sadb_address *)(mh.ext[SADB_EXT_ADDRESS_SRC]);
dst0 = (struct sadb_address *)(mh.ext[SADB_EXT_ADDRESS_DST]);
/* check upper layer protocol */
if (src0->sadb_address_proto != dst0->sadb_address_proto) {
ipseclog((LOG_DEBUG, "%s: upper layer protocol "
"mismatched.\n", __func__));
PFKEYSTAT_INC(out_invaddr);
error = EINVAL;
goto senderror;
}
/* check family */
if (PFKEY_ADDR_SADDR(src0)->sa_family !=
PFKEY_ADDR_SADDR(dst0)->sa_family) {
ipseclog((LOG_DEBUG, "%s: address family mismatched.\n",
__func__));
PFKEYSTAT_INC(out_invaddr);
error = EINVAL;
goto senderror;
}
if (PFKEY_ADDR_SADDR(src0)->sa_len !=
PFKEY_ADDR_SADDR(dst0)->sa_len) {
ipseclog((LOG_DEBUG, "%s: address struct size "
"mismatched.\n", __func__));
PFKEYSTAT_INC(out_invaddr);
error = EINVAL;
goto senderror;
}
switch (PFKEY_ADDR_SADDR(src0)->sa_family) {
case AF_INET:
if (PFKEY_ADDR_SADDR(src0)->sa_len !=
sizeof(struct sockaddr_in)) {
PFKEYSTAT_INC(out_invaddr);
error = EINVAL;
goto senderror;
}
break;
case AF_INET6:
if (PFKEY_ADDR_SADDR(src0)->sa_len !=
sizeof(struct sockaddr_in6)) {
PFKEYSTAT_INC(out_invaddr);
error = EINVAL;
goto senderror;
}
break;
default:
ipseclog((LOG_DEBUG, "%s: unsupported address family\n",
__func__));
PFKEYSTAT_INC(out_invaddr);
error = EAFNOSUPPORT;
goto senderror;
}
switch (PFKEY_ADDR_SADDR(src0)->sa_family) {
case AF_INET:
plen = sizeof(struct in_addr) << 3;
break;
case AF_INET6:
plen = sizeof(struct in6_addr) << 3;
break;
default:
plen = 0; /*fool gcc*/
break;
}
/* check max prefix length */
if (src0->sadb_address_prefixlen > plen ||
dst0->sadb_address_prefixlen > plen) {
ipseclog((LOG_DEBUG, "%s: illegal prefixlen.\n",
__func__));
PFKEYSTAT_INC(out_invaddr);
error = EINVAL;
goto senderror;
}
/*
* prefixlen == 0 is valid because there can be a case when
* all addresses are matched.
*/
}
if (msg->sadb_msg_type >= nitems(key_typesw) ||
key_typesw[msg->sadb_msg_type] == NULL) {
PFKEYSTAT_INC(out_invmsgtype);
error = EINVAL;
goto senderror;
}
return (*key_typesw[msg->sadb_msg_type])(so, m, &mh);
senderror:
msg->sadb_msg_errno = error;
return key_sendup_mbuf(so, m, target);
}
static int
key_senderror(struct socket *so, struct mbuf *m, int code)
{
struct sadb_msg *msg;
IPSEC_ASSERT(m->m_len >= sizeof(struct sadb_msg),
("mbuf too small, len %u", m->m_len));
msg = mtod(m, struct sadb_msg *);
msg->sadb_msg_errno = code;
return key_sendup_mbuf(so, m, KEY_SENDUP_ONE);
}
/*
* set the pointer to each header into message buffer.
* m will be freed on error.
* XXX larger-than-MCLBYTES extension?
*/
static int
key_align(struct mbuf *m, struct sadb_msghdr *mhp)
{
struct mbuf *n;
struct sadb_ext *ext;
size_t off, end;
int extlen;
int toff;
IPSEC_ASSERT(m != NULL, ("null mbuf"));
IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
IPSEC_ASSERT(m->m_len >= sizeof(struct sadb_msg),
("mbuf too small, len %u", m->m_len));
/* initialize */
bzero(mhp, sizeof(*mhp));
mhp->msg = mtod(m, struct sadb_msg *);
mhp->ext[0] = (struct sadb_ext *)mhp->msg; /*XXX backward compat */
end = PFKEY_UNUNIT64(mhp->msg->sadb_msg_len);
extlen = end; /*just in case extlen is not updated*/
for (off = sizeof(struct sadb_msg); off < end; off += extlen) {
n = m_pulldown(m, off, sizeof(struct sadb_ext), &toff);
if (!n) {
/* m is already freed */
return ENOBUFS;
}
ext = (struct sadb_ext *)(mtod(n, caddr_t) + toff);
/* set pointer */
switch (ext->sadb_ext_type) {
case SADB_EXT_SA:
case SADB_EXT_ADDRESS_SRC:
case SADB_EXT_ADDRESS_DST:
case SADB_EXT_ADDRESS_PROXY:
case SADB_EXT_LIFETIME_CURRENT:
case SADB_EXT_LIFETIME_HARD:
case SADB_EXT_LIFETIME_SOFT:
case SADB_EXT_KEY_AUTH:
case SADB_EXT_KEY_ENCRYPT:
case SADB_EXT_IDENTITY_SRC:
case SADB_EXT_IDENTITY_DST:
case SADB_EXT_SENSITIVITY:
case SADB_EXT_PROPOSAL:
case SADB_EXT_SUPPORTED_AUTH:
case SADB_EXT_SUPPORTED_ENCRYPT:
case SADB_EXT_SPIRANGE:
case SADB_X_EXT_POLICY:
case SADB_X_EXT_SA2:
case SADB_X_EXT_NAT_T_TYPE:
case SADB_X_EXT_NAT_T_SPORT:
case SADB_X_EXT_NAT_T_DPORT:
case SADB_X_EXT_NAT_T_OAI:
case SADB_X_EXT_NAT_T_OAR:
case SADB_X_EXT_NAT_T_FRAG:
case SADB_X_EXT_SA_REPLAY:
case SADB_X_EXT_NEW_ADDRESS_SRC:
case SADB_X_EXT_NEW_ADDRESS_DST:
/* duplicate check */
/*
* XXX Are there duplication payloads of either
* KEY_AUTH or KEY_ENCRYPT ?
*/
if (mhp->ext[ext->sadb_ext_type] != NULL) {
ipseclog((LOG_DEBUG, "%s: duplicate ext_type "
"%u\n", __func__, ext->sadb_ext_type));
m_freem(m);
PFKEYSTAT_INC(out_dupext);
return EINVAL;
}
break;
default:
ipseclog((LOG_DEBUG, "%s: invalid ext_type %u\n",
__func__, ext->sadb_ext_type));
m_freem(m);
PFKEYSTAT_INC(out_invexttype);
return EINVAL;
}
extlen = PFKEY_UNUNIT64(ext->sadb_ext_len);
if (key_validate_ext(ext, extlen)) {
m_freem(m);
PFKEYSTAT_INC(out_invlen);
return EINVAL;
}
n = m_pulldown(m, off, extlen, &toff);
if (!n) {
/* m is already freed */
return ENOBUFS;
}
ext = (struct sadb_ext *)(mtod(n, caddr_t) + toff);
mhp->ext[ext->sadb_ext_type] = ext;
mhp->extoff[ext->sadb_ext_type] = off;
mhp->extlen[ext->sadb_ext_type] = extlen;
}
if (off != end) {
m_freem(m);
PFKEYSTAT_INC(out_invlen);
return EINVAL;
}
return 0;
}
static int
key_validate_ext(const struct sadb_ext *ext, int len)
{
const struct sockaddr *sa;
enum { NONE, ADDR } checktype = NONE;
int baselen = 0;
const int sal = offsetof(struct sockaddr, sa_len) + sizeof(sa->sa_len);
if (len != PFKEY_UNUNIT64(ext->sadb_ext_len))
return EINVAL;
/* if it does not match minimum/maximum length, bail */
if (ext->sadb_ext_type >= nitems(minsize) ||
ext->sadb_ext_type >= nitems(maxsize))
return EINVAL;
if (!minsize[ext->sadb_ext_type] || len < minsize[ext->sadb_ext_type])
return EINVAL;
if (maxsize[ext->sadb_ext_type] && len > maxsize[ext->sadb_ext_type])
return EINVAL;
/* more checks based on sadb_ext_type XXX need more */
switch (ext->sadb_ext_type) {
case SADB_EXT_ADDRESS_SRC:
case SADB_EXT_ADDRESS_DST:
case SADB_EXT_ADDRESS_PROXY:
case SADB_X_EXT_NAT_T_OAI:
case SADB_X_EXT_NAT_T_OAR:
case SADB_X_EXT_NEW_ADDRESS_SRC:
case SADB_X_EXT_NEW_ADDRESS_DST:
baselen = PFKEY_ALIGN8(sizeof(struct sadb_address));
checktype = ADDR;
break;
case SADB_EXT_IDENTITY_SRC:
case SADB_EXT_IDENTITY_DST:
if (((const struct sadb_ident *)ext)->sadb_ident_type ==
SADB_X_IDENTTYPE_ADDR) {
baselen = PFKEY_ALIGN8(sizeof(struct sadb_ident));
checktype = ADDR;
} else
checktype = NONE;
break;
default:
checktype = NONE;
break;
}
switch (checktype) {
case NONE:
break;
case ADDR:
sa = (const struct sockaddr *)(((const u_int8_t*)ext)+baselen);
if (len < baselen + sal)
return EINVAL;
if (baselen + PFKEY_ALIGN8(sa->sa_len) != len)
return EINVAL;
break;
}
return 0;
}
void
key_init(void)
{
int i;
for (i = 0; i < IPSEC_DIR_MAX; i++) {
TAILQ_INIT(&V_sptree[i]);
TAILQ_INIT(&V_sptree_ifnet[i]);
}
V_key_lft_zone = uma_zcreate("IPsec SA lft_c",
sizeof(uint64_t) * 2, NULL, NULL, NULL, NULL,
UMA_ALIGN_PTR, UMA_ZONE_PCPU);
TAILQ_INIT(&V_sahtree);
V_sphashtbl = hashinit(SPHASH_NHASH, M_IPSEC_SP, &V_sphash_mask);
V_savhashtbl = hashinit(SAVHASH_NHASH, M_IPSEC_SA, &V_savhash_mask);
V_sahaddrhashtbl = hashinit(SAHHASH_NHASH, M_IPSEC_SAH,
&V_sahaddrhash_mask);
V_acqaddrhashtbl = hashinit(ACQHASH_NHASH, M_IPSEC_SAQ,
&V_acqaddrhash_mask);
V_acqseqhashtbl = hashinit(ACQHASH_NHASH, M_IPSEC_SAQ,
&V_acqseqhash_mask);
for (i = 0; i <= SADB_SATYPE_MAX; i++)
LIST_INIT(&V_regtree[i]);
LIST_INIT(&V_acqtree);
LIST_INIT(&V_spacqtree);
if (!IS_DEFAULT_VNET(curvnet))
return;
XFORMS_LOCK_INIT();
SPTREE_LOCK_INIT();
REGTREE_LOCK_INIT();
SAHTREE_LOCK_INIT();
ACQ_LOCK_INIT();
SPACQ_LOCK_INIT();
#ifndef IPSEC_DEBUG2
callout_init(&key_timer, 1);
callout_reset(&key_timer, hz, key_timehandler, NULL);
#endif /*IPSEC_DEBUG2*/
/* initialize key statistics */
keystat.getspi_count = 1;
if (bootverbose)
printf("IPsec: Initialized Security Association Processing.\n");
}
#ifdef VIMAGE
void
key_destroy(void)
{
struct secashead_queue sahdrainq;
struct secpolicy_queue drainq;
struct secpolicy *sp, *nextsp;
struct secacq *acq, *nextacq;
struct secspacq *spacq, *nextspacq;
struct secashead *sah;
struct secasvar *sav;
struct secreg *reg;
int i;
/*
* XXX: can we just call free() for each object without
* walking through safe way with releasing references?
*/
TAILQ_INIT(&drainq);
SPTREE_WLOCK();
for (i = 0; i < IPSEC_DIR_MAX; i++) {
TAILQ_CONCAT(&drainq, &V_sptree[i], chain);
TAILQ_CONCAT(&drainq, &V_sptree_ifnet[i], chain);
}
for (i = 0; i < V_sphash_mask + 1; i++)
LIST_INIT(&V_sphashtbl[i]);
SPTREE_WUNLOCK();
sp = TAILQ_FIRST(&drainq);
while (sp != NULL) {
nextsp = TAILQ_NEXT(sp, chain);
key_freesp(&sp);
sp = nextsp;
}
TAILQ_INIT(&sahdrainq);
SAHTREE_WLOCK();
TAILQ_CONCAT(&sahdrainq, &V_sahtree, chain);
for (i = 0; i < V_savhash_mask + 1; i++)
LIST_INIT(&V_savhashtbl[i]);
for (i = 0; i < V_sahaddrhash_mask + 1; i++)
LIST_INIT(&V_sahaddrhashtbl[i]);
TAILQ_FOREACH(sah, &sahdrainq, chain) {
sah->state = SADB_SASTATE_DEAD;
TAILQ_FOREACH(sav, &sah->savtree_larval, chain) {
sav->state = SADB_SASTATE_DEAD;
}
TAILQ_FOREACH(sav, &sah->savtree_alive, chain) {
sav->state = SADB_SASTATE_DEAD;
}
}
SAHTREE_WUNLOCK();
key_freesah_flushed(&sahdrainq);
hashdestroy(V_sphashtbl, M_IPSEC_SP, V_sphash_mask);
hashdestroy(V_savhashtbl, M_IPSEC_SA, V_savhash_mask);
hashdestroy(V_sahaddrhashtbl, M_IPSEC_SAH, V_sahaddrhash_mask);
REGTREE_LOCK();
for (i = 0; i <= SADB_SATYPE_MAX; i++) {
LIST_FOREACH(reg, &V_regtree[i], chain) {
if (__LIST_CHAINED(reg)) {
LIST_REMOVE(reg, chain);
free(reg, M_IPSEC_SAR);
break;
}
}
}
REGTREE_UNLOCK();
ACQ_LOCK();
acq = LIST_FIRST(&V_acqtree);
while (acq != NULL) {
nextacq = LIST_NEXT(acq, chain);
LIST_REMOVE(acq, chain);
free(acq, M_IPSEC_SAQ);
acq = nextacq;
}
for (i = 0; i < V_acqaddrhash_mask + 1; i++)
LIST_INIT(&V_acqaddrhashtbl[i]);
for (i = 0; i < V_acqseqhash_mask + 1; i++)
LIST_INIT(&V_acqseqhashtbl[i]);
ACQ_UNLOCK();
SPACQ_LOCK();
for (spacq = LIST_FIRST(&V_spacqtree); spacq != NULL;
spacq = nextspacq) {
nextspacq = LIST_NEXT(spacq, chain);
if (__LIST_CHAINED(spacq)) {
LIST_REMOVE(spacq, chain);
free(spacq, M_IPSEC_SAQ);
}
}
SPACQ_UNLOCK();
hashdestroy(V_acqaddrhashtbl, M_IPSEC_SAQ, V_acqaddrhash_mask);
hashdestroy(V_acqseqhashtbl, M_IPSEC_SAQ, V_acqseqhash_mask);
uma_zdestroy(V_key_lft_zone);
if (!IS_DEFAULT_VNET(curvnet))
return;
#ifndef IPSEC_DEBUG2
callout_drain(&key_timer);
#endif
XFORMS_LOCK_DESTROY();
SPTREE_LOCK_DESTROY();
REGTREE_LOCK_DESTROY();
SAHTREE_LOCK_DESTROY();
ACQ_LOCK_DESTROY();
SPACQ_LOCK_DESTROY();
}
#endif
/* record data transfer on SA, and update timestamps */
void
key_sa_recordxfer(struct secasvar *sav, struct mbuf *m)
{
IPSEC_ASSERT(sav != NULL, ("Null secasvar"));
IPSEC_ASSERT(m != NULL, ("Null mbuf"));
/*
* XXX Currently, there is a difference of bytes size
* between inbound and outbound processing.
*/
counter_u64_add(sav->lft_c_bytes, m->m_pkthdr.len);
/*
* We use the number of packets as the unit of
* allocations. We increment the variable
* whenever {esp,ah}_{in,out}put is called.
*/
counter_u64_add(sav->lft_c_allocations, 1);
/*
* NOTE: We record CURRENT usetime by using wall clock,
* in seconds. HARD and SOFT lifetime are measured by the time
* difference (again in seconds) from usetime.
*
* usetime
* v expire expire
* -----+-----+--------+---> t
* <--------------> HARD
* <-----> SOFT
*/
if (sav->firstused == 0)
sav->firstused = time_second;
}
/*
* Take one of the kernel's security keys and convert it into a PF_KEY
* structure within an mbuf, suitable for sending up to a waiting
* application in user land.
*
* IN:
* src: A pointer to a kernel security key.
* exttype: Which type of key this is. Refer to the PF_KEY data structures.
* OUT:
* a valid mbuf or NULL indicating an error
*
*/
static struct mbuf *
key_setkey(struct seckey *src, uint16_t exttype)
{
struct mbuf *m;
struct sadb_key *p;
int len;
if (src == NULL)
return NULL;
len = PFKEY_ALIGN8(sizeof(struct sadb_key) + _KEYLEN(src));
m = m_get2(len, M_NOWAIT, MT_DATA, 0);
if (m == NULL)
return NULL;
m_align(m, len);
m->m_len = len;
p = mtod(m, struct sadb_key *);
bzero(p, len);
p->sadb_key_len = PFKEY_UNIT64(len);
p->sadb_key_exttype = exttype;
p->sadb_key_bits = src->bits;
bcopy(src->key_data, _KEYBUF(p), _KEYLEN(src));
return m;
}
/*
* Take one of the kernel's lifetime data structures and convert it
* into a PF_KEY structure within an mbuf, suitable for sending up to
* a waiting application in user land.
*
* IN:
* src: A pointer to a kernel lifetime structure.
* exttype: Which type of lifetime this is. Refer to the PF_KEY
* data structures for more information.
* OUT:
* a valid mbuf or NULL indicating an error
*
*/
static struct mbuf *
key_setlifetime(struct seclifetime *src, uint16_t exttype)
{
struct mbuf *m = NULL;
struct sadb_lifetime *p;
int len = PFKEY_ALIGN8(sizeof(struct sadb_lifetime));
if (src == NULL)
return NULL;
m = m_get2(len, M_NOWAIT, MT_DATA, 0);
if (m == NULL)
return m;
m_align(m, len);
m->m_len = len;
p = mtod(m, struct sadb_lifetime *);
bzero(p, len);
p->sadb_lifetime_len = PFKEY_UNIT64(len);
p->sadb_lifetime_exttype = exttype;
p->sadb_lifetime_allocations = src->allocations;
p->sadb_lifetime_bytes = src->bytes;
p->sadb_lifetime_addtime = src->addtime;
p->sadb_lifetime_usetime = src->usetime;
return m;
}
const struct enc_xform *
enc_algorithm_lookup(int alg)
{
int i;
for (i = 0; i < nitems(supported_ealgs); i++)
if (alg == supported_ealgs[i].sadb_alg)
return (supported_ealgs[i].xform);
return (NULL);
}
const struct auth_hash *
auth_algorithm_lookup(int alg)
{
int i;
for (i = 0; i < nitems(supported_aalgs); i++)
if (alg == supported_aalgs[i].sadb_alg)
return (supported_aalgs[i].xform);
return (NULL);
}
const struct comp_algo *
comp_algorithm_lookup(int alg)
{
int i;
for (i = 0; i < nitems(supported_calgs); i++)
if (alg == supported_calgs[i].sadb_alg)
return (supported_calgs[i].xform);
return (NULL);
}
/*
* Register a transform.
*/
static int
xform_register(struct xformsw* xsp)
{
struct xformsw *entry;
XFORMS_LOCK();
LIST_FOREACH(entry, &xforms, chain) {
if (entry->xf_type == xsp->xf_type) {
XFORMS_UNLOCK();
return (EEXIST);
}
}
LIST_INSERT_HEAD(&xforms, xsp, chain);
XFORMS_UNLOCK();
return (0);
}
void
xform_attach(void *data)
{
struct xformsw *xsp = (struct xformsw *)data;
if (xform_register(xsp) != 0)
printf("%s: failed to register %s xform\n", __func__,
xsp->xf_name);
}
void
xform_detach(void *data)
{
struct xformsw *xsp = (struct xformsw *)data;
XFORMS_LOCK();
LIST_REMOVE(xsp, chain);
XFORMS_UNLOCK();
/* Delete all SAs related to this xform. */
key_delete_xform(xsp);
}
/*
* Initialize transform support in an sav.
*/
static int
xform_init(struct secasvar *sav, u_short xftype)
{
struct xformsw *entry;
int ret;
IPSEC_ASSERT(sav->tdb_xform == NULL,
("tdb_xform is already initialized"));
ret = EINVAL;
XFORMS_LOCK();
LIST_FOREACH(entry, &xforms, chain) {
if (entry->xf_type == xftype) {
ret = (*entry->xf_init)(sav, entry);
break;
}
}
XFORMS_UNLOCK();
return (ret);
}
Index: head/sys/netipsec/xform_ah.c
===================================================================
--- head/sys/netipsec/xform_ah.c (revision 327172)
+++ head/sys/netipsec/xform_ah.c (revision 327173)
@@ -1,1154 +1,1149 @@
/* $FreeBSD$ */
/* $OpenBSD: ip_ah.c,v 1.63 2001/06/26 06:18:58 angelos Exp $ */
/*-
* The authors of this code are John Ioannidis (ji@tla.org),
* Angelos D. Keromytis (kermit@csd.uch.gr) and
* Niels Provos (provos@physnet.uni-hamburg.de).
*
* The original version of this code was written by John Ioannidis
* for BSD/OS in Athens, Greece, in November 1995.
*
* Ported to OpenBSD and NetBSD, with additional transforms, in December 1996,
* by Angelos D. Keromytis.
*
* Additional transforms and features in 1997 and 1998 by Angelos D. Keromytis
* and Niels Provos.
*
* Additional features in 1999 by Angelos D. Keromytis and Niklas Hallqvist.
*
* Copyright (c) 1995, 1996, 1997, 1998, 1999 by John Ioannidis,
* Angelos D. Keromytis and Niels Provos.
* Copyright (c) 1999 Niklas Hallqvist.
* Copyright (c) 2001 Angelos D. Keromytis.
*
* Permission to use, copy, and modify this software with or without fee
* is hereby granted, provided that this entire notice is included in
* all copies of any software which is or includes a copy or
* modification of this software.
* You may use this code under the GNU public license if you so wish. Please
* contribute changes back to the authors under this freer than GPL license
* so that we may further the use of strong encryption without limitations to
* all.
*
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR
* IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE
* MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR
* PURPOSE.
*/
#include "opt_inet.h"
#include "opt_inet6.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/syslog.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/sysctl.h>
#include <net/if.h>
#include <net/vnet.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/ip_ecn.h>
#include <netinet/ip6.h>
#include <netipsec/ipsec.h>
#include <netipsec/ah.h>
#include <netipsec/ah_var.h>
#include <netipsec/xform.h>
#ifdef INET6
#include <netinet6/ip6_var.h>
#include <netipsec/ipsec6.h>
#include <netinet6/ip6_ecn.h>
#endif
#include <netipsec/key.h>
#include <netipsec/key_debug.h>
#include <opencrypto/cryptodev.h>
/*
* Return header size in bytes. The old protocol did not support
* the replay counter; the new protocol always includes the counter.
*/
#define HDRSIZE(sav) \
(((sav)->flags & SADB_X_EXT_OLD) ? \
sizeof (struct ah) : sizeof (struct ah) + sizeof (u_int32_t))
/*
* Return authenticator size in bytes, based on a field in the
* algorithm descriptor.
*/
#define AUTHSIZE(sav) ((sav->flags & SADB_X_EXT_OLD) ? 16 : \
xform_ah_authsize((sav)->tdb_authalgxform))
VNET_DEFINE(int, ah_enable) = 1; /* control flow of packets with AH */
VNET_DEFINE(int, ah_cleartos) = 1; /* clear ip_tos when doing AH calc */
VNET_PCPUSTAT_DEFINE(struct ahstat, ahstat);
VNET_PCPUSTAT_SYSINIT(ahstat);
#ifdef VIMAGE
VNET_PCPUSTAT_SYSUNINIT(ahstat);
#endif /* VIMAGE */
#ifdef INET
SYSCTL_DECL(_net_inet_ah);
SYSCTL_INT(_net_inet_ah, OID_AUTO, ah_enable,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ah_enable), 0, "");
SYSCTL_INT(_net_inet_ah, OID_AUTO, ah_cleartos,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ah_cleartos), 0, "");
SYSCTL_VNET_PCPUSTAT(_net_inet_ah, IPSECCTL_STATS, stats, struct ahstat,
ahstat, "AH statistics (struct ahstat, netipsec/ah_var.h)");
#endif
static unsigned char ipseczeroes[256]; /* larger than an ip6 extension hdr */
static int ah_input_cb(struct cryptop*);
static int ah_output_cb(struct cryptop*);
int
xform_ah_authsize(const struct auth_hash *esph)
{
int alen;
if (esph == NULL)
return 0;
switch (esph->type) {
case CRYPTO_SHA2_256_HMAC:
case CRYPTO_SHA2_384_HMAC:
case CRYPTO_SHA2_512_HMAC:
alen = esph->hashsize / 2; /* RFC4868 2.3 */
break;
case CRYPTO_AES_128_NIST_GMAC:
case CRYPTO_AES_192_NIST_GMAC:
case CRYPTO_AES_256_NIST_GMAC:
alen = esph->hashsize;
break;
default:
alen = AH_HMAC_HASHLEN;
break;
}
return alen;
}
size_t
ah_hdrsiz(struct secasvar *sav)
{
size_t size;
if (sav != NULL) {
int authsize;
IPSEC_ASSERT(sav->tdb_authalgxform != NULL, ("null xform"));
/*XXX not right for null algorithm--does it matter??*/
authsize = AUTHSIZE(sav);
size = roundup(authsize, sizeof (u_int32_t)) + HDRSIZE(sav);
} else {
/* default guess */
size = sizeof (struct ah) + sizeof (u_int32_t) + 16;
}
return size;
}
/*
* NB: public for use by esp_init.
*/
int
ah_init0(struct secasvar *sav, struct xformsw *xsp, struct cryptoini *cria)
{
const struct auth_hash *thash;
int keylen;
thash = auth_algorithm_lookup(sav->alg_auth);
if (thash == NULL) {
DPRINTF(("%s: unsupported authentication algorithm %u\n",
__func__, sav->alg_auth));
return EINVAL;
}
/*
* Verify the replay state block allocation is consistent with
* the protocol type. We check here so we can make assumptions
* later during protocol processing.
*/
/* NB: replay state is setup elsewhere (sigh) */
if (((sav->flags&SADB_X_EXT_OLD) == 0) ^ (sav->replay != NULL)) {
DPRINTF(("%s: replay state block inconsistency, "
"%s algorithm %s replay state\n", __func__,
(sav->flags & SADB_X_EXT_OLD) ? "old" : "new",
sav->replay == NULL ? "without" : "with"));
return EINVAL;
}
if (sav->key_auth == NULL) {
DPRINTF(("%s: no authentication key for %s algorithm\n",
__func__, thash->name));
return EINVAL;
}
keylen = _KEYLEN(sav->key_auth);
if (keylen > thash->keysize && thash->keysize != 0) {
DPRINTF(("%s: invalid keylength %d, algorithm %s requires "
"keysize less than %d\n", __func__,
keylen, thash->name, thash->keysize));
return EINVAL;
}
sav->tdb_xform = xsp;
sav->tdb_authalgxform = thash;
/* Initialize crypto session. */
bzero(cria, sizeof (*cria));
cria->cri_alg = sav->tdb_authalgxform->type;
cria->cri_klen = _KEYBITS(sav->key_auth);
cria->cri_key = sav->key_auth->key_data;
cria->cri_mlen = AUTHSIZE(sav);
return 0;
}
/*
* ah_init() is called when an SPI is being set up.
*/
static int
ah_init(struct secasvar *sav, struct xformsw *xsp)
{
struct cryptoini cria;
int error;
error = ah_init0(sav, xsp, &cria);
return error ? error :
crypto_newsession(&sav->tdb_cryptoid, &cria, V_crypto_support);
}
/*
* Paranoia.
*
* NB: public for use by esp_zeroize (XXX).
*/
int
ah_zeroize(struct secasvar *sav)
{
int err;
if (sav->key_auth)
bzero(sav->key_auth->key_data, _KEYLEN(sav->key_auth));
err = crypto_freesession(sav->tdb_cryptoid);
sav->tdb_cryptoid = 0;
sav->tdb_authalgxform = NULL;
sav->tdb_xform = NULL;
return err;
}
/*
* Massage IPv4/IPv6 headers for AH processing.
*/
static int
ah_massage_headers(struct mbuf **m0, int proto, int skip, int alg, int out)
{
struct mbuf *m = *m0;
unsigned char *ptr;
int off, count;
#ifdef INET
struct ip *ip;
#endif /* INET */
#ifdef INET6
struct ip6_ext *ip6e;
struct ip6_hdr ip6;
int alloc, len, ad;
#endif /* INET6 */
switch (proto) {
#ifdef INET
case AF_INET:
/*
* This is the least painful way of dealing with IPv4 header
* and option processing -- just make sure they're in
* contiguous memory.
*/
*m0 = m = m_pullup(m, skip);
if (m == NULL) {
DPRINTF(("%s: m_pullup failed\n", __func__));
return ENOBUFS;
}
/* Fix the IP header */
ip = mtod(m, struct ip *);
if (V_ah_cleartos)
ip->ip_tos = 0;
ip->ip_ttl = 0;
ip->ip_sum = 0;
if (alg == CRYPTO_MD5_KPDK || alg == CRYPTO_SHA1_KPDK)
ip->ip_off &= htons(IP_DF);
else
ip->ip_off = htons(0);
ptr = mtod(m, unsigned char *) + sizeof(struct ip);
/* IPv4 option processing */
for (off = sizeof(struct ip); off < skip;) {
if (ptr[off] == IPOPT_EOL || ptr[off] == IPOPT_NOP ||
off + 1 < skip)
;
else {
DPRINTF(("%s: illegal IPv4 option length for "
"option %d\n", __func__, ptr[off]));
m_freem(m);
return EINVAL;
}
switch (ptr[off]) {
case IPOPT_EOL:
off = skip; /* End the loop. */
break;
case IPOPT_NOP:
off++;
break;
case IPOPT_SECURITY: /* 0x82 */
case 0x85: /* Extended security. */
case 0x86: /* Commercial security. */
case 0x94: /* Router alert */
case 0x95: /* RFC1770 */
/* Sanity check for option length. */
if (ptr[off + 1] < 2) {
DPRINTF(("%s: illegal IPv4 option "
"length for option %d\n",
__func__, ptr[off]));
m_freem(m);
return EINVAL;
}
off += ptr[off + 1];
break;
case IPOPT_LSRR:
case IPOPT_SSRR:
/* Sanity check for option length. */
if (ptr[off + 1] < 2) {
DPRINTF(("%s: illegal IPv4 option "
"length for option %d\n",
__func__, ptr[off]));
m_freem(m);
return EINVAL;
}
/*
* On output, if we have either of the
* source routing options, we should
* swap the destination address of the
* IP header with the last address
* specified in the option, as that is
* what the destination's IP header
* will look like.
*/
if (out)
bcopy(ptr + off + ptr[off + 1] -
sizeof(struct in_addr),
&(ip->ip_dst), sizeof(struct in_addr));
/* Fall through */
default:
/* Sanity check for option length. */
if (ptr[off + 1] < 2) {
DPRINTF(("%s: illegal IPv4 option "
"length for option %d\n",
__func__, ptr[off]));
m_freem(m);
return EINVAL;
}
/* Zeroize all other options. */
count = ptr[off + 1];
bcopy(ipseczeroes, ptr, count);
off += count;
break;
}
/* Sanity check. */
if (off > skip) {
DPRINTF(("%s: malformed IPv4 options header\n",
__func__));
m_freem(m);
return EINVAL;
}
}
break;
#endif /* INET */
#ifdef INET6
case AF_INET6: /* Ugly... */
/* Copy and "cook" the IPv6 header. */
m_copydata(m, 0, sizeof(ip6), (caddr_t) &ip6);
/* We don't do IPv6 Jumbograms. */
if (ip6.ip6_plen == 0) {
DPRINTF(("%s: unsupported IPv6 jumbogram\n", __func__));
m_freem(m);
return EMSGSIZE;
}
ip6.ip6_flow = 0;
ip6.ip6_hlim = 0;
ip6.ip6_vfc &= ~IPV6_VERSION_MASK;
ip6.ip6_vfc |= IPV6_VERSION;
/* Scoped address handling. */
if (IN6_IS_SCOPE_LINKLOCAL(&ip6.ip6_src))
ip6.ip6_src.s6_addr16[1] = 0;
if (IN6_IS_SCOPE_LINKLOCAL(&ip6.ip6_dst))
ip6.ip6_dst.s6_addr16[1] = 0;
/* Done with IPv6 header. */
m_copyback(m, 0, sizeof(struct ip6_hdr), (caddr_t) &ip6);
/* Let's deal with the remaining headers (if any). */
if (skip - sizeof(struct ip6_hdr) > 0) {
if (m->m_len <= skip) {
ptr = (unsigned char *) malloc(
skip - sizeof(struct ip6_hdr),
M_XDATA, M_NOWAIT);
if (ptr == NULL) {
DPRINTF(("%s: failed to allocate memory"
"for IPv6 headers\n",__func__));
m_freem(m);
return ENOBUFS;
}
/*
* Copy all the protocol headers after
* the IPv6 header.
*/
m_copydata(m, sizeof(struct ip6_hdr),
skip - sizeof(struct ip6_hdr), ptr);
alloc = 1;
} else {
/* No need to allocate memory. */
ptr = mtod(m, unsigned char *) +
sizeof(struct ip6_hdr);
alloc = 0;
}
} else
break;
off = ip6.ip6_nxt & 0xff; /* Next header type. */
for (len = 0; len < skip - sizeof(struct ip6_hdr);)
switch (off) {
case IPPROTO_HOPOPTS:
case IPPROTO_DSTOPTS:
ip6e = (struct ip6_ext *) (ptr + len);
/*
* Process the mutable/immutable
* options -- borrows heavily from the
* KAME code.
*/
for (count = len + sizeof(struct ip6_ext);
count < len + ((ip6e->ip6e_len + 1) << 3);) {
if (ptr[count] == IP6OPT_PAD1) {
count++;
continue; /* Skip padding. */
}
/* Sanity check. */
if (count > len +
((ip6e->ip6e_len + 1) << 3)) {
m_freem(m);
/* Free, if we allocated. */
if (alloc)
free(ptr, M_XDATA);
return EINVAL;
}
ad = ptr[count + 1];
/* If mutable option, zeroize. */
if (ptr[count] & IP6OPT_MUTABLE)
bcopy(ipseczeroes, ptr + count,
ptr[count + 1]);
count += ad;
/* Sanity check. */
if (count >
skip - sizeof(struct ip6_hdr)) {
m_freem(m);
/* Free, if we allocated. */
if (alloc)
free(ptr, M_XDATA);
return EINVAL;
}
}
/* Advance. */
len += ((ip6e->ip6e_len + 1) << 3);
off = ip6e->ip6e_nxt;
break;
case IPPROTO_ROUTING:
/*
* Always include routing headers in
* computation.
*/
ip6e = (struct ip6_ext *) (ptr + len);
len += ((ip6e->ip6e_len + 1) << 3);
off = ip6e->ip6e_nxt;
break;
default:
DPRINTF(("%s: unexpected IPv6 header type %d",
__func__, off));
if (alloc)
free(ptr, M_XDATA);
m_freem(m);
return EINVAL;
}
/* Copyback and free, if we allocated. */
if (alloc) {
m_copyback(m, sizeof(struct ip6_hdr),
skip - sizeof(struct ip6_hdr), ptr);
free(ptr, M_XDATA);
}
break;
#endif /* INET6 */
}
return 0;
}
/*
* ah_input() gets called to verify that an input packet
* passes authentication.
*/
static int
ah_input(struct mbuf *m, struct secasvar *sav, int skip, int protoff)
{
IPSEC_DEBUG_DECLARE(char buf[128]);
const struct auth_hash *ahx;
struct cryptodesc *crda;
struct cryptop *crp;
struct xform_data *xd;
struct newah *ah;
uint64_t cryptoid;
int hl, rplen, authsize, error;
IPSEC_ASSERT(sav != NULL, ("null SA"));
IPSEC_ASSERT(sav->key_auth != NULL, ("null authentication key"));
IPSEC_ASSERT(sav->tdb_authalgxform != NULL,
("null authentication xform"));
/* Figure out header size. */
rplen = HDRSIZE(sav);
/* XXX don't pullup, just copy header */
IP6_EXTHDR_GET(ah, struct newah *, m, skip, rplen);
if (ah == NULL) {
DPRINTF(("ah_input: cannot pullup header\n"));
AHSTAT_INC(ahs_hdrops); /*XXX*/
error = ENOBUFS;
goto bad;
}
/* Check replay window, if applicable. */
SECASVAR_LOCK(sav);
if (sav->replay != NULL && sav->replay->wsize != 0 &&
ipsec_chkreplay(ntohl(ah->ah_seq), sav) == 0) {
SECASVAR_UNLOCK(sav);
AHSTAT_INC(ahs_replay);
DPRINTF(("%s: packet replay failure: %s\n", __func__,
ipsec_sa2str(sav, buf, sizeof(buf))));
error = EACCES;
goto bad;
}
cryptoid = sav->tdb_cryptoid;
SECASVAR_UNLOCK(sav);
/* Verify AH header length. */
hl = ah->ah_len * sizeof (u_int32_t);
ahx = sav->tdb_authalgxform;
authsize = AUTHSIZE(sav);
if (hl != authsize + rplen - sizeof (struct ah)) {
DPRINTF(("%s: bad authenticator length %u (expecting %lu)"
" for packet in SA %s/%08lx\n", __func__, hl,
(u_long) (authsize + rplen - sizeof (struct ah)),
ipsec_address(&sav->sah->saidx.dst, buf, sizeof(buf)),
(u_long) ntohl(sav->spi)));
AHSTAT_INC(ahs_badauthl);
error = EACCES;
goto bad;
}
AHSTAT_ADD(ahs_ibytes, m->m_pkthdr.len - skip - hl);
/* Get crypto descriptors. */
crp = crypto_getreq(1);
if (crp == NULL) {
DPRINTF(("%s: failed to acquire crypto descriptor\n",
__func__));
AHSTAT_INC(ahs_crypto);
error = ENOBUFS;
goto bad;
}
crda = crp->crp_desc;
IPSEC_ASSERT(crda != NULL, ("null crypto descriptor"));
crda->crd_skip = 0;
crda->crd_len = m->m_pkthdr.len;
crda->crd_inject = skip + rplen;
/* Authentication operation. */
crda->crd_alg = ahx->type;
crda->crd_klen = _KEYBITS(sav->key_auth);
crda->crd_key = sav->key_auth->key_data;
/* Allocate IPsec-specific opaque crypto info. */
xd = malloc(sizeof(*xd) + skip + rplen + authsize, M_XDATA,
M_NOWAIT | M_ZERO);
if (xd == NULL) {
DPRINTF(("%s: failed to allocate xform_data\n", __func__));
AHSTAT_INC(ahs_crypto);
crypto_freereq(crp);
error = ENOBUFS;
goto bad;
}
/*
* Save the authenticator, the skipped portion of the packet,
* and the AH header.
*/
m_copydata(m, 0, skip + rplen + authsize, (caddr_t)(xd + 1));
/* Zeroize the authenticator on the packet. */
m_copyback(m, skip + rplen, authsize, ipseczeroes);
/* "Massage" the packet headers for crypto processing. */
error = ah_massage_headers(&m, sav->sah->saidx.dst.sa.sa_family,
skip, ahx->type, 0);
if (error != 0) {
/* NB: mbuf is free'd by ah_massage_headers */
AHSTAT_INC(ahs_hdrops);
free(xd, M_XDATA);
crypto_freereq(crp);
key_freesav(&sav);
return (error);
}
/* Crypto operation descriptor. */
crp->crp_ilen = m->m_pkthdr.len; /* Total input length. */
crp->crp_flags = CRYPTO_F_IMBUF | CRYPTO_F_CBIFSYNC;
if (V_async_crypto)
crp->crp_flags |= CRYPTO_F_ASYNC | CRYPTO_F_ASYNC_KEEPORDER;
crp->crp_buf = (caddr_t) m;
crp->crp_callback = ah_input_cb;
crp->crp_sid = cryptoid;
crp->crp_opaque = (caddr_t) xd;
/* These are passed as-is to the callback. */
xd->sav = sav;
xd->nxt = ah->ah_nxt;
xd->protoff = protoff;
xd->skip = skip;
xd->cryptoid = cryptoid;
return (crypto_dispatch(crp));
bad:
m_freem(m);
key_freesav(&sav);
return (error);
}
/*
* AH input callback from the crypto driver.
*/
static int
ah_input_cb(struct cryptop *crp)
{
IPSEC_DEBUG_DECLARE(char buf[IPSEC_ADDRSTRLEN]);
unsigned char calc[AH_ALEN_MAX];
- const struct auth_hash *ahx;
struct mbuf *m;
- struct cryptodesc *crd;
struct xform_data *xd;
struct secasvar *sav;
struct secasindex *saidx;
caddr_t ptr;
uint64_t cryptoid;
int authsize, rplen, error, skip, protoff;
uint8_t nxt;
- crd = crp->crp_desc;
m = (struct mbuf *) crp->crp_buf;
xd = (struct xform_data *) crp->crp_opaque;
sav = xd->sav;
skip = xd->skip;
nxt = xd->nxt;
protoff = xd->protoff;
cryptoid = xd->cryptoid;
saidx = &sav->sah->saidx;
IPSEC_ASSERT(saidx->dst.sa.sa_family == AF_INET ||
saidx->dst.sa.sa_family == AF_INET6,
("unexpected protocol family %u", saidx->dst.sa.sa_family));
-
- ahx = sav->tdb_authalgxform;
/* Check for crypto errors. */
if (crp->crp_etype) {
if (crp->crp_etype == EAGAIN) {
/* Reset the session ID */
if (ipsec_updateid(sav, &crp->crp_sid, &cryptoid) != 0)
crypto_freesession(cryptoid);
xd->cryptoid = crp->crp_sid;
return (crypto_dispatch(crp));
}
AHSTAT_INC(ahs_noxform);
DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype));
error = crp->crp_etype;
goto bad;
} else {
AHSTAT_INC(ahs_hist[sav->alg_auth]);
crypto_freereq(crp); /* No longer needed. */
crp = NULL;
}
/* Shouldn't happen... */
if (m == NULL) {
AHSTAT_INC(ahs_crypto);
DPRINTF(("%s: bogus returned buffer from crypto\n", __func__));
error = EINVAL;
goto bad;
}
/* Figure out header size. */
rplen = HDRSIZE(sav);
authsize = AUTHSIZE(sav);
/* Copy authenticator off the packet. */
m_copydata(m, skip + rplen, authsize, calc);
/* Verify authenticator. */
ptr = (caddr_t) (xd + 1);
if (timingsafe_bcmp(ptr + skip + rplen, calc, authsize)) {
DPRINTF(("%s: authentication hash mismatch for packet "
"in SA %s/%08lx\n", __func__,
ipsec_address(&saidx->dst, buf, sizeof(buf)),
(u_long) ntohl(sav->spi)));
AHSTAT_INC(ahs_badauth);
error = EACCES;
goto bad;
}
/* Fix the Next Protocol field. */
((uint8_t *) ptr)[protoff] = nxt;
/* Copyback the saved (uncooked) network headers. */
m_copyback(m, 0, skip, ptr);
free(xd, M_XDATA), xd = NULL; /* No longer needed */
/*
* Header is now authenticated.
*/
m->m_flags |= M_AUTHIPHDR|M_AUTHIPDGM;
/*
* Update replay sequence number, if appropriate.
*/
if (sav->replay) {
u_int32_t seq;
m_copydata(m, skip + offsetof(struct newah, ah_seq),
sizeof (seq), (caddr_t) &seq);
SECASVAR_LOCK(sav);
if (ipsec_updatereplay(ntohl(seq), sav)) {
SECASVAR_UNLOCK(sav);
AHSTAT_INC(ahs_replay);
error = EACCES;
goto bad;
}
SECASVAR_UNLOCK(sav);
}
/*
* Remove the AH header and authenticator from the mbuf.
*/
error = m_striphdr(m, skip, rplen + authsize);
if (error) {
DPRINTF(("%s: mangled mbuf chain for SA %s/%08lx\n", __func__,
ipsec_address(&saidx->dst, buf, sizeof(buf)),
(u_long) ntohl(sav->spi)));
AHSTAT_INC(ahs_hdrops);
goto bad;
}
switch (saidx->dst.sa.sa_family) {
#ifdef INET6
case AF_INET6:
error = ipsec6_common_input_cb(m, sav, skip, protoff);
break;
#endif
#ifdef INET
case AF_INET:
error = ipsec4_common_input_cb(m, sav, skip, protoff);
break;
#endif
default:
panic("%s: Unexpected address family: %d saidx=%p", __func__,
saidx->dst.sa.sa_family, saidx);
}
return error;
bad:
if (sav)
key_freesav(&sav);
if (m != NULL)
m_freem(m);
if (xd != NULL)
free(xd, M_XDATA);
if (crp != NULL)
crypto_freereq(crp);
return error;
}
/*
* AH output routine, called by ipsec[46]_perform_request().
*/
static int
ah_output(struct mbuf *m, struct secpolicy *sp, struct secasvar *sav,
u_int idx, int skip, int protoff)
{
IPSEC_DEBUG_DECLARE(char buf[IPSEC_ADDRSTRLEN]);
const struct auth_hash *ahx;
struct cryptodesc *crda;
struct xform_data *xd;
struct mbuf *mi;
struct cryptop *crp;
struct newah *ah;
uint64_t cryptoid;
uint16_t iplen;
int error, rplen, authsize, maxpacketsize, roff;
uint8_t prot;
IPSEC_ASSERT(sav != NULL, ("null SA"));
ahx = sav->tdb_authalgxform;
IPSEC_ASSERT(ahx != NULL, ("null authentication xform"));
AHSTAT_INC(ahs_output);
/* Figure out header size. */
rplen = HDRSIZE(sav);
/* Check for maximum packet size violations. */
switch (sav->sah->saidx.dst.sa.sa_family) {
#ifdef INET
case AF_INET:
maxpacketsize = IP_MAXPACKET;
break;
#endif /* INET */
#ifdef INET6
case AF_INET6:
maxpacketsize = IPV6_MAXPACKET;
break;
#endif /* INET6 */
default:
DPRINTF(("%s: unknown/unsupported protocol family %u, "
"SA %s/%08lx\n", __func__,
sav->sah->saidx.dst.sa.sa_family,
ipsec_address(&sav->sah->saidx.dst, buf, sizeof(buf)),
(u_long) ntohl(sav->spi)));
AHSTAT_INC(ahs_nopf);
error = EPFNOSUPPORT;
goto bad;
}
authsize = AUTHSIZE(sav);
if (rplen + authsize + m->m_pkthdr.len > maxpacketsize) {
DPRINTF(("%s: packet in SA %s/%08lx got too big "
"(len %u, max len %u)\n", __func__,
ipsec_address(&sav->sah->saidx.dst, buf, sizeof(buf)),
(u_long) ntohl(sav->spi),
rplen + authsize + m->m_pkthdr.len, maxpacketsize));
AHSTAT_INC(ahs_toobig);
error = EMSGSIZE;
goto bad;
}
/* Update the counters. */
AHSTAT_ADD(ahs_obytes, m->m_pkthdr.len - skip);
m = m_unshare(m, M_NOWAIT);
if (m == NULL) {
DPRINTF(("%s: cannot clone mbuf chain, SA %s/%08lx\n", __func__,
ipsec_address(&sav->sah->saidx.dst, buf, sizeof(buf)),
(u_long) ntohl(sav->spi)));
AHSTAT_INC(ahs_hdrops);
error = ENOBUFS;
goto bad;
}
/* Inject AH header. */
mi = m_makespace(m, skip, rplen + authsize, &roff);
if (mi == NULL) {
DPRINTF(("%s: failed to inject %u byte AH header for SA "
"%s/%08lx\n", __func__,
rplen + authsize,
ipsec_address(&sav->sah->saidx.dst, buf, sizeof(buf)),
(u_long) ntohl(sav->spi)));
AHSTAT_INC(ahs_hdrops); /*XXX differs from openbsd */
error = ENOBUFS;
goto bad;
}
/*
* The AH header is guaranteed by m_makespace() to be in
* contiguous memory, at roff bytes offset into the returned mbuf.
*/
ah = (struct newah *)(mtod(mi, caddr_t) + roff);
/* Initialize the AH header. */
m_copydata(m, protoff, sizeof(u_int8_t), (caddr_t) &ah->ah_nxt);
ah->ah_len = (rplen + authsize - sizeof(struct ah)) / sizeof(u_int32_t);
ah->ah_reserve = 0;
ah->ah_spi = sav->spi;
/* Zeroize authenticator. */
m_copyback(m, skip + rplen, authsize, ipseczeroes);
/* Insert packet replay counter, as requested. */
SECASVAR_LOCK(sav);
if (sav->replay) {
if (sav->replay->count == ~0 &&
(sav->flags & SADB_X_EXT_CYCSEQ) == 0) {
SECASVAR_UNLOCK(sav);
DPRINTF(("%s: replay counter wrapped for SA %s/%08lx\n",
__func__, ipsec_address(&sav->sah->saidx.dst, buf,
sizeof(buf)), (u_long) ntohl(sav->spi)));
AHSTAT_INC(ahs_wrap);
error = EACCES;
goto bad;
}
#ifdef REGRESSION
/* Emulate replay attack when ipsec_replay is TRUE. */
if (!V_ipsec_replay)
#endif
sav->replay->count++;
ah->ah_seq = htonl(sav->replay->count);
}
cryptoid = sav->tdb_cryptoid;
SECASVAR_UNLOCK(sav);
/* Get crypto descriptors. */
crp = crypto_getreq(1);
if (crp == NULL) {
DPRINTF(("%s: failed to acquire crypto descriptors\n",
__func__));
AHSTAT_INC(ahs_crypto);
error = ENOBUFS;
goto bad;
}
crda = crp->crp_desc;
crda->crd_skip = 0;
crda->crd_inject = skip + rplen;
crda->crd_len = m->m_pkthdr.len;
/* Authentication operation. */
crda->crd_alg = ahx->type;
crda->crd_key = sav->key_auth->key_data;
crda->crd_klen = _KEYBITS(sav->key_auth);
/* Allocate IPsec-specific opaque crypto info. */
xd = malloc(sizeof(struct xform_data) + skip, M_XDATA,
M_NOWAIT | M_ZERO);
if (xd == NULL) {
crypto_freereq(crp);
DPRINTF(("%s: failed to allocate xform_data\n", __func__));
AHSTAT_INC(ahs_crypto);
error = ENOBUFS;
goto bad;
}
/* Save the skipped portion of the packet. */
m_copydata(m, 0, skip, (caddr_t) (xd + 1));
/*
* Fix IP header length on the header used for
* authentication. We don't need to fix the original
* header length as it will be fixed by our caller.
*/
switch (sav->sah->saidx.dst.sa.sa_family) {
#ifdef INET
case AF_INET:
bcopy(((caddr_t)(xd + 1)) +
offsetof(struct ip, ip_len),
(caddr_t) &iplen, sizeof(u_int16_t));
iplen = htons(ntohs(iplen) + rplen + authsize);
m_copyback(m, offsetof(struct ip, ip_len),
sizeof(u_int16_t), (caddr_t) &iplen);
break;
#endif /* INET */
#ifdef INET6
case AF_INET6:
bcopy(((caddr_t)(xd + 1)) +
offsetof(struct ip6_hdr, ip6_plen),
(caddr_t) &iplen, sizeof(uint16_t));
iplen = htons(ntohs(iplen) + rplen + authsize);
m_copyback(m, offsetof(struct ip6_hdr, ip6_plen),
sizeof(uint16_t), (caddr_t) &iplen);
break;
#endif /* INET6 */
}
/* Fix the Next Header field in saved header. */
((uint8_t *) (xd + 1))[protoff] = IPPROTO_AH;
/* Update the Next Protocol field in the IP header. */
prot = IPPROTO_AH;
m_copyback(m, protoff, sizeof(uint8_t), (caddr_t) &prot);
/* "Massage" the packet headers for crypto processing. */
error = ah_massage_headers(&m, sav->sah->saidx.dst.sa.sa_family,
skip, ahx->type, 1);
if (error != 0) {
m = NULL; /* mbuf was free'd by ah_massage_headers. */
free(xd, M_XDATA);
crypto_freereq(crp);
goto bad;
}
/* Crypto operation descriptor. */
crp->crp_ilen = m->m_pkthdr.len; /* Total input length. */
crp->crp_flags = CRYPTO_F_IMBUF | CRYPTO_F_CBIFSYNC;
if (V_async_crypto)
crp->crp_flags |= CRYPTO_F_ASYNC | CRYPTO_F_ASYNC_KEEPORDER;
crp->crp_buf = (caddr_t) m;
crp->crp_callback = ah_output_cb;
crp->crp_sid = cryptoid;
crp->crp_opaque = (caddr_t) xd;
/* These are passed as-is to the callback. */
xd->sp = sp;
xd->sav = sav;
xd->skip = skip;
xd->idx = idx;
xd->cryptoid = cryptoid;
return crypto_dispatch(crp);
bad:
if (m)
m_freem(m);
key_freesav(&sav);
key_freesp(&sp);
return (error);
}
/*
* AH output callback from the crypto driver.
*/
static int
ah_output_cb(struct cryptop *crp)
{
struct xform_data *xd;
struct secpolicy *sp;
struct secasvar *sav;
struct mbuf *m;
uint64_t cryptoid;
caddr_t ptr;
u_int idx;
int skip, error;
m = (struct mbuf *) crp->crp_buf;
xd = (struct xform_data *) crp->crp_opaque;
sp = xd->sp;
sav = xd->sav;
skip = xd->skip;
idx = xd->idx;
cryptoid = xd->cryptoid;
ptr = (caddr_t) (xd + 1);
/* Check for crypto errors. */
if (crp->crp_etype) {
if (crp->crp_etype == EAGAIN) {
/* Reset the session ID */
if (ipsec_updateid(sav, &crp->crp_sid, &cryptoid) != 0)
crypto_freesession(cryptoid);
xd->cryptoid = crp->crp_sid;
return (crypto_dispatch(crp));
}
AHSTAT_INC(ahs_noxform);
DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype));
error = crp->crp_etype;
m_freem(m);
goto bad;
}
/* Shouldn't happen... */
if (m == NULL) {
AHSTAT_INC(ahs_crypto);
DPRINTF(("%s: bogus returned buffer from crypto\n", __func__));
error = EINVAL;
goto bad;
}
/*
* Copy original headers (with the new protocol number) back
* in place.
*/
m_copyback(m, 0, skip, ptr);
free(xd, M_XDATA);
crypto_freereq(crp);
AHSTAT_INC(ahs_hist[sav->alg_auth]);
#ifdef REGRESSION
/* Emulate man-in-the-middle attack when ipsec_integrity is TRUE. */
if (V_ipsec_integrity) {
int alen;
/*
* Corrupt HMAC if we want to test integrity verification of
* the other side.
*/
alen = AUTHSIZE(sav);
m_copyback(m, m->m_pkthdr.len - alen, alen, ipseczeroes);
}
#endif
/* NB: m is reclaimed by ipsec_process_done. */
error = ipsec_process_done(m, sp, sav, idx);
return (error);
bad:
free(xd, M_XDATA);
crypto_freereq(crp);
key_freesav(&sav);
key_freesp(&sp);
return (error);
}
static struct xformsw ah_xformsw = {
.xf_type = XF_AH,
.xf_name = "IPsec AH",
.xf_init = ah_init,
.xf_zeroize = ah_zeroize,
.xf_input = ah_input,
.xf_output = ah_output,
};
SYSINIT(ah_xform_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE,
xform_attach, &ah_xformsw);
SYSUNINIT(ah_xform_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE,
xform_detach, &ah_xformsw);
Index: head/sys/netipsec/xform_esp.c
===================================================================
--- head/sys/netipsec/xform_esp.c (revision 327172)
+++ head/sys/netipsec/xform_esp.c (revision 327173)
@@ -1,966 +1,964 @@
/* $FreeBSD$ */
/* $OpenBSD: ip_esp.c,v 1.69 2001/06/26 06:18:59 angelos Exp $ */
/*-
* The authors of this code are John Ioannidis (ji@tla.org),
* Angelos D. Keromytis (kermit@csd.uch.gr) and
* Niels Provos (provos@physnet.uni-hamburg.de).
*
* The original version of this code was written by John Ioannidis
* for BSD/OS in Athens, Greece, in November 1995.
*
* Ported to OpenBSD and NetBSD, with additional transforms, in December 1996,
* by Angelos D. Keromytis.
*
* Additional transforms and features in 1997 and 1998 by Angelos D. Keromytis
* and Niels Provos.
*
* Additional features in 1999 by Angelos D. Keromytis.
*
* Copyright (C) 1995, 1996, 1997, 1998, 1999 by John Ioannidis,
* Angelos D. Keromytis and Niels Provos.
* Copyright (c) 2001 Angelos D. Keromytis.
*
* Permission to use, copy, and modify this software with or without fee
* is hereby granted, provided that this entire notice is included in
* all copies of any software which is or includes a copy or
* modification of this software.
* You may use this code under the GNU public license if you so wish. Please
* contribute changes back to the authors under this freer than GPL license
* so that we may further the use of strong encryption without limitations to
* all.
*
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR
* IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE
* MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR
* PURPOSE.
*/
#include "opt_inet.h"
#include "opt_inet6.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/syslog.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/random.h>
#include <sys/mutex.h>
#include <sys/sysctl.h>
#include <sys/mutex.h>
#include <machine/atomic.h>
#include <net/if.h>
#include <net/vnet.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/ip_ecn.h>
#include <netinet/ip6.h>
#include <netipsec/ipsec.h>
#include <netipsec/ah.h>
#include <netipsec/ah_var.h>
#include <netipsec/esp.h>
#include <netipsec/esp_var.h>
#include <netipsec/xform.h>
#ifdef INET6
#include <netinet6/ip6_var.h>
#include <netipsec/ipsec6.h>
#include <netinet6/ip6_ecn.h>
#endif
#include <netipsec/key.h>
#include <netipsec/key_debug.h>
#include <opencrypto/cryptodev.h>
#include <opencrypto/xform.h>
VNET_DEFINE(int, esp_enable) = 1;
VNET_PCPUSTAT_DEFINE(struct espstat, espstat);
VNET_PCPUSTAT_SYSINIT(espstat);
#ifdef VIMAGE
VNET_PCPUSTAT_SYSUNINIT(espstat);
#endif /* VIMAGE */
SYSCTL_DECL(_net_inet_esp);
SYSCTL_INT(_net_inet_esp, OID_AUTO, esp_enable,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(esp_enable), 0, "");
SYSCTL_VNET_PCPUSTAT(_net_inet_esp, IPSECCTL_STATS, stats,
struct espstat, espstat,
"ESP statistics (struct espstat, netipsec/esp_var.h");
static int esp_input_cb(struct cryptop *op);
static int esp_output_cb(struct cryptop *crp);
size_t
esp_hdrsiz(struct secasvar *sav)
{
size_t size;
if (sav != NULL) {
/*XXX not right for null algorithm--does it matter??*/
IPSEC_ASSERT(sav->tdb_encalgxform != NULL,
("SA with null xform"));
if (sav->flags & SADB_X_EXT_OLD)
size = sizeof (struct esp);
else
size = sizeof (struct newesp);
size += sav->tdb_encalgxform->blocksize + 9;
/*XXX need alg check???*/
if (sav->tdb_authalgxform != NULL && sav->replay)
size += ah_hdrsiz(sav);
} else {
/*
* base header size
* + max iv length for CBC mode
* + max pad length
* + sizeof (pad length field)
* + sizeof (next header field)
* + max icv supported.
*/
size = sizeof (struct newesp) + EALG_MAX_BLOCK_LEN + 9 + 16;
}
return size;
}
/*
* esp_init() is called when an SPI is being set up.
*/
static int
esp_init(struct secasvar *sav, struct xformsw *xsp)
{
const struct enc_xform *txform;
struct cryptoini cria, crie;
int keylen;
int error;
txform = enc_algorithm_lookup(sav->alg_enc);
if (txform == NULL) {
DPRINTF(("%s: unsupported encryption algorithm %d\n",
__func__, sav->alg_enc));
return EINVAL;
}
if (sav->key_enc == NULL) {
DPRINTF(("%s: no encoding key for %s algorithm\n",
__func__, txform->name));
return EINVAL;
}
if ((sav->flags & (SADB_X_EXT_OLD | SADB_X_EXT_IV4B)) ==
SADB_X_EXT_IV4B) {
DPRINTF(("%s: 4-byte IV not supported with protocol\n",
__func__));
return EINVAL;
}
/* subtract off the salt, RFC4106, 8.1 and RFC3686, 5.1 */
keylen = _KEYLEN(sav->key_enc) - SAV_ISCTRORGCM(sav) * 4;
if (txform->minkey > keylen || keylen > txform->maxkey) {
DPRINTF(("%s: invalid key length %u, must be in the range "
"[%u..%u] for algorithm %s\n", __func__,
keylen, txform->minkey, txform->maxkey,
txform->name));
return EINVAL;
}
if (SAV_ISCTRORGCM(sav))
sav->ivlen = 8; /* RFC4106 3.1 and RFC3686 3.1 */
else
sav->ivlen = txform->ivsize;
/*
* Setup AH-related state.
*/
if (sav->alg_auth != 0) {
error = ah_init0(sav, xsp, &cria);
if (error)
return error;
}
/* NB: override anything set in ah_init0 */
sav->tdb_xform = xsp;
sav->tdb_encalgxform = txform;
/*
* Whenever AES-GCM is used for encryption, one
* of the AES authentication algorithms is chosen
* as well, based on the key size.
*/
if (sav->alg_enc == SADB_X_EALG_AESGCM16) {
switch (keylen) {
case AES_128_GMAC_KEY_LEN:
sav->alg_auth = SADB_X_AALG_AES128GMAC;
sav->tdb_authalgxform = &auth_hash_nist_gmac_aes_128;
break;
case AES_192_GMAC_KEY_LEN:
sav->alg_auth = SADB_X_AALG_AES192GMAC;
sav->tdb_authalgxform = &auth_hash_nist_gmac_aes_192;
break;
case AES_256_GMAC_KEY_LEN:
sav->alg_auth = SADB_X_AALG_AES256GMAC;
sav->tdb_authalgxform = &auth_hash_nist_gmac_aes_256;
break;
default:
DPRINTF(("%s: invalid key length %u"
"for algorithm %s\n", __func__,
keylen, txform->name));
return EINVAL;
}
bzero(&cria, sizeof(cria));
cria.cri_alg = sav->tdb_authalgxform->type;
cria.cri_key = sav->key_enc->key_data;
cria.cri_klen = _KEYBITS(sav->key_enc) - SAV_ISGCM(sav) * 32;
}
/* Initialize crypto session. */
bzero(&crie, sizeof(crie));
crie.cri_alg = sav->tdb_encalgxform->type;
crie.cri_key = sav->key_enc->key_data;
crie.cri_klen = _KEYBITS(sav->key_enc) - SAV_ISCTRORGCM(sav) * 32;
if (sav->tdb_authalgxform && sav->tdb_encalgxform) {
/* init both auth & enc */
crie.cri_next = &cria;
error = crypto_newsession(&sav->tdb_cryptoid,
&crie, V_crypto_support);
} else if (sav->tdb_encalgxform) {
error = crypto_newsession(&sav->tdb_cryptoid,
&crie, V_crypto_support);
} else if (sav->tdb_authalgxform) {
error = crypto_newsession(&sav->tdb_cryptoid,
&cria, V_crypto_support);
} else {
/* XXX cannot happen? */
DPRINTF(("%s: no encoding OR authentication xform!\n",
__func__));
error = EINVAL;
}
return error;
}
/*
* Paranoia.
*/
static int
esp_zeroize(struct secasvar *sav)
{
/* NB: ah_zerorize free's the crypto session state */
int error = ah_zeroize(sav);
if (sav->key_enc)
bzero(sav->key_enc->key_data, _KEYLEN(sav->key_enc));
sav->tdb_encalgxform = NULL;
sav->tdb_xform = NULL;
return error;
}
/*
* ESP input processing, called (eventually) through the protocol switch.
*/
static int
esp_input(struct mbuf *m, struct secasvar *sav, int skip, int protoff)
{
IPSEC_DEBUG_DECLARE(char buf[128]);
const struct auth_hash *esph;
const struct enc_xform *espx;
struct xform_data *xd;
struct cryptodesc *crde;
struct cryptop *crp;
struct newesp *esp;
uint8_t *ivp;
uint64_t cryptoid;
int alen, error, hlen, plen;
IPSEC_ASSERT(sav != NULL, ("null SA"));
IPSEC_ASSERT(sav->tdb_encalgxform != NULL, ("null encoding xform"));
error = EINVAL;
/* Valid IP Packet length ? */
if ( (skip&3) || (m->m_pkthdr.len&3) ){
DPRINTF(("%s: misaligned packet, skip %u pkt len %u",
__func__, skip, m->m_pkthdr.len));
ESPSTAT_INC(esps_badilen);
goto bad;
}
/* XXX don't pullup, just copy header */
IP6_EXTHDR_GET(esp, struct newesp *, m, skip, sizeof (struct newesp));
esph = sav->tdb_authalgxform;
espx = sav->tdb_encalgxform;
/* Determine the ESP header and auth length */
if (sav->flags & SADB_X_EXT_OLD)
hlen = sizeof (struct esp) + sav->ivlen;
else
hlen = sizeof (struct newesp) + sav->ivlen;
alen = xform_ah_authsize(esph);
/*
* Verify payload length is multiple of encryption algorithm
* block size.
*
* NB: This works for the null algorithm because the blocksize
* is 4 and all packets must be 4-byte aligned regardless
* of the algorithm.
*/
plen = m->m_pkthdr.len - (skip + hlen + alen);
if ((plen & (espx->blocksize - 1)) || (plen <= 0)) {
DPRINTF(("%s: payload of %d octets not a multiple of %d octets,"
" SA %s/%08lx\n", __func__, plen, espx->blocksize,
ipsec_address(&sav->sah->saidx.dst, buf, sizeof(buf)),
(u_long)ntohl(sav->spi)));
ESPSTAT_INC(esps_badilen);
goto bad;
}
/*
* Check sequence number.
*/
SECASVAR_LOCK(sav);
if (esph != NULL && sav->replay != NULL && sav->replay->wsize != 0) {
if (ipsec_chkreplay(ntohl(esp->esp_seq), sav) == 0) {
SECASVAR_UNLOCK(sav);
DPRINTF(("%s: packet replay check for %s\n", __func__,
ipsec_sa2str(sav, buf, sizeof(buf))));
ESPSTAT_INC(esps_replay);
error = EACCES;
goto bad;
}
}
cryptoid = sav->tdb_cryptoid;
SECASVAR_UNLOCK(sav);
/* Update the counters */
ESPSTAT_ADD(esps_ibytes, m->m_pkthdr.len - (skip + hlen + alen));
/* Get crypto descriptors */
crp = crypto_getreq(esph && espx ? 2 : 1);
if (crp == NULL) {
DPRINTF(("%s: failed to acquire crypto descriptors\n",
__func__));
ESPSTAT_INC(esps_crypto);
error = ENOBUFS;
goto bad;
}
/* Get IPsec-specific opaque pointer */
xd = malloc(sizeof(*xd) + alen, M_XDATA, M_NOWAIT | M_ZERO);
if (xd == NULL) {
DPRINTF(("%s: failed to allocate xform_data\n", __func__));
ESPSTAT_INC(esps_crypto);
crypto_freereq(crp);
error = ENOBUFS;
goto bad;
}
if (esph != NULL) {
struct cryptodesc *crda = crp->crp_desc;
IPSEC_ASSERT(crda != NULL, ("null ah crypto descriptor"));
/* Authentication descriptor */
crda->crd_skip = skip;
if (SAV_ISGCM(sav))
crda->crd_len = 8; /* RFC4106 5, SPI + SN */
else
crda->crd_len = m->m_pkthdr.len - (skip + alen);
crda->crd_inject = m->m_pkthdr.len - alen;
crda->crd_alg = esph->type;
/* Copy the authenticator */
m_copydata(m, m->m_pkthdr.len - alen, alen,
(caddr_t) (xd + 1));
/* Chain authentication request */
crde = crda->crd_next;
} else {
crde = crp->crp_desc;
}
/* Crypto operation descriptor */
crp->crp_ilen = m->m_pkthdr.len; /* Total input length */
crp->crp_flags = CRYPTO_F_IMBUF | CRYPTO_F_CBIFSYNC;
if (V_async_crypto)
crp->crp_flags |= CRYPTO_F_ASYNC | CRYPTO_F_ASYNC_KEEPORDER;
crp->crp_buf = (caddr_t) m;
crp->crp_callback = esp_input_cb;
crp->crp_sid = cryptoid;
crp->crp_opaque = (caddr_t) xd;
/* These are passed as-is to the callback */
xd->sav = sav;
xd->protoff = protoff;
xd->skip = skip;
xd->cryptoid = cryptoid;
/* Decryption descriptor */
IPSEC_ASSERT(crde != NULL, ("null esp crypto descriptor"));
crde->crd_skip = skip + hlen;
crde->crd_len = m->m_pkthdr.len - (skip + hlen + alen);
crde->crd_inject = skip + hlen - sav->ivlen;
if (SAV_ISCTRORGCM(sav)) {
ivp = &crde->crd_iv[0];
/* GCM IV Format: RFC4106 4 */
/* CTR IV Format: RFC3686 4 */
/* Salt is last four bytes of key, RFC4106 8.1 */
/* Nonce is last four bytes of key, RFC3686 5.1 */
memcpy(ivp, sav->key_enc->key_data +
_KEYLEN(sav->key_enc) - 4, 4);
if (SAV_ISCTR(sav)) {
/* Initial block counter is 1, RFC3686 4 */
be32enc(&ivp[sav->ivlen + 4], 1);
}
m_copydata(m, skip + hlen - sav->ivlen, sav->ivlen, &ivp[4]);
crde->crd_flags |= CRD_F_IV_EXPLICIT;
}
crde->crd_alg = espx->type;
return (crypto_dispatch(crp));
bad:
m_freem(m);
key_freesav(&sav);
return (error);
}
/*
* ESP input callback from the crypto driver.
*/
static int
esp_input_cb(struct cryptop *crp)
{
IPSEC_DEBUG_DECLARE(char buf[128]);
u_int8_t lastthree[3], aalg[AH_HMAC_MAXHASHLEN];
const struct auth_hash *esph;
- const struct enc_xform *espx;
struct mbuf *m;
struct cryptodesc *crd;
struct xform_data *xd;
struct secasvar *sav;
struct secasindex *saidx;
caddr_t ptr;
uint64_t cryptoid;
int hlen, skip, protoff, error, alen;
crd = crp->crp_desc;
IPSEC_ASSERT(crd != NULL, ("null crypto descriptor!"));
m = (struct mbuf *) crp->crp_buf;
xd = (struct xform_data *) crp->crp_opaque;
sav = xd->sav;
skip = xd->skip;
protoff = xd->protoff;
cryptoid = xd->cryptoid;
saidx = &sav->sah->saidx;
esph = sav->tdb_authalgxform;
- espx = sav->tdb_encalgxform;
/* Check for crypto errors */
if (crp->crp_etype) {
if (crp->crp_etype == EAGAIN) {
/* Reset the session ID */
if (ipsec_updateid(sav, &crp->crp_sid, &cryptoid) != 0)
crypto_freesession(cryptoid);
xd->cryptoid = crp->crp_sid;
return (crypto_dispatch(crp));
}
ESPSTAT_INC(esps_noxform);
DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype));
error = crp->crp_etype;
goto bad;
}
/* Shouldn't happen... */
if (m == NULL) {
ESPSTAT_INC(esps_crypto);
DPRINTF(("%s: bogus returned buffer from crypto\n", __func__));
error = EINVAL;
goto bad;
}
ESPSTAT_INC(esps_hist[sav->alg_enc]);
/* If authentication was performed, check now. */
if (esph != NULL) {
alen = xform_ah_authsize(esph);
AHSTAT_INC(ahs_hist[sav->alg_auth]);
/* Copy the authenticator from the packet */
m_copydata(m, m->m_pkthdr.len - alen, alen, aalg);
ptr = (caddr_t) (xd + 1);
/* Verify authenticator */
if (timingsafe_bcmp(ptr, aalg, alen) != 0) {
DPRINTF(("%s: authentication hash mismatch for "
"packet in SA %s/%08lx\n", __func__,
ipsec_address(&saidx->dst, buf, sizeof(buf)),
(u_long) ntohl(sav->spi)));
ESPSTAT_INC(esps_badauth);
error = EACCES;
goto bad;
}
m->m_flags |= M_AUTHIPDGM;
/* Remove trailing authenticator */
m_adj(m, -alen);
}
/* Release the crypto descriptors */
free(xd, M_XDATA), xd = NULL;
crypto_freereq(crp), crp = NULL;
/*
* Packet is now decrypted.
*/
m->m_flags |= M_DECRYPTED;
/*
* Update replay sequence number, if appropriate.
*/
if (sav->replay) {
u_int32_t seq;
m_copydata(m, skip + offsetof(struct newesp, esp_seq),
sizeof (seq), (caddr_t) &seq);
SECASVAR_LOCK(sav);
if (ipsec_updatereplay(ntohl(seq), sav)) {
SECASVAR_UNLOCK(sav);
DPRINTF(("%s: packet replay check for %s\n", __func__,
ipsec_sa2str(sav, buf, sizeof(buf))));
ESPSTAT_INC(esps_replay);
error = EACCES;
goto bad;
}
SECASVAR_UNLOCK(sav);
}
/* Determine the ESP header length */
if (sav->flags & SADB_X_EXT_OLD)
hlen = sizeof (struct esp) + sav->ivlen;
else
hlen = sizeof (struct newesp) + sav->ivlen;
/* Remove the ESP header and IV from the mbuf. */
error = m_striphdr(m, skip, hlen);
if (error) {
ESPSTAT_INC(esps_hdrops);
DPRINTF(("%s: bad mbuf chain, SA %s/%08lx\n", __func__,
ipsec_address(&sav->sah->saidx.dst, buf, sizeof(buf)),
(u_long) ntohl(sav->spi)));
goto bad;
}
/* Save the last three bytes of decrypted data */
m_copydata(m, m->m_pkthdr.len - 3, 3, lastthree);
/* Verify pad length */
if (lastthree[1] + 2 > m->m_pkthdr.len - skip) {
ESPSTAT_INC(esps_badilen);
DPRINTF(("%s: invalid padding length %d for %u byte packet "
"in SA %s/%08lx\n", __func__, lastthree[1],
m->m_pkthdr.len - skip,
ipsec_address(&sav->sah->saidx.dst, buf, sizeof(buf)),
(u_long) ntohl(sav->spi)));
error = EINVAL;
goto bad;
}
/* Verify correct decryption by checking the last padding bytes */
if ((sav->flags & SADB_X_EXT_PMASK) != SADB_X_EXT_PRAND) {
if (lastthree[1] != lastthree[0] && lastthree[1] != 0) {
ESPSTAT_INC(esps_badenc);
DPRINTF(("%s: decryption failed for packet in "
"SA %s/%08lx\n", __func__, ipsec_address(
&sav->sah->saidx.dst, buf, sizeof(buf)),
(u_long) ntohl(sav->spi)));
error = EINVAL;
goto bad;
}
}
/* Trim the mbuf chain to remove trailing authenticator and padding */
m_adj(m, -(lastthree[1] + 2));
/* Restore the Next Protocol field */
m_copyback(m, protoff, sizeof (u_int8_t), lastthree + 2);
switch (saidx->dst.sa.sa_family) {
#ifdef INET6
case AF_INET6:
error = ipsec6_common_input_cb(m, sav, skip, protoff);
break;
#endif
#ifdef INET
case AF_INET:
error = ipsec4_common_input_cb(m, sav, skip, protoff);
break;
#endif
default:
panic("%s: Unexpected address family: %d saidx=%p", __func__,
saidx->dst.sa.sa_family, saidx);
}
return error;
bad:
if (sav != NULL)
key_freesav(&sav);
if (m != NULL)
m_freem(m);
if (xd != NULL)
free(xd, M_XDATA);
if (crp != NULL)
crypto_freereq(crp);
return error;
}
/*
* ESP output routine, called by ipsec[46]_perform_request().
*/
static int
esp_output(struct mbuf *m, struct secpolicy *sp, struct secasvar *sav,
u_int idx, int skip, int protoff)
{
IPSEC_DEBUG_DECLARE(char buf[IPSEC_ADDRSTRLEN]);
struct cryptodesc *crde = NULL, *crda = NULL;
struct cryptop *crp;
const struct auth_hash *esph;
const struct enc_xform *espx;
struct mbuf *mo = NULL;
struct xform_data *xd;
struct secasindex *saidx;
unsigned char *pad;
uint8_t *ivp;
uint64_t cntr, cryptoid;
int hlen, rlen, padding, blks, alen, i, roff;
int error, maxpacketsize;
uint8_t prot;
IPSEC_ASSERT(sav != NULL, ("null SA"));
esph = sav->tdb_authalgxform;
espx = sav->tdb_encalgxform;
IPSEC_ASSERT(espx != NULL, ("null encoding xform"));
if (sav->flags & SADB_X_EXT_OLD)
hlen = sizeof (struct esp) + sav->ivlen;
else
hlen = sizeof (struct newesp) + sav->ivlen;
rlen = m->m_pkthdr.len - skip; /* Raw payload length. */
/*
* RFC4303 2.4 Requires 4 byte alignment.
*/
blks = MAX(4, espx->blocksize); /* Cipher blocksize */
/* XXX clamp padding length a la KAME??? */
padding = ((blks - ((rlen + 2) % blks)) % blks) + 2;
alen = xform_ah_authsize(esph);
ESPSTAT_INC(esps_output);
saidx = &sav->sah->saidx;
/* Check for maximum packet size violations. */
switch (saidx->dst.sa.sa_family) {
#ifdef INET
case AF_INET:
maxpacketsize = IP_MAXPACKET;
break;
#endif /* INET */
#ifdef INET6
case AF_INET6:
maxpacketsize = IPV6_MAXPACKET;
break;
#endif /* INET6 */
default:
DPRINTF(("%s: unknown/unsupported protocol "
"family %d, SA %s/%08lx\n", __func__,
saidx->dst.sa.sa_family, ipsec_address(&saidx->dst,
buf, sizeof(buf)), (u_long) ntohl(sav->spi)));
ESPSTAT_INC(esps_nopf);
error = EPFNOSUPPORT;
goto bad;
}
/*
DPRINTF(("%s: skip %d hlen %d rlen %d padding %d alen %d blksd %d\n",
__func__, skip, hlen, rlen, padding, alen, blks)); */
if (skip + hlen + rlen + padding + alen > maxpacketsize) {
DPRINTF(("%s: packet in SA %s/%08lx got too big "
"(len %u, max len %u)\n", __func__,
ipsec_address(&saidx->dst, buf, sizeof(buf)),
(u_long) ntohl(sav->spi),
skip + hlen + rlen + padding + alen, maxpacketsize));
ESPSTAT_INC(esps_toobig);
error = EMSGSIZE;
goto bad;
}
/* Update the counters. */
ESPSTAT_ADD(esps_obytes, m->m_pkthdr.len - skip);
m = m_unshare(m, M_NOWAIT);
if (m == NULL) {
DPRINTF(("%s: cannot clone mbuf chain, SA %s/%08lx\n", __func__,
ipsec_address(&saidx->dst, buf, sizeof(buf)),
(u_long) ntohl(sav->spi)));
ESPSTAT_INC(esps_hdrops);
error = ENOBUFS;
goto bad;
}
/* Inject ESP header. */
mo = m_makespace(m, skip, hlen, &roff);
if (mo == NULL) {
DPRINTF(("%s: %u byte ESP hdr inject failed for SA %s/%08lx\n",
__func__, hlen, ipsec_address(&saidx->dst, buf,
sizeof(buf)), (u_long) ntohl(sav->spi)));
ESPSTAT_INC(esps_hdrops); /* XXX diffs from openbsd */
error = ENOBUFS;
goto bad;
}
/* Initialize ESP header. */
bcopy((caddr_t) &sav->spi, mtod(mo, caddr_t) + roff,
sizeof(uint32_t));
SECASVAR_LOCK(sav);
if (sav->replay) {
uint32_t replay;
#ifdef REGRESSION
/* Emulate replay attack when ipsec_replay is TRUE. */
if (!V_ipsec_replay)
#endif
sav->replay->count++;
replay = htonl(sav->replay->count);
bcopy((caddr_t) &replay, mtod(mo, caddr_t) + roff +
sizeof(uint32_t), sizeof(uint32_t));
}
cryptoid = sav->tdb_cryptoid;
if (SAV_ISCTRORGCM(sav))
cntr = sav->cntr++;
SECASVAR_UNLOCK(sav);
/*
* Add padding -- better to do it ourselves than use the crypto engine,
* although if/when we support compression, we'd have to do that.
*/
pad = (u_char *) m_pad(m, padding + alen);
if (pad == NULL) {
DPRINTF(("%s: m_pad failed for SA %s/%08lx\n", __func__,
ipsec_address(&saidx->dst, buf, sizeof(buf)),
(u_long) ntohl(sav->spi)));
m = NULL; /* NB: free'd by m_pad */
error = ENOBUFS;
goto bad;
}
/*
* Add padding: random, zero, or self-describing.
* XXX catch unexpected setting
*/
switch (sav->flags & SADB_X_EXT_PMASK) {
case SADB_X_EXT_PRAND:
(void) read_random(pad, padding - 2);
break;
case SADB_X_EXT_PZERO:
bzero(pad, padding - 2);
break;
case SADB_X_EXT_PSEQ:
for (i = 0; i < padding - 2; i++)
pad[i] = i+1;
break;
}
/* Fix padding length and Next Protocol in padding itself. */
pad[padding - 2] = padding - 2;
m_copydata(m, protoff, sizeof(u_int8_t), pad + padding - 1);
/* Fix Next Protocol in IPv4/IPv6 header. */
prot = IPPROTO_ESP;
m_copyback(m, protoff, sizeof(u_int8_t), (u_char *) &prot);
/* Get crypto descriptors. */
crp = crypto_getreq(esph != NULL ? 2 : 1);
if (crp == NULL) {
DPRINTF(("%s: failed to acquire crypto descriptors\n",
__func__));
ESPSTAT_INC(esps_crypto);
error = ENOBUFS;
goto bad;
}
/* IPsec-specific opaque crypto info. */
xd = malloc(sizeof(struct xform_data), M_XDATA, M_NOWAIT | M_ZERO);
if (xd == NULL) {
crypto_freereq(crp);
DPRINTF(("%s: failed to allocate xform_data\n", __func__));
ESPSTAT_INC(esps_crypto);
error = ENOBUFS;
goto bad;
}
crde = crp->crp_desc;
crda = crde->crd_next;
/* Encryption descriptor. */
crde->crd_skip = skip + hlen;
crde->crd_len = m->m_pkthdr.len - (skip + hlen + alen);
crde->crd_flags = CRD_F_ENCRYPT;
crde->crd_inject = skip + hlen - sav->ivlen;
/* Encryption operation. */
crde->crd_alg = espx->type;
if (SAV_ISCTRORGCM(sav)) {
ivp = &crde->crd_iv[0];
/* GCM IV Format: RFC4106 4 */
/* CTR IV Format: RFC3686 4 */
/* Salt is last four bytes of key, RFC4106 8.1 */
/* Nonce is last four bytes of key, RFC3686 5.1 */
memcpy(ivp, sav->key_enc->key_data +
_KEYLEN(sav->key_enc) - 4, 4);
be64enc(&ivp[4], cntr);
if (SAV_ISCTR(sav)) {
/* Initial block counter is 1, RFC3686 4 */
/* XXXAE: should we use this only for first packet? */
be32enc(&ivp[sav->ivlen + 4], 1);
}
m_copyback(m, skip + hlen - sav->ivlen, sav->ivlen, &ivp[4]);
crde->crd_flags |= CRD_F_IV_EXPLICIT|CRD_F_IV_PRESENT;
}
/* Callback parameters */
xd->sp = sp;
xd->sav = sav;
xd->idx = idx;
xd->cryptoid = cryptoid;
/* Crypto operation descriptor. */
crp->crp_ilen = m->m_pkthdr.len; /* Total input length. */
crp->crp_flags = CRYPTO_F_IMBUF | CRYPTO_F_CBIFSYNC;
if (V_async_crypto)
crp->crp_flags |= CRYPTO_F_ASYNC | CRYPTO_F_ASYNC_KEEPORDER;
crp->crp_buf = (caddr_t) m;
crp->crp_callback = esp_output_cb;
crp->crp_opaque = (caddr_t) xd;
crp->crp_sid = cryptoid;
if (esph) {
/* Authentication descriptor. */
crda->crd_alg = esph->type;
crda->crd_skip = skip;
if (SAV_ISGCM(sav))
crda->crd_len = 8; /* RFC4106 5, SPI + SN */
else
crda->crd_len = m->m_pkthdr.len - (skip + alen);
crda->crd_inject = m->m_pkthdr.len - alen;
}
return crypto_dispatch(crp);
bad:
if (m)
m_freem(m);
key_freesav(&sav);
key_freesp(&sp);
return (error);
}
/*
* ESP output callback from the crypto driver.
*/
static int
esp_output_cb(struct cryptop *crp)
{
struct xform_data *xd;
struct secpolicy *sp;
struct secasvar *sav;
struct mbuf *m;
uint64_t cryptoid;
u_int idx;
int error;
xd = (struct xform_data *) crp->crp_opaque;
m = (struct mbuf *) crp->crp_buf;
sp = xd->sp;
sav = xd->sav;
idx = xd->idx;
cryptoid = xd->cryptoid;
/* Check for crypto errors. */
if (crp->crp_etype) {
if (crp->crp_etype == EAGAIN) {
/* Reset the session ID */
if (ipsec_updateid(sav, &crp->crp_sid, &cryptoid) != 0)
crypto_freesession(cryptoid);
xd->cryptoid = crp->crp_sid;
return (crypto_dispatch(crp));
}
ESPSTAT_INC(esps_noxform);
DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype));
error = crp->crp_etype;
m_freem(m);
goto bad;
}
/* Shouldn't happen... */
if (m == NULL) {
ESPSTAT_INC(esps_crypto);
DPRINTF(("%s: bogus returned buffer from crypto\n", __func__));
error = EINVAL;
goto bad;
}
free(xd, M_XDATA);
crypto_freereq(crp);
ESPSTAT_INC(esps_hist[sav->alg_enc]);
if (sav->tdb_authalgxform != NULL)
AHSTAT_INC(ahs_hist[sav->alg_auth]);
#ifdef REGRESSION
/* Emulate man-in-the-middle attack when ipsec_integrity is TRUE. */
if (V_ipsec_integrity) {
static unsigned char ipseczeroes[AH_HMAC_MAXHASHLEN];
const struct auth_hash *esph;
/*
* Corrupt HMAC if we want to test integrity verification of
* the other side.
*/
esph = sav->tdb_authalgxform;
if (esph != NULL) {
int alen;
alen = xform_ah_authsize(esph);
m_copyback(m, m->m_pkthdr.len - alen,
alen, ipseczeroes);
}
}
#endif
/* NB: m is reclaimed by ipsec_process_done. */
error = ipsec_process_done(m, sp, sav, idx);
return (error);
bad:
free(xd, M_XDATA);
crypto_freereq(crp);
key_freesav(&sav);
key_freesp(&sp);
return (error);
}
static struct xformsw esp_xformsw = {
.xf_type = XF_ESP,
.xf_name = "IPsec ESP",
.xf_init = esp_init,
.xf_zeroize = esp_zeroize,
.xf_input = esp_input,
.xf_output = esp_output,
};
SYSINIT(esp_xform_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE,
xform_attach, &esp_xformsw);
SYSUNINIT(esp_xform_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE,
xform_detach, &esp_xformsw);
Index: head/sys/netipsec/xform_ipcomp.c
===================================================================
--- head/sys/netipsec/xform_ipcomp.c (revision 327172)
+++ head/sys/netipsec/xform_ipcomp.c (revision 327173)
@@ -1,780 +1,777 @@
/* $FreeBSD$ */
/* $OpenBSD: ip_ipcomp.c,v 1.1 2001/07/05 12:08:52 jjbg Exp $ */
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (c) 2001 Jean-Jacques Bernard-Gundol (jj@wabbitt.org)
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* IP payload compression protocol (IPComp), see RFC 2393 */
#include "opt_inet.h"
#include "opt_inet6.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/socket.h>
#include <sys/kernel.h>
#include <sys/protosw.h>
#include <sys/sysctl.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/ip_var.h>
#include <netinet/ip_encap.h>
#include <net/netisr.h>
#include <net/vnet.h>
#include <netipsec/ipsec.h>
#include <netipsec/xform.h>
#ifdef INET6
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netipsec/ipsec6.h>
#endif
#include <netipsec/ipcomp.h>
#include <netipsec/ipcomp_var.h>
#include <netipsec/key.h>
#include <netipsec/key_debug.h>
#include <opencrypto/cryptodev.h>
#include <opencrypto/deflate.h>
#include <opencrypto/xform.h>
VNET_DEFINE(int, ipcomp_enable) = 1;
VNET_PCPUSTAT_DEFINE(struct ipcompstat, ipcompstat);
VNET_PCPUSTAT_SYSINIT(ipcompstat);
#ifdef VIMAGE
VNET_PCPUSTAT_SYSUNINIT(ipcompstat);
#endif /* VIMAGE */
SYSCTL_DECL(_net_inet_ipcomp);
SYSCTL_INT(_net_inet_ipcomp, OID_AUTO, ipcomp_enable,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipcomp_enable), 0, "");
SYSCTL_VNET_PCPUSTAT(_net_inet_ipcomp, IPSECCTL_STATS, stats,
struct ipcompstat, ipcompstat,
"IPCOMP statistics (struct ipcompstat, netipsec/ipcomp_var.h");
static int ipcomp_input_cb(struct cryptop *crp);
static int ipcomp_output_cb(struct cryptop *crp);
/*
* RFC 3173 p 2.2. Non-Expansion Policy:
* If the total size of a compressed payload and the IPComp header, as
* defined in section 3, is not smaller than the size of the original
* payload, the IP datagram MUST be sent in the original non-compressed
* form.
*
* When we use IPComp in tunnel mode, for small packets we will receive
* encapsulated IP-IP datagrams without any compression and without IPComp
* header.
*/
static int
ipcomp_encapcheck(union sockaddr_union *src, union sockaddr_union *dst)
{
struct secasvar *sav;
sav = key_allocsa_tunnel(src, dst, IPPROTO_IPCOMP);
if (sav == NULL)
return (0);
key_freesav(&sav);
if (src->sa.sa_family == AF_INET)
return (sizeof(struct in_addr) << 4);
else
return (sizeof(struct in6_addr) << 4);
}
static int
ipcomp_nonexp_input(struct mbuf **mp, int *offp, int proto)
{
int isr;
switch (proto) {
#ifdef INET
case IPPROTO_IPV4:
isr = NETISR_IP;
break;
#endif
#ifdef INET6
case IPPROTO_IPV6:
isr = NETISR_IPV6;
break;
#endif
default:
IPCOMPSTAT_INC(ipcomps_nopf);
m_freem(*mp);
return (IPPROTO_DONE);
}
m_adj(*mp, *offp);
IPCOMPSTAT_ADD(ipcomps_ibytes, (*mp)->m_pkthdr.len);
IPCOMPSTAT_INC(ipcomps_input);
netisr_dispatch(isr, *mp);
return (IPPROTO_DONE);
}
/*
* ipcomp_init() is called when an CPI is being set up.
*/
static int
ipcomp_init(struct secasvar *sav, struct xformsw *xsp)
{
const struct comp_algo *tcomp;
struct cryptoini cric;
/* NB: algorithm really comes in alg_enc and not alg_comp! */
tcomp = comp_algorithm_lookup(sav->alg_enc);
if (tcomp == NULL) {
DPRINTF(("%s: unsupported compression algorithm %d\n", __func__,
sav->alg_comp));
return EINVAL;
}
sav->alg_comp = sav->alg_enc; /* set for doing histogram */
sav->tdb_xform = xsp;
sav->tdb_compalgxform = tcomp;
/* Initialize crypto session */
bzero(&cric, sizeof (cric));
cric.cri_alg = sav->tdb_compalgxform->type;
return crypto_newsession(&sav->tdb_cryptoid, &cric, V_crypto_support);
}
/*
* ipcomp_zeroize() used when IPCA is deleted
*/
static int
ipcomp_zeroize(struct secasvar *sav)
{
int err;
err = crypto_freesession(sav->tdb_cryptoid);
sav->tdb_cryptoid = 0;
return err;
}
/*
* ipcomp_input() gets called to uncompress an input packet
*/
static int
ipcomp_input(struct mbuf *m, struct secasvar *sav, int skip, int protoff)
{
struct xform_data *xd;
struct cryptodesc *crdc;
struct cryptop *crp;
struct ipcomp *ipcomp;
caddr_t addr;
int error, hlen = IPCOMP_HLENGTH;
/*
* Check that the next header of the IPComp is not IPComp again, before
* doing any real work. Given it is not possible to do double
* compression it means someone is playing tricks on us.
*/
error = ENOBUFS;
if (m->m_len < skip + hlen && (m = m_pullup(m, skip + hlen)) == NULL) {
IPCOMPSTAT_INC(ipcomps_hdrops); /*XXX*/
DPRINTF(("%s: m_pullup failed\n", __func__));
key_freesav(&sav);
return (error);
}
addr = (caddr_t) mtod(m, struct ip *) + skip;
ipcomp = (struct ipcomp *)addr;
if (ipcomp->comp_nxt == IPPROTO_IPCOMP) {
IPCOMPSTAT_INC(ipcomps_pdrops); /* XXX have our own stats? */
DPRINTF(("%s: recursive compression detected\n", __func__));
error = EINVAL;
goto bad;
}
/* Get crypto descriptors */
crp = crypto_getreq(1);
if (crp == NULL) {
DPRINTF(("%s: no crypto descriptors\n", __func__));
IPCOMPSTAT_INC(ipcomps_crypto);
goto bad;
}
/* Get IPsec-specific opaque pointer */
xd = malloc(sizeof(*xd), M_XDATA, M_NOWAIT | M_ZERO);
if (xd == NULL) {
DPRINTF(("%s: cannot allocate xform_data\n", __func__));
IPCOMPSTAT_INC(ipcomps_crypto);
crypto_freereq(crp);
goto bad;
}
crdc = crp->crp_desc;
crdc->crd_skip = skip + hlen;
crdc->crd_len = m->m_pkthdr.len - (skip + hlen);
crdc->crd_inject = skip;
/* Decompression operation */
crdc->crd_alg = sav->tdb_compalgxform->type;
/* Crypto operation descriptor */
crp->crp_ilen = m->m_pkthdr.len - (skip + hlen);
crp->crp_flags = CRYPTO_F_IMBUF | CRYPTO_F_CBIFSYNC;
crp->crp_buf = (caddr_t) m;
crp->crp_callback = ipcomp_input_cb;
crp->crp_opaque = (caddr_t) xd;
/* These are passed as-is to the callback */
xd->sav = sav;
xd->protoff = protoff;
xd->skip = skip;
SECASVAR_LOCK(sav);
crp->crp_sid = xd->cryptoid = sav->tdb_cryptoid;
SECASVAR_UNLOCK(sav);
return crypto_dispatch(crp);
bad:
m_freem(m);
key_freesav(&sav);
return (error);
}
/*
* IPComp input callback from the crypto driver.
*/
static int
ipcomp_input_cb(struct cryptop *crp)
{
IPSEC_DEBUG_DECLARE(char buf[IPSEC_ADDRSTRLEN]);
- struct cryptodesc *crd;
struct xform_data *xd;
struct mbuf *m;
struct secasvar *sav;
struct secasindex *saidx;
caddr_t addr;
uint64_t cryptoid;
int hlen = IPCOMP_HLENGTH, error, clen;
int skip, protoff;
uint8_t nproto;
-
- crd = crp->crp_desc;
m = (struct mbuf *) crp->crp_buf;
xd = (struct xform_data *) crp->crp_opaque;
sav = xd->sav;
skip = xd->skip;
protoff = xd->protoff;
cryptoid = xd->cryptoid;
saidx = &sav->sah->saidx;
IPSEC_ASSERT(saidx->dst.sa.sa_family == AF_INET ||
saidx->dst.sa.sa_family == AF_INET6,
("unexpected protocol family %u", saidx->dst.sa.sa_family));
/* Check for crypto errors */
if (crp->crp_etype) {
if (crp->crp_etype == EAGAIN) {
/* Reset the session ID */
if (ipsec_updateid(sav, &crp->crp_sid, &cryptoid) != 0)
crypto_freesession(cryptoid);
xd->cryptoid = crp->crp_sid;
return (crypto_dispatch(crp));
}
IPCOMPSTAT_INC(ipcomps_noxform);
DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype));
error = crp->crp_etype;
goto bad;
}
/* Shouldn't happen... */
if (m == NULL) {
IPCOMPSTAT_INC(ipcomps_crypto);
DPRINTF(("%s: null mbuf returned from crypto\n", __func__));
error = EINVAL;
goto bad;
}
IPCOMPSTAT_INC(ipcomps_hist[sav->alg_comp]);
clen = crp->crp_olen; /* Length of data after processing */
/* Release the crypto descriptors */
free(xd, M_XDATA), xd = NULL;
crypto_freereq(crp), crp = NULL;
/* In case it's not done already, adjust the size of the mbuf chain */
m->m_pkthdr.len = clen + hlen + skip;
if (m->m_len < skip + hlen && (m = m_pullup(m, skip + hlen)) == NULL) {
IPCOMPSTAT_INC(ipcomps_hdrops); /*XXX*/
DPRINTF(("%s: m_pullup failed\n", __func__));
error = EINVAL; /*XXX*/
goto bad;
}
/* Keep the next protocol field */
addr = (caddr_t) mtod(m, struct ip *) + skip;
nproto = ((struct ipcomp *) addr)->comp_nxt;
/* Remove the IPCOMP header */
error = m_striphdr(m, skip, hlen);
if (error) {
IPCOMPSTAT_INC(ipcomps_hdrops);
DPRINTF(("%s: bad mbuf chain, IPCA %s/%08lx\n", __func__,
ipsec_address(&sav->sah->saidx.dst, buf, sizeof(buf)),
(u_long) ntohl(sav->spi)));
goto bad;
}
/* Restore the Next Protocol field */
m_copyback(m, protoff, sizeof (u_int8_t), (u_int8_t *) &nproto);
switch (saidx->dst.sa.sa_family) {
#ifdef INET6
case AF_INET6:
error = ipsec6_common_input_cb(m, sav, skip, protoff);
break;
#endif
#ifdef INET
case AF_INET:
error = ipsec4_common_input_cb(m, sav, skip, protoff);
break;
#endif
default:
panic("%s: Unexpected address family: %d saidx=%p", __func__,
saidx->dst.sa.sa_family, saidx);
}
return error;
bad:
if (sav != NULL)
key_freesav(&sav);
if (m != NULL)
m_freem(m);
if (xd != NULL)
free(xd, M_XDATA);
if (crp != NULL)
crypto_freereq(crp);
return error;
}
/*
* IPComp output routine, called by ipsec[46]_perform_request()
*/
static int
ipcomp_output(struct mbuf *m, struct secpolicy *sp, struct secasvar *sav,
u_int idx, int skip, int protoff)
{
IPSEC_DEBUG_DECLARE(char buf[IPSEC_ADDRSTRLEN]);
const struct comp_algo *ipcompx;
struct cryptodesc *crdc;
struct cryptop *crp;
struct xform_data *xd;
int error, ralen, maxpacketsize;
IPSEC_ASSERT(sav != NULL, ("null SA"));
ipcompx = sav->tdb_compalgxform;
IPSEC_ASSERT(ipcompx != NULL, ("null compression xform"));
/*
* Do not touch the packet in case our payload to compress
* is lower than the minimal threshold of the compression
* alogrithm. We will just send out the data uncompressed.
* See RFC 3173, 2.2. Non-Expansion Policy.
*/
if (m->m_pkthdr.len <= ipcompx->minlen) {
IPCOMPSTAT_INC(ipcomps_threshold);
return ipsec_process_done(m, sp, sav, idx);
}
ralen = m->m_pkthdr.len - skip; /* Raw payload length before comp. */
IPCOMPSTAT_INC(ipcomps_output);
/* Check for maximum packet size violations. */
switch (sav->sah->saidx.dst.sa.sa_family) {
#ifdef INET
case AF_INET:
maxpacketsize = IP_MAXPACKET;
break;
#endif /* INET */
#ifdef INET6
case AF_INET6:
maxpacketsize = IPV6_MAXPACKET;
break;
#endif /* INET6 */
default:
IPCOMPSTAT_INC(ipcomps_nopf);
DPRINTF(("%s: unknown/unsupported protocol family %d, "
"IPCA %s/%08lx\n", __func__,
sav->sah->saidx.dst.sa.sa_family,
ipsec_address(&sav->sah->saidx.dst, buf, sizeof(buf)),
(u_long) ntohl(sav->spi)));
error = EPFNOSUPPORT;
goto bad;
}
if (ralen + skip + IPCOMP_HLENGTH > maxpacketsize) {
IPCOMPSTAT_INC(ipcomps_toobig);
DPRINTF(("%s: packet in IPCA %s/%08lx got too big "
"(len %u, max len %u)\n", __func__,
ipsec_address(&sav->sah->saidx.dst, buf, sizeof(buf)),
(u_long) ntohl(sav->spi),
ralen + skip + IPCOMP_HLENGTH, maxpacketsize));
error = EMSGSIZE;
goto bad;
}
/* Update the counters */
IPCOMPSTAT_ADD(ipcomps_obytes, m->m_pkthdr.len - skip);
m = m_unshare(m, M_NOWAIT);
if (m == NULL) {
IPCOMPSTAT_INC(ipcomps_hdrops);
DPRINTF(("%s: cannot clone mbuf chain, IPCA %s/%08lx\n",
__func__, ipsec_address(&sav->sah->saidx.dst, buf,
sizeof(buf)), (u_long) ntohl(sav->spi)));
error = ENOBUFS;
goto bad;
}
/* Ok now, we can pass to the crypto processing. */
/* Get crypto descriptors */
crp = crypto_getreq(1);
if (crp == NULL) {
IPCOMPSTAT_INC(ipcomps_crypto);
DPRINTF(("%s: failed to acquire crypto descriptor\n",__func__));
error = ENOBUFS;
goto bad;
}
crdc = crp->crp_desc;
/* Compression descriptor */
crdc->crd_skip = skip;
crdc->crd_len = ralen;
crdc->crd_flags = CRD_F_COMP;
crdc->crd_inject = skip;
/* Compression operation */
crdc->crd_alg = ipcompx->type;
/* IPsec-specific opaque crypto info */
xd = malloc(sizeof(struct xform_data), M_XDATA, M_NOWAIT | M_ZERO);
if (xd == NULL) {
IPCOMPSTAT_INC(ipcomps_crypto);
DPRINTF(("%s: failed to allocate xform_data\n", __func__));
crypto_freereq(crp);
error = ENOBUFS;
goto bad;
}
xd->sp = sp;
xd->sav = sav;
xd->idx = idx;
xd->skip = skip;
xd->protoff = protoff;
/* Crypto operation descriptor */
crp->crp_ilen = m->m_pkthdr.len; /* Total input length */
crp->crp_flags = CRYPTO_F_IMBUF | CRYPTO_F_CBIFSYNC;
crp->crp_buf = (caddr_t) m;
crp->crp_callback = ipcomp_output_cb;
crp->crp_opaque = (caddr_t) xd;
SECASVAR_LOCK(sav);
crp->crp_sid = xd->cryptoid = sav->tdb_cryptoid;
SECASVAR_UNLOCK(sav);
return crypto_dispatch(crp);
bad:
if (m)
m_freem(m);
key_freesav(&sav);
key_freesp(&sp);
return (error);
}
/*
* IPComp output callback from the crypto driver.
*/
static int
ipcomp_output_cb(struct cryptop *crp)
{
IPSEC_DEBUG_DECLARE(char buf[IPSEC_ADDRSTRLEN]);
struct xform_data *xd;
struct secpolicy *sp;
struct secasvar *sav;
struct mbuf *m;
uint64_t cryptoid;
u_int idx;
int error, skip, protoff;
m = (struct mbuf *) crp->crp_buf;
xd = (struct xform_data *) crp->crp_opaque;
idx = xd->idx;
sp = xd->sp;
sav = xd->sav;
skip = xd->skip;
protoff = xd->protoff;
cryptoid = xd->cryptoid;
/* Check for crypto errors */
if (crp->crp_etype) {
if (crp->crp_etype == EAGAIN) {
/* Reset the session ID */
if (ipsec_updateid(sav, &crp->crp_sid, &cryptoid) != 0)
crypto_freesession(cryptoid);
xd->cryptoid = crp->crp_sid;
return (crypto_dispatch(crp));
}
IPCOMPSTAT_INC(ipcomps_noxform);
DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype));
error = crp->crp_etype;
goto bad;
}
/* Shouldn't happen... */
if (m == NULL) {
IPCOMPSTAT_INC(ipcomps_crypto);
DPRINTF(("%s: bogus return buffer from crypto\n", __func__));
error = EINVAL;
goto bad;
}
IPCOMPSTAT_INC(ipcomps_hist[sav->alg_comp]);
if (crp->crp_ilen - skip > crp->crp_olen) {
struct mbuf *mo;
struct ipcomp *ipcomp;
int roff;
uint8_t prot;
/* Compression helped, inject IPCOMP header. */
mo = m_makespace(m, skip, IPCOMP_HLENGTH, &roff);
if (mo == NULL) {
IPCOMPSTAT_INC(ipcomps_wrap);
DPRINTF(("%s: IPCOMP header inject failed "
"for IPCA %s/%08lx\n",
__func__, ipsec_address(&sav->sah->saidx.dst, buf,
sizeof(buf)), (u_long) ntohl(sav->spi)));
error = ENOBUFS;
goto bad;
}
ipcomp = (struct ipcomp *)(mtod(mo, caddr_t) + roff);
/* Initialize the IPCOMP header */
/* XXX alignment always correct? */
switch (sav->sah->saidx.dst.sa.sa_family) {
#ifdef INET
case AF_INET:
ipcomp->comp_nxt = mtod(m, struct ip *)->ip_p;
break;
#endif /* INET */
#ifdef INET6
case AF_INET6:
ipcomp->comp_nxt = mtod(m, struct ip6_hdr *)->ip6_nxt;
break;
#endif
}
ipcomp->comp_flags = 0;
ipcomp->comp_cpi = htons((u_int16_t) ntohl(sav->spi));
/* Fix Next Protocol in IPv4/IPv6 header */
prot = IPPROTO_IPCOMP;
m_copyback(m, protoff, sizeof(u_int8_t),
(u_char *)&prot);
/* Adjust the length in the IP header */
switch (sav->sah->saidx.dst.sa.sa_family) {
#ifdef INET
case AF_INET:
mtod(m, struct ip *)->ip_len = htons(m->m_pkthdr.len);
break;
#endif /* INET */
#ifdef INET6
case AF_INET6:
mtod(m, struct ip6_hdr *)->ip6_plen =
htons(m->m_pkthdr.len) - sizeof(struct ip6_hdr);
break;
#endif /* INET6 */
default:
IPCOMPSTAT_INC(ipcomps_nopf);
DPRINTF(("%s: unknown/unsupported protocol "
"family %d, IPCA %s/%08lx\n", __func__,
sav->sah->saidx.dst.sa.sa_family,
ipsec_address(&sav->sah->saidx.dst, buf,
sizeof(buf)), (u_long) ntohl(sav->spi)));
error = EPFNOSUPPORT;
goto bad;
}
} else {
/* Compression was useless, we have lost time. */
IPCOMPSTAT_INC(ipcomps_uncompr);
DPRINTF(("%s: compressions was useless %d - %d <= %d\n",
__func__, crp->crp_ilen, skip, crp->crp_olen));
/* XXX remember state to not compress the next couple
* of packets, RFC 3173, 2.2. Non-Expansion Policy */
}
/* Release the crypto descriptor */
free(xd, M_XDATA);
crypto_freereq(crp);
/* NB: m is reclaimed by ipsec_process_done. */
error = ipsec_process_done(m, sp, sav, idx);
return (error);
bad:
if (m)
m_freem(m);
free(xd, M_XDATA);
crypto_freereq(crp);
key_freesav(&sav);
key_freesp(&sp);
return (error);
}
#ifdef INET
static const struct encaptab *ipe4_cookie = NULL;
extern struct domain inetdomain;
static struct protosw ipcomp4_protosw = {
.pr_type = SOCK_RAW,
.pr_domain = &inetdomain,
.pr_protocol = 0 /* IPPROTO_IPV[46] */,
.pr_flags = PR_ATOMIC | PR_ADDR | PR_LASTHDR,
.pr_input = ipcomp_nonexp_input,
.pr_output = rip_output,
.pr_ctloutput = rip_ctloutput,
.pr_usrreqs = &rip_usrreqs
};
static int
ipcomp4_nonexp_encapcheck(const struct mbuf *m, int off, int proto,
void *arg __unused)
{
union sockaddr_union src, dst;
const struct ip *ip;
if (V_ipcomp_enable == 0)
return (0);
if (proto != IPPROTO_IPV4 && proto != IPPROTO_IPV6)
return (0);
bzero(&src, sizeof(src));
bzero(&dst, sizeof(dst));
src.sa.sa_family = dst.sa.sa_family = AF_INET;
src.sin.sin_len = dst.sin.sin_len = sizeof(struct sockaddr_in);
ip = mtod(m, const struct ip *);
src.sin.sin_addr = ip->ip_src;
dst.sin.sin_addr = ip->ip_dst;
return (ipcomp_encapcheck(&src, &dst));
}
#endif
#ifdef INET6
static const struct encaptab *ipe6_cookie = NULL;
extern struct domain inet6domain;
static struct protosw ipcomp6_protosw = {
.pr_type = SOCK_RAW,
.pr_domain = &inet6domain,
.pr_protocol = 0 /* IPPROTO_IPV[46] */,
.pr_flags = PR_ATOMIC | PR_ADDR | PR_LASTHDR,
.pr_input = ipcomp_nonexp_input,
.pr_output = rip6_output,
.pr_ctloutput = rip6_ctloutput,
.pr_usrreqs = &rip6_usrreqs
};
static int
ipcomp6_nonexp_encapcheck(const struct mbuf *m, int off, int proto,
void *arg __unused)
{
union sockaddr_union src, dst;
const struct ip6_hdr *ip6;
if (V_ipcomp_enable == 0)
return (0);
if (proto != IPPROTO_IPV4 && proto != IPPROTO_IPV6)
return (0);
bzero(&src, sizeof(src));
bzero(&dst, sizeof(dst));
src.sa.sa_family = dst.sa.sa_family = AF_INET;
src.sin6.sin6_len = dst.sin6.sin6_len = sizeof(struct sockaddr_in6);
ip6 = mtod(m, const struct ip6_hdr *);
src.sin6.sin6_addr = ip6->ip6_src;
dst.sin6.sin6_addr = ip6->ip6_dst;
if (IN6_IS_SCOPE_LINKLOCAL(&src.sin6.sin6_addr)) {
/* XXX: sa6_recoverscope() */
src.sin6.sin6_scope_id =
ntohs(src.sin6.sin6_addr.s6_addr16[1]);
src.sin6.sin6_addr.s6_addr16[1] = 0;
}
if (IN6_IS_SCOPE_LINKLOCAL(&dst.sin6.sin6_addr)) {
/* XXX: sa6_recoverscope() */
dst.sin6.sin6_scope_id =
ntohs(dst.sin6.sin6_addr.s6_addr16[1]);
dst.sin6.sin6_addr.s6_addr16[1] = 0;
}
return (ipcomp_encapcheck(&src, &dst));
}
#endif
static struct xformsw ipcomp_xformsw = {
.xf_type = XF_IPCOMP,
.xf_name = "IPcomp",
.xf_init = ipcomp_init,
.xf_zeroize = ipcomp_zeroize,
.xf_input = ipcomp_input,
.xf_output = ipcomp_output,
};
static void
ipcomp_attach(void)
{
#ifdef INET
ipe4_cookie = encap_attach_func(AF_INET, -1,
ipcomp4_nonexp_encapcheck, &ipcomp4_protosw, NULL);
#endif
#ifdef INET6
ipe6_cookie = encap_attach_func(AF_INET6, -1,
ipcomp6_nonexp_encapcheck, &ipcomp6_protosw, NULL);
#endif
xform_attach(&ipcomp_xformsw);
}
static void
ipcomp_detach(void)
{
#ifdef INET
encap_detach(ipe4_cookie);
#endif
#ifdef INET6
encap_detach(ipe6_cookie);
#endif
xform_detach(&ipcomp_xformsw);
}
SYSINIT(ipcomp_xform_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE,
ipcomp_attach, NULL);
SYSUNINIT(ipcomp_xform_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE,
ipcomp_detach, NULL);
Index: head/sys/nfs/nfs_fha.c
===================================================================
--- head/sys/nfs/nfs_fha.c (revision 327172)
+++ head/sys/nfs/nfs_fha.c (revision 327173)
@@ -1,536 +1,527 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2008 Isilon Inc http://www.isilon.com/
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysproto.h>
#include <sys/kernel.h>
#include <sys/sysctl.h>
#include <sys/vnode.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/mbuf.h>
#include <sys/sbuf.h>
#include <rpc/rpc.h>
#include <nfs/nfs_fha.h>
static MALLOC_DEFINE(M_NFS_FHA, "NFS FHA", "NFS FHA");
/*
* XXX need to commonize definitions between old and new NFS code. Define
* this here so we don't include one nfsproto.h over the other.
*/
#define NFS_PROG 100003
void
fha_init(struct fha_params *softc)
{
int i;
for (i = 0; i < FHA_HASH_SIZE; i++)
mtx_init(&softc->fha_hash[i].mtx, "fhalock", NULL, MTX_DEF);
/*
* Set the default tuning parameters.
*/
softc->ctls.enable = FHA_DEF_ENABLE;
softc->ctls.read = FHA_DEF_READ;
softc->ctls.write = FHA_DEF_WRITE;
softc->ctls.bin_shift = FHA_DEF_BIN_SHIFT;
softc->ctls.max_nfsds_per_fh = FHA_DEF_MAX_NFSDS_PER_FH;
softc->ctls.max_reqs_per_nfsd = FHA_DEF_MAX_REQS_PER_NFSD;
/*
* Add sysctls so the user can change the tuning parameters.
*/
SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
OID_AUTO, "enable", CTLFLAG_RWTUN,
&softc->ctls.enable, 0, "Enable NFS File Handle Affinity (FHA)");
SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
OID_AUTO, "read", CTLFLAG_RWTUN,
&softc->ctls.read, 0, "Enable NFS FHA read locality");
SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
OID_AUTO, "write", CTLFLAG_RWTUN,
&softc->ctls.write, 0, "Enable NFS FHA write locality");
SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
OID_AUTO, "bin_shift", CTLFLAG_RWTUN,
&softc->ctls.bin_shift, 0, "Maximum locality distance 2^(bin_shift) bytes");
SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
OID_AUTO, "max_nfsds_per_fh", CTLFLAG_RWTUN,
&softc->ctls.max_nfsds_per_fh, 0, "Maximum nfsd threads that "
"should be working on requests for the same file handle");
SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
OID_AUTO, "max_reqs_per_nfsd", CTLFLAG_RWTUN,
&softc->ctls.max_reqs_per_nfsd, 0, "Maximum requests that "
"single nfsd thread should be working on at any time");
SYSCTL_ADD_OID(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
OID_AUTO, "fhe_stats", CTLTYPE_STRING | CTLFLAG_RD, 0, 0,
softc->callbacks.fhe_stats_sysctl, "A", "");
}
void
fha_uninit(struct fha_params *softc)
{
int i;
sysctl_ctx_free(&softc->sysctl_ctx);
for (i = 0; i < FHA_HASH_SIZE; i++)
mtx_destroy(&softc->fha_hash[i].mtx);
}
/*
* This just specifies that offsets should obey affinity when within
* the same 1Mbyte (1<<20) chunk for the file (reads only for now).
*/
static void
fha_extract_info(struct svc_req *req, struct fha_info *i,
struct fha_callbacks *cb)
{
struct mbuf *md;
caddr_t dpos;
static u_int64_t random_fh = 0;
int error;
int v3 = (req->rq_vers == 3);
rpcproc_t procnum;
/*
* We start off with a random fh. If we get a reasonable
* procnum, we set the fh. If there's a concept of offset
* that we're interested in, we set that.
*/
i->fh = ++random_fh;
i->offset = 0;
i->locktype = LK_EXCLUSIVE;
i->read = i->write = 0;
/*
* Extract the procnum and convert to v3 form if necessary,
* taking care to deal with out-of-range procnums. Caller will
* ensure that rq_vers is either 2 or 3.
*/
procnum = req->rq_proc;
if (!v3) {
rpcproc_t tmp_procnum;
tmp_procnum = cb->get_procnum(procnum);
if (tmp_procnum == -1)
goto out;
procnum = tmp_procnum;
}
/*
* We do affinity for most. However, we divide a realm of affinity
* by file offset so as to allow for concurrent random access. We
* only do this for reads today, but this may change when IFS supports
* efficient concurrent writes.
*/
if (cb->no_offset(procnum))
goto out;
i->read = cb->is_read(procnum);
i->write = cb->is_write(procnum);
error = cb->realign(&req->rq_args, M_NOWAIT);
if (error)
goto out;
md = req->rq_args;
dpos = mtod(md, caddr_t);
/* Grab the filehandle. */
error = cb->get_fh(&i->fh, v3, &md, &dpos);
if (error)
goto out;
/* Content ourselves with zero offset for all but reads. */
if (i->read || i->write)
cb->get_offset(&md, &dpos, v3, i);
out:
cb->set_locktype(procnum, i);
}
static struct fha_hash_entry *
fha_hash_entry_new(u_int64_t fh)
{
struct fha_hash_entry *e;
e = malloc(sizeof(*e), M_NFS_FHA, M_WAITOK);
e->fh = fh;
e->num_rw = 0;
e->num_exclusive = 0;
e->num_threads = 0;
LIST_INIT(&e->threads);
return (e);
}
static void
fha_hash_entry_destroy(struct fha_hash_entry *e)
{
mtx_assert(e->mtx, MA_OWNED);
KASSERT(e->num_rw == 0,
("%d reqs on destroyed fhe %p", e->num_rw, e));
KASSERT(e->num_exclusive == 0,
("%d exclusive reqs on destroyed fhe %p", e->num_exclusive, e));
KASSERT(e->num_threads == 0,
("%d threads on destroyed fhe %p", e->num_threads, e));
free(e, M_NFS_FHA);
}
static void
fha_hash_entry_remove(struct fha_hash_entry *e)
{
mtx_assert(e->mtx, MA_OWNED);
LIST_REMOVE(e, link);
fha_hash_entry_destroy(e);
}
static struct fha_hash_entry *
fha_hash_entry_lookup(struct fha_params *softc, u_int64_t fh)
{
- SVCPOOL *pool;
struct fha_hash_slot *fhs;
struct fha_hash_entry *fhe, *new_fhe;
- pool = *softc->pool;
fhs = &softc->fha_hash[fh % FHA_HASH_SIZE];
new_fhe = fha_hash_entry_new(fh);
new_fhe->mtx = &fhs->mtx;
mtx_lock(&fhs->mtx);
LIST_FOREACH(fhe, &fhs->list, link)
if (fhe->fh == fh)
break;
if (!fhe) {
fhe = new_fhe;
LIST_INSERT_HEAD(&fhs->list, fhe, link);
} else
fha_hash_entry_destroy(new_fhe);
return (fhe);
}
static void
fha_hash_entry_add_thread(struct fha_hash_entry *fhe, SVCTHREAD *thread)
{
mtx_assert(fhe->mtx, MA_OWNED);
thread->st_p2 = 0;
LIST_INSERT_HEAD(&fhe->threads, thread, st_alink);
fhe->num_threads++;
}
static void
fha_hash_entry_remove_thread(struct fha_hash_entry *fhe, SVCTHREAD *thread)
{
mtx_assert(fhe->mtx, MA_OWNED);
KASSERT(thread->st_p2 == 0,
("%d reqs on removed thread %p", thread->st_p2, thread));
LIST_REMOVE(thread, st_alink);
fhe->num_threads--;
}
/*
* Account for an ongoing operation associated with this file.
*/
static void
fha_hash_entry_add_op(struct fha_hash_entry *fhe, int locktype, int count)
{
mtx_assert(fhe->mtx, MA_OWNED);
if (LK_EXCLUSIVE == locktype)
fhe->num_exclusive += count;
else
fhe->num_rw += count;
}
/*
* Get the service thread currently associated with the fhe that is
* appropriate to handle this operation.
*/
static SVCTHREAD *
fha_hash_entry_choose_thread(struct fha_params *softc,
struct fha_hash_entry *fhe, struct fha_info *i, SVCTHREAD *this_thread)
{
SVCTHREAD *thread, *min_thread = NULL;
- SVCPOOL *pool;
int req_count, min_count = 0;
off_t offset1, offset2;
- pool = *softc->pool;
-
LIST_FOREACH(thread, &fhe->threads, st_alink) {
req_count = thread->st_p2;
/* If there are any writes in progress, use the first thread. */
if (fhe->num_exclusive) {
#if 0
ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
"fha: %p(%d)w", thread, req_count);
#endif
return (thread);
}
/* Check whether we should consider locality. */
if ((i->read && !softc->ctls.read) ||
(i->write && !softc->ctls.write))
goto noloc;
/*
* Check for locality, making sure that we won't
* exceed our per-thread load limit in the process.
*/
offset1 = i->offset;
offset2 = thread->st_p3;
if (((offset1 >= offset2)
&& ((offset1 - offset2) < (1 << softc->ctls.bin_shift)))
|| ((offset2 > offset1)
&& ((offset2 - offset1) < (1 << softc->ctls.bin_shift)))) {
if ((softc->ctls.max_reqs_per_nfsd == 0) ||
(req_count < softc->ctls.max_reqs_per_nfsd)) {
#if 0
ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
"fha: %p(%d)r", thread, req_count);
#endif
return (thread);
}
}
noloc:
/*
* We don't have a locality match, so skip this thread,
* but keep track of the most attractive thread in case
* we need to come back to it later.
*/
#if 0
ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
"fha: %p(%d)s off1 %llu off2 %llu", thread,
req_count, offset1, offset2);
#endif
if ((min_thread == NULL) || (req_count < min_count)) {
min_count = req_count;
min_thread = thread;
}
}
/*
* We didn't find a good match yet. See if we can add
* a new thread to this file handle entry's thread list.
*/
if ((softc->ctls.max_nfsds_per_fh == 0) ||
(fhe->num_threads < softc->ctls.max_nfsds_per_fh)) {
thread = this_thread;
#if 0
ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
"fha: %p(%d)t", thread, thread->st_p2);
#endif
fha_hash_entry_add_thread(fhe, thread);
} else {
/*
* We don't want to use any more threads for this file, so
* go back to the most attractive nfsd we're already using.
*/
thread = min_thread;
}
return (thread);
}
/*
* After getting a request, try to assign it to some thread. Usually we
* handle it ourselves.
*/
SVCTHREAD *
fha_assign(SVCTHREAD *this_thread, struct svc_req *req,
struct fha_params *softc)
{
SVCTHREAD *thread;
struct fha_info i;
struct fha_hash_entry *fhe;
struct fha_callbacks *cb;
cb = &softc->callbacks;
/* Check to see whether we're enabled. */
if (softc->ctls.enable == 0)
goto thist;
/*
* Only do placement if this is an NFS request.
*/
if (req->rq_prog != NFS_PROG)
goto thist;
if (req->rq_vers != 2 && req->rq_vers != 3)
goto thist;
fha_extract_info(req, &i, cb);
/*
* We save the offset associated with this request for later
* nfsd matching.
*/
fhe = fha_hash_entry_lookup(softc, i.fh);
req->rq_p1 = fhe;
req->rq_p2 = i.locktype;
req->rq_p3 = i.offset;
/*
* Choose a thread, taking into consideration locality, thread load,
* and the number of threads already working on this file.
*/
thread = fha_hash_entry_choose_thread(softc, fhe, &i, this_thread);
KASSERT(thread, ("fha_assign: NULL thread!"));
fha_hash_entry_add_op(fhe, i.locktype, 1);
thread->st_p2++;
thread->st_p3 = i.offset;
/*
* Grab the pool lock here to not let chosen thread go away before
* the new request inserted to its queue while we drop fhe lock.
*/
mtx_lock(&thread->st_lock);
mtx_unlock(fhe->mtx);
return (thread);
thist:
req->rq_p1 = NULL;
mtx_lock(&this_thread->st_lock);
return (this_thread);
}
/*
* Called when we're done with an operation. The request has already
* been de-queued.
*/
void
fha_nd_complete(SVCTHREAD *thread, struct svc_req *req)
{
struct fha_hash_entry *fhe = req->rq_p1;
struct mtx *mtx;
/*
* This may be called for reqs that didn't go through
* fha_assign (e.g. extra NULL ops used for RPCSEC_GSS.
*/
if (!fhe)
return;
mtx = fhe->mtx;
mtx_lock(mtx);
fha_hash_entry_add_op(fhe, req->rq_p2, -1);
thread->st_p2--;
KASSERT(thread->st_p2 >= 0, ("Negative request count %d on %p",
thread->st_p2, thread));
if (thread->st_p2 == 0) {
fha_hash_entry_remove_thread(fhe, thread);
if (0 == fhe->num_rw + fhe->num_exclusive)
fha_hash_entry_remove(fhe);
}
mtx_unlock(mtx);
}
int
fhe_stats_sysctl(SYSCTL_HANDLER_ARGS, struct fha_params *softc)
{
int error, i;
struct sbuf sb;
struct fha_hash_entry *fhe;
bool_t first, hfirst;
SVCTHREAD *thread;
- SVCPOOL *pool;
sbuf_new(&sb, NULL, 65536, SBUF_FIXEDLEN);
- pool = NULL;
-
if (!*softc->pool) {
sbuf_printf(&sb, "NFSD not running\n");
goto out;
}
- pool = *softc->pool;
for (i = 0; i < FHA_HASH_SIZE; i++)
if (!LIST_EMPTY(&softc->fha_hash[i].list))
break;
if (i == FHA_HASH_SIZE) {
sbuf_printf(&sb, "No file handle entries.\n");
goto out;
}
hfirst = TRUE;
for (; i < FHA_HASH_SIZE; i++) {
mtx_lock(&softc->fha_hash[i].mtx);
if (LIST_EMPTY(&softc->fha_hash[i].list)) {
mtx_unlock(&softc->fha_hash[i].mtx);
continue;
}
sbuf_printf(&sb, "%shash %d: {\n", hfirst ? "" : ", ", i);
first = TRUE;
LIST_FOREACH(fhe, &softc->fha_hash[i].list, link) {
sbuf_printf(&sb, "%sfhe %p: {\n", first ? " " : ", ", fhe);
sbuf_printf(&sb, " fh: %ju\n", (uintmax_t) fhe->fh);
sbuf_printf(&sb, " num_rw/exclusive: %d/%d\n",
fhe->num_rw, fhe->num_exclusive);
sbuf_printf(&sb, " num_threads: %d\n", fhe->num_threads);
LIST_FOREACH(thread, &fhe->threads, st_alink) {
sbuf_printf(&sb, " thread %p offset %ju "
"reqs %d\n", thread,
thread->st_p3, thread->st_p2);
}
sbuf_printf(&sb, " }");
first = FALSE;
}
sbuf_printf(&sb, "\n}");
mtx_unlock(&softc->fha_hash[i].mtx);
hfirst = FALSE;
}
out:
sbuf_trim(&sb);
sbuf_finish(&sb);
error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
sbuf_delete(&sb);
return (error);
}
Index: head/sys/nlm/nlm_prot_impl.c
===================================================================
--- head/sys/nlm/nlm_prot_impl.c (revision 327172)
+++ head/sys/nlm/nlm_prot_impl.c (revision 327173)
@@ -1,2437 +1,2433 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2008 Isilon Inc http://www.isilon.com/
* Authors: Doug Rabson <dfr@rabson.org>
* Developed with Red Inc: Alfred Perlstein <alfred@freebsd.org>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "opt_inet6.h"
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/fail.h>
#include <sys/fcntl.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/lockf.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#if __FreeBSD_version >= 700000
#include <sys/priv.h>
#endif
#include <sys/proc.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/syscall.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
#include <sys/syslog.h>
#include <sys/sysproto.h>
#include <sys/systm.h>
#include <sys/taskqueue.h>
#include <sys/unistd.h>
#include <sys/vnode.h>
#include <nfs/nfsproto.h>
#include <nfs/nfs_lock.h>
#include <nlm/nlm_prot.h>
#include <nlm/sm_inter.h>
#include <nlm/nlm.h>
#include <rpc/rpc_com.h>
#include <rpc/rpcb_prot.h>
MALLOC_DEFINE(M_NLM, "NLM", "Network Lock Manager");
/*
* If a host is inactive (and holds no locks) for this amount of
* seconds, we consider it idle and stop tracking it.
*/
#define NLM_IDLE_TIMEOUT 30
/*
* We check the host list for idle every few seconds.
*/
#define NLM_IDLE_PERIOD 5
/*
* We only look for GRANTED_RES messages for a little while.
*/
#define NLM_EXPIRE_TIMEOUT 10
/*
* Support for sysctl vfs.nlm.sysid
*/
static SYSCTL_NODE(_vfs, OID_AUTO, nlm, CTLFLAG_RW, NULL,
"Network Lock Manager");
static SYSCTL_NODE(_vfs_nlm, OID_AUTO, sysid, CTLFLAG_RW, NULL, "");
/*
* Syscall hooks
*/
static int nlm_syscall_offset = SYS_nlm_syscall;
static struct sysent nlm_syscall_prev_sysent;
#if __FreeBSD_version < 700000
static struct sysent nlm_syscall_sysent = {
(sizeof(struct nlm_syscall_args) / sizeof(register_t)) | SYF_MPSAFE,
(sy_call_t *) nlm_syscall
};
#else
MAKE_SYSENT(nlm_syscall);
#endif
static bool_t nlm_syscall_registered = FALSE;
/*
* Debug level passed in from userland. We also support a sysctl hook
* so that it can be changed on a live system.
*/
static int nlm_debug_level;
SYSCTL_INT(_debug, OID_AUTO, nlm_debug, CTLFLAG_RW, &nlm_debug_level, 0, "");
#define NLM_DEBUG(_level, args...) \
do { \
if (nlm_debug_level >= (_level)) \
log(LOG_DEBUG, args); \
} while(0)
#define NLM_ERR(args...) \
do { \
log(LOG_ERR, args); \
} while(0)
/*
* Grace period handling. The value of nlm_grace_threshold is the
* value of time_uptime after which we are serving requests normally.
*/
static time_t nlm_grace_threshold;
/*
* We check for idle hosts if time_uptime is greater than
* nlm_next_idle_check,
*/
static time_t nlm_next_idle_check;
/*
* A flag to indicate the server is already running.
*/
static int nlm_is_running;
/*
* A socket to use for RPC - shared by all IPv4 RPC clients.
*/
static struct socket *nlm_socket;
#ifdef INET6
/*
* A socket to use for RPC - shared by all IPv6 RPC clients.
*/
static struct socket *nlm_socket6;
#endif
/*
* An RPC client handle that can be used to communicate with the local
* NSM.
*/
static CLIENT *nlm_nsm;
/*
* An AUTH handle for the server's creds.
*/
static AUTH *nlm_auth;
/*
* A zero timeval for sending async RPC messages.
*/
struct timeval nlm_zero_tv = { 0, 0 };
/*
* The local NSM state number
*/
int nlm_nsm_state;
/*
* A lock to protect the host list and waiting lock list.
*/
static struct mtx nlm_global_lock;
/*
* Locks:
* (l) locked by nh_lock
* (s) only accessed via server RPC which is single threaded
* (g) locked by nlm_global_lock
* (c) const until freeing
* (a) modified using atomic ops
*/
/*
* A pending client-side lock request, stored on the nlm_waiting_locks
* list.
*/
struct nlm_waiting_lock {
TAILQ_ENTRY(nlm_waiting_lock) nw_link; /* (g) */
bool_t nw_waiting; /* (g) */
nlm4_lock nw_lock; /* (c) */
union nfsfh nw_fh; /* (c) */
struct vnode *nw_vp; /* (c) */
};
TAILQ_HEAD(nlm_waiting_lock_list, nlm_waiting_lock);
struct nlm_waiting_lock_list nlm_waiting_locks; /* (g) */
/*
* A pending server-side asynchronous lock request, stored on the
* nh_pending list of the NLM host.
*/
struct nlm_async_lock {
TAILQ_ENTRY(nlm_async_lock) af_link; /* (l) host's list of locks */
struct task af_task; /* (c) async callback details */
void *af_cookie; /* (l) lock manager cancel token */
struct vnode *af_vp; /* (l) vnode to lock */
struct flock af_fl; /* (c) lock details */
struct nlm_host *af_host; /* (c) host which is locking */
CLIENT *af_rpc; /* (c) rpc client to send message */
nlm4_testargs af_granted; /* (c) notification details */
time_t af_expiretime; /* (c) notification time */
};
TAILQ_HEAD(nlm_async_lock_list, nlm_async_lock);
/*
* NLM host.
*/
enum nlm_host_state {
NLM_UNMONITORED,
NLM_MONITORED,
NLM_MONITOR_FAILED,
NLM_RECOVERING
};
struct nlm_rpc {
CLIENT *nr_client; /* (l) RPC client handle */
time_t nr_create_time; /* (l) when client was created */
};
struct nlm_host {
struct mtx nh_lock;
volatile u_int nh_refs; /* (a) reference count */
TAILQ_ENTRY(nlm_host) nh_link; /* (g) global list of hosts */
char nh_caller_name[MAXNAMELEN]; /* (c) printable name of host */
uint32_t nh_sysid; /* (c) our allocaed system ID */
char nh_sysid_string[10]; /* (c) string rep. of sysid */
struct sockaddr_storage nh_addr; /* (s) remote address of host */
struct nlm_rpc nh_srvrpc; /* (l) RPC for server replies */
struct nlm_rpc nh_clntrpc; /* (l) RPC for client requests */
rpcvers_t nh_vers; /* (s) NLM version of host */
int nh_state; /* (s) last seen NSM state of host */
enum nlm_host_state nh_monstate; /* (l) local NSM monitoring state */
time_t nh_idle_timeout; /* (s) Time at which host is idle */
struct sysctl_ctx_list nh_sysctl; /* (c) vfs.nlm.sysid nodes */
uint32_t nh_grantcookie; /* (l) grant cookie counter */
struct nlm_async_lock_list nh_pending; /* (l) pending async locks */
struct nlm_async_lock_list nh_granted; /* (l) granted locks */
struct nlm_async_lock_list nh_finished; /* (l) finished async locks */
};
TAILQ_HEAD(nlm_host_list, nlm_host);
static struct nlm_host_list nlm_hosts; /* (g) */
static uint32_t nlm_next_sysid = 1; /* (g) */
static void nlm_host_unmonitor(struct nlm_host *);
struct nlm_grantcookie {
uint32_t ng_sysid;
uint32_t ng_cookie;
};
static inline uint32_t
ng_sysid(struct netobj *src)
{
return ((struct nlm_grantcookie *)src->n_bytes)->ng_sysid;
}
static inline uint32_t
ng_cookie(struct netobj *src)
{
return ((struct nlm_grantcookie *)src->n_bytes)->ng_cookie;
}
/**********************************************************************/
/*
* Initialise NLM globals.
*/
static void
nlm_init(void *dummy)
{
int error;
mtx_init(&nlm_global_lock, "nlm_global_lock", NULL, MTX_DEF);
TAILQ_INIT(&nlm_waiting_locks);
TAILQ_INIT(&nlm_hosts);
error = syscall_register(&nlm_syscall_offset, &nlm_syscall_sysent,
&nlm_syscall_prev_sysent, SY_THR_STATIC_KLD);
if (error)
NLM_ERR("Can't register NLM syscall\n");
else
nlm_syscall_registered = TRUE;
}
SYSINIT(nlm_init, SI_SUB_LOCK, SI_ORDER_FIRST, nlm_init, NULL);
static void
nlm_uninit(void *dummy)
{
if (nlm_syscall_registered)
syscall_deregister(&nlm_syscall_offset,
&nlm_syscall_prev_sysent);
}
SYSUNINIT(nlm_uninit, SI_SUB_LOCK, SI_ORDER_FIRST, nlm_uninit, NULL);
/*
* Create a netobj from an arbitrary source.
*/
void
nlm_make_netobj(struct netobj *dst, caddr_t src, size_t srcsize,
struct malloc_type *type)
{
dst->n_len = srcsize;
dst->n_bytes = malloc(srcsize, type, M_WAITOK);
memcpy(dst->n_bytes, src, srcsize);
}
/*
* Copy a struct netobj.
*/
void
nlm_copy_netobj(struct netobj *dst, struct netobj *src,
struct malloc_type *type)
{
nlm_make_netobj(dst, src->n_bytes, src->n_len, type);
}
/*
* Create an RPC client handle for the given (address,prog,vers)
* triple using UDP.
*/
static CLIENT *
nlm_get_rpc(struct sockaddr *sa, rpcprog_t prog, rpcvers_t vers)
{
char *wchan = "nlmrcv";
- const char* protofmly;
struct sockaddr_storage ss;
struct socket *so;
CLIENT *rpcb;
struct timeval timo;
RPCB parms;
char *uaddr;
enum clnt_stat stat = RPC_SUCCESS;
int rpcvers = RPCBVERS4;
bool_t do_tcp = FALSE;
bool_t tryagain = FALSE;
struct portmap mapping;
u_short port = 0;
/*
* First we need to contact the remote RPCBIND service to find
* the right port.
*/
memcpy(&ss, sa, sa->sa_len);
switch (ss.ss_family) {
case AF_INET:
((struct sockaddr_in *)&ss)->sin_port = htons(111);
- protofmly = "inet";
so = nlm_socket;
break;
-
#ifdef INET6
case AF_INET6:
((struct sockaddr_in6 *)&ss)->sin6_port = htons(111);
- protofmly = "inet6";
so = nlm_socket6;
break;
#endif
default:
/*
* Unsupported address family - fail.
*/
return (NULL);
}
rpcb = clnt_dg_create(so, (struct sockaddr *)&ss,
RPCBPROG, rpcvers, 0, 0);
if (!rpcb)
return (NULL);
try_tcp:
parms.r_prog = prog;
parms.r_vers = vers;
if (do_tcp)
parms.r_netid = "tcp";
else
parms.r_netid = "udp";
parms.r_addr = "";
parms.r_owner = "";
/*
* Use the default timeout.
*/
timo.tv_sec = 25;
timo.tv_usec = 0;
again:
switch (rpcvers) {
case RPCBVERS4:
case RPCBVERS:
/*
* Try RPCBIND 4 then 3.
*/
uaddr = NULL;
stat = CLNT_CALL(rpcb, (rpcprog_t) RPCBPROC_GETADDR,
(xdrproc_t) xdr_rpcb, &parms,
(xdrproc_t) xdr_wrapstring, &uaddr, timo);
if (stat == RPC_SUCCESS) {
/*
* We have a reply from the remote RPCBIND - turn it
* into an appropriate address and make a new client
* that can talk to the remote NLM.
*
* XXX fixup IPv6 scope ID.
*/
struct netbuf *a;
a = __rpc_uaddr2taddr_af(ss.ss_family, uaddr);
if (!a) {
tryagain = TRUE;
} else {
tryagain = FALSE;
memcpy(&ss, a->buf, a->len);
free(a->buf, M_RPC);
free(a, M_RPC);
xdr_free((xdrproc_t) xdr_wrapstring, &uaddr);
}
}
if (tryagain || stat == RPC_PROGVERSMISMATCH) {
if (rpcvers == RPCBVERS4)
rpcvers = RPCBVERS;
else if (rpcvers == RPCBVERS)
rpcvers = PMAPVERS;
CLNT_CONTROL(rpcb, CLSET_VERS, &rpcvers);
goto again;
}
break;
case PMAPVERS:
/*
* Try portmap.
*/
mapping.pm_prog = parms.r_prog;
mapping.pm_vers = parms.r_vers;
mapping.pm_prot = do_tcp ? IPPROTO_TCP : IPPROTO_UDP;
mapping.pm_port = 0;
stat = CLNT_CALL(rpcb, (rpcprog_t) PMAPPROC_GETPORT,
(xdrproc_t) xdr_portmap, &mapping,
(xdrproc_t) xdr_u_short, &port, timo);
if (stat == RPC_SUCCESS) {
switch (ss.ss_family) {
case AF_INET:
((struct sockaddr_in *)&ss)->sin_port =
htons(port);
break;
#ifdef INET6
case AF_INET6:
((struct sockaddr_in6 *)&ss)->sin6_port =
htons(port);
break;
#endif
}
}
break;
default:
panic("invalid rpcvers %d", rpcvers);
}
/*
* We may have a positive response from the portmapper, but the NLM
* service was not found. Make sure we received a valid port.
*/
switch (ss.ss_family) {
case AF_INET:
port = ((struct sockaddr_in *)&ss)->sin_port;
break;
#ifdef INET6
case AF_INET6:
port = ((struct sockaddr_in6 *)&ss)->sin6_port;
break;
#endif
}
if (stat != RPC_SUCCESS || !port) {
/*
* If we were able to talk to rpcbind or portmap, but the udp
* variant wasn't available, ask about tcp.
*
* XXX - We could also check for a TCP portmapper, but
* if the host is running a portmapper at all, we should be able
* to hail it over UDP.
*/
if (stat == RPC_SUCCESS && !do_tcp) {
do_tcp = TRUE;
goto try_tcp;
}
/* Otherwise, bad news. */
NLM_ERR("NLM: failed to contact remote rpcbind, "
"stat = %d, port = %d\n", (int) stat, port);
CLNT_DESTROY(rpcb);
return (NULL);
}
if (do_tcp) {
/*
* Destroy the UDP client we used to speak to rpcbind and
* recreate as a TCP client.
*/
struct netconfig *nconf = NULL;
CLNT_DESTROY(rpcb);
switch (ss.ss_family) {
case AF_INET:
nconf = getnetconfigent("tcp");
break;
#ifdef INET6
case AF_INET6:
nconf = getnetconfigent("tcp6");
break;
#endif
}
rpcb = clnt_reconnect_create(nconf, (struct sockaddr *)&ss,
prog, vers, 0, 0);
CLNT_CONTROL(rpcb, CLSET_WAITCHAN, wchan);
rpcb->cl_auth = nlm_auth;
} else {
/*
* Re-use the client we used to speak to rpcbind.
*/
CLNT_CONTROL(rpcb, CLSET_SVC_ADDR, &ss);
CLNT_CONTROL(rpcb, CLSET_PROG, &prog);
CLNT_CONTROL(rpcb, CLSET_VERS, &vers);
CLNT_CONTROL(rpcb, CLSET_WAITCHAN, wchan);
rpcb->cl_auth = nlm_auth;
}
return (rpcb);
}
/*
* This async callback after when an async lock request has been
* granted. We notify the host which initiated the request.
*/
static void
nlm_lock_callback(void *arg, int pending)
{
struct nlm_async_lock *af = (struct nlm_async_lock *) arg;
struct rpc_callextra ext;
NLM_DEBUG(2, "NLM: async lock %p for %s (sysid %d) granted,"
" cookie %d:%d\n", af, af->af_host->nh_caller_name,
af->af_host->nh_sysid, ng_sysid(&af->af_granted.cookie),
ng_cookie(&af->af_granted.cookie));
/*
* Send the results back to the host.
*
* Note: there is a possible race here with nlm_host_notify
* destroying the RPC client. To avoid problems, the first
* thing nlm_host_notify does is to cancel pending async lock
* requests.
*/
memset(&ext, 0, sizeof(ext));
ext.rc_auth = nlm_auth;
if (af->af_host->nh_vers == NLM_VERS4) {
nlm4_granted_msg_4(&af->af_granted,
NULL, af->af_rpc, &ext, nlm_zero_tv);
} else {
/*
* Back-convert to legacy protocol
*/
nlm_testargs granted;
granted.cookie = af->af_granted.cookie;
granted.exclusive = af->af_granted.exclusive;
granted.alock.caller_name =
af->af_granted.alock.caller_name;
granted.alock.fh = af->af_granted.alock.fh;
granted.alock.oh = af->af_granted.alock.oh;
granted.alock.svid = af->af_granted.alock.svid;
granted.alock.l_offset =
af->af_granted.alock.l_offset;
granted.alock.l_len =
af->af_granted.alock.l_len;
nlm_granted_msg_1(&granted,
NULL, af->af_rpc, &ext, nlm_zero_tv);
}
/*
* Move this entry to the nh_granted list.
*/
af->af_expiretime = time_uptime + NLM_EXPIRE_TIMEOUT;
mtx_lock(&af->af_host->nh_lock);
TAILQ_REMOVE(&af->af_host->nh_pending, af, af_link);
TAILQ_INSERT_TAIL(&af->af_host->nh_granted, af, af_link);
mtx_unlock(&af->af_host->nh_lock);
}
/*
* Free an async lock request. The request must have been removed from
* any list.
*/
static void
nlm_free_async_lock(struct nlm_async_lock *af)
{
/*
* Free an async lock.
*/
if (af->af_rpc)
CLNT_RELEASE(af->af_rpc);
xdr_free((xdrproc_t) xdr_nlm4_testargs, &af->af_granted);
if (af->af_vp)
vrele(af->af_vp);
free(af, M_NLM);
}
/*
* Cancel our async request - this must be called with
* af->nh_host->nh_lock held. This is slightly complicated by a
* potential race with our own callback. If we fail to cancel the
* lock, it must already have been granted - we make sure our async
* task has completed by calling taskqueue_drain in this case.
*/
static int
nlm_cancel_async_lock(struct nlm_async_lock *af)
{
struct nlm_host *host = af->af_host;
int error;
mtx_assert(&host->nh_lock, MA_OWNED);
mtx_unlock(&host->nh_lock);
error = VOP_ADVLOCKASYNC(af->af_vp, NULL, F_CANCEL, &af->af_fl,
F_REMOTE, NULL, &af->af_cookie);
if (error) {
/*
* We failed to cancel - make sure our callback has
* completed before we continue.
*/
taskqueue_drain(taskqueue_thread, &af->af_task);
}
mtx_lock(&host->nh_lock);
if (!error) {
NLM_DEBUG(2, "NLM: async lock %p for %s (sysid %d) "
"cancelled\n", af, host->nh_caller_name, host->nh_sysid);
/*
* Remove from the nh_pending list and free now that
* we are safe from the callback.
*/
TAILQ_REMOVE(&host->nh_pending, af, af_link);
mtx_unlock(&host->nh_lock);
nlm_free_async_lock(af);
mtx_lock(&host->nh_lock);
}
return (error);
}
static void
nlm_check_expired_locks(struct nlm_host *host)
{
struct nlm_async_lock *af;
time_t uptime = time_uptime;
mtx_lock(&host->nh_lock);
while ((af = TAILQ_FIRST(&host->nh_granted)) != NULL
&& uptime >= af->af_expiretime) {
NLM_DEBUG(2, "NLM: async lock %p for %s (sysid %d) expired,"
" cookie %d:%d\n", af, af->af_host->nh_caller_name,
af->af_host->nh_sysid, ng_sysid(&af->af_granted.cookie),
ng_cookie(&af->af_granted.cookie));
TAILQ_REMOVE(&host->nh_granted, af, af_link);
mtx_unlock(&host->nh_lock);
nlm_free_async_lock(af);
mtx_lock(&host->nh_lock);
}
while ((af = TAILQ_FIRST(&host->nh_finished)) != NULL) {
TAILQ_REMOVE(&host->nh_finished, af, af_link);
mtx_unlock(&host->nh_lock);
nlm_free_async_lock(af);
mtx_lock(&host->nh_lock);
}
mtx_unlock(&host->nh_lock);
}
/*
* Free resources used by a host. This is called after the reference
* count has reached zero so it doesn't need to worry about locks.
*/
static void
nlm_host_destroy(struct nlm_host *host)
{
mtx_lock(&nlm_global_lock);
TAILQ_REMOVE(&nlm_hosts, host, nh_link);
mtx_unlock(&nlm_global_lock);
if (host->nh_srvrpc.nr_client)
CLNT_RELEASE(host->nh_srvrpc.nr_client);
if (host->nh_clntrpc.nr_client)
CLNT_RELEASE(host->nh_clntrpc.nr_client);
mtx_destroy(&host->nh_lock);
sysctl_ctx_free(&host->nh_sysctl);
free(host, M_NLM);
}
/*
* Thread start callback for client lock recovery
*/
static void
nlm_client_recovery_start(void *arg)
{
struct nlm_host *host = (struct nlm_host *) arg;
NLM_DEBUG(1, "NLM: client lock recovery for %s started\n",
host->nh_caller_name);
nlm_client_recovery(host);
NLM_DEBUG(1, "NLM: client lock recovery for %s completed\n",
host->nh_caller_name);
host->nh_monstate = NLM_MONITORED;
nlm_host_release(host);
kthread_exit();
}
/*
* This is called when we receive a host state change notification. We
* unlock any active locks owned by the host. When rpc.lockd is
* shutting down, this function is called with newstate set to zero
* which allows us to cancel any pending async locks and clear the
* locking state.
*/
static void
nlm_host_notify(struct nlm_host *host, int newstate)
{
struct nlm_async_lock *af;
if (newstate) {
NLM_DEBUG(1, "NLM: host %s (sysid %d) rebooted, new "
"state is %d\n", host->nh_caller_name,
host->nh_sysid, newstate);
}
/*
* Cancel any pending async locks for this host.
*/
mtx_lock(&host->nh_lock);
while ((af = TAILQ_FIRST(&host->nh_pending)) != NULL) {
/*
* nlm_cancel_async_lock will remove the entry from
* nh_pending and free it.
*/
nlm_cancel_async_lock(af);
}
mtx_unlock(&host->nh_lock);
nlm_check_expired_locks(host);
/*
* The host just rebooted - trash its locks.
*/
lf_clearremotesys(host->nh_sysid);
host->nh_state = newstate;
/*
* If we have any remote locks for this host (i.e. it
* represents a remote NFS server that our local NFS client
* has locks for), start a recovery thread.
*/
if (newstate != 0
&& host->nh_monstate != NLM_RECOVERING
&& lf_countlocks(NLM_SYSID_CLIENT | host->nh_sysid) > 0) {
struct thread *td;
host->nh_monstate = NLM_RECOVERING;
refcount_acquire(&host->nh_refs);
kthread_add(nlm_client_recovery_start, host, curproc, &td, 0, 0,
"NFS lock recovery for %s", host->nh_caller_name);
}
}
/*
* Sysctl handler to count the number of locks for a sysid.
*/
static int
nlm_host_lock_count_sysctl(SYSCTL_HANDLER_ARGS)
{
struct nlm_host *host;
int count;
host = oidp->oid_arg1;
count = lf_countlocks(host->nh_sysid);
return sysctl_handle_int(oidp, &count, 0, req);
}
/*
* Sysctl handler to count the number of client locks for a sysid.
*/
static int
nlm_host_client_lock_count_sysctl(SYSCTL_HANDLER_ARGS)
{
struct nlm_host *host;
int count;
host = oidp->oid_arg1;
count = lf_countlocks(NLM_SYSID_CLIENT | host->nh_sysid);
return sysctl_handle_int(oidp, &count, 0, req);
}
/*
* Create a new NLM host.
*/
static struct nlm_host *
nlm_create_host(const char* caller_name)
{
struct nlm_host *host;
struct sysctl_oid *oid;
mtx_assert(&nlm_global_lock, MA_OWNED);
NLM_DEBUG(1, "NLM: new host %s (sysid %d)\n",
caller_name, nlm_next_sysid);
host = malloc(sizeof(struct nlm_host), M_NLM, M_NOWAIT|M_ZERO);
if (!host)
return (NULL);
mtx_init(&host->nh_lock, "nh_lock", NULL, MTX_DEF);
host->nh_refs = 1;
strlcpy(host->nh_caller_name, caller_name, MAXNAMELEN);
host->nh_sysid = nlm_next_sysid++;
snprintf(host->nh_sysid_string, sizeof(host->nh_sysid_string),
"%d", host->nh_sysid);
host->nh_vers = 0;
host->nh_state = 0;
host->nh_monstate = NLM_UNMONITORED;
host->nh_grantcookie = 1;
TAILQ_INIT(&host->nh_pending);
TAILQ_INIT(&host->nh_granted);
TAILQ_INIT(&host->nh_finished);
TAILQ_INSERT_TAIL(&nlm_hosts, host, nh_link);
mtx_unlock(&nlm_global_lock);
sysctl_ctx_init(&host->nh_sysctl);
oid = SYSCTL_ADD_NODE(&host->nh_sysctl,
SYSCTL_STATIC_CHILDREN(_vfs_nlm_sysid),
OID_AUTO, host->nh_sysid_string, CTLFLAG_RD, NULL, "");
SYSCTL_ADD_STRING(&host->nh_sysctl, SYSCTL_CHILDREN(oid), OID_AUTO,
"hostname", CTLFLAG_RD, host->nh_caller_name, 0, "");
SYSCTL_ADD_UINT(&host->nh_sysctl, SYSCTL_CHILDREN(oid), OID_AUTO,
"version", CTLFLAG_RD, &host->nh_vers, 0, "");
SYSCTL_ADD_UINT(&host->nh_sysctl, SYSCTL_CHILDREN(oid), OID_AUTO,
"monitored", CTLFLAG_RD, &host->nh_monstate, 0, "");
SYSCTL_ADD_PROC(&host->nh_sysctl, SYSCTL_CHILDREN(oid), OID_AUTO,
"lock_count", CTLTYPE_INT | CTLFLAG_RD, host, 0,
nlm_host_lock_count_sysctl, "I", "");
SYSCTL_ADD_PROC(&host->nh_sysctl, SYSCTL_CHILDREN(oid), OID_AUTO,
"client_lock_count", CTLTYPE_INT | CTLFLAG_RD, host, 0,
nlm_host_client_lock_count_sysctl, "I", "");
mtx_lock(&nlm_global_lock);
return (host);
}
/*
* Acquire the next sysid for remote locks not handled by the NLM.
*/
uint32_t
nlm_acquire_next_sysid(void)
{
uint32_t next_sysid;
mtx_lock(&nlm_global_lock);
next_sysid = nlm_next_sysid++;
mtx_unlock(&nlm_global_lock);
return (next_sysid);
}
/*
* Return non-zero if the address parts of the two sockaddrs are the
* same.
*/
static int
nlm_compare_addr(const struct sockaddr *a, const struct sockaddr *b)
{
const struct sockaddr_in *a4, *b4;
#ifdef INET6
const struct sockaddr_in6 *a6, *b6;
#endif
if (a->sa_family != b->sa_family)
return (FALSE);
switch (a->sa_family) {
case AF_INET:
a4 = (const struct sockaddr_in *) a;
b4 = (const struct sockaddr_in *) b;
return !memcmp(&a4->sin_addr, &b4->sin_addr,
sizeof(a4->sin_addr));
#ifdef INET6
case AF_INET6:
a6 = (const struct sockaddr_in6 *) a;
b6 = (const struct sockaddr_in6 *) b;
return !memcmp(&a6->sin6_addr, &b6->sin6_addr,
sizeof(a6->sin6_addr));
#endif
}
return (0);
}
/*
* Check for idle hosts and stop monitoring them. We could also free
* the host structure here, possibly after a larger timeout but that
* would require some care to avoid races with
* e.g. nlm_host_lock_count_sysctl.
*/
static void
nlm_check_idle(void)
{
struct nlm_host *host;
mtx_assert(&nlm_global_lock, MA_OWNED);
if (time_uptime <= nlm_next_idle_check)
return;
nlm_next_idle_check = time_uptime + NLM_IDLE_PERIOD;
TAILQ_FOREACH(host, &nlm_hosts, nh_link) {
if (host->nh_monstate == NLM_MONITORED
&& time_uptime > host->nh_idle_timeout) {
mtx_unlock(&nlm_global_lock);
if (lf_countlocks(host->nh_sysid) > 0
|| lf_countlocks(NLM_SYSID_CLIENT
+ host->nh_sysid)) {
host->nh_idle_timeout =
time_uptime + NLM_IDLE_TIMEOUT;
mtx_lock(&nlm_global_lock);
continue;
}
nlm_host_unmonitor(host);
mtx_lock(&nlm_global_lock);
}
}
}
/*
* Search for an existing NLM host that matches the given name
* (typically the caller_name element of an nlm4_lock). If none is
* found, create a new host. If 'addr' is non-NULL, record the remote
* address of the host so that we can call it back for async
* responses. If 'vers' is greater than zero then record the NLM
* program version to use to communicate with this client.
*/
struct nlm_host *
nlm_find_host_by_name(const char *name, const struct sockaddr *addr,
rpcvers_t vers)
{
struct nlm_host *host;
mtx_lock(&nlm_global_lock);
/*
* The remote host is determined by caller_name.
*/
TAILQ_FOREACH(host, &nlm_hosts, nh_link) {
if (!strcmp(host->nh_caller_name, name))
break;
}
if (!host) {
host = nlm_create_host(name);
if (!host) {
mtx_unlock(&nlm_global_lock);
return (NULL);
}
}
refcount_acquire(&host->nh_refs);
host->nh_idle_timeout = time_uptime + NLM_IDLE_TIMEOUT;
/*
* If we have an address for the host, record it so that we
* can send async replies etc.
*/
if (addr) {
KASSERT(addr->sa_len < sizeof(struct sockaddr_storage),
("Strange remote transport address length"));
/*
* If we have seen an address before and we currently
* have an RPC client handle, make sure the address is
* the same, otherwise discard the client handle.
*/
if (host->nh_addr.ss_len && host->nh_srvrpc.nr_client) {
if (!nlm_compare_addr(
(struct sockaddr *) &host->nh_addr,
addr)
|| host->nh_vers != vers) {
CLIENT *client;
mtx_lock(&host->nh_lock);
client = host->nh_srvrpc.nr_client;
host->nh_srvrpc.nr_client = NULL;
mtx_unlock(&host->nh_lock);
if (client) {
CLNT_RELEASE(client);
}
}
}
memcpy(&host->nh_addr, addr, addr->sa_len);
host->nh_vers = vers;
}
nlm_check_idle();
mtx_unlock(&nlm_global_lock);
return (host);
}
/*
* Search for an existing NLM host that matches the given remote
* address. If none is found, create a new host with the requested
* address and remember 'vers' as the NLM protocol version to use for
* that host.
*/
struct nlm_host *
nlm_find_host_by_addr(const struct sockaddr *addr, int vers)
{
/*
* Fake up a name using inet_ntop. This buffer is
* large enough for an IPv6 address.
*/
char tmp[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255"];
struct nlm_host *host;
switch (addr->sa_family) {
case AF_INET:
inet_ntop(AF_INET,
&((const struct sockaddr_in *) addr)->sin_addr,
tmp, sizeof tmp);
break;
#ifdef INET6
case AF_INET6:
inet_ntop(AF_INET6,
&((const struct sockaddr_in6 *) addr)->sin6_addr,
tmp, sizeof tmp);
break;
#endif
default:
strlcpy(tmp, "<unknown>", sizeof(tmp));
}
mtx_lock(&nlm_global_lock);
/*
* The remote host is determined by caller_name.
*/
TAILQ_FOREACH(host, &nlm_hosts, nh_link) {
if (nlm_compare_addr(addr,
(const struct sockaddr *) &host->nh_addr))
break;
}
if (!host) {
host = nlm_create_host(tmp);
if (!host) {
mtx_unlock(&nlm_global_lock);
return (NULL);
}
memcpy(&host->nh_addr, addr, addr->sa_len);
host->nh_vers = vers;
}
refcount_acquire(&host->nh_refs);
host->nh_idle_timeout = time_uptime + NLM_IDLE_TIMEOUT;
nlm_check_idle();
mtx_unlock(&nlm_global_lock);
return (host);
}
/*
* Find the NLM host that matches the value of 'sysid'. If none
* exists, return NULL.
*/
static struct nlm_host *
nlm_find_host_by_sysid(int sysid)
{
struct nlm_host *host;
TAILQ_FOREACH(host, &nlm_hosts, nh_link) {
if (host->nh_sysid == sysid) {
refcount_acquire(&host->nh_refs);
return (host);
}
}
return (NULL);
}
void nlm_host_release(struct nlm_host *host)
{
if (refcount_release(&host->nh_refs)) {
/*
* Free the host
*/
nlm_host_destroy(host);
}
}
/*
* Unregister this NLM host with the local NSM due to idleness.
*/
static void
nlm_host_unmonitor(struct nlm_host *host)
{
mon_id smmonid;
sm_stat_res smstat;
struct timeval timo;
enum clnt_stat stat;
NLM_DEBUG(1, "NLM: unmonitoring %s (sysid %d)\n",
host->nh_caller_name, host->nh_sysid);
/*
* We put our assigned system ID value in the priv field to
* make it simpler to find the host if we are notified of a
* host restart.
*/
smmonid.mon_name = host->nh_caller_name;
smmonid.my_id.my_name = "localhost";
smmonid.my_id.my_prog = NLM_PROG;
smmonid.my_id.my_vers = NLM_SM;
smmonid.my_id.my_proc = NLM_SM_NOTIFY;
timo.tv_sec = 25;
timo.tv_usec = 0;
stat = CLNT_CALL(nlm_nsm, SM_UNMON,
(xdrproc_t) xdr_mon, &smmonid,
(xdrproc_t) xdr_sm_stat, &smstat, timo);
if (stat != RPC_SUCCESS) {
NLM_ERR("Failed to contact local NSM - rpc error %d\n", stat);
return;
}
if (smstat.res_stat == stat_fail) {
NLM_ERR("Local NSM refuses to unmonitor %s\n",
host->nh_caller_name);
return;
}
host->nh_monstate = NLM_UNMONITORED;
}
/*
* Register this NLM host with the local NSM so that we can be
* notified if it reboots.
*/
void
nlm_host_monitor(struct nlm_host *host, int state)
{
mon smmon;
sm_stat_res smstat;
struct timeval timo;
enum clnt_stat stat;
if (state && !host->nh_state) {
/*
* This is the first time we have seen an NSM state
* value for this host. We record it here to help
* detect host reboots.
*/
host->nh_state = state;
NLM_DEBUG(1, "NLM: host %s (sysid %d) has NSM state %d\n",
host->nh_caller_name, host->nh_sysid, state);
}
mtx_lock(&host->nh_lock);
if (host->nh_monstate != NLM_UNMONITORED) {
mtx_unlock(&host->nh_lock);
return;
}
host->nh_monstate = NLM_MONITORED;
mtx_unlock(&host->nh_lock);
NLM_DEBUG(1, "NLM: monitoring %s (sysid %d)\n",
host->nh_caller_name, host->nh_sysid);
/*
* We put our assigned system ID value in the priv field to
* make it simpler to find the host if we are notified of a
* host restart.
*/
smmon.mon_id.mon_name = host->nh_caller_name;
smmon.mon_id.my_id.my_name = "localhost";
smmon.mon_id.my_id.my_prog = NLM_PROG;
smmon.mon_id.my_id.my_vers = NLM_SM;
smmon.mon_id.my_id.my_proc = NLM_SM_NOTIFY;
memcpy(smmon.priv, &host->nh_sysid, sizeof(host->nh_sysid));
timo.tv_sec = 25;
timo.tv_usec = 0;
stat = CLNT_CALL(nlm_nsm, SM_MON,
(xdrproc_t) xdr_mon, &smmon,
(xdrproc_t) xdr_sm_stat, &smstat, timo);
if (stat != RPC_SUCCESS) {
NLM_ERR("Failed to contact local NSM - rpc error %d\n", stat);
return;
}
if (smstat.res_stat == stat_fail) {
NLM_ERR("Local NSM refuses to monitor %s\n",
host->nh_caller_name);
mtx_lock(&host->nh_lock);
host->nh_monstate = NLM_MONITOR_FAILED;
mtx_unlock(&host->nh_lock);
return;
}
host->nh_monstate = NLM_MONITORED;
}
/*
* Return an RPC client handle that can be used to talk to the NLM
* running on the given host.
*/
CLIENT *
nlm_host_get_rpc(struct nlm_host *host, bool_t isserver)
{
struct nlm_rpc *rpc;
CLIENT *client;
mtx_lock(&host->nh_lock);
if (isserver)
rpc = &host->nh_srvrpc;
else
rpc = &host->nh_clntrpc;
/*
* We can't hold onto RPC handles for too long - the async
* call/reply protocol used by some NLM clients makes it hard
* to tell when they change port numbers (e.g. after a
* reboot). Note that if a client reboots while it isn't
* holding any locks, it won't bother to notify us. We
* expire the RPC handles after two minutes.
*/
if (rpc->nr_client && time_uptime > rpc->nr_create_time + 2*60) {
client = rpc->nr_client;
rpc->nr_client = NULL;
mtx_unlock(&host->nh_lock);
CLNT_RELEASE(client);
mtx_lock(&host->nh_lock);
}
if (!rpc->nr_client) {
mtx_unlock(&host->nh_lock);
client = nlm_get_rpc((struct sockaddr *)&host->nh_addr,
NLM_PROG, host->nh_vers);
mtx_lock(&host->nh_lock);
if (client) {
if (rpc->nr_client) {
mtx_unlock(&host->nh_lock);
CLNT_DESTROY(client);
mtx_lock(&host->nh_lock);
} else {
rpc->nr_client = client;
rpc->nr_create_time = time_uptime;
}
}
}
client = rpc->nr_client;
if (client)
CLNT_ACQUIRE(client);
mtx_unlock(&host->nh_lock);
return (client);
}
int nlm_host_get_sysid(struct nlm_host *host)
{
return (host->nh_sysid);
}
int
nlm_host_get_state(struct nlm_host *host)
{
return (host->nh_state);
}
void *
nlm_register_wait_lock(struct nlm4_lock *lock, struct vnode *vp)
{
struct nlm_waiting_lock *nw;
nw = malloc(sizeof(struct nlm_waiting_lock), M_NLM, M_WAITOK);
nw->nw_lock = *lock;
memcpy(&nw->nw_fh.fh_bytes, nw->nw_lock.fh.n_bytes,
nw->nw_lock.fh.n_len);
nw->nw_lock.fh.n_bytes = nw->nw_fh.fh_bytes;
nw->nw_waiting = TRUE;
nw->nw_vp = vp;
mtx_lock(&nlm_global_lock);
TAILQ_INSERT_TAIL(&nlm_waiting_locks, nw, nw_link);
mtx_unlock(&nlm_global_lock);
return nw;
}
void
nlm_deregister_wait_lock(void *handle)
{
struct nlm_waiting_lock *nw = handle;
mtx_lock(&nlm_global_lock);
TAILQ_REMOVE(&nlm_waiting_locks, nw, nw_link);
mtx_unlock(&nlm_global_lock);
free(nw, M_NLM);
}
int
nlm_wait_lock(void *handle, int timo)
{
struct nlm_waiting_lock *nw = handle;
int error, stops_deferred;
/*
* If the granted message arrived before we got here,
* nw->nw_waiting will be FALSE - in that case, don't sleep.
*/
mtx_lock(&nlm_global_lock);
error = 0;
if (nw->nw_waiting) {
stops_deferred = sigdeferstop(SIGDEFERSTOP_ERESTART);
error = msleep(nw, &nlm_global_lock, PCATCH, "nlmlock", timo);
sigallowstop(stops_deferred);
}
TAILQ_REMOVE(&nlm_waiting_locks, nw, nw_link);
if (error) {
/*
* The granted message may arrive after the
* interrupt/timeout but before we manage to lock the
* mutex. Detect this by examining nw_lock.
*/
if (!nw->nw_waiting)
error = 0;
} else {
/*
* If nlm_cancel_wait is called, then error will be
* zero but nw_waiting will still be TRUE. We
* translate this into EINTR.
*/
if (nw->nw_waiting)
error = EINTR;
}
mtx_unlock(&nlm_global_lock);
free(nw, M_NLM);
return (error);
}
void
nlm_cancel_wait(struct vnode *vp)
{
struct nlm_waiting_lock *nw;
mtx_lock(&nlm_global_lock);
TAILQ_FOREACH(nw, &nlm_waiting_locks, nw_link) {
if (nw->nw_vp == vp) {
wakeup(nw);
}
}
mtx_unlock(&nlm_global_lock);
}
/**********************************************************************/
/*
* Syscall interface with userland.
*/
extern void nlm_prog_0(struct svc_req *rqstp, SVCXPRT *transp);
extern void nlm_prog_1(struct svc_req *rqstp, SVCXPRT *transp);
extern void nlm_prog_3(struct svc_req *rqstp, SVCXPRT *transp);
extern void nlm_prog_4(struct svc_req *rqstp, SVCXPRT *transp);
static int
nlm_register_services(SVCPOOL *pool, int addr_count, char **addrs)
{
static rpcvers_t versions[] = {
NLM_SM, NLM_VERS, NLM_VERSX, NLM_VERS4
};
static void (*dispatchers[])(struct svc_req *, SVCXPRT *) = {
nlm_prog_0, nlm_prog_1, nlm_prog_3, nlm_prog_4
};
SVCXPRT **xprts;
char netid[16];
char uaddr[128];
struct netconfig *nconf;
int i, j, error;
if (!addr_count) {
NLM_ERR("NLM: no service addresses given - can't start server");
return (EINVAL);
}
if (addr_count < 0 || addr_count > 256 ) {
NLM_ERR("NLM: too many service addresses (%d) given, "
"max 256 - can't start server\n", addr_count);
return (EINVAL);
}
xprts = malloc(addr_count * sizeof(SVCXPRT *), M_NLM, M_WAITOK|M_ZERO);
for (i = 0; i < nitems(versions); i++) {
for (j = 0; j < addr_count; j++) {
/*
* Create transports for the first version and
* then just register everything else to the
* same transports.
*/
if (i == 0) {
char *up;
error = copyin(&addrs[2*j], &up,
sizeof(char*));
if (error)
goto out;
error = copyinstr(up, netid, sizeof(netid),
NULL);
if (error)
goto out;
error = copyin(&addrs[2*j+1], &up,
sizeof(char*));
if (error)
goto out;
error = copyinstr(up, uaddr, sizeof(uaddr),
NULL);
if (error)
goto out;
nconf = getnetconfigent(netid);
if (!nconf) {
NLM_ERR("Can't lookup netid %s\n",
netid);
error = EINVAL;
goto out;
}
xprts[j] = svc_tp_create(pool, dispatchers[i],
NLM_PROG, versions[i], uaddr, nconf);
if (!xprts[j]) {
NLM_ERR("NLM: unable to create "
"(NLM_PROG, %d).\n", versions[i]);
error = EINVAL;
goto out;
}
freenetconfigent(nconf);
} else {
nconf = getnetconfigent(xprts[j]->xp_netid);
rpcb_unset(NLM_PROG, versions[i], nconf);
if (!svc_reg(xprts[j], NLM_PROG, versions[i],
dispatchers[i], nconf)) {
NLM_ERR("NLM: can't register "
"(NLM_PROG, %d)\n", versions[i]);
error = EINVAL;
goto out;
}
}
}
}
error = 0;
out:
for (j = 0; j < addr_count; j++) {
if (xprts[j])
SVC_RELEASE(xprts[j]);
}
free(xprts, M_NLM);
return (error);
}
/*
* Main server entry point. Contacts the local NSM to get its current
* state and send SM_UNMON_ALL. Registers the NLM services and then
* services requests. Does not return until the server is interrupted
* by a signal.
*/
static int
nlm_server_main(int addr_count, char **addrs)
{
struct thread *td = curthread;
int error;
SVCPOOL *pool = NULL;
struct sockopt opt;
int portlow;
#ifdef INET6
struct sockaddr_in6 sin6;
#endif
struct sockaddr_in sin;
my_id id;
sm_stat smstat;
struct timeval timo;
enum clnt_stat stat;
struct nlm_host *host, *nhost;
struct nlm_waiting_lock *nw;
vop_advlock_t *old_nfs_advlock;
vop_reclaim_t *old_nfs_reclaim;
if (nlm_is_running != 0) {
NLM_ERR("NLM: can't start server - "
"it appears to be running already\n");
return (EPERM);
}
if (nlm_socket == NULL) {
memset(&opt, 0, sizeof(opt));
error = socreate(AF_INET, &nlm_socket, SOCK_DGRAM, 0,
td->td_ucred, td);
if (error) {
NLM_ERR("NLM: can't create IPv4 socket - error %d\n",
error);
return (error);
}
opt.sopt_dir = SOPT_SET;
opt.sopt_level = IPPROTO_IP;
opt.sopt_name = IP_PORTRANGE;
portlow = IP_PORTRANGE_LOW;
opt.sopt_val = &portlow;
opt.sopt_valsize = sizeof(portlow);
sosetopt(nlm_socket, &opt);
#ifdef INET6
nlm_socket6 = NULL;
error = socreate(AF_INET6, &nlm_socket6, SOCK_DGRAM, 0,
td->td_ucred, td);
if (error) {
NLM_ERR("NLM: can't create IPv6 socket - error %d\n",
error);
soclose(nlm_socket);
nlm_socket = NULL;
return (error);
}
opt.sopt_dir = SOPT_SET;
opt.sopt_level = IPPROTO_IPV6;
opt.sopt_name = IPV6_PORTRANGE;
portlow = IPV6_PORTRANGE_LOW;
opt.sopt_val = &portlow;
opt.sopt_valsize = sizeof(portlow);
sosetopt(nlm_socket6, &opt);
#endif
}
nlm_auth = authunix_create(curthread->td_ucred);
#ifdef INET6
memset(&sin6, 0, sizeof(sin6));
sin6.sin6_len = sizeof(sin6);
sin6.sin6_family = AF_INET6;
sin6.sin6_addr = in6addr_loopback;
nlm_nsm = nlm_get_rpc((struct sockaddr *) &sin6, SM_PROG, SM_VERS);
if (!nlm_nsm) {
#endif
memset(&sin, 0, sizeof(sin));
sin.sin_len = sizeof(sin);
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
nlm_nsm = nlm_get_rpc((struct sockaddr *) &sin, SM_PROG,
SM_VERS);
#ifdef INET6
}
#endif
if (!nlm_nsm) {
NLM_ERR("Can't start NLM - unable to contact NSM\n");
error = EINVAL;
goto out;
}
pool = svcpool_create("NLM", NULL);
error = nlm_register_services(pool, addr_count, addrs);
if (error)
goto out;
memset(&id, 0, sizeof(id));
id.my_name = "NFS NLM";
timo.tv_sec = 25;
timo.tv_usec = 0;
stat = CLNT_CALL(nlm_nsm, SM_UNMON_ALL,
(xdrproc_t) xdr_my_id, &id,
(xdrproc_t) xdr_sm_stat, &smstat, timo);
if (stat != RPC_SUCCESS) {
struct rpc_err err;
CLNT_GETERR(nlm_nsm, &err);
NLM_ERR("NLM: unexpected error contacting NSM, "
"stat=%d, errno=%d\n", stat, err.re_errno);
error = EINVAL;
goto out;
}
nlm_is_running = 1;
NLM_DEBUG(1, "NLM: local NSM state is %d\n", smstat.state);
nlm_nsm_state = smstat.state;
old_nfs_advlock = nfs_advlock_p;
nfs_advlock_p = nlm_advlock;
old_nfs_reclaim = nfs_reclaim_p;
nfs_reclaim_p = nlm_reclaim;
svc_run(pool);
error = 0;
nfs_advlock_p = old_nfs_advlock;
nfs_reclaim_p = old_nfs_reclaim;
out:
nlm_is_running = 0;
if (pool)
svcpool_destroy(pool);
/*
* We are finished communicating with the NSM.
*/
if (nlm_nsm) {
CLNT_RELEASE(nlm_nsm);
nlm_nsm = NULL;
}
/*
* Trash all the existing state so that if the server
* restarts, it gets a clean slate. This is complicated by the
* possibility that there may be other threads trying to make
* client locking requests.
*
* First we fake a client reboot notification which will
* cancel any pending async locks and purge remote lock state
* from the local lock manager. We release the reference from
* nlm_hosts to the host (which may remove it from the list
* and free it). After this phase, the only entries in the
* nlm_host list should be from other threads performing
* client lock requests.
*/
mtx_lock(&nlm_global_lock);
TAILQ_FOREACH(nw, &nlm_waiting_locks, nw_link) {
wakeup(nw);
}
TAILQ_FOREACH_SAFE(host, &nlm_hosts, nh_link, nhost) {
mtx_unlock(&nlm_global_lock);
nlm_host_notify(host, 0);
nlm_host_release(host);
mtx_lock(&nlm_global_lock);
}
mtx_unlock(&nlm_global_lock);
AUTH_DESTROY(nlm_auth);
return (error);
}
int
sys_nlm_syscall(struct thread *td, struct nlm_syscall_args *uap)
{
int error;
#if __FreeBSD_version >= 700000
error = priv_check(td, PRIV_NFS_LOCKD);
#else
error = suser(td);
#endif
if (error)
return (error);
nlm_debug_level = uap->debug_level;
nlm_grace_threshold = time_uptime + uap->grace_period;
nlm_next_idle_check = time_uptime + NLM_IDLE_PERIOD;
return nlm_server_main(uap->addr_count, uap->addrs);
}
/**********************************************************************/
/*
* NLM implementation details, called from the RPC stubs.
*/
void
nlm_sm_notify(struct nlm_sm_status *argp)
{
uint32_t sysid;
struct nlm_host *host;
NLM_DEBUG(3, "nlm_sm_notify(): mon_name = %s\n", argp->mon_name);
memcpy(&sysid, &argp->priv, sizeof(sysid));
host = nlm_find_host_by_sysid(sysid);
if (host) {
nlm_host_notify(host, argp->state);
nlm_host_release(host);
}
}
static void
nlm_convert_to_fhandle_t(fhandle_t *fhp, struct netobj *p)
{
memcpy(fhp, p->n_bytes, sizeof(fhandle_t));
}
struct vfs_state {
struct mount *vs_mp;
struct vnode *vs_vp;
int vs_vnlocked;
};
static int
nlm_get_vfs_state(struct nlm_host *host, struct svc_req *rqstp,
fhandle_t *fhp, struct vfs_state *vs, accmode_t accmode)
{
int error, exflags;
struct ucred *cred = NULL, *credanon = NULL;
memset(vs, 0, sizeof(*vs));
vs->vs_mp = vfs_getvfs(&fhp->fh_fsid);
if (!vs->vs_mp) {
return (ESTALE);
}
/* accmode == 0 means don't check, since it is an unlock. */
if (accmode != 0) {
error = VFS_CHECKEXP(vs->vs_mp,
(struct sockaddr *)&host->nh_addr, &exflags, &credanon,
NULL, NULL);
if (error)
goto out;
if (exflags & MNT_EXRDONLY ||
(vs->vs_mp->mnt_flag & MNT_RDONLY)) {
error = EROFS;
goto out;
}
}
error = VFS_FHTOVP(vs->vs_mp, &fhp->fh_fid, LK_EXCLUSIVE, &vs->vs_vp);
if (error)
goto out;
vs->vs_vnlocked = TRUE;
if (accmode != 0) {
if (!svc_getcred(rqstp, &cred, NULL)) {
error = EINVAL;
goto out;
}
if (cred->cr_uid == 0 || (exflags & MNT_EXPORTANON)) {
crfree(cred);
cred = credanon;
credanon = NULL;
}
/*
* Check cred.
*/
error = VOP_ACCESS(vs->vs_vp, accmode, cred, curthread);
/*
* If this failed and accmode != VWRITE, try again with
* VWRITE to maintain backwards compatibility with the
* old code that always used VWRITE.
*/
if (error != 0 && accmode != VWRITE)
error = VOP_ACCESS(vs->vs_vp, VWRITE, cred, curthread);
if (error)
goto out;
}
#if __FreeBSD_version < 800011
VOP_UNLOCK(vs->vs_vp, 0, curthread);
#else
VOP_UNLOCK(vs->vs_vp, 0);
#endif
vs->vs_vnlocked = FALSE;
out:
if (cred)
crfree(cred);
if (credanon)
crfree(credanon);
return (error);
}
static void
nlm_release_vfs_state(struct vfs_state *vs)
{
if (vs->vs_vp) {
if (vs->vs_vnlocked)
vput(vs->vs_vp);
else
vrele(vs->vs_vp);
}
if (vs->vs_mp)
vfs_rel(vs->vs_mp);
}
static nlm4_stats
nlm_convert_error(int error)
{
if (error == ESTALE)
return nlm4_stale_fh;
else if (error == EROFS)
return nlm4_rofs;
else
return nlm4_failed;
}
int
nlm_do_test(nlm4_testargs *argp, nlm4_testres *result, struct svc_req *rqstp,
CLIENT **rpcp)
{
fhandle_t fh;
struct vfs_state vs;
struct nlm_host *host, *bhost;
int error, sysid;
struct flock fl;
accmode_t accmode;
memset(result, 0, sizeof(*result));
memset(&vs, 0, sizeof(vs));
host = nlm_find_host_by_name(argp->alock.caller_name,
svc_getrpccaller(rqstp), rqstp->rq_vers);
if (!host) {
result->stat.stat = nlm4_denied_nolocks;
return (ENOMEM);
}
NLM_DEBUG(3, "nlm_do_test(): caller_name = %s (sysid = %d)\n",
host->nh_caller_name, host->nh_sysid);
nlm_check_expired_locks(host);
sysid = host->nh_sysid;
nlm_convert_to_fhandle_t(&fh, &argp->alock.fh);
nlm_copy_netobj(&result->cookie, &argp->cookie, M_RPC);
if (time_uptime < nlm_grace_threshold) {
result->stat.stat = nlm4_denied_grace_period;
goto out;
}
accmode = argp->exclusive ? VWRITE : VREAD;
error = nlm_get_vfs_state(host, rqstp, &fh, &vs, accmode);
if (error) {
result->stat.stat = nlm_convert_error(error);
goto out;
}
fl.l_start = argp->alock.l_offset;
fl.l_len = argp->alock.l_len;
fl.l_pid = argp->alock.svid;
fl.l_sysid = sysid;
fl.l_whence = SEEK_SET;
if (argp->exclusive)
fl.l_type = F_WRLCK;
else
fl.l_type = F_RDLCK;
error = VOP_ADVLOCK(vs.vs_vp, NULL, F_GETLK, &fl, F_REMOTE);
if (error) {
result->stat.stat = nlm4_failed;
goto out;
}
if (fl.l_type == F_UNLCK) {
result->stat.stat = nlm4_granted;
} else {
result->stat.stat = nlm4_denied;
result->stat.nlm4_testrply_u.holder.exclusive =
(fl.l_type == F_WRLCK);
result->stat.nlm4_testrply_u.holder.svid = fl.l_pid;
bhost = nlm_find_host_by_sysid(fl.l_sysid);
if (bhost) {
/*
* We don't have any useful way of recording
* the value of oh used in the original lock
* request. Ideally, the test reply would have
* a space for the owning host's name allowing
* our caller's NLM to keep track.
*
* As far as I can see, Solaris uses an eight
* byte structure for oh which contains a four
* byte pid encoded in local byte order and
* the first four bytes of the host
* name. Linux uses a variable length string
* 'pid@hostname' in ascii but doesn't even
* return that in test replies.
*
* For the moment, return nothing in oh
* (already zero'ed above).
*/
nlm_host_release(bhost);
}
result->stat.nlm4_testrply_u.holder.l_offset = fl.l_start;
result->stat.nlm4_testrply_u.holder.l_len = fl.l_len;
}
out:
nlm_release_vfs_state(&vs);
if (rpcp)
*rpcp = nlm_host_get_rpc(host, TRUE);
nlm_host_release(host);
return (0);
}
int
nlm_do_lock(nlm4_lockargs *argp, nlm4_res *result, struct svc_req *rqstp,
bool_t monitor, CLIENT **rpcp)
{
fhandle_t fh;
struct vfs_state vs;
struct nlm_host *host;
int error, sysid;
struct flock fl;
accmode_t accmode;
memset(result, 0, sizeof(*result));
memset(&vs, 0, sizeof(vs));
host = nlm_find_host_by_name(argp->alock.caller_name,
svc_getrpccaller(rqstp), rqstp->rq_vers);
if (!host) {
result->stat.stat = nlm4_denied_nolocks;
return (ENOMEM);
}
NLM_DEBUG(3, "nlm_do_lock(): caller_name = %s (sysid = %d)\n",
host->nh_caller_name, host->nh_sysid);
if (monitor && host->nh_state && argp->state
&& host->nh_state != argp->state) {
/*
* The host rebooted without telling us. Trash its
* locks.
*/
nlm_host_notify(host, argp->state);
}
nlm_check_expired_locks(host);
sysid = host->nh_sysid;
nlm_convert_to_fhandle_t(&fh, &argp->alock.fh);
nlm_copy_netobj(&result->cookie, &argp->cookie, M_RPC);
if (time_uptime < nlm_grace_threshold && !argp->reclaim) {
result->stat.stat = nlm4_denied_grace_period;
goto out;
}
accmode = argp->exclusive ? VWRITE : VREAD;
error = nlm_get_vfs_state(host, rqstp, &fh, &vs, accmode);
if (error) {
result->stat.stat = nlm_convert_error(error);
goto out;
}
fl.l_start = argp->alock.l_offset;
fl.l_len = argp->alock.l_len;
fl.l_pid = argp->alock.svid;
fl.l_sysid = sysid;
fl.l_whence = SEEK_SET;
if (argp->exclusive)
fl.l_type = F_WRLCK;
else
fl.l_type = F_RDLCK;
if (argp->block) {
struct nlm_async_lock *af;
CLIENT *client;
struct nlm_grantcookie cookie;
/*
* First, make sure we can contact the host's NLM.
*/
client = nlm_host_get_rpc(host, TRUE);
if (!client) {
result->stat.stat = nlm4_failed;
goto out;
}
/*
* First we need to check and see if there is an
* existing blocked lock that matches. This could be a
* badly behaved client or an RPC re-send. If we find
* one, just return nlm4_blocked.
*/
mtx_lock(&host->nh_lock);
TAILQ_FOREACH(af, &host->nh_pending, af_link) {
if (af->af_fl.l_start == fl.l_start
&& af->af_fl.l_len == fl.l_len
&& af->af_fl.l_pid == fl.l_pid
&& af->af_fl.l_type == fl.l_type) {
break;
}
}
if (!af) {
cookie.ng_sysid = host->nh_sysid;
cookie.ng_cookie = host->nh_grantcookie++;
}
mtx_unlock(&host->nh_lock);
if (af) {
CLNT_RELEASE(client);
result->stat.stat = nlm4_blocked;
goto out;
}
af = malloc(sizeof(struct nlm_async_lock), M_NLM,
M_WAITOK|M_ZERO);
TASK_INIT(&af->af_task, 0, nlm_lock_callback, af);
af->af_vp = vs.vs_vp;
af->af_fl = fl;
af->af_host = host;
af->af_rpc = client;
/*
* We use M_RPC here so that we can xdr_free the thing
* later.
*/
nlm_make_netobj(&af->af_granted.cookie,
(caddr_t)&cookie, sizeof(cookie), M_RPC);
af->af_granted.exclusive = argp->exclusive;
af->af_granted.alock.caller_name =
strdup(argp->alock.caller_name, M_RPC);
nlm_copy_netobj(&af->af_granted.alock.fh,
&argp->alock.fh, M_RPC);
nlm_copy_netobj(&af->af_granted.alock.oh,
&argp->alock.oh, M_RPC);
af->af_granted.alock.svid = argp->alock.svid;
af->af_granted.alock.l_offset = argp->alock.l_offset;
af->af_granted.alock.l_len = argp->alock.l_len;
/*
* Put the entry on the pending list before calling
* VOP_ADVLOCKASYNC. We do this in case the lock
* request was blocked (returning EINPROGRESS) but
* then granted before we manage to run again. The
* client may receive the granted message before we
* send our blocked reply but thats their problem.
*/
mtx_lock(&host->nh_lock);
TAILQ_INSERT_TAIL(&host->nh_pending, af, af_link);
mtx_unlock(&host->nh_lock);
error = VOP_ADVLOCKASYNC(vs.vs_vp, NULL, F_SETLK, &fl, F_REMOTE,
&af->af_task, &af->af_cookie);
/*
* If the lock completed synchronously, just free the
* tracking structure now.
*/
if (error != EINPROGRESS) {
CLNT_RELEASE(af->af_rpc);
mtx_lock(&host->nh_lock);
TAILQ_REMOVE(&host->nh_pending, af, af_link);
mtx_unlock(&host->nh_lock);
xdr_free((xdrproc_t) xdr_nlm4_testargs,
&af->af_granted);
free(af, M_NLM);
} else {
NLM_DEBUG(2, "NLM: pending async lock %p for %s "
"(sysid %d)\n", af, host->nh_caller_name, sysid);
/*
* Don't vrele the vnode just yet - this must
* wait until either the async callback
* happens or the lock is cancelled.
*/
vs.vs_vp = NULL;
}
} else {
error = VOP_ADVLOCK(vs.vs_vp, NULL, F_SETLK, &fl, F_REMOTE);
}
if (error) {
if (error == EINPROGRESS) {
result->stat.stat = nlm4_blocked;
} else if (error == EDEADLK) {
result->stat.stat = nlm4_deadlck;
} else if (error == EAGAIN) {
result->stat.stat = nlm4_denied;
} else {
result->stat.stat = nlm4_failed;
}
} else {
if (monitor)
nlm_host_monitor(host, argp->state);
result->stat.stat = nlm4_granted;
}
out:
nlm_release_vfs_state(&vs);
if (rpcp)
*rpcp = nlm_host_get_rpc(host, TRUE);
nlm_host_release(host);
return (0);
}
int
nlm_do_cancel(nlm4_cancargs *argp, nlm4_res *result, struct svc_req *rqstp,
CLIENT **rpcp)
{
fhandle_t fh;
struct vfs_state vs;
struct nlm_host *host;
int error, sysid;
struct flock fl;
struct nlm_async_lock *af;
memset(result, 0, sizeof(*result));
memset(&vs, 0, sizeof(vs));
host = nlm_find_host_by_name(argp->alock.caller_name,
svc_getrpccaller(rqstp), rqstp->rq_vers);
if (!host) {
result->stat.stat = nlm4_denied_nolocks;
return (ENOMEM);
}
NLM_DEBUG(3, "nlm_do_cancel(): caller_name = %s (sysid = %d)\n",
host->nh_caller_name, host->nh_sysid);
nlm_check_expired_locks(host);
sysid = host->nh_sysid;
nlm_convert_to_fhandle_t(&fh, &argp->alock.fh);
nlm_copy_netobj(&result->cookie, &argp->cookie, M_RPC);
if (time_uptime < nlm_grace_threshold) {
result->stat.stat = nlm4_denied_grace_period;
goto out;
}
error = nlm_get_vfs_state(host, rqstp, &fh, &vs, (accmode_t)0);
if (error) {
result->stat.stat = nlm_convert_error(error);
goto out;
}
fl.l_start = argp->alock.l_offset;
fl.l_len = argp->alock.l_len;
fl.l_pid = argp->alock.svid;
fl.l_sysid = sysid;
fl.l_whence = SEEK_SET;
if (argp->exclusive)
fl.l_type = F_WRLCK;
else
fl.l_type = F_RDLCK;
/*
* First we need to try and find the async lock request - if
* there isn't one, we give up and return nlm4_denied.
*/
mtx_lock(&host->nh_lock);
TAILQ_FOREACH(af, &host->nh_pending, af_link) {
if (af->af_fl.l_start == fl.l_start
&& af->af_fl.l_len == fl.l_len
&& af->af_fl.l_pid == fl.l_pid
&& af->af_fl.l_type == fl.l_type) {
break;
}
}
if (!af) {
mtx_unlock(&host->nh_lock);
result->stat.stat = nlm4_denied;
goto out;
}
error = nlm_cancel_async_lock(af);
if (error) {
result->stat.stat = nlm4_denied;
} else {
result->stat.stat = nlm4_granted;
}
mtx_unlock(&host->nh_lock);
out:
nlm_release_vfs_state(&vs);
if (rpcp)
*rpcp = nlm_host_get_rpc(host, TRUE);
nlm_host_release(host);
return (0);
}
int
nlm_do_unlock(nlm4_unlockargs *argp, nlm4_res *result, struct svc_req *rqstp,
CLIENT **rpcp)
{
fhandle_t fh;
struct vfs_state vs;
struct nlm_host *host;
int error, sysid;
struct flock fl;
memset(result, 0, sizeof(*result));
memset(&vs, 0, sizeof(vs));
host = nlm_find_host_by_name(argp->alock.caller_name,
svc_getrpccaller(rqstp), rqstp->rq_vers);
if (!host) {
result->stat.stat = nlm4_denied_nolocks;
return (ENOMEM);
}
NLM_DEBUG(3, "nlm_do_unlock(): caller_name = %s (sysid = %d)\n",
host->nh_caller_name, host->nh_sysid);
nlm_check_expired_locks(host);
sysid = host->nh_sysid;
nlm_convert_to_fhandle_t(&fh, &argp->alock.fh);
nlm_copy_netobj(&result->cookie, &argp->cookie, M_RPC);
if (time_uptime < nlm_grace_threshold) {
result->stat.stat = nlm4_denied_grace_period;
goto out;
}
error = nlm_get_vfs_state(host, rqstp, &fh, &vs, (accmode_t)0);
if (error) {
result->stat.stat = nlm_convert_error(error);
goto out;
}
fl.l_start = argp->alock.l_offset;
fl.l_len = argp->alock.l_len;
fl.l_pid = argp->alock.svid;
fl.l_sysid = sysid;
fl.l_whence = SEEK_SET;
fl.l_type = F_UNLCK;
error = VOP_ADVLOCK(vs.vs_vp, NULL, F_UNLCK, &fl, F_REMOTE);
/*
* Ignore the error - there is no result code for failure,
* only for grace period.
*/
result->stat.stat = nlm4_granted;
out:
nlm_release_vfs_state(&vs);
if (rpcp)
*rpcp = nlm_host_get_rpc(host, TRUE);
nlm_host_release(host);
return (0);
}
int
nlm_do_granted(nlm4_testargs *argp, nlm4_res *result, struct svc_req *rqstp,
CLIENT **rpcp)
{
struct nlm_host *host;
struct nlm_waiting_lock *nw;
memset(result, 0, sizeof(*result));
host = nlm_find_host_by_addr(svc_getrpccaller(rqstp), rqstp->rq_vers);
if (!host) {
result->stat.stat = nlm4_denied_nolocks;
return (ENOMEM);
}
nlm_copy_netobj(&result->cookie, &argp->cookie, M_RPC);
result->stat.stat = nlm4_denied;
KFAIL_POINT_CODE(DEBUG_FP, nlm_deny_grant, goto out);
mtx_lock(&nlm_global_lock);
TAILQ_FOREACH(nw, &nlm_waiting_locks, nw_link) {
if (!nw->nw_waiting)
continue;
if (argp->alock.svid == nw->nw_lock.svid
&& argp->alock.l_offset == nw->nw_lock.l_offset
&& argp->alock.l_len == nw->nw_lock.l_len
&& argp->alock.fh.n_len == nw->nw_lock.fh.n_len
&& !memcmp(argp->alock.fh.n_bytes, nw->nw_lock.fh.n_bytes,
nw->nw_lock.fh.n_len)) {
nw->nw_waiting = FALSE;
wakeup(nw);
result->stat.stat = nlm4_granted;
break;
}
}
mtx_unlock(&nlm_global_lock);
out:
if (rpcp)
*rpcp = nlm_host_get_rpc(host, TRUE);
nlm_host_release(host);
return (0);
}
void
nlm_do_granted_res(nlm4_res *argp, struct svc_req *rqstp)
{
struct nlm_host *host = NULL;
struct nlm_async_lock *af = NULL;
int error;
if (argp->cookie.n_len != sizeof(struct nlm_grantcookie)) {
NLM_DEBUG(1, "NLM: bogus grant cookie");
goto out;
}
host = nlm_find_host_by_sysid(ng_sysid(&argp->cookie));
if (!host) {
NLM_DEBUG(1, "NLM: Unknown host rejected our grant");
goto out;
}
mtx_lock(&host->nh_lock);
TAILQ_FOREACH(af, &host->nh_granted, af_link)
if (ng_cookie(&argp->cookie) ==
ng_cookie(&af->af_granted.cookie))
break;
if (af)
TAILQ_REMOVE(&host->nh_granted, af, af_link);
mtx_unlock(&host->nh_lock);
if (!af) {
NLM_DEBUG(1, "NLM: host %s (sysid %d) replied to our grant "
"with unrecognized cookie %d:%d", host->nh_caller_name,
host->nh_sysid, ng_sysid(&argp->cookie),
ng_cookie(&argp->cookie));
goto out;
}
if (argp->stat.stat != nlm4_granted) {
af->af_fl.l_type = F_UNLCK;
error = VOP_ADVLOCK(af->af_vp, NULL, F_UNLCK, &af->af_fl, F_REMOTE);
if (error) {
NLM_DEBUG(1, "NLM: host %s (sysid %d) rejected our grant "
"and we failed to unlock (%d)", host->nh_caller_name,
host->nh_sysid, error);
goto out;
}
NLM_DEBUG(5, "NLM: async lock %p rejected by host %s (sysid %d)",
af, host->nh_caller_name, host->nh_sysid);
} else {
NLM_DEBUG(5, "NLM: async lock %p accepted by host %s (sysid %d)",
af, host->nh_caller_name, host->nh_sysid);
}
out:
if (af)
nlm_free_async_lock(af);
if (host)
nlm_host_release(host);
}
void
nlm_do_free_all(nlm4_notify *argp)
{
struct nlm_host *host, *thost;
TAILQ_FOREACH_SAFE(host, &nlm_hosts, nh_link, thost) {
if (!strcmp(host->nh_caller_name, argp->name))
nlm_host_notify(host, argp->state);
}
}
/*
* Kernel module glue
*/
static int
nfslockd_modevent(module_t mod, int type, void *data)
{
switch (type) {
case MOD_LOAD:
return (0);
case MOD_UNLOAD:
/* The NLM module cannot be safely unloaded. */
/* FALLTHROUGH */
default:
return (EOPNOTSUPP);
}
}
static moduledata_t nfslockd_mod = {
"nfslockd",
nfslockd_modevent,
NULL,
};
DECLARE_MODULE(nfslockd, nfslockd_mod, SI_SUB_VFS, SI_ORDER_ANY);
/* So that loader and kldload(2) can find us, wherever we are.. */
MODULE_DEPEND(nfslockd, krpc, 1, 1, 1);
MODULE_DEPEND(nfslockd, nfslock, 1, 1, 1);
MODULE_VERSION(nfslockd, 1);
Index: head/sys/opencrypto/crypto.c
===================================================================
--- head/sys/opencrypto/crypto.c (revision 327172)
+++ head/sys/opencrypto/crypto.c (revision 327173)
@@ -1,1781 +1,1780 @@
/*-
* Copyright (c) 2002-2006 Sam Leffler. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
/*
* Cryptographic Subsystem.
*
* This code is derived from the Openbsd Cryptographic Framework (OCF)
* that has the copyright shown below. Very little of the original
* code remains.
*/
/*-
* The author of this code is Angelos D. Keromytis (angelos@cis.upenn.edu)
*
* This code was written by Angelos D. Keromytis in Athens, Greece, in
* February 2000. Network Security Technologies Inc. (NSTI) kindly
* supported the development of this code.
*
* Copyright (c) 2000, 2001 Angelos D. Keromytis
*
* Permission to use, copy, and modify this software with or without fee
* is hereby granted, provided that this entire notice is included in
* all source code copies of any software which is or includes a copy or
* modification of this software.
*
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR
* IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE
* MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR
* PURPOSE.
*/
#define CRYPTO_TIMING /* enable timing support */
#include "opt_ddb.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/eventhandler.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/linker.h>
#include <sys/lock.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/malloc.h>
#include <sys/proc.h>
#include <sys/sdt.h>
#include <sys/smp.h>
#include <sys/sysctl.h>
#include <sys/taskqueue.h>
#include <ddb/ddb.h>
#include <vm/uma.h>
#include <crypto/intake.h>
#include <opencrypto/cryptodev.h>
#include <opencrypto/xform.h> /* XXX for M_XDATA */
#include <sys/kobj.h>
#include <sys/bus.h>
#include "cryptodev_if.h"
#if defined(__i386__) || defined(__amd64__) || defined(__aarch64__)
#include <machine/pcb.h>
#endif
SDT_PROVIDER_DEFINE(opencrypto);
/*
* Crypto drivers register themselves by allocating a slot in the
* crypto_drivers table with crypto_get_driverid() and then registering
* each algorithm they support with crypto_register() and crypto_kregister().
*/
static struct mtx crypto_drivers_mtx; /* lock on driver table */
#define CRYPTO_DRIVER_LOCK() mtx_lock(&crypto_drivers_mtx)
#define CRYPTO_DRIVER_UNLOCK() mtx_unlock(&crypto_drivers_mtx)
#define CRYPTO_DRIVER_ASSERT() mtx_assert(&crypto_drivers_mtx, MA_OWNED)
/*
* Crypto device/driver capabilities structure.
*
* Synchronization:
* (d) - protected by CRYPTO_DRIVER_LOCK()
* (q) - protected by CRYPTO_Q_LOCK()
* Not tagged fields are read-only.
*/
struct cryptocap {
device_t cc_dev; /* (d) device/driver */
u_int32_t cc_sessions; /* (d) # of sessions */
u_int32_t cc_koperations; /* (d) # os asym operations */
/*
* Largest possible operator length (in bits) for each type of
* encryption algorithm. XXX not used
*/
u_int16_t cc_max_op_len[CRYPTO_ALGORITHM_MAX + 1];
u_int8_t cc_alg[CRYPTO_ALGORITHM_MAX + 1];
u_int8_t cc_kalg[CRK_ALGORITHM_MAX + 1];
int cc_flags; /* (d) flags */
#define CRYPTOCAP_F_CLEANUP 0x80000000 /* needs resource cleanup */
int cc_qblocked; /* (q) symmetric q blocked */
int cc_kqblocked; /* (q) asymmetric q blocked */
};
static struct cryptocap *crypto_drivers = NULL;
static int crypto_drivers_num = 0;
/*
* There are two queues for crypto requests; one for symmetric (e.g.
* cipher) operations and one for asymmetric (e.g. MOD)operations.
* A single mutex is used to lock access to both queues. We could
* have one per-queue but having one simplifies handling of block/unblock
* operations.
*/
static int crp_sleep = 0;
static TAILQ_HEAD(cryptop_q ,cryptop) crp_q; /* request queues */
static TAILQ_HEAD(,cryptkop) crp_kq;
static struct mtx crypto_q_mtx;
#define CRYPTO_Q_LOCK() mtx_lock(&crypto_q_mtx)
#define CRYPTO_Q_UNLOCK() mtx_unlock(&crypto_q_mtx)
/*
* Taskqueue used to dispatch the crypto requests
* that have the CRYPTO_F_ASYNC flag
*/
static struct taskqueue *crypto_tq;
/*
* Crypto seq numbers are operated on with modular arithmetic
*/
#define CRYPTO_SEQ_GT(a,b) ((int)((a)-(b)) > 0)
struct crypto_ret_worker {
struct mtx crypto_ret_mtx;
TAILQ_HEAD(,cryptop) crp_ordered_ret_q; /* ordered callback queue for symetric jobs */
TAILQ_HEAD(,cryptop) crp_ret_q; /* callback queue for symetric jobs */
TAILQ_HEAD(,cryptkop) crp_ret_kq; /* callback queue for asym jobs */
u_int32_t reorder_ops; /* total ordered sym jobs received */
u_int32_t reorder_cur_seq; /* current sym job dispatched */
struct proc *cryptoretproc;
};
static struct crypto_ret_worker *crypto_ret_workers = NULL;
#define CRYPTO_RETW(i) (&crypto_ret_workers[i])
#define CRYPTO_RETW_ID(w) ((w) - crypto_ret_workers)
#define FOREACH_CRYPTO_RETW(w) \
for (w = crypto_ret_workers; w < crypto_ret_workers + crypto_workers_num; ++w)
#define CRYPTO_RETW_LOCK(w) mtx_lock(&w->crypto_ret_mtx)
#define CRYPTO_RETW_UNLOCK(w) mtx_unlock(&w->crypto_ret_mtx)
#define CRYPTO_RETW_EMPTY(w) \
(TAILQ_EMPTY(&w->crp_ret_q) && TAILQ_EMPTY(&w->crp_ret_kq) && TAILQ_EMPTY(&w->crp_ordered_ret_q))
static int crypto_workers_num = 0;
SYSCTL_INT(_kern, OID_AUTO, crypto_workers_num, CTLFLAG_RDTUN,
&crypto_workers_num, 0,
"Number of crypto workers used to dispatch crypto jobs");
static uma_zone_t cryptop_zone;
static uma_zone_t cryptodesc_zone;
int crypto_userasymcrypto = 1; /* userland may do asym crypto reqs */
SYSCTL_INT(_kern, OID_AUTO, userasymcrypto, CTLFLAG_RW,
&crypto_userasymcrypto, 0,
"Enable/disable user-mode access to asymmetric crypto support");
int crypto_devallowsoft = 0; /* only use hardware crypto */
SYSCTL_INT(_kern, OID_AUTO, cryptodevallowsoft, CTLFLAG_RW,
&crypto_devallowsoft, 0,
"Enable/disable use of software crypto by /dev/crypto");
MALLOC_DEFINE(M_CRYPTO_DATA, "crypto", "crypto session records");
static void crypto_proc(void);
static struct proc *cryptoproc;
static void crypto_ret_proc(struct crypto_ret_worker *ret_worker);
static void crypto_destroy(void);
static int crypto_invoke(struct cryptocap *cap, struct cryptop *crp, int hint);
static int crypto_kinvoke(struct cryptkop *krp, int flags);
static void crypto_task_invoke(void *ctx, int pending);
static void crypto_batch_enqueue(struct cryptop *crp);
static struct cryptostats cryptostats;
SYSCTL_STRUCT(_kern, OID_AUTO, crypto_stats, CTLFLAG_RW, &cryptostats,
cryptostats, "Crypto system statistics");
#ifdef CRYPTO_TIMING
static int crypto_timing = 0;
SYSCTL_INT(_debug, OID_AUTO, crypto_timing, CTLFLAG_RW,
&crypto_timing, 0, "Enable/disable crypto timing support");
#endif
/* Try to avoid directly exposing the key buffer as a symbol */
static struct keybuf *keybuf;
static struct keybuf empty_keybuf = {
.kb_nents = 0
};
/* Obtain the key buffer from boot metadata */
static void
keybuf_init(void)
{
caddr_t kmdp;
kmdp = preload_search_by_type("elf kernel");
if (kmdp == NULL)
kmdp = preload_search_by_type("elf64 kernel");
keybuf = (struct keybuf *)preload_search_info(kmdp,
MODINFO_METADATA | MODINFOMD_KEYBUF);
if (keybuf == NULL)
keybuf = &empty_keybuf;
}
/* It'd be nice if we could store these in some kind of secure memory... */
struct keybuf * get_keybuf(void) {
return (keybuf);
}
static int
crypto_init(void)
{
struct crypto_ret_worker *ret_worker;
int error;
mtx_init(&crypto_drivers_mtx, "crypto", "crypto driver table",
MTX_DEF|MTX_QUIET);
TAILQ_INIT(&crp_q);
TAILQ_INIT(&crp_kq);
mtx_init(&crypto_q_mtx, "crypto", "crypto op queues", MTX_DEF);
cryptop_zone = uma_zcreate("cryptop", sizeof (struct cryptop),
0, 0, 0, 0,
UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
cryptodesc_zone = uma_zcreate("cryptodesc", sizeof (struct cryptodesc),
0, 0, 0, 0,
UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
if (cryptodesc_zone == NULL || cryptop_zone == NULL) {
printf("crypto_init: cannot setup crypto zones\n");
error = ENOMEM;
goto bad;
}
crypto_drivers_num = CRYPTO_DRIVERS_INITIAL;
crypto_drivers = malloc(crypto_drivers_num *
sizeof(struct cryptocap), M_CRYPTO_DATA, M_NOWAIT | M_ZERO);
if (crypto_drivers == NULL) {
printf("crypto_init: cannot setup crypto drivers\n");
error = ENOMEM;
goto bad;
}
if (crypto_workers_num < 1 || crypto_workers_num > mp_ncpus)
crypto_workers_num = mp_ncpus;
crypto_tq = taskqueue_create("crypto", M_WAITOK|M_ZERO,
taskqueue_thread_enqueue, &crypto_tq);
if (crypto_tq == NULL) {
printf("crypto init: cannot setup crypto taskqueue\n");
error = ENOMEM;
goto bad;
}
taskqueue_start_threads(&crypto_tq, crypto_workers_num, PRI_MIN_KERN,
"crypto");
error = kproc_create((void (*)(void *)) crypto_proc, NULL,
&cryptoproc, 0, 0, "crypto");
if (error) {
printf("crypto_init: cannot start crypto thread; error %d",
error);
goto bad;
}
crypto_ret_workers = malloc(crypto_workers_num * sizeof(struct crypto_ret_worker),
M_CRYPTO_DATA, M_NOWAIT|M_ZERO);
if (crypto_ret_workers == NULL) {
error = ENOMEM;
printf("crypto_init: cannot allocate ret workers\n");
goto bad;
}
FOREACH_CRYPTO_RETW(ret_worker) {
TAILQ_INIT(&ret_worker->crp_ordered_ret_q);
TAILQ_INIT(&ret_worker->crp_ret_q);
TAILQ_INIT(&ret_worker->crp_ret_kq);
ret_worker->reorder_ops = 0;
ret_worker->reorder_cur_seq = 0;
mtx_init(&ret_worker->crypto_ret_mtx, "crypto", "crypto return queues", MTX_DEF);
error = kproc_create((void (*)(void *)) crypto_ret_proc, ret_worker,
&ret_worker->cryptoretproc, 0, 0, "crypto returns %td", CRYPTO_RETW_ID(ret_worker));
if (error) {
printf("crypto_init: cannot start cryptoret thread; error %d",
error);
goto bad;
}
}
keybuf_init();
return 0;
bad:
crypto_destroy();
return error;
}
/*
* Signal a crypto thread to terminate. We use the driver
* table lock to synchronize the sleep/wakeups so that we
* are sure the threads have terminated before we release
* the data structures they use. See crypto_finis below
* for the other half of this song-and-dance.
*/
static void
crypto_terminate(struct proc **pp, void *q)
{
struct proc *p;
mtx_assert(&crypto_drivers_mtx, MA_OWNED);
p = *pp;
*pp = NULL;
if (p) {
wakeup_one(q);
PROC_LOCK(p); /* NB: insure we don't miss wakeup */
CRYPTO_DRIVER_UNLOCK(); /* let crypto_finis progress */
msleep(p, &p->p_mtx, PWAIT, "crypto_destroy", 0);
PROC_UNLOCK(p);
CRYPTO_DRIVER_LOCK();
}
}
static void
crypto_destroy(void)
{
struct crypto_ret_worker *ret_worker;
/*
* Terminate any crypto threads.
*/
if (crypto_tq != NULL)
taskqueue_drain_all(crypto_tq);
CRYPTO_DRIVER_LOCK();
crypto_terminate(&cryptoproc, &crp_q);
FOREACH_CRYPTO_RETW(ret_worker)
crypto_terminate(&ret_worker->cryptoretproc, &ret_worker->crp_ret_q);
CRYPTO_DRIVER_UNLOCK();
/* XXX flush queues??? */
/*
* Reclaim dynamically allocated resources.
*/
if (crypto_drivers != NULL)
free(crypto_drivers, M_CRYPTO_DATA);
if (cryptodesc_zone != NULL)
uma_zdestroy(cryptodesc_zone);
if (cryptop_zone != NULL)
uma_zdestroy(cryptop_zone);
mtx_destroy(&crypto_q_mtx);
FOREACH_CRYPTO_RETW(ret_worker)
mtx_destroy(&ret_worker->crypto_ret_mtx);
free(crypto_ret_workers, M_CRYPTO_DATA);
if (crypto_tq != NULL)
taskqueue_free(crypto_tq);
mtx_destroy(&crypto_drivers_mtx);
}
static struct cryptocap *
crypto_checkdriver(u_int32_t hid)
{
if (crypto_drivers == NULL)
return NULL;
return (hid >= crypto_drivers_num ? NULL : &crypto_drivers[hid]);
}
/*
* Compare a driver's list of supported algorithms against another
* list; return non-zero if all algorithms are supported.
*/
static int
driver_suitable(const struct cryptocap *cap, const struct cryptoini *cri)
{
const struct cryptoini *cr;
/* See if all the algorithms are supported. */
for (cr = cri; cr; cr = cr->cri_next)
if (cap->cc_alg[cr->cri_alg] == 0)
return 0;
return 1;
}
/*
* Select a driver for a new session that supports the specified
* algorithms and, optionally, is constrained according to the flags.
* The algorithm we use here is pretty stupid; just use the
* first driver that supports all the algorithms we need. If there
* are multiple drivers we choose the driver with the fewest active
* sessions. We prefer hardware-backed drivers to software ones.
*
* XXX We need more smarts here (in real life too, but that's
* XXX another story altogether).
*/
static struct cryptocap *
crypto_select_driver(const struct cryptoini *cri, int flags)
{
struct cryptocap *cap, *best;
int match, hid;
CRYPTO_DRIVER_ASSERT();
/*
* Look first for hardware crypto devices if permitted.
*/
if (flags & CRYPTOCAP_F_HARDWARE)
match = CRYPTOCAP_F_HARDWARE;
else
match = CRYPTOCAP_F_SOFTWARE;
best = NULL;
again:
for (hid = 0; hid < crypto_drivers_num; hid++) {
cap = &crypto_drivers[hid];
/*
* If it's not initialized, is in the process of
* going away, or is not appropriate (hardware
* or software based on match), then skip.
*/
if (cap->cc_dev == NULL ||
(cap->cc_flags & CRYPTOCAP_F_CLEANUP) ||
(cap->cc_flags & match) == 0)
continue;
/* verify all the algorithms are supported. */
if (driver_suitable(cap, cri)) {
if (best == NULL ||
cap->cc_sessions < best->cc_sessions)
best = cap;
}
}
if (best == NULL && match == CRYPTOCAP_F_HARDWARE &&
(flags & CRYPTOCAP_F_SOFTWARE)) {
/* sort of an Algol 68-style for loop */
match = CRYPTOCAP_F_SOFTWARE;
goto again;
}
return best;
}
/*
* Create a new session. The crid argument specifies a crypto
* driver to use or constraints on a driver to select (hardware
* only, software only, either). Whatever driver is selected
* must be capable of the requested crypto algorithms.
*/
int
crypto_newsession(u_int64_t *sid, struct cryptoini *cri, int crid)
{
struct cryptocap *cap;
u_int32_t hid, lid;
int err;
CRYPTO_DRIVER_LOCK();
if ((crid & (CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE)) == 0) {
/*
* Use specified driver; verify it is capable.
*/
cap = crypto_checkdriver(crid);
if (cap != NULL && !driver_suitable(cap, cri))
cap = NULL;
} else {
/*
* No requested driver; select based on crid flags.
*/
cap = crypto_select_driver(cri, crid);
/*
* if NULL then can't do everything in one session.
* XXX Fix this. We need to inject a "virtual" session
* XXX layer right about here.
*/
}
if (cap != NULL) {
/* Call the driver initialization routine. */
hid = cap - crypto_drivers;
lid = hid; /* Pass the driver ID. */
err = CRYPTODEV_NEWSESSION(cap->cc_dev, &lid, cri);
if (err == 0) {
(*sid) = (cap->cc_flags & 0xff000000)
| (hid & 0x00ffffff);
(*sid) <<= 32;
(*sid) |= (lid & 0xffffffff);
cap->cc_sessions++;
} else
CRYPTDEB("dev newsession failed");
} else {
CRYPTDEB("no driver");
err = EOPNOTSUPP;
}
CRYPTO_DRIVER_UNLOCK();
return err;
}
static void
crypto_remove(struct cryptocap *cap)
{
mtx_assert(&crypto_drivers_mtx, MA_OWNED);
if (cap->cc_sessions == 0 && cap->cc_koperations == 0)
bzero(cap, sizeof(*cap));
}
/*
* Delete an existing session (or a reserved session on an unregistered
* driver).
*/
int
crypto_freesession(u_int64_t sid)
{
struct cryptocap *cap;
u_int32_t hid;
int err;
CRYPTO_DRIVER_LOCK();
if (crypto_drivers == NULL) {
err = EINVAL;
goto done;
}
/* Determine two IDs. */
hid = CRYPTO_SESID2HID(sid);
if (hid >= crypto_drivers_num) {
err = ENOENT;
goto done;
}
cap = &crypto_drivers[hid];
if (cap->cc_sessions)
cap->cc_sessions--;
/* Call the driver cleanup routine, if available. */
err = CRYPTODEV_FREESESSION(cap->cc_dev, sid);
if (cap->cc_flags & CRYPTOCAP_F_CLEANUP)
crypto_remove(cap);
done:
CRYPTO_DRIVER_UNLOCK();
return err;
}
/*
* Return an unused driver id. Used by drivers prior to registering
* support for the algorithms they handle.
*/
int32_t
crypto_get_driverid(device_t dev, int flags)
{
struct cryptocap *newdrv;
int i;
if ((flags & (CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE)) == 0) {
printf("%s: no flags specified when registering driver\n",
device_get_nameunit(dev));
return -1;
}
CRYPTO_DRIVER_LOCK();
for (i = 0; i < crypto_drivers_num; i++) {
if (crypto_drivers[i].cc_dev == NULL &&
(crypto_drivers[i].cc_flags & CRYPTOCAP_F_CLEANUP) == 0) {
break;
}
}
/* Out of entries, allocate some more. */
if (i == crypto_drivers_num) {
/* Be careful about wrap-around. */
if (2 * crypto_drivers_num <= crypto_drivers_num) {
CRYPTO_DRIVER_UNLOCK();
printf("crypto: driver count wraparound!\n");
return -1;
}
newdrv = malloc(2 * crypto_drivers_num *
sizeof(struct cryptocap), M_CRYPTO_DATA, M_NOWAIT|M_ZERO);
if (newdrv == NULL) {
CRYPTO_DRIVER_UNLOCK();
printf("crypto: no space to expand driver table!\n");
return -1;
}
bcopy(crypto_drivers, newdrv,
crypto_drivers_num * sizeof(struct cryptocap));
crypto_drivers_num *= 2;
free(crypto_drivers, M_CRYPTO_DATA);
crypto_drivers = newdrv;
}
/* NB: state is zero'd on free */
crypto_drivers[i].cc_sessions = 1; /* Mark */
crypto_drivers[i].cc_dev = dev;
crypto_drivers[i].cc_flags = flags;
if (bootverbose)
printf("crypto: assign %s driver id %u, flags 0x%x\n",
device_get_nameunit(dev), i, flags);
CRYPTO_DRIVER_UNLOCK();
return i;
}
/*
* Lookup a driver by name. We match against the full device
* name and unit, and against just the name. The latter gives
* us a simple widlcarding by device name. On success return the
* driver/hardware identifier; otherwise return -1.
*/
int
crypto_find_driver(const char *match)
{
int i, len = strlen(match);
CRYPTO_DRIVER_LOCK();
for (i = 0; i < crypto_drivers_num; i++) {
device_t dev = crypto_drivers[i].cc_dev;
if (dev == NULL ||
(crypto_drivers[i].cc_flags & CRYPTOCAP_F_CLEANUP))
continue;
if (strncmp(match, device_get_nameunit(dev), len) == 0 ||
strncmp(match, device_get_name(dev), len) == 0)
break;
}
CRYPTO_DRIVER_UNLOCK();
return i < crypto_drivers_num ? i : -1;
}
/*
* Return the device_t for the specified driver or NULL
* if the driver identifier is invalid.
*/
device_t
crypto_find_device_byhid(int hid)
{
struct cryptocap *cap = crypto_checkdriver(hid);
return cap != NULL ? cap->cc_dev : NULL;
}
/*
* Return the device/driver capabilities.
*/
int
crypto_getcaps(int hid)
{
struct cryptocap *cap = crypto_checkdriver(hid);
return cap != NULL ? cap->cc_flags : 0;
}
/*
* Register support for a key-related algorithm. This routine
* is called once for each algorithm supported a driver.
*/
int
crypto_kregister(u_int32_t driverid, int kalg, u_int32_t flags)
{
struct cryptocap *cap;
int err;
CRYPTO_DRIVER_LOCK();
cap = crypto_checkdriver(driverid);
if (cap != NULL &&
(CRK_ALGORITM_MIN <= kalg && kalg <= CRK_ALGORITHM_MAX)) {
/*
* XXX Do some performance testing to determine placing.
* XXX We probably need an auxiliary data structure that
* XXX describes relative performances.
*/
cap->cc_kalg[kalg] = flags | CRYPTO_ALG_FLAG_SUPPORTED;
if (bootverbose)
printf("crypto: %s registers key alg %u flags %u\n"
, device_get_nameunit(cap->cc_dev)
, kalg
, flags
);
err = 0;
} else
err = EINVAL;
CRYPTO_DRIVER_UNLOCK();
return err;
}
/*
* Register support for a non-key-related algorithm. This routine
* is called once for each such algorithm supported by a driver.
*/
int
crypto_register(u_int32_t driverid, int alg, u_int16_t maxoplen,
u_int32_t flags)
{
struct cryptocap *cap;
int err;
CRYPTO_DRIVER_LOCK();
cap = crypto_checkdriver(driverid);
/* NB: algorithms are in the range [1..max] */
if (cap != NULL &&
(CRYPTO_ALGORITHM_MIN <= alg && alg <= CRYPTO_ALGORITHM_MAX)) {
/*
* XXX Do some performance testing to determine placing.
* XXX We probably need an auxiliary data structure that
* XXX describes relative performances.
*/
cap->cc_alg[alg] = flags | CRYPTO_ALG_FLAG_SUPPORTED;
cap->cc_max_op_len[alg] = maxoplen;
if (bootverbose)
printf("crypto: %s registers alg %u flags %u maxoplen %u\n"
, device_get_nameunit(cap->cc_dev)
, alg
, flags
, maxoplen
);
cap->cc_sessions = 0; /* Unmark */
err = 0;
} else
err = EINVAL;
CRYPTO_DRIVER_UNLOCK();
return err;
}
static void
driver_finis(struct cryptocap *cap)
{
u_int32_t ses, kops;
CRYPTO_DRIVER_ASSERT();
ses = cap->cc_sessions;
kops = cap->cc_koperations;
bzero(cap, sizeof(*cap));
if (ses != 0 || kops != 0) {
/*
* If there are pending sessions,
* just mark as invalid.
*/
cap->cc_flags |= CRYPTOCAP_F_CLEANUP;
cap->cc_sessions = ses;
cap->cc_koperations = kops;
}
}
/*
* Unregister a crypto driver. If there are pending sessions using it,
* leave enough information around so that subsequent calls using those
* sessions will correctly detect the driver has been unregistered and
* reroute requests.
*/
int
crypto_unregister(u_int32_t driverid, int alg)
{
struct cryptocap *cap;
int i, err;
CRYPTO_DRIVER_LOCK();
cap = crypto_checkdriver(driverid);
if (cap != NULL &&
(CRYPTO_ALGORITHM_MIN <= alg && alg <= CRYPTO_ALGORITHM_MAX) &&
cap->cc_alg[alg] != 0) {
cap->cc_alg[alg] = 0;
cap->cc_max_op_len[alg] = 0;
/* Was this the last algorithm ? */
for (i = 1; i <= CRYPTO_ALGORITHM_MAX; i++)
if (cap->cc_alg[i] != 0)
break;
if (i == CRYPTO_ALGORITHM_MAX + 1)
driver_finis(cap);
err = 0;
} else
err = EINVAL;
CRYPTO_DRIVER_UNLOCK();
return err;
}
/*
* Unregister all algorithms associated with a crypto driver.
* If there are pending sessions using it, leave enough information
* around so that subsequent calls using those sessions will
* correctly detect the driver has been unregistered and reroute
* requests.
*/
int
crypto_unregister_all(u_int32_t driverid)
{
struct cryptocap *cap;
int err;
CRYPTO_DRIVER_LOCK();
cap = crypto_checkdriver(driverid);
if (cap != NULL) {
driver_finis(cap);
err = 0;
} else
err = EINVAL;
CRYPTO_DRIVER_UNLOCK();
return err;
}
/*
* Clear blockage on a driver. The what parameter indicates whether
* the driver is now ready for cryptop's and/or cryptokop's.
*/
int
crypto_unblock(u_int32_t driverid, int what)
{
struct cryptocap *cap;
int err;
CRYPTO_Q_LOCK();
cap = crypto_checkdriver(driverid);
if (cap != NULL) {
if (what & CRYPTO_SYMQ)
cap->cc_qblocked = 0;
if (what & CRYPTO_ASYMQ)
cap->cc_kqblocked = 0;
if (crp_sleep)
wakeup_one(&crp_q);
err = 0;
} else
err = EINVAL;
CRYPTO_Q_UNLOCK();
return err;
}
/*
* Add a crypto request to a queue, to be processed by the kernel thread.
*/
int
crypto_dispatch(struct cryptop *crp)
{
struct cryptocap *cap;
u_int32_t hid;
int result;
cryptostats.cs_ops++;
#ifdef CRYPTO_TIMING
if (crypto_timing)
binuptime(&crp->crp_tstamp);
#endif
if (CRYPTOP_ASYNC(crp)) {
if (crp->crp_flags & CRYPTO_F_ASYNC_KEEPORDER) {
struct crypto_ret_worker *ret_worker;
crp->crp_retw_id = crp->crp_sid % crypto_workers_num;
ret_worker = CRYPTO_RETW(crp->crp_retw_id);
CRYPTO_RETW_LOCK(ret_worker);
crp->crp_seq = ret_worker->reorder_ops++;
CRYPTO_RETW_UNLOCK(ret_worker);
}
TASK_INIT(&crp->crp_task, 0, crypto_task_invoke, crp);
taskqueue_enqueue(crypto_tq, &crp->crp_task);
return (0);
}
if ((crp->crp_flags & CRYPTO_F_BATCH) == 0) {
hid = CRYPTO_SESID2HID(crp->crp_sid);
/*
* Caller marked the request to be processed
* immediately; dispatch it directly to the
* driver unless the driver is currently blocked.
*/
cap = crypto_checkdriver(hid);
/* Driver cannot disappeared when there is an active session. */
KASSERT(cap != NULL, ("%s: Driver disappeared.", __func__));
if (!cap->cc_qblocked) {
result = crypto_invoke(cap, crp, 0);
if (result != ERESTART)
return (result);
/*
* The driver ran out of resources, put the request on
* the queue.
*/
}
}
crypto_batch_enqueue(crp);
return 0;
}
void
crypto_batch_enqueue(struct cryptop *crp)
{
CRYPTO_Q_LOCK();
TAILQ_INSERT_TAIL(&crp_q, crp, crp_next);
if (crp_sleep)
wakeup_one(&crp_q);
CRYPTO_Q_UNLOCK();
}
/*
* Add an asymetric crypto request to a queue,
* to be processed by the kernel thread.
*/
int
crypto_kdispatch(struct cryptkop *krp)
{
int error;
cryptostats.cs_kops++;
error = crypto_kinvoke(krp, krp->krp_crid);
if (error == ERESTART) {
CRYPTO_Q_LOCK();
TAILQ_INSERT_TAIL(&crp_kq, krp, krp_next);
if (crp_sleep)
wakeup_one(&crp_q);
CRYPTO_Q_UNLOCK();
error = 0;
}
return error;
}
/*
* Verify a driver is suitable for the specified operation.
*/
static __inline int
kdriver_suitable(const struct cryptocap *cap, const struct cryptkop *krp)
{
return (cap->cc_kalg[krp->krp_op] & CRYPTO_ALG_FLAG_SUPPORTED) != 0;
}
/*
* Select a driver for an asym operation. The driver must
* support the necessary algorithm. The caller can constrain
* which device is selected with the flags parameter. The
* algorithm we use here is pretty stupid; just use the first
* driver that supports the algorithms we need. If there are
* multiple suitable drivers we choose the driver with the
* fewest active operations. We prefer hardware-backed
* drivers to software ones when either may be used.
*/
static struct cryptocap *
crypto_select_kdriver(const struct cryptkop *krp, int flags)
{
- struct cryptocap *cap, *best, *blocked;
+ struct cryptocap *cap, *best;
int match, hid;
CRYPTO_DRIVER_ASSERT();
/*
* Look first for hardware crypto devices if permitted.
*/
if (flags & CRYPTOCAP_F_HARDWARE)
match = CRYPTOCAP_F_HARDWARE;
else
match = CRYPTOCAP_F_SOFTWARE;
best = NULL;
- blocked = NULL;
again:
for (hid = 0; hid < crypto_drivers_num; hid++) {
cap = &crypto_drivers[hid];
/*
* If it's not initialized, is in the process of
* going away, or is not appropriate (hardware
* or software based on match), then skip.
*/
if (cap->cc_dev == NULL ||
(cap->cc_flags & CRYPTOCAP_F_CLEANUP) ||
(cap->cc_flags & match) == 0)
continue;
/* verify all the algorithms are supported. */
if (kdriver_suitable(cap, krp)) {
if (best == NULL ||
cap->cc_koperations < best->cc_koperations)
best = cap;
}
}
if (best != NULL)
return best;
if (match == CRYPTOCAP_F_HARDWARE && (flags & CRYPTOCAP_F_SOFTWARE)) {
/* sort of an Algol 68-style for loop */
match = CRYPTOCAP_F_SOFTWARE;
goto again;
}
return best;
}
/*
* Dispatch an asymmetric crypto request.
*/
static int
crypto_kinvoke(struct cryptkop *krp, int crid)
{
struct cryptocap *cap = NULL;
int error;
KASSERT(krp != NULL, ("%s: krp == NULL", __func__));
KASSERT(krp->krp_callback != NULL,
("%s: krp->crp_callback == NULL", __func__));
CRYPTO_DRIVER_LOCK();
if ((crid & (CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE)) == 0) {
cap = crypto_checkdriver(crid);
if (cap != NULL) {
/*
* Driver present, it must support the necessary
* algorithm and, if s/w drivers are excluded,
* it must be registered as hardware-backed.
*/
if (!kdriver_suitable(cap, krp) ||
(!crypto_devallowsoft &&
(cap->cc_flags & CRYPTOCAP_F_HARDWARE) == 0))
cap = NULL;
}
} else {
/*
* No requested driver; select based on crid flags.
*/
if (!crypto_devallowsoft) /* NB: disallow s/w drivers */
crid &= ~CRYPTOCAP_F_SOFTWARE;
cap = crypto_select_kdriver(krp, crid);
}
if (cap != NULL && !cap->cc_kqblocked) {
krp->krp_hid = cap - crypto_drivers;
cap->cc_koperations++;
CRYPTO_DRIVER_UNLOCK();
error = CRYPTODEV_KPROCESS(cap->cc_dev, krp, 0);
CRYPTO_DRIVER_LOCK();
if (error == ERESTART) {
cap->cc_koperations--;
CRYPTO_DRIVER_UNLOCK();
return (error);
}
} else {
/*
* NB: cap is !NULL if device is blocked; in
* that case return ERESTART so the operation
* is resubmitted if possible.
*/
error = (cap == NULL) ? ENODEV : ERESTART;
}
CRYPTO_DRIVER_UNLOCK();
if (error) {
krp->krp_status = error;
crypto_kdone(krp);
}
return 0;
}
#ifdef CRYPTO_TIMING
static void
crypto_tstat(struct cryptotstat *ts, struct bintime *bt)
{
struct bintime now, delta;
struct timespec t;
uint64_t u;
binuptime(&now);
u = now.frac;
delta.frac = now.frac - bt->frac;
delta.sec = now.sec - bt->sec;
if (u < delta.frac)
delta.sec--;
bintime2timespec(&delta, &t);
timespecadd(&ts->acc, &t);
if (timespeccmp(&t, &ts->min, <))
ts->min = t;
if (timespeccmp(&t, &ts->max, >))
ts->max = t;
ts->count++;
*bt = now;
}
#endif
static void
crypto_task_invoke(void *ctx, int pending)
{
struct cryptocap *cap;
struct cryptop *crp;
int hid, result;
crp = (struct cryptop *)ctx;
hid = CRYPTO_SESID2HID(crp->crp_sid);
cap = crypto_checkdriver(hid);
result = crypto_invoke(cap, crp, 0);
if (result == ERESTART)
crypto_batch_enqueue(crp);
}
/*
* Dispatch a crypto request to the appropriate crypto devices.
*/
static int
crypto_invoke(struct cryptocap *cap, struct cryptop *crp, int hint)
{
KASSERT(crp != NULL, ("%s: crp == NULL", __func__));
KASSERT(crp->crp_callback != NULL,
("%s: crp->crp_callback == NULL", __func__));
KASSERT(crp->crp_desc != NULL, ("%s: crp->crp_desc == NULL", __func__));
#ifdef CRYPTO_TIMING
if (crypto_timing)
crypto_tstat(&cryptostats.cs_invoke, &crp->crp_tstamp);
#endif
if (cap->cc_flags & CRYPTOCAP_F_CLEANUP) {
struct cryptodesc *crd;
u_int64_t nid;
/*
* Driver has unregistered; migrate the session and return
* an error to the caller so they'll resubmit the op.
*
* XXX: What if there are more already queued requests for this
* session?
*/
crypto_freesession(crp->crp_sid);
for (crd = crp->crp_desc; crd->crd_next; crd = crd->crd_next)
crd->CRD_INI.cri_next = &(crd->crd_next->CRD_INI);
/* XXX propagate flags from initial session? */
if (crypto_newsession(&nid, &(crp->crp_desc->CRD_INI),
CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE) == 0)
crp->crp_sid = nid;
crp->crp_etype = EAGAIN;
crypto_done(crp);
return 0;
} else {
/*
* Invoke the driver to process the request.
*/
return CRYPTODEV_PROCESS(cap->cc_dev, crp, hint);
}
}
/*
* Release a set of crypto descriptors.
*/
void
crypto_freereq(struct cryptop *crp)
{
struct cryptodesc *crd;
if (crp == NULL)
return;
#ifdef DIAGNOSTIC
{
struct cryptop *crp2;
struct crypto_ret_worker *ret_worker;
CRYPTO_Q_LOCK();
TAILQ_FOREACH(crp2, &crp_q, crp_next) {
KASSERT(crp2 != crp,
("Freeing cryptop from the crypto queue (%p).",
crp));
}
CRYPTO_Q_UNLOCK();
FOREACH_CRYPTO_RETW(ret_worker) {
CRYPTO_RETW_LOCK(ret_worker);
TAILQ_FOREACH(crp2, &ret_worker->crp_ret_q, crp_next) {
KASSERT(crp2 != crp,
("Freeing cryptop from the return queue (%p).",
crp));
}
CRYPTO_RETW_UNLOCK(ret_worker);
}
}
#endif
while ((crd = crp->crp_desc) != NULL) {
crp->crp_desc = crd->crd_next;
uma_zfree(cryptodesc_zone, crd);
}
uma_zfree(cryptop_zone, crp);
}
/*
* Acquire a set of crypto descriptors.
*/
struct cryptop *
crypto_getreq(int num)
{
struct cryptodesc *crd;
struct cryptop *crp;
crp = uma_zalloc(cryptop_zone, M_NOWAIT|M_ZERO);
if (crp != NULL) {
while (num--) {
crd = uma_zalloc(cryptodesc_zone, M_NOWAIT|M_ZERO);
if (crd == NULL) {
crypto_freereq(crp);
return NULL;
}
crd->crd_next = crp->crp_desc;
crp->crp_desc = crd;
}
}
return crp;
}
/*
* Invoke the callback on behalf of the driver.
*/
void
crypto_done(struct cryptop *crp)
{
KASSERT((crp->crp_flags & CRYPTO_F_DONE) == 0,
("crypto_done: op already done, flags 0x%x", crp->crp_flags));
crp->crp_flags |= CRYPTO_F_DONE;
if (crp->crp_etype != 0)
cryptostats.cs_errs++;
#ifdef CRYPTO_TIMING
if (crypto_timing)
crypto_tstat(&cryptostats.cs_done, &crp->crp_tstamp);
#endif
/*
* CBIMM means unconditionally do the callback immediately;
* CBIFSYNC means do the callback immediately only if the
* operation was done synchronously. Both are used to avoid
* doing extraneous context switches; the latter is mostly
* used with the software crypto driver.
*/
if (!CRYPTOP_ASYNC_KEEPORDER(crp) &&
((crp->crp_flags & CRYPTO_F_CBIMM) ||
((crp->crp_flags & CRYPTO_F_CBIFSYNC) &&
(CRYPTO_SESID2CAPS(crp->crp_sid) & CRYPTOCAP_F_SYNC)))) {
/*
* Do the callback directly. This is ok when the
* callback routine does very little (e.g. the
* /dev/crypto callback method just does a wakeup).
*/
#ifdef CRYPTO_TIMING
if (crypto_timing) {
/*
* NB: We must copy the timestamp before
* doing the callback as the cryptop is
* likely to be reclaimed.
*/
struct bintime t = crp->crp_tstamp;
crypto_tstat(&cryptostats.cs_cb, &t);
crp->crp_callback(crp);
crypto_tstat(&cryptostats.cs_finis, &t);
} else
#endif
crp->crp_callback(crp);
} else {
struct crypto_ret_worker *ret_worker;
bool wake;
ret_worker = CRYPTO_RETW(crp->crp_retw_id);
wake = false;
/*
* Normal case; queue the callback for the thread.
*/
CRYPTO_RETW_LOCK(ret_worker);
if (CRYPTOP_ASYNC_KEEPORDER(crp)) {
struct cryptop *tmp;
TAILQ_FOREACH_REVERSE(tmp, &ret_worker->crp_ordered_ret_q,
cryptop_q, crp_next) {
if (CRYPTO_SEQ_GT(crp->crp_seq, tmp->crp_seq)) {
TAILQ_INSERT_AFTER(&ret_worker->crp_ordered_ret_q,
tmp, crp, crp_next);
break;
}
}
if (tmp == NULL) {
TAILQ_INSERT_HEAD(&ret_worker->crp_ordered_ret_q,
crp, crp_next);
}
if (crp->crp_seq == ret_worker->reorder_cur_seq)
wake = true;
}
else {
if (CRYPTO_RETW_EMPTY(ret_worker))
wake = true;
TAILQ_INSERT_TAIL(&ret_worker->crp_ret_q, crp, crp_next);
}
if (wake)
wakeup_one(&ret_worker->crp_ret_q); /* shared wait channel */
CRYPTO_RETW_UNLOCK(ret_worker);
}
}
/*
* Invoke the callback on behalf of the driver.
*/
void
crypto_kdone(struct cryptkop *krp)
{
struct crypto_ret_worker *ret_worker;
struct cryptocap *cap;
if (krp->krp_status != 0)
cryptostats.cs_kerrs++;
CRYPTO_DRIVER_LOCK();
/* XXX: What if driver is loaded in the meantime? */
if (krp->krp_hid < crypto_drivers_num) {
cap = &crypto_drivers[krp->krp_hid];
KASSERT(cap->cc_koperations > 0, ("cc_koperations == 0"));
cap->cc_koperations--;
if (cap->cc_flags & CRYPTOCAP_F_CLEANUP)
crypto_remove(cap);
}
CRYPTO_DRIVER_UNLOCK();
ret_worker = CRYPTO_RETW(0);
CRYPTO_RETW_LOCK(ret_worker);
if (CRYPTO_RETW_EMPTY(ret_worker))
wakeup_one(&ret_worker->crp_ret_q); /* shared wait channel */
TAILQ_INSERT_TAIL(&ret_worker->crp_ret_kq, krp, krp_next);
CRYPTO_RETW_UNLOCK(ret_worker);
}
int
crypto_getfeat(int *featp)
{
int hid, kalg, feat = 0;
CRYPTO_DRIVER_LOCK();
for (hid = 0; hid < crypto_drivers_num; hid++) {
const struct cryptocap *cap = &crypto_drivers[hid];
if ((cap->cc_flags & CRYPTOCAP_F_SOFTWARE) &&
!crypto_devallowsoft) {
continue;
}
for (kalg = 0; kalg < CRK_ALGORITHM_MAX; kalg++)
if (cap->cc_kalg[kalg] & CRYPTO_ALG_FLAG_SUPPORTED)
feat |= 1 << kalg;
}
CRYPTO_DRIVER_UNLOCK();
*featp = feat;
return (0);
}
/*
* Terminate a thread at module unload. The process that
* initiated this is waiting for us to signal that we're gone;
* wake it up and exit. We use the driver table lock to insure
* we don't do the wakeup before they're waiting. There is no
* race here because the waiter sleeps on the proc lock for the
* thread so it gets notified at the right time because of an
* extra wakeup that's done in exit1().
*/
static void
crypto_finis(void *chan)
{
CRYPTO_DRIVER_LOCK();
wakeup_one(chan);
CRYPTO_DRIVER_UNLOCK();
kproc_exit(0);
}
/*
* Crypto thread, dispatches crypto requests.
*/
static void
crypto_proc(void)
{
struct cryptop *crp, *submit;
struct cryptkop *krp;
struct cryptocap *cap;
u_int32_t hid;
int result, hint;
#if defined(__i386__) || defined(__amd64__) || defined(__aarch64__)
fpu_kern_thread(FPU_KERN_NORMAL);
#endif
CRYPTO_Q_LOCK();
for (;;) {
/*
* Find the first element in the queue that can be
* processed and look-ahead to see if multiple ops
* are ready for the same driver.
*/
submit = NULL;
hint = 0;
TAILQ_FOREACH(crp, &crp_q, crp_next) {
hid = CRYPTO_SESID2HID(crp->crp_sid);
cap = crypto_checkdriver(hid);
/*
* Driver cannot disappeared when there is an active
* session.
*/
KASSERT(cap != NULL, ("%s:%u Driver disappeared.",
__func__, __LINE__));
if (cap == NULL || cap->cc_dev == NULL) {
/* Op needs to be migrated, process it. */
if (submit == NULL)
submit = crp;
break;
}
if (!cap->cc_qblocked) {
if (submit != NULL) {
/*
* We stop on finding another op,
* regardless whether its for the same
* driver or not. We could keep
* searching the queue but it might be
* better to just use a per-driver
* queue instead.
*/
if (CRYPTO_SESID2HID(submit->crp_sid) == hid)
hint = CRYPTO_HINT_MORE;
break;
} else {
submit = crp;
if ((submit->crp_flags & CRYPTO_F_BATCH) == 0)
break;
/* keep scanning for more are q'd */
}
}
}
if (submit != NULL) {
TAILQ_REMOVE(&crp_q, submit, crp_next);
hid = CRYPTO_SESID2HID(submit->crp_sid);
cap = crypto_checkdriver(hid);
KASSERT(cap != NULL, ("%s:%u Driver disappeared.",
__func__, __LINE__));
result = crypto_invoke(cap, submit, hint);
if (result == ERESTART) {
/*
* The driver ran out of resources, mark the
* driver ``blocked'' for cryptop's and put
* the request back in the queue. It would
* best to put the request back where we got
* it but that's hard so for now we put it
* at the front. This should be ok; putting
* it at the end does not work.
*/
/* XXX validate sid again? */
crypto_drivers[CRYPTO_SESID2HID(submit->crp_sid)].cc_qblocked = 1;
TAILQ_INSERT_HEAD(&crp_q, submit, crp_next);
cryptostats.cs_blocks++;
}
}
/* As above, but for key ops */
TAILQ_FOREACH(krp, &crp_kq, krp_next) {
cap = crypto_checkdriver(krp->krp_hid);
if (cap == NULL || cap->cc_dev == NULL) {
/*
* Operation needs to be migrated, invalidate
* the assigned device so it will reselect a
* new one below. Propagate the original
* crid selection flags if supplied.
*/
krp->krp_hid = krp->krp_crid &
(CRYPTOCAP_F_SOFTWARE|CRYPTOCAP_F_HARDWARE);
if (krp->krp_hid == 0)
krp->krp_hid =
CRYPTOCAP_F_SOFTWARE|CRYPTOCAP_F_HARDWARE;
break;
}
if (!cap->cc_kqblocked)
break;
}
if (krp != NULL) {
TAILQ_REMOVE(&crp_kq, krp, krp_next);
result = crypto_kinvoke(krp, krp->krp_hid);
if (result == ERESTART) {
/*
* The driver ran out of resources, mark the
* driver ``blocked'' for cryptkop's and put
* the request back in the queue. It would
* best to put the request back where we got
* it but that's hard so for now we put it
* at the front. This should be ok; putting
* it at the end does not work.
*/
/* XXX validate sid again? */
crypto_drivers[krp->krp_hid].cc_kqblocked = 1;
TAILQ_INSERT_HEAD(&crp_kq, krp, krp_next);
cryptostats.cs_kblocks++;
}
}
if (submit == NULL && krp == NULL) {
/*
* Nothing more to be processed. Sleep until we're
* woken because there are more ops to process.
* This happens either by submission or by a driver
* becoming unblocked and notifying us through
* crypto_unblock. Note that when we wakeup we
* start processing each queue again from the
* front. It's not clear that it's important to
* preserve this ordering since ops may finish
* out of order if dispatched to different devices
* and some become blocked while others do not.
*/
crp_sleep = 1;
msleep(&crp_q, &crypto_q_mtx, PWAIT, "crypto_wait", 0);
crp_sleep = 0;
if (cryptoproc == NULL)
break;
cryptostats.cs_intrs++;
}
}
CRYPTO_Q_UNLOCK();
crypto_finis(&crp_q);
}
/*
* Crypto returns thread, does callbacks for processed crypto requests.
* Callbacks are done here, rather than in the crypto drivers, because
* callbacks typically are expensive and would slow interrupt handling.
*/
static void
crypto_ret_proc(struct crypto_ret_worker *ret_worker)
{
struct cryptop *crpt;
struct cryptkop *krpt;
CRYPTO_RETW_LOCK(ret_worker);
for (;;) {
/* Harvest return q's for completed ops */
crpt = TAILQ_FIRST(&ret_worker->crp_ordered_ret_q);
if (crpt != NULL) {
if (crpt->crp_seq == ret_worker->reorder_cur_seq) {
TAILQ_REMOVE(&ret_worker->crp_ordered_ret_q, crpt, crp_next);
ret_worker->reorder_cur_seq++;
} else {
crpt = NULL;
}
}
if (crpt == NULL) {
crpt = TAILQ_FIRST(&ret_worker->crp_ret_q);
if (crpt != NULL)
TAILQ_REMOVE(&ret_worker->crp_ret_q, crpt, crp_next);
}
krpt = TAILQ_FIRST(&ret_worker->crp_ret_kq);
if (krpt != NULL)
TAILQ_REMOVE(&ret_worker->crp_ret_kq, krpt, krp_next);
if (crpt != NULL || krpt != NULL) {
CRYPTO_RETW_UNLOCK(ret_worker);
/*
* Run callbacks unlocked.
*/
if (crpt != NULL) {
#ifdef CRYPTO_TIMING
if (crypto_timing) {
/*
* NB: We must copy the timestamp before
* doing the callback as the cryptop is
* likely to be reclaimed.
*/
struct bintime t = crpt->crp_tstamp;
crypto_tstat(&cryptostats.cs_cb, &t);
crpt->crp_callback(crpt);
crypto_tstat(&cryptostats.cs_finis, &t);
} else
#endif
crpt->crp_callback(crpt);
}
if (krpt != NULL)
krpt->krp_callback(krpt);
CRYPTO_RETW_LOCK(ret_worker);
} else {
/*
* Nothing more to be processed. Sleep until we're
* woken because there are more returns to process.
*/
msleep(&ret_worker->crp_ret_q, &ret_worker->crypto_ret_mtx, PWAIT,
"crypto_ret_wait", 0);
if (ret_worker->cryptoretproc == NULL)
break;
cryptostats.cs_rets++;
}
}
CRYPTO_RETW_UNLOCK(ret_worker);
crypto_finis(&ret_worker->crp_ret_q);
}
#ifdef DDB
static void
db_show_drivers(void)
{
int hid;
db_printf("%12s %4s %4s %8s %2s %2s\n"
, "Device"
, "Ses"
, "Kops"
, "Flags"
, "QB"
, "KB"
);
for (hid = 0; hid < crypto_drivers_num; hid++) {
const struct cryptocap *cap = &crypto_drivers[hid];
if (cap->cc_dev == NULL)
continue;
db_printf("%-12s %4u %4u %08x %2u %2u\n"
, device_get_nameunit(cap->cc_dev)
, cap->cc_sessions
, cap->cc_koperations
, cap->cc_flags
, cap->cc_qblocked
, cap->cc_kqblocked
);
}
}
DB_SHOW_COMMAND(crypto, db_show_crypto)
{
struct cryptop *crp;
struct crypto_ret_worker *ret_worker;
db_show_drivers();
db_printf("\n");
db_printf("%4s %8s %4s %4s %4s %4s %8s %8s\n",
"HID", "Caps", "Ilen", "Olen", "Etype", "Flags",
"Desc", "Callback");
TAILQ_FOREACH(crp, &crp_q, crp_next) {
db_printf("%4u %08x %4u %4u %4u %04x %8p %8p\n"
, (int) CRYPTO_SESID2HID(crp->crp_sid)
, (int) CRYPTO_SESID2CAPS(crp->crp_sid)
, crp->crp_ilen, crp->crp_olen
, crp->crp_etype
, crp->crp_flags
, crp->crp_desc
, crp->crp_callback
);
}
FOREACH_CRYPTO_RETW(ret_worker) {
db_printf("\n%8s %4s %4s %4s %8s\n",
"ret_worker", "HID", "Etype", "Flags", "Callback");
if (!TAILQ_EMPTY(&ret_worker->crp_ret_q)) {
TAILQ_FOREACH(crp, &ret_worker->crp_ret_q, crp_next) {
db_printf("%8td %4u %4u %04x %8p\n"
, CRYPTO_RETW_ID(ret_worker)
, (int) CRYPTO_SESID2HID(crp->crp_sid)
, crp->crp_etype
, crp->crp_flags
, crp->crp_callback
);
}
}
}
}
DB_SHOW_COMMAND(kcrypto, db_show_kcrypto)
{
struct cryptkop *krp;
struct crypto_ret_worker *ret_worker;
db_show_drivers();
db_printf("\n");
db_printf("%4s %5s %4s %4s %8s %4s %8s\n",
"Op", "Status", "#IP", "#OP", "CRID", "HID", "Callback");
TAILQ_FOREACH(krp, &crp_kq, krp_next) {
db_printf("%4u %5u %4u %4u %08x %4u %8p\n"
, krp->krp_op
, krp->krp_status
, krp->krp_iparams, krp->krp_oparams
, krp->krp_crid, krp->krp_hid
, krp->krp_callback
);
}
ret_worker = CRYPTO_RETW(0);
if (!TAILQ_EMPTY(&ret_worker->crp_ret_q)) {
db_printf("%4s %5s %8s %4s %8s\n",
"Op", "Status", "CRID", "HID", "Callback");
TAILQ_FOREACH(krp, &ret_worker->crp_ret_kq, krp_next) {
db_printf("%4u %5u %08x %4u %8p\n"
, krp->krp_op
, krp->krp_status
, krp->krp_crid, krp->krp_hid
, krp->krp_callback
);
}
}
}
#endif
int crypto_modevent(module_t mod, int type, void *unused);
/*
* Initialization code, both for static and dynamic loading.
* Note this is not invoked with the usual MODULE_DECLARE
* mechanism but instead is listed as a dependency by the
* cryptosoft driver. This guarantees proper ordering of
* calls on module load/unload.
*/
int
crypto_modevent(module_t mod, int type, void *unused)
{
int error = EINVAL;
switch (type) {
case MOD_LOAD:
error = crypto_init();
if (error == 0 && bootverbose)
printf("crypto: <crypto core>\n");
break;
case MOD_UNLOAD:
/*XXX disallow if active sessions */
error = 0;
crypto_destroy();
return 0;
}
return error;
}
MODULE_VERSION(crypto, 1);
MODULE_DEPEND(crypto, zlib, 1, 1, 1);
Index: head/sys/opencrypto/cryptosoft.c
===================================================================
--- head/sys/opencrypto/cryptosoft.c (revision 327172)
+++ head/sys/opencrypto/cryptosoft.c (revision 327173)
@@ -1,1298 +1,1297 @@
/* $OpenBSD: cryptosoft.c,v 1.35 2002/04/26 08:43:50 deraadt Exp $ */
/*-
* The author of this code is Angelos D. Keromytis (angelos@cis.upenn.edu)
* Copyright (c) 2002-2006 Sam Leffler, Errno Consulting
*
* This code was written by Angelos D. Keromytis in Athens, Greece, in
* February 2000. Network Security Technologies Inc. (NSTI) kindly
* supported the development of this code.
*
* Copyright (c) 2000, 2001 Angelos D. Keromytis
* Copyright (c) 2014 The FreeBSD Foundation
* All rights reserved.
*
* Portions of this software were developed by John-Mark Gurney
* under sponsorship of the FreeBSD Foundation and
* Rubicon Communications, LLC (Netgate).
*
* Permission to use, copy, and modify this software with or without fee
* is hereby granted, provided that this entire notice is included in
* all source code copies of any software which is or includes a copy or
* modification of this software.
*
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR
* IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE
* MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR
* PURPOSE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/module.h>
#include <sys/sysctl.h>
#include <sys/errno.h>
#include <sys/random.h>
#include <sys/kernel.h>
#include <sys/uio.h>
#include <sys/lock.h>
#include <sys/rwlock.h>
#include <sys/endian.h>
#include <sys/limits.h>
#include <crypto/blowfish/blowfish.h>
#include <crypto/sha1.h>
#include <opencrypto/rmd160.h>
#include <opencrypto/cast.h>
#include <opencrypto/skipjack.h>
#include <sys/md5.h>
#include <opencrypto/cryptodev.h>
#include <opencrypto/cryptosoft.h>
#include <opencrypto/xform.h>
#include <sys/kobj.h>
#include <sys/bus.h>
#include "cryptodev_if.h"
static int32_t swcr_id;
static struct swcr_data **swcr_sessions = NULL;
static u_int32_t swcr_sesnum;
/* Protects swcr_sessions pointer, not data. */
static struct rwlock swcr_sessions_lock;
u_int8_t hmac_ipad_buffer[HMAC_MAX_BLOCK_LEN];
u_int8_t hmac_opad_buffer[HMAC_MAX_BLOCK_LEN];
static int swcr_encdec(struct cryptodesc *, struct swcr_data *, caddr_t, int);
static int swcr_authcompute(struct cryptodesc *, struct swcr_data *, caddr_t, int);
static int swcr_authenc(struct cryptop *crp);
static int swcr_compdec(struct cryptodesc *, struct swcr_data *, caddr_t, int);
static int swcr_freesession(device_t dev, u_int64_t tid);
static int swcr_freesession_locked(device_t dev, u_int64_t tid);
/*
* Apply a symmetric encryption/decryption algorithm.
*/
static int
swcr_encdec(struct cryptodesc *crd, struct swcr_data *sw, caddr_t buf,
int flags)
{
unsigned char iv[EALG_MAX_BLOCK_LEN], blk[EALG_MAX_BLOCK_LEN], *idat;
unsigned char *ivp, *nivp, iv2[EALG_MAX_BLOCK_LEN];
struct enc_xform *exf;
int i, j, k, blks, ind, count, ivlen;
struct uio *uio, uiolcl;
struct iovec iovlcl[4];
struct iovec *iov;
int iovcnt, iovalloc;
int error;
error = 0;
exf = sw->sw_exf;
blks = exf->blocksize;
ivlen = exf->ivsize;
/* Check for non-padded data */
if (crd->crd_len % blks)
return EINVAL;
if (crd->crd_alg == CRYPTO_AES_ICM &&
(crd->crd_flags & CRD_F_IV_EXPLICIT) == 0)
return (EINVAL);
/* Initialize the IV */
if (crd->crd_flags & CRD_F_ENCRYPT) {
/* IV explicitly provided ? */
if (crd->crd_flags & CRD_F_IV_EXPLICIT)
bcopy(crd->crd_iv, iv, ivlen);
else
arc4rand(iv, ivlen, 0);
/* Do we need to write the IV */
if (!(crd->crd_flags & CRD_F_IV_PRESENT))
crypto_copyback(flags, buf, crd->crd_inject, ivlen, iv);
} else { /* Decryption */
/* IV explicitly provided ? */
if (crd->crd_flags & CRD_F_IV_EXPLICIT)
bcopy(crd->crd_iv, iv, ivlen);
else {
/* Get IV off buf */
crypto_copydata(flags, buf, crd->crd_inject, ivlen, iv);
}
}
if (crd->crd_flags & CRD_F_KEY_EXPLICIT) {
int error;
if (sw->sw_kschedule)
exf->zerokey(&(sw->sw_kschedule));
error = exf->setkey(&sw->sw_kschedule,
crd->crd_key, crd->crd_klen / 8);
if (error)
return (error);
}
iov = iovlcl;
iovcnt = nitems(iovlcl);
iovalloc = 0;
uio = &uiolcl;
if ((flags & CRYPTO_F_IMBUF) != 0) {
error = crypto_mbuftoiov((struct mbuf *)buf, &iov, &iovcnt,
&iovalloc);
if (error)
return (error);
uio->uio_iov = iov;
uio->uio_iovcnt = iovcnt;
} else if ((flags & CRYPTO_F_IOV) != 0)
uio = (struct uio *)buf;
else {
iov[0].iov_base = buf;
iov[0].iov_len = crd->crd_skip + crd->crd_len;
uio->uio_iov = iov;
uio->uio_iovcnt = 1;
}
ivp = iv;
if (exf->reinit) {
/*
* xforms that provide a reinit method perform all IV
* handling themselves.
*/
exf->reinit(sw->sw_kschedule, iv);
}
count = crd->crd_skip;
ind = cuio_getptr(uio, count, &k);
if (ind == -1) {
error = EINVAL;
goto out;
}
i = crd->crd_len;
while (i > 0) {
/*
* If there's insufficient data at the end of
* an iovec, we have to do some copying.
*/
if (uio->uio_iov[ind].iov_len < k + blks &&
uio->uio_iov[ind].iov_len != k) {
cuio_copydata(uio, count, blks, blk);
/* Actual encryption/decryption */
if (exf->reinit) {
if (crd->crd_flags & CRD_F_ENCRYPT) {
exf->encrypt(sw->sw_kschedule,
blk);
} else {
exf->decrypt(sw->sw_kschedule,
blk);
}
} else if (crd->crd_flags & CRD_F_ENCRYPT) {
/* XOR with previous block */
for (j = 0; j < blks; j++)
blk[j] ^= ivp[j];
exf->encrypt(sw->sw_kschedule, blk);
/*
* Keep encrypted block for XOR'ing
* with next block
*/
bcopy(blk, iv, blks);
ivp = iv;
} else { /* decrypt */
/*
* Keep encrypted block for XOR'ing
* with next block
*/
nivp = (ivp == iv) ? iv2 : iv;
bcopy(blk, nivp, blks);
exf->decrypt(sw->sw_kschedule, blk);
/* XOR with previous block */
for (j = 0; j < blks; j++)
blk[j] ^= ivp[j];
ivp = nivp;
}
/* Copy back decrypted block */
cuio_copyback(uio, count, blks, blk);
count += blks;
/* Advance pointer */
ind = cuio_getptr(uio, count, &k);
if (ind == -1) {
error = EINVAL;
goto out;
}
i -= blks;
/* Could be done... */
if (i == 0)
break;
}
/*
* Warning: idat may point to garbage here, but
* we only use it in the while() loop, only if
* there are indeed enough data.
*/
idat = (char *)uio->uio_iov[ind].iov_base + k;
while (uio->uio_iov[ind].iov_len >= k + blks && i > 0) {
if (exf->reinit) {
if (crd->crd_flags & CRD_F_ENCRYPT) {
exf->encrypt(sw->sw_kschedule,
idat);
} else {
exf->decrypt(sw->sw_kschedule,
idat);
}
} else if (crd->crd_flags & CRD_F_ENCRYPT) {
/* XOR with previous block/IV */
for (j = 0; j < blks; j++)
idat[j] ^= ivp[j];
exf->encrypt(sw->sw_kschedule, idat);
ivp = idat;
} else { /* decrypt */
/*
* Keep encrypted block to be used
* in next block's processing.
*/
nivp = (ivp == iv) ? iv2 : iv;
bcopy(idat, nivp, blks);
exf->decrypt(sw->sw_kschedule, idat);
/* XOR with previous block/IV */
for (j = 0; j < blks; j++)
idat[j] ^= ivp[j];
ivp = nivp;
}
idat += blks;
count += blks;
k += blks;
i -= blks;
}
/*
* Advance to the next iov if the end of the current iov
* is aligned with the end of a cipher block.
* Note that the code is equivalent to calling:
* ind = cuio_getptr(uio, count, &k);
*/
if (i > 0 && k == uio->uio_iov[ind].iov_len) {
k = 0;
ind++;
if (ind >= uio->uio_iovcnt) {
error = EINVAL;
goto out;
}
}
}
out:
if (iovalloc)
free(iov, M_CRYPTO_DATA);
return (error);
}
static void
swcr_authprepare(struct auth_hash *axf, struct swcr_data *sw, u_char *key,
int klen)
{
int k;
klen /= 8;
switch (axf->type) {
case CRYPTO_MD5_HMAC:
case CRYPTO_SHA1_HMAC:
case CRYPTO_SHA2_256_HMAC:
case CRYPTO_SHA2_384_HMAC:
case CRYPTO_SHA2_512_HMAC:
case CRYPTO_NULL_HMAC:
case CRYPTO_RIPEMD160_HMAC:
for (k = 0; k < klen; k++)
key[k] ^= HMAC_IPAD_VAL;
axf->Init(sw->sw_ictx);
axf->Update(sw->sw_ictx, key, klen);
axf->Update(sw->sw_ictx, hmac_ipad_buffer, axf->blocksize - klen);
for (k = 0; k < klen; k++)
key[k] ^= (HMAC_IPAD_VAL ^ HMAC_OPAD_VAL);
axf->Init(sw->sw_octx);
axf->Update(sw->sw_octx, key, klen);
axf->Update(sw->sw_octx, hmac_opad_buffer, axf->blocksize - klen);
for (k = 0; k < klen; k++)
key[k] ^= HMAC_OPAD_VAL;
break;
case CRYPTO_MD5_KPDK:
case CRYPTO_SHA1_KPDK:
{
/*
* We need a buffer that can hold an md5 and a sha1 result
* just to throw it away.
* What we do here is the initial part of:
* ALGO( key, keyfill, .. )
* adding the key to sw_ictx and abusing Final() to get the
* "keyfill" padding.
* In addition we abuse the sw_octx to save the key to have
* it to be able to append it at the end in swcr_authcompute().
*/
u_char buf[SHA1_RESULTLEN];
sw->sw_klen = klen;
bcopy(key, sw->sw_octx, klen);
axf->Init(sw->sw_ictx);
axf->Update(sw->sw_ictx, key, klen);
axf->Final(buf, sw->sw_ictx);
break;
}
default:
printf("%s: CRD_F_KEY_EXPLICIT flag given, but algorithm %d "
"doesn't use keys.\n", __func__, axf->type);
}
}
/*
* Compute keyed-hash authenticator.
*/
static int
swcr_authcompute(struct cryptodesc *crd, struct swcr_data *sw, caddr_t buf,
int flags)
{
unsigned char aalg[HASH_MAX_LEN];
struct auth_hash *axf;
union authctx ctx;
int err;
if (sw->sw_ictx == 0)
return EINVAL;
axf = sw->sw_axf;
if (crd->crd_flags & CRD_F_KEY_EXPLICIT)
swcr_authprepare(axf, sw, crd->crd_key, crd->crd_klen);
bcopy(sw->sw_ictx, &ctx, axf->ctxsize);
err = crypto_apply(flags, buf, crd->crd_skip, crd->crd_len,
(int (*)(void *, void *, unsigned int))axf->Update, (caddr_t)&ctx);
if (err)
return err;
switch (sw->sw_alg) {
case CRYPTO_MD5_HMAC:
case CRYPTO_SHA1_HMAC:
case CRYPTO_SHA2_256_HMAC:
case CRYPTO_SHA2_384_HMAC:
case CRYPTO_SHA2_512_HMAC:
case CRYPTO_RIPEMD160_HMAC:
if (sw->sw_octx == NULL)
return EINVAL;
axf->Final(aalg, &ctx);
bcopy(sw->sw_octx, &ctx, axf->ctxsize);
axf->Update(&ctx, aalg, axf->hashsize);
axf->Final(aalg, &ctx);
break;
case CRYPTO_MD5_KPDK:
case CRYPTO_SHA1_KPDK:
/* If we have no key saved, return error. */
if (sw->sw_octx == NULL)
return EINVAL;
/*
* Add the trailing copy of the key (see comment in
* swcr_authprepare()) after the data:
* ALGO( .., key, algofill )
* and let Final() do the proper, natural "algofill"
* padding.
*/
axf->Update(&ctx, sw->sw_octx, sw->sw_klen);
axf->Final(aalg, &ctx);
break;
case CRYPTO_NULL_HMAC:
axf->Final(aalg, &ctx);
break;
}
/* Inject the authentication data */
crypto_copyback(flags, buf, crd->crd_inject,
sw->sw_mlen == 0 ? axf->hashsize : sw->sw_mlen, aalg);
return 0;
}
CTASSERT(INT_MAX <= (1ll<<39) - 256); /* GCM: plain text < 2^39-256 */
CTASSERT(INT_MAX <= (uint64_t)-1); /* GCM: associated data <= 2^64-1 */
/*
* Apply a combined encryption-authentication transformation
*/
static int
swcr_authenc(struct cryptop *crp)
{
uint32_t blkbuf[howmany(EALG_MAX_BLOCK_LEN, sizeof(uint32_t))];
u_char *blk = (u_char *)blkbuf;
u_char aalg[AALG_MAX_RESULT_LEN];
u_char uaalg[AALG_MAX_RESULT_LEN];
u_char iv[EALG_MAX_BLOCK_LEN];
union authctx ctx;
struct cryptodesc *crd, *crda = NULL, *crde = NULL;
struct swcr_data *sw, *swa, *swe = NULL;
struct auth_hash *axf = NULL;
struct enc_xform *exf = NULL;
caddr_t buf = (caddr_t)crp->crp_buf;
uint32_t *blkp;
int aadlen, blksz, i, ivlen, len, iskip, oskip, r;
ivlen = blksz = iskip = oskip = 0;
for (crd = crp->crp_desc; crd; crd = crd->crd_next) {
for (sw = swcr_sessions[crp->crp_sid & 0xffffffff];
sw && sw->sw_alg != crd->crd_alg;
sw = sw->sw_next)
;
if (sw == NULL)
return (EINVAL);
switch (sw->sw_alg) {
case CRYPTO_AES_NIST_GCM_16:
case CRYPTO_AES_NIST_GMAC:
swe = sw;
crde = crd;
exf = swe->sw_exf;
ivlen = 12;
break;
case CRYPTO_AES_128_NIST_GMAC:
case CRYPTO_AES_192_NIST_GMAC:
case CRYPTO_AES_256_NIST_GMAC:
swa = sw;
crda = crd;
axf = swa->sw_axf;
if (swa->sw_ictx == 0)
return (EINVAL);
bcopy(swa->sw_ictx, &ctx, axf->ctxsize);
blksz = axf->blocksize;
break;
default:
return (EINVAL);
}
}
if (crde == NULL || crda == NULL)
return (EINVAL);
if (crde->crd_alg == CRYPTO_AES_NIST_GCM_16 &&
(crde->crd_flags & CRD_F_IV_EXPLICIT) == 0)
return (EINVAL);
if (crde->crd_klen != crda->crd_klen)
return (EINVAL);
/* Initialize the IV */
if (crde->crd_flags & CRD_F_ENCRYPT) {
/* IV explicitly provided ? */
if (crde->crd_flags & CRD_F_IV_EXPLICIT)
bcopy(crde->crd_iv, iv, ivlen);
else
arc4rand(iv, ivlen, 0);
/* Do we need to write the IV */
if (!(crde->crd_flags & CRD_F_IV_PRESENT))
crypto_copyback(crp->crp_flags, buf, crde->crd_inject,
ivlen, iv);
} else { /* Decryption */
/* IV explicitly provided ? */
if (crde->crd_flags & CRD_F_IV_EXPLICIT)
bcopy(crde->crd_iv, iv, ivlen);
else {
/* Get IV off buf */
crypto_copydata(crp->crp_flags, buf, crde->crd_inject,
ivlen, iv);
}
}
/* Supply MAC with IV */
if (axf->Reinit)
axf->Reinit(&ctx, iv, ivlen);
/* Supply MAC with AAD */
aadlen = crda->crd_len;
for (i = iskip; i < crda->crd_len; i += blksz) {
len = MIN(crda->crd_len - i, blksz - oskip);
crypto_copydata(crp->crp_flags, buf, crda->crd_skip + i, len,
blk + oskip);
bzero(blk + len + oskip, blksz - len - oskip);
axf->Update(&ctx, blk, blksz);
oskip = 0; /* reset initial output offset */
}
if (exf->reinit)
exf->reinit(swe->sw_kschedule, iv);
/* Do encryption/decryption with MAC */
for (i = 0; i < crde->crd_len; i += blksz) {
len = MIN(crde->crd_len - i, blksz);
if (len < blksz)
bzero(blk, blksz);
crypto_copydata(crp->crp_flags, buf, crde->crd_skip + i, len,
blk);
if (crde->crd_flags & CRD_F_ENCRYPT) {
exf->encrypt(swe->sw_kschedule, blk);
axf->Update(&ctx, blk, len);
crypto_copyback(crp->crp_flags, buf,
crde->crd_skip + i, len, blk);
} else {
axf->Update(&ctx, blk, len);
}
}
/* Do any required special finalization */
switch (crda->crd_alg) {
case CRYPTO_AES_128_NIST_GMAC:
case CRYPTO_AES_192_NIST_GMAC:
case CRYPTO_AES_256_NIST_GMAC:
/* length block */
bzero(blk, blksz);
blkp = (uint32_t *)blk + 1;
*blkp = htobe32(aadlen * 8);
blkp = (uint32_t *)blk + 3;
*blkp = htobe32(crde->crd_len * 8);
axf->Update(&ctx, blk, blksz);
break;
}
/* Finalize MAC */
axf->Final(aalg, &ctx);
/* Validate tag */
if (!(crde->crd_flags & CRD_F_ENCRYPT)) {
crypto_copydata(crp->crp_flags, buf, crda->crd_inject,
axf->hashsize, uaalg);
r = timingsafe_bcmp(aalg, uaalg, axf->hashsize);
if (r == 0) {
/* tag matches, decrypt data */
for (i = 0; i < crde->crd_len; i += blksz) {
len = MIN(crde->crd_len - i, blksz);
if (len < blksz)
bzero(blk, blksz);
crypto_copydata(crp->crp_flags, buf,
crde->crd_skip + i, len, blk);
if (!(crde->crd_flags & CRD_F_ENCRYPT)) {
exf->decrypt(swe->sw_kschedule, blk);
}
crypto_copyback(crp->crp_flags, buf,
crde->crd_skip + i, len, blk);
}
} else
return (EBADMSG);
} else {
/* Inject the authentication data */
crypto_copyback(crp->crp_flags, buf, crda->crd_inject,
axf->hashsize, aalg);
}
return (0);
}
/*
* Apply a compression/decompression algorithm
*/
static int
swcr_compdec(struct cryptodesc *crd, struct swcr_data *sw,
caddr_t buf, int flags)
{
u_int8_t *data, *out;
struct comp_algo *cxf;
int adj;
u_int32_t result;
cxf = sw->sw_cxf;
/* We must handle the whole buffer of data in one time
* then if there is not all the data in the mbuf, we must
* copy in a buffer.
*/
data = malloc(crd->crd_len, M_CRYPTO_DATA, M_NOWAIT);
if (data == NULL)
return (EINVAL);
crypto_copydata(flags, buf, crd->crd_skip, crd->crd_len, data);
if (crd->crd_flags & CRD_F_COMP)
result = cxf->compress(data, crd->crd_len, &out);
else
result = cxf->decompress(data, crd->crd_len, &out);
free(data, M_CRYPTO_DATA);
if (result == 0)
return EINVAL;
/* Copy back the (de)compressed data. m_copyback is
* extending the mbuf as necessary.
*/
sw->sw_size = result;
/* Check the compressed size when doing compression */
if (crd->crd_flags & CRD_F_COMP) {
if (result >= crd->crd_len) {
/* Compression was useless, we lost time */
free(out, M_CRYPTO_DATA);
return 0;
}
}
crypto_copyback(flags, buf, crd->crd_skip, result, out);
if (result < crd->crd_len) {
adj = result - crd->crd_len;
if (flags & CRYPTO_F_IMBUF) {
adj = result - crd->crd_len;
m_adj((struct mbuf *)buf, adj);
} else if (flags & CRYPTO_F_IOV) {
struct uio *uio = (struct uio *)buf;
int ind;
adj = crd->crd_len - result;
ind = uio->uio_iovcnt - 1;
while (adj > 0 && ind >= 0) {
if (adj < uio->uio_iov[ind].iov_len) {
uio->uio_iov[ind].iov_len -= adj;
break;
}
adj -= uio->uio_iov[ind].iov_len;
uio->uio_iov[ind].iov_len = 0;
ind--;
uio->uio_iovcnt--;
}
}
}
free(out, M_CRYPTO_DATA);
return 0;
}
/*
* Generate a new software session.
*/
static int
swcr_newsession(device_t dev, u_int32_t *sid, struct cryptoini *cri)
{
struct swcr_data **swd;
struct auth_hash *axf;
struct enc_xform *txf;
struct comp_algo *cxf;
u_int32_t i;
int len;
int error;
if (sid == NULL || cri == NULL)
return EINVAL;
rw_wlock(&swcr_sessions_lock);
if (swcr_sessions) {
for (i = 1; i < swcr_sesnum; i++)
if (swcr_sessions[i] == NULL)
break;
} else
i = 1; /* NB: to silence compiler warning */
if (swcr_sessions == NULL || i == swcr_sesnum) {
if (swcr_sessions == NULL) {
i = 1; /* We leave swcr_sessions[0] empty */
swcr_sesnum = CRYPTO_SW_SESSIONS;
} else
swcr_sesnum *= 2;
swd = malloc(swcr_sesnum * sizeof(struct swcr_data *),
M_CRYPTO_DATA, M_NOWAIT|M_ZERO);
if (swd == NULL) {
/* Reset session number */
if (swcr_sesnum == CRYPTO_SW_SESSIONS)
swcr_sesnum = 0;
else
swcr_sesnum /= 2;
rw_wunlock(&swcr_sessions_lock);
return ENOBUFS;
}
/* Copy existing sessions */
if (swcr_sessions != NULL) {
bcopy(swcr_sessions, swd,
(swcr_sesnum / 2) * sizeof(struct swcr_data *));
free(swcr_sessions, M_CRYPTO_DATA);
}
swcr_sessions = swd;
}
rw_downgrade(&swcr_sessions_lock);
swd = &swcr_sessions[i];
*sid = i;
while (cri) {
*swd = malloc(sizeof(struct swcr_data),
M_CRYPTO_DATA, M_NOWAIT|M_ZERO);
if (*swd == NULL) {
swcr_freesession_locked(dev, i);
rw_runlock(&swcr_sessions_lock);
return ENOBUFS;
}
switch (cri->cri_alg) {
case CRYPTO_DES_CBC:
txf = &enc_xform_des;
goto enccommon;
case CRYPTO_3DES_CBC:
txf = &enc_xform_3des;
goto enccommon;
case CRYPTO_BLF_CBC:
txf = &enc_xform_blf;
goto enccommon;
case CRYPTO_CAST_CBC:
txf = &enc_xform_cast5;
goto enccommon;
case CRYPTO_SKIPJACK_CBC:
txf = &enc_xform_skipjack;
goto enccommon;
case CRYPTO_RIJNDAEL128_CBC:
txf = &enc_xform_rijndael128;
goto enccommon;
case CRYPTO_AES_XTS:
txf = &enc_xform_aes_xts;
goto enccommon;
case CRYPTO_AES_ICM:
txf = &enc_xform_aes_icm;
goto enccommon;
case CRYPTO_AES_NIST_GCM_16:
txf = &enc_xform_aes_nist_gcm;
goto enccommon;
case CRYPTO_AES_NIST_GMAC:
txf = &enc_xform_aes_nist_gmac;
(*swd)->sw_exf = txf;
break;
case CRYPTO_CAMELLIA_CBC:
txf = &enc_xform_camellia;
goto enccommon;
case CRYPTO_NULL_CBC:
txf = &enc_xform_null;
goto enccommon;
enccommon:
if (cri->cri_key != NULL) {
error = txf->setkey(&((*swd)->sw_kschedule),
cri->cri_key, cri->cri_klen / 8);
if (error) {
swcr_freesession_locked(dev, i);
rw_runlock(&swcr_sessions_lock);
return error;
}
}
(*swd)->sw_exf = txf;
break;
case CRYPTO_MD5_HMAC:
axf = &auth_hash_hmac_md5;
goto authcommon;
case CRYPTO_SHA1_HMAC:
axf = &auth_hash_hmac_sha1;
goto authcommon;
case CRYPTO_SHA2_256_HMAC:
axf = &auth_hash_hmac_sha2_256;
goto authcommon;
case CRYPTO_SHA2_384_HMAC:
axf = &auth_hash_hmac_sha2_384;
goto authcommon;
case CRYPTO_SHA2_512_HMAC:
axf = &auth_hash_hmac_sha2_512;
goto authcommon;
case CRYPTO_NULL_HMAC:
axf = &auth_hash_null;
goto authcommon;
case CRYPTO_RIPEMD160_HMAC:
axf = &auth_hash_hmac_ripemd_160;
authcommon:
(*swd)->sw_ictx = malloc(axf->ctxsize, M_CRYPTO_DATA,
M_NOWAIT);
if ((*swd)->sw_ictx == NULL) {
swcr_freesession_locked(dev, i);
rw_runlock(&swcr_sessions_lock);
return ENOBUFS;
}
(*swd)->sw_octx = malloc(axf->ctxsize, M_CRYPTO_DATA,
M_NOWAIT);
if ((*swd)->sw_octx == NULL) {
swcr_freesession_locked(dev, i);
rw_runlock(&swcr_sessions_lock);
return ENOBUFS;
}
if (cri->cri_key != NULL) {
swcr_authprepare(axf, *swd, cri->cri_key,
cri->cri_klen);
}
(*swd)->sw_mlen = cri->cri_mlen;
(*swd)->sw_axf = axf;
break;
case CRYPTO_MD5_KPDK:
axf = &auth_hash_key_md5;
goto auth2common;
case CRYPTO_SHA1_KPDK:
axf = &auth_hash_key_sha1;
auth2common:
(*swd)->sw_ictx = malloc(axf->ctxsize, M_CRYPTO_DATA,
M_NOWAIT);
if ((*swd)->sw_ictx == NULL) {
swcr_freesession_locked(dev, i);
rw_runlock(&swcr_sessions_lock);
return ENOBUFS;
}
(*swd)->sw_octx = malloc(cri->cri_klen / 8,
M_CRYPTO_DATA, M_NOWAIT);
if ((*swd)->sw_octx == NULL) {
swcr_freesession_locked(dev, i);
rw_runlock(&swcr_sessions_lock);
return ENOBUFS;
}
/* Store the key so we can "append" it to the payload */
if (cri->cri_key != NULL) {
swcr_authprepare(axf, *swd, cri->cri_key,
cri->cri_klen);
}
(*swd)->sw_mlen = cri->cri_mlen;
(*swd)->sw_axf = axf;
break;
#ifdef notdef
case CRYPTO_MD5:
axf = &auth_hash_md5;
goto auth3common;
case CRYPTO_SHA1:
axf = &auth_hash_sha1;
auth3common:
(*swd)->sw_ictx = malloc(axf->ctxsize, M_CRYPTO_DATA,
M_NOWAIT);
if ((*swd)->sw_ictx == NULL) {
swcr_freesession_locked(dev, i);
rw_runlock(&swcr_sessions_lock);
return ENOBUFS;
}
axf->Init((*swd)->sw_ictx);
(*swd)->sw_mlen = cri->cri_mlen;
(*swd)->sw_axf = axf;
break;
#endif
case CRYPTO_AES_128_NIST_GMAC:
axf = &auth_hash_nist_gmac_aes_128;
goto auth4common;
case CRYPTO_AES_192_NIST_GMAC:
axf = &auth_hash_nist_gmac_aes_192;
goto auth4common;
case CRYPTO_AES_256_NIST_GMAC:
axf = &auth_hash_nist_gmac_aes_256;
auth4common:
len = cri->cri_klen / 8;
if (len != 16 && len != 24 && len != 32) {
swcr_freesession_locked(dev, i);
rw_runlock(&swcr_sessions_lock);
return EINVAL;
}
(*swd)->sw_ictx = malloc(axf->ctxsize, M_CRYPTO_DATA,
M_NOWAIT);
if ((*swd)->sw_ictx == NULL) {
swcr_freesession_locked(dev, i);
rw_runlock(&swcr_sessions_lock);
return ENOBUFS;
}
axf->Init((*swd)->sw_ictx);
axf->Setkey((*swd)->sw_ictx, cri->cri_key, len);
(*swd)->sw_axf = axf;
break;
case CRYPTO_DEFLATE_COMP:
cxf = &comp_algo_deflate;
(*swd)->sw_cxf = cxf;
break;
default:
swcr_freesession_locked(dev, i);
rw_runlock(&swcr_sessions_lock);
return EINVAL;
}
(*swd)->sw_alg = cri->cri_alg;
cri = cri->cri_next;
swd = &((*swd)->sw_next);
}
rw_runlock(&swcr_sessions_lock);
return 0;
}
static int
swcr_freesession(device_t dev, u_int64_t tid)
{
int error;
rw_rlock(&swcr_sessions_lock);
error = swcr_freesession_locked(dev, tid);
rw_runlock(&swcr_sessions_lock);
return error;
}
/*
* Free a session.
*/
static int
swcr_freesession_locked(device_t dev, u_int64_t tid)
{
struct swcr_data *swd;
struct enc_xform *txf;
struct auth_hash *axf;
- struct comp_algo *cxf;
u_int32_t sid = CRYPTO_SESID2LID(tid);
if (sid > swcr_sesnum || swcr_sessions == NULL ||
swcr_sessions[sid] == NULL)
return EINVAL;
/* Silently accept and return */
if (sid == 0)
return 0;
while ((swd = swcr_sessions[sid]) != NULL) {
swcr_sessions[sid] = swd->sw_next;
switch (swd->sw_alg) {
case CRYPTO_DES_CBC:
case CRYPTO_3DES_CBC:
case CRYPTO_BLF_CBC:
case CRYPTO_CAST_CBC:
case CRYPTO_SKIPJACK_CBC:
case CRYPTO_RIJNDAEL128_CBC:
case CRYPTO_AES_XTS:
case CRYPTO_AES_ICM:
case CRYPTO_AES_NIST_GCM_16:
case CRYPTO_AES_NIST_GMAC:
case CRYPTO_CAMELLIA_CBC:
case CRYPTO_NULL_CBC:
txf = swd->sw_exf;
if (swd->sw_kschedule)
txf->zerokey(&(swd->sw_kschedule));
break;
case CRYPTO_MD5_HMAC:
case CRYPTO_SHA1_HMAC:
case CRYPTO_SHA2_256_HMAC:
case CRYPTO_SHA2_384_HMAC:
case CRYPTO_SHA2_512_HMAC:
case CRYPTO_RIPEMD160_HMAC:
case CRYPTO_NULL_HMAC:
axf = swd->sw_axf;
if (swd->sw_ictx) {
bzero(swd->sw_ictx, axf->ctxsize);
free(swd->sw_ictx, M_CRYPTO_DATA);
}
if (swd->sw_octx) {
bzero(swd->sw_octx, axf->ctxsize);
free(swd->sw_octx, M_CRYPTO_DATA);
}
break;
case CRYPTO_MD5_KPDK:
case CRYPTO_SHA1_KPDK:
axf = swd->sw_axf;
if (swd->sw_ictx) {
bzero(swd->sw_ictx, axf->ctxsize);
free(swd->sw_ictx, M_CRYPTO_DATA);
}
if (swd->sw_octx) {
bzero(swd->sw_octx, swd->sw_klen);
free(swd->sw_octx, M_CRYPTO_DATA);
}
break;
case CRYPTO_MD5:
case CRYPTO_SHA1:
axf = swd->sw_axf;
if (swd->sw_ictx)
free(swd->sw_ictx, M_CRYPTO_DATA);
break;
case CRYPTO_DEFLATE_COMP:
- cxf = swd->sw_cxf;
+ /* Nothing to do */
break;
}
free(swd, M_CRYPTO_DATA);
}
return 0;
}
/*
* Process a software request.
*/
static int
swcr_process(device_t dev, struct cryptop *crp, int hint)
{
struct cryptodesc *crd;
struct swcr_data *sw;
u_int32_t lid;
/* Sanity check */
if (crp == NULL)
return EINVAL;
if (crp->crp_desc == NULL || crp->crp_buf == NULL) {
crp->crp_etype = EINVAL;
goto done;
}
lid = CRYPTO_SESID2LID(crp->crp_sid);
rw_rlock(&swcr_sessions_lock);
if (swcr_sessions == NULL || lid >= swcr_sesnum || lid == 0 ||
swcr_sessions[lid] == NULL) {
rw_runlock(&swcr_sessions_lock);
crp->crp_etype = ENOENT;
goto done;
}
rw_runlock(&swcr_sessions_lock);
/* Go through crypto descriptors, processing as we go */
for (crd = crp->crp_desc; crd; crd = crd->crd_next) {
/*
* Find the crypto context.
*
* XXX Note that the logic here prevents us from having
* XXX the same algorithm multiple times in a session
* XXX (or rather, we can but it won't give us the right
* XXX results). To do that, we'd need some way of differentiating
* XXX between the various instances of an algorithm (so we can
* XXX locate the correct crypto context).
*/
rw_rlock(&swcr_sessions_lock);
if (swcr_sessions == NULL) {
rw_runlock(&swcr_sessions_lock);
crp->crp_etype = ENOENT;
goto done;
}
for (sw = swcr_sessions[lid];
sw && sw->sw_alg != crd->crd_alg;
sw = sw->sw_next)
;
rw_runlock(&swcr_sessions_lock);
/* No such context ? */
if (sw == NULL) {
crp->crp_etype = EINVAL;
goto done;
}
switch (sw->sw_alg) {
case CRYPTO_DES_CBC:
case CRYPTO_3DES_CBC:
case CRYPTO_BLF_CBC:
case CRYPTO_CAST_CBC:
case CRYPTO_SKIPJACK_CBC:
case CRYPTO_RIJNDAEL128_CBC:
case CRYPTO_AES_XTS:
case CRYPTO_AES_ICM:
case CRYPTO_CAMELLIA_CBC:
if ((crp->crp_etype = swcr_encdec(crd, sw,
crp->crp_buf, crp->crp_flags)) != 0)
goto done;
break;
case CRYPTO_NULL_CBC:
crp->crp_etype = 0;
break;
case CRYPTO_MD5_HMAC:
case CRYPTO_SHA1_HMAC:
case CRYPTO_SHA2_256_HMAC:
case CRYPTO_SHA2_384_HMAC:
case CRYPTO_SHA2_512_HMAC:
case CRYPTO_RIPEMD160_HMAC:
case CRYPTO_NULL_HMAC:
case CRYPTO_MD5_KPDK:
case CRYPTO_SHA1_KPDK:
case CRYPTO_MD5:
case CRYPTO_SHA1:
if ((crp->crp_etype = swcr_authcompute(crd, sw,
crp->crp_buf, crp->crp_flags)) != 0)
goto done;
break;
case CRYPTO_AES_NIST_GCM_16:
case CRYPTO_AES_NIST_GMAC:
case CRYPTO_AES_128_NIST_GMAC:
case CRYPTO_AES_192_NIST_GMAC:
case CRYPTO_AES_256_NIST_GMAC:
crp->crp_etype = swcr_authenc(crp);
goto done;
case CRYPTO_DEFLATE_COMP:
if ((crp->crp_etype = swcr_compdec(crd, sw,
crp->crp_buf, crp->crp_flags)) != 0)
goto done;
else
crp->crp_olen = (int)sw->sw_size;
break;
default:
/* Unknown/unsupported algorithm */
crp->crp_etype = EINVAL;
goto done;
}
}
done:
crypto_done(crp);
return 0;
}
static void
swcr_identify(driver_t *drv, device_t parent)
{
/* NB: order 10 is so we get attached after h/w devices */
if (device_find_child(parent, "cryptosoft", -1) == NULL &&
BUS_ADD_CHILD(parent, 10, "cryptosoft", 0) == 0)
panic("cryptosoft: could not attach");
}
static int
swcr_probe(device_t dev)
{
device_set_desc(dev, "software crypto");
return (BUS_PROBE_NOWILDCARD);
}
static int
swcr_attach(device_t dev)
{
rw_init(&swcr_sessions_lock, "swcr_sessions_lock");
memset(hmac_ipad_buffer, HMAC_IPAD_VAL, HMAC_MAX_BLOCK_LEN);
memset(hmac_opad_buffer, HMAC_OPAD_VAL, HMAC_MAX_BLOCK_LEN);
swcr_id = crypto_get_driverid(dev,
CRYPTOCAP_F_SOFTWARE | CRYPTOCAP_F_SYNC);
if (swcr_id < 0) {
device_printf(dev, "cannot initialize!");
return ENOMEM;
}
#define REGISTER(alg) \
crypto_register(swcr_id, alg, 0,0)
REGISTER(CRYPTO_DES_CBC);
REGISTER(CRYPTO_3DES_CBC);
REGISTER(CRYPTO_BLF_CBC);
REGISTER(CRYPTO_CAST_CBC);
REGISTER(CRYPTO_SKIPJACK_CBC);
REGISTER(CRYPTO_NULL_CBC);
REGISTER(CRYPTO_MD5_HMAC);
REGISTER(CRYPTO_SHA1_HMAC);
REGISTER(CRYPTO_SHA2_256_HMAC);
REGISTER(CRYPTO_SHA2_384_HMAC);
REGISTER(CRYPTO_SHA2_512_HMAC);
REGISTER(CRYPTO_RIPEMD160_HMAC);
REGISTER(CRYPTO_NULL_HMAC);
REGISTER(CRYPTO_MD5_KPDK);
REGISTER(CRYPTO_SHA1_KPDK);
REGISTER(CRYPTO_MD5);
REGISTER(CRYPTO_SHA1);
REGISTER(CRYPTO_RIJNDAEL128_CBC);
REGISTER(CRYPTO_AES_XTS);
REGISTER(CRYPTO_AES_ICM);
REGISTER(CRYPTO_AES_NIST_GCM_16);
REGISTER(CRYPTO_AES_NIST_GMAC);
REGISTER(CRYPTO_AES_128_NIST_GMAC);
REGISTER(CRYPTO_AES_192_NIST_GMAC);
REGISTER(CRYPTO_AES_256_NIST_GMAC);
REGISTER(CRYPTO_CAMELLIA_CBC);
REGISTER(CRYPTO_DEFLATE_COMP);
#undef REGISTER
return 0;
}
static int
swcr_detach(device_t dev)
{
crypto_unregister_all(swcr_id);
rw_wlock(&swcr_sessions_lock);
free(swcr_sessions, M_CRYPTO_DATA);
swcr_sessions = NULL;
rw_wunlock(&swcr_sessions_lock);
rw_destroy(&swcr_sessions_lock);
return 0;
}
static device_method_t swcr_methods[] = {
DEVMETHOD(device_identify, swcr_identify),
DEVMETHOD(device_probe, swcr_probe),
DEVMETHOD(device_attach, swcr_attach),
DEVMETHOD(device_detach, swcr_detach),
DEVMETHOD(cryptodev_newsession, swcr_newsession),
DEVMETHOD(cryptodev_freesession,swcr_freesession),
DEVMETHOD(cryptodev_process, swcr_process),
{0, 0},
};
static driver_t swcr_driver = {
"cryptosoft",
swcr_methods,
0, /* NB: no softc */
};
static devclass_t swcr_devclass;
/*
* NB: We explicitly reference the crypto module so we
* get the necessary ordering when built as a loadable
* module. This is required because we bundle the crypto
* module code together with the cryptosoft driver (otherwise
* normal module dependencies would handle things).
*/
extern int crypto_modevent(struct module *, int, void *);
/* XXX where to attach */
DRIVER_MODULE(cryptosoft, nexus, swcr_driver, swcr_devclass, crypto_modevent,0);
MODULE_VERSION(cryptosoft, 1);
MODULE_DEPEND(cryptosoft, crypto, 1, 1, 1);
Index: head/sys/rpc/clnt_dg.c
===================================================================
--- head/sys/rpc/clnt_dg.c (revision 327172)
+++ head/sys/rpc/clnt_dg.c (revision 327173)
@@ -1,1155 +1,1151 @@
/* $NetBSD: clnt_dg.c,v 1.4 2000/07/14 08:40:41 fvdl Exp $ */
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (c) 2009, Sun Microsystems, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of Sun Microsystems, Inc. nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1986-1991 by Sun Microsystems Inc.
*/
#if defined(LIBC_SCCS) && !defined(lint)
#ident "@(#)clnt_dg.c 1.23 94/04/22 SMI"
static char sccsid[] = "@(#)clnt_dg.c 1.19 89/03/16 Copyr 1988 Sun Micro";
#endif
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
/*
* Implements a connectionless client side RPC.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/mutex.h>
#include <sys/pcpu.h>
#include <sys/proc.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/time.h>
#include <sys/uio.h>
#include <net/vnet.h>
#include <rpc/rpc.h>
#include <rpc/rpc_com.h>
#ifdef _FREEFALL_CONFIG
/*
* Disable RPC exponential back-off for FreeBSD.org systems.
*/
#define RPC_MAX_BACKOFF 1 /* second */
#else
#define RPC_MAX_BACKOFF 30 /* seconds */
#endif
static bool_t time_not_ok(struct timeval *);
static enum clnt_stat clnt_dg_call(CLIENT *, struct rpc_callextra *,
rpcproc_t, struct mbuf *, struct mbuf **, struct timeval);
static void clnt_dg_geterr(CLIENT *, struct rpc_err *);
static bool_t clnt_dg_freeres(CLIENT *, xdrproc_t, void *);
static void clnt_dg_abort(CLIENT *);
static bool_t clnt_dg_control(CLIENT *, u_int, void *);
static void clnt_dg_close(CLIENT *);
static void clnt_dg_destroy(CLIENT *);
static int clnt_dg_soupcall(struct socket *so, void *arg, int waitflag);
static struct clnt_ops clnt_dg_ops = {
.cl_call = clnt_dg_call,
.cl_abort = clnt_dg_abort,
.cl_geterr = clnt_dg_geterr,
.cl_freeres = clnt_dg_freeres,
.cl_close = clnt_dg_close,
.cl_destroy = clnt_dg_destroy,
.cl_control = clnt_dg_control
};
/*
* A pending RPC request which awaits a reply. Requests which have
* received their reply will have cr_xid set to zero and cr_mrep to
* the mbuf chain of the reply.
*/
struct cu_request {
TAILQ_ENTRY(cu_request) cr_link;
CLIENT *cr_client; /* owner */
uint32_t cr_xid; /* XID of request */
struct mbuf *cr_mrep; /* reply received by upcall */
int cr_error; /* any error from upcall */
char cr_verf[MAX_AUTH_BYTES]; /* reply verf */
};
TAILQ_HEAD(cu_request_list, cu_request);
#define MCALL_MSG_SIZE 24
/*
* This structure is pointed to by the socket buffer's sb_upcallarg
* member. It is separate from the client private data to facilitate
* multiple clients sharing the same socket. The cs_lock mutex is used
* to protect all fields of this structure, the socket's receive
* buffer SOCKBUF_LOCK is used to ensure that exactly one of these
* structures is installed on the socket.
*/
struct cu_socket {
struct mtx cs_lock;
int cs_refs; /* Count of clients */
struct cu_request_list cs_pending; /* Requests awaiting replies */
int cs_upcallrefs; /* Refcnt of upcalls in prog.*/
};
static void clnt_dg_upcallsdone(struct socket *, struct cu_socket *);
/*
* Private data kept per client handle
*/
struct cu_data {
int cu_threads; /* # threads in clnt_vc_call */
bool_t cu_closing; /* TRUE if we are closing */
bool_t cu_closed; /* TRUE if we are closed */
struct socket *cu_socket; /* connection socket */
bool_t cu_closeit; /* opened by library */
struct sockaddr_storage cu_raddr; /* remote address */
int cu_rlen;
struct timeval cu_wait; /* retransmit interval */
struct timeval cu_total; /* total time for the call */
struct rpc_err cu_error;
uint32_t cu_xid;
char cu_mcallc[MCALL_MSG_SIZE]; /* marshalled callmsg */
size_t cu_mcalllen;
size_t cu_sendsz; /* send size */
size_t cu_recvsz; /* recv size */
int cu_async;
int cu_connect; /* Use connect(). */
int cu_connected; /* Have done connect(). */
const char *cu_waitchan;
int cu_waitflag;
int cu_cwnd; /* congestion window */
int cu_sent; /* number of in-flight RPCs */
bool_t cu_cwnd_wait;
};
#define CWNDSCALE 256
#define MAXCWND (32 * CWNDSCALE)
/*
* Connection less client creation returns with client handle parameters.
* Default options are set, which the user can change using clnt_control().
* fd should be open and bound.
* NB: The rpch->cl_auth is initialized to null authentication.
* Caller may wish to set this something more useful.
*
* sendsz and recvsz are the maximum allowable packet sizes that can be
* sent and received. Normally they are the same, but they can be
* changed to improve the program efficiency and buffer allocation.
* If they are 0, use the transport default.
*
* If svcaddr is NULL, returns NULL.
*/
CLIENT *
clnt_dg_create(
struct socket *so,
struct sockaddr *svcaddr, /* servers address */
rpcprog_t program, /* program number */
rpcvers_t version, /* version number */
size_t sendsz, /* buffer recv size */
size_t recvsz) /* buffer send size */
{
CLIENT *cl = NULL; /* client handle */
struct cu_data *cu = NULL; /* private data */
struct cu_socket *cs = NULL;
struct sockbuf *sb;
struct timeval now;
struct rpc_msg call_msg;
struct __rpc_sockinfo si;
XDR xdrs;
int error;
if (svcaddr == NULL) {
rpc_createerr.cf_stat = RPC_UNKNOWNADDR;
return (NULL);
}
if (!__rpc_socket2sockinfo(so, &si)) {
rpc_createerr.cf_stat = RPC_TLIERROR;
rpc_createerr.cf_error.re_errno = 0;
return (NULL);
}
/*
* Find the receive and the send size
*/
sendsz = __rpc_get_t_size(si.si_af, si.si_proto, (int)sendsz);
recvsz = __rpc_get_t_size(si.si_af, si.si_proto, (int)recvsz);
if ((sendsz == 0) || (recvsz == 0)) {
rpc_createerr.cf_stat = RPC_TLIERROR; /* XXX */
rpc_createerr.cf_error.re_errno = 0;
return (NULL);
}
cl = mem_alloc(sizeof (CLIENT));
/*
* Should be multiple of 4 for XDR.
*/
sendsz = rounddown(sendsz + 3, 4);
recvsz = rounddown(recvsz + 3, 4);
cu = mem_alloc(sizeof (*cu));
cu->cu_threads = 0;
cu->cu_closing = FALSE;
cu->cu_closed = FALSE;
(void) memcpy(&cu->cu_raddr, svcaddr, (size_t)svcaddr->sa_len);
cu->cu_rlen = svcaddr->sa_len;
/* Other values can also be set through clnt_control() */
cu->cu_wait.tv_sec = 3; /* heuristically chosen */
cu->cu_wait.tv_usec = 0;
cu->cu_total.tv_sec = -1;
cu->cu_total.tv_usec = -1;
cu->cu_sendsz = sendsz;
cu->cu_recvsz = recvsz;
cu->cu_async = FALSE;
cu->cu_connect = FALSE;
cu->cu_connected = FALSE;
cu->cu_waitchan = "rpcrecv";
cu->cu_waitflag = 0;
cu->cu_cwnd = MAXCWND / 2;
cu->cu_sent = 0;
cu->cu_cwnd_wait = FALSE;
(void) getmicrotime(&now);
cu->cu_xid = __RPC_GETXID(&now);
call_msg.rm_xid = cu->cu_xid;
call_msg.rm_call.cb_prog = program;
call_msg.rm_call.cb_vers = version;
xdrmem_create(&xdrs, cu->cu_mcallc, MCALL_MSG_SIZE, XDR_ENCODE);
if (! xdr_callhdr(&xdrs, &call_msg)) {
rpc_createerr.cf_stat = RPC_CANTENCODEARGS; /* XXX */
rpc_createerr.cf_error.re_errno = 0;
goto err2;
}
cu->cu_mcalllen = XDR_GETPOS(&xdrs);
/*
* By default, closeit is always FALSE. It is users responsibility
* to do a close on it, else the user may use clnt_control
* to let clnt_destroy do it for him/her.
*/
cu->cu_closeit = FALSE;
cu->cu_socket = so;
error = soreserve(so, (u_long)sendsz, (u_long)recvsz);
if (error != 0) {
rpc_createerr.cf_stat = RPC_FAILED;
rpc_createerr.cf_error.re_errno = error;
goto err2;
}
sb = &so->so_rcv;
SOCKBUF_LOCK(&so->so_rcv);
recheck_socket:
if (sb->sb_upcall) {
if (sb->sb_upcall != clnt_dg_soupcall) {
SOCKBUF_UNLOCK(&so->so_rcv);
printf("clnt_dg_create(): socket already has an incompatible upcall\n");
goto err2;
}
cs = (struct cu_socket *) sb->sb_upcallarg;
mtx_lock(&cs->cs_lock);
cs->cs_refs++;
mtx_unlock(&cs->cs_lock);
} else {
/*
* We are the first on this socket - allocate the
* structure and install it in the socket.
*/
SOCKBUF_UNLOCK(&so->so_rcv);
cs = mem_alloc(sizeof(*cs));
SOCKBUF_LOCK(&so->so_rcv);
if (sb->sb_upcall) {
/*
* We have lost a race with some other client.
*/
mem_free(cs, sizeof(*cs));
goto recheck_socket;
}
mtx_init(&cs->cs_lock, "cs->cs_lock", NULL, MTX_DEF);
cs->cs_refs = 1;
cs->cs_upcallrefs = 0;
TAILQ_INIT(&cs->cs_pending);
soupcall_set(so, SO_RCV, clnt_dg_soupcall, cs);
}
SOCKBUF_UNLOCK(&so->so_rcv);
cl->cl_refs = 1;
cl->cl_ops = &clnt_dg_ops;
cl->cl_private = (caddr_t)(void *)cu;
cl->cl_auth = authnone_create();
cl->cl_tp = NULL;
cl->cl_netid = NULL;
return (cl);
err2:
mem_free(cl, sizeof (CLIENT));
mem_free(cu, sizeof (*cu));
return (NULL);
}
static enum clnt_stat
clnt_dg_call(
CLIENT *cl, /* client handle */
struct rpc_callextra *ext, /* call metadata */
rpcproc_t proc, /* procedure number */
struct mbuf *args, /* pointer to args */
struct mbuf **resultsp, /* pointer to results */
struct timeval utimeout) /* seconds to wait before giving up */
{
struct cu_data *cu = (struct cu_data *)cl->cl_private;
struct cu_socket *cs;
struct rpc_timers *rt;
AUTH *auth;
struct rpc_err *errp;
enum clnt_stat stat;
XDR xdrs;
struct rpc_msg reply_msg;
bool_t ok;
int retrans; /* number of re-transmits so far */
int nrefreshes = 2; /* number of times to refresh cred */
struct timeval *tvp;
int timeout;
int retransmit_time;
int next_sendtime, starttime, rtt, time_waited, tv = 0;
struct sockaddr *sa;
- socklen_t salen;
uint32_t xid = 0;
struct mbuf *mreq = NULL, *results;
struct cu_request *cr;
int error;
cs = cu->cu_socket->so_rcv.sb_upcallarg;
cr = malloc(sizeof(struct cu_request), M_RPC, M_WAITOK);
mtx_lock(&cs->cs_lock);
if (cu->cu_closing || cu->cu_closed) {
mtx_unlock(&cs->cs_lock);
free(cr, M_RPC);
return (RPC_CANTSEND);
}
cu->cu_threads++;
if (ext) {
auth = ext->rc_auth;
errp = &ext->rc_err;
} else {
auth = cl->cl_auth;
errp = &cu->cu_error;
}
cr->cr_client = cl;
cr->cr_mrep = NULL;
cr->cr_error = 0;
if (cu->cu_total.tv_usec == -1) {
tvp = &utimeout; /* use supplied timeout */
} else {
tvp = &cu->cu_total; /* use default timeout */
}
if (tvp->tv_sec || tvp->tv_usec)
timeout = tvtohz(tvp);
else
timeout = 0;
if (cu->cu_connect && !cu->cu_connected) {
mtx_unlock(&cs->cs_lock);
error = soconnect(cu->cu_socket,
(struct sockaddr *)&cu->cu_raddr, curthread);
mtx_lock(&cs->cs_lock);
if (error) {
errp->re_errno = error;
errp->re_status = stat = RPC_CANTSEND;
goto out;
}
cu->cu_connected = 1;
}
- if (cu->cu_connected) {
+ if (cu->cu_connected)
sa = NULL;
- salen = 0;
- } else {
+ else
sa = (struct sockaddr *)&cu->cu_raddr;
- salen = cu->cu_rlen;
- }
time_waited = 0;
retrans = 0;
if (ext && ext->rc_timers) {
rt = ext->rc_timers;
if (!rt->rt_rtxcur)
rt->rt_rtxcur = tvtohz(&cu->cu_wait);
retransmit_time = next_sendtime = rt->rt_rtxcur;
} else {
rt = NULL;
retransmit_time = next_sendtime = tvtohz(&cu->cu_wait);
}
starttime = ticks;
call_again:
mtx_assert(&cs->cs_lock, MA_OWNED);
cu->cu_xid++;
xid = cu->cu_xid;
send_again:
mtx_unlock(&cs->cs_lock);
mreq = m_gethdr(M_WAITOK, MT_DATA);
KASSERT(cu->cu_mcalllen <= MHLEN, ("RPC header too big"));
bcopy(cu->cu_mcallc, mreq->m_data, cu->cu_mcalllen);
mreq->m_len = cu->cu_mcalllen;
/*
* The XID is the first thing in the request.
*/
*mtod(mreq, uint32_t *) = htonl(xid);
xdrmbuf_create(&xdrs, mreq, XDR_ENCODE);
if (cu->cu_async == TRUE && args == NULL)
goto get_reply;
if ((! XDR_PUTINT32(&xdrs, &proc)) ||
(! AUTH_MARSHALL(auth, xid, &xdrs,
m_copym(args, 0, M_COPYALL, M_WAITOK)))) {
errp->re_status = stat = RPC_CANTENCODEARGS;
mtx_lock(&cs->cs_lock);
goto out;
}
mreq->m_pkthdr.len = m_length(mreq, NULL);
cr->cr_xid = xid;
mtx_lock(&cs->cs_lock);
/*
* Try to get a place in the congestion window.
*/
while (cu->cu_sent >= cu->cu_cwnd) {
cu->cu_cwnd_wait = TRUE;
error = msleep(&cu->cu_cwnd_wait, &cs->cs_lock,
cu->cu_waitflag, "rpccwnd", 0);
if (error) {
errp->re_errno = error;
if (error == EINTR || error == ERESTART)
errp->re_status = stat = RPC_INTR;
else
errp->re_status = stat = RPC_CANTSEND;
goto out;
}
}
cu->cu_sent += CWNDSCALE;
TAILQ_INSERT_TAIL(&cs->cs_pending, cr, cr_link);
mtx_unlock(&cs->cs_lock);
/*
* sosend consumes mreq.
*/
error = sosend(cu->cu_socket, sa, NULL, mreq, NULL, 0, curthread);
mreq = NULL;
/*
* sub-optimal code appears here because we have
* some clock time to spare while the packets are in flight.
* (We assume that this is actually only executed once.)
*/
reply_msg.acpted_rply.ar_verf.oa_flavor = AUTH_NULL;
reply_msg.acpted_rply.ar_verf.oa_base = cr->cr_verf;
reply_msg.acpted_rply.ar_verf.oa_length = 0;
reply_msg.acpted_rply.ar_results.where = NULL;
reply_msg.acpted_rply.ar_results.proc = (xdrproc_t)xdr_void;
mtx_lock(&cs->cs_lock);
if (error) {
TAILQ_REMOVE(&cs->cs_pending, cr, cr_link);
errp->re_errno = error;
errp->re_status = stat = RPC_CANTSEND;
cu->cu_sent -= CWNDSCALE;
if (cu->cu_cwnd_wait) {
cu->cu_cwnd_wait = FALSE;
wakeup(&cu->cu_cwnd_wait);
}
goto out;
}
/*
* Check to see if we got an upcall while waiting for the
* lock.
*/
if (cr->cr_error) {
TAILQ_REMOVE(&cs->cs_pending, cr, cr_link);
errp->re_errno = cr->cr_error;
errp->re_status = stat = RPC_CANTRECV;
cu->cu_sent -= CWNDSCALE;
if (cu->cu_cwnd_wait) {
cu->cu_cwnd_wait = FALSE;
wakeup(&cu->cu_cwnd_wait);
}
goto out;
}
if (cr->cr_mrep) {
TAILQ_REMOVE(&cs->cs_pending, cr, cr_link);
cu->cu_sent -= CWNDSCALE;
if (cu->cu_cwnd_wait) {
cu->cu_cwnd_wait = FALSE;
wakeup(&cu->cu_cwnd_wait);
}
goto got_reply;
}
/*
* Hack to provide rpc-based message passing
*/
if (timeout == 0) {
TAILQ_REMOVE(&cs->cs_pending, cr, cr_link);
errp->re_status = stat = RPC_TIMEDOUT;
cu->cu_sent -= CWNDSCALE;
if (cu->cu_cwnd_wait) {
cu->cu_cwnd_wait = FALSE;
wakeup(&cu->cu_cwnd_wait);
}
goto out;
}
get_reply:
for (;;) {
/* Decide how long to wait. */
if (next_sendtime < timeout)
tv = next_sendtime;
else
tv = timeout;
tv -= time_waited;
if (tv > 0) {
if (cu->cu_closing || cu->cu_closed) {
error = 0;
cr->cr_error = ESHUTDOWN;
} else {
error = msleep(cr, &cs->cs_lock,
cu->cu_waitflag, cu->cu_waitchan, tv);
}
} else {
error = EWOULDBLOCK;
}
TAILQ_REMOVE(&cs->cs_pending, cr, cr_link);
cu->cu_sent -= CWNDSCALE;
if (cu->cu_cwnd_wait) {
cu->cu_cwnd_wait = FALSE;
wakeup(&cu->cu_cwnd_wait);
}
if (!error) {
/*
* We were woken up by the upcall. If the
* upcall had a receive error, report that,
* otherwise we have a reply.
*/
if (cr->cr_error) {
errp->re_errno = cr->cr_error;
errp->re_status = stat = RPC_CANTRECV;
goto out;
}
cu->cu_cwnd += (CWNDSCALE * CWNDSCALE
+ cu->cu_cwnd / 2) / cu->cu_cwnd;
if (cu->cu_cwnd > MAXCWND)
cu->cu_cwnd = MAXCWND;
if (rt) {
/*
* Add one to the time since a tick
* count of N means that the actual
* time taken was somewhere between N
* and N+1.
*/
rtt = ticks - starttime + 1;
/*
* Update our estimate of the round
* trip time using roughly the
* algorithm described in RFC
* 2988. Given an RTT sample R:
*
* RTTVAR = (1-beta) * RTTVAR + beta * |SRTT-R|
* SRTT = (1-alpha) * SRTT + alpha * R
*
* where alpha = 0.125 and beta = 0.25.
*
* The initial retransmit timeout is
* SRTT + 4*RTTVAR and doubles on each
* retransmision.
*/
if (rt->rt_srtt == 0) {
rt->rt_srtt = rtt;
rt->rt_deviate = rtt / 2;
} else {
int32_t error = rtt - rt->rt_srtt;
rt->rt_srtt += error / 8;
error = abs(error) - rt->rt_deviate;
rt->rt_deviate += error / 4;
}
rt->rt_rtxcur = rt->rt_srtt + 4*rt->rt_deviate;
}
break;
}
/*
* The sleep returned an error so our request is still
* on the list. If we got EWOULDBLOCK, we may want to
* re-send the request.
*/
if (error != EWOULDBLOCK) {
errp->re_errno = error;
if (error == EINTR || error == ERESTART)
errp->re_status = stat = RPC_INTR;
else
errp->re_status = stat = RPC_CANTRECV;
goto out;
}
time_waited = ticks - starttime;
/* Check for timeout. */
if (time_waited > timeout) {
errp->re_errno = EWOULDBLOCK;
errp->re_status = stat = RPC_TIMEDOUT;
goto out;
}
/* Retransmit if necessary. */
if (time_waited >= next_sendtime) {
cu->cu_cwnd /= 2;
if (cu->cu_cwnd < CWNDSCALE)
cu->cu_cwnd = CWNDSCALE;
if (ext && ext->rc_feedback) {
mtx_unlock(&cs->cs_lock);
if (retrans == 0)
ext->rc_feedback(FEEDBACK_REXMIT1,
proc, ext->rc_feedback_arg);
else
ext->rc_feedback(FEEDBACK_REXMIT2,
proc, ext->rc_feedback_arg);
mtx_lock(&cs->cs_lock);
}
if (cu->cu_closing || cu->cu_closed) {
errp->re_errno = ESHUTDOWN;
errp->re_status = stat = RPC_CANTRECV;
goto out;
}
retrans++;
/* update retransmit_time */
if (retransmit_time < RPC_MAX_BACKOFF * hz)
retransmit_time = 2 * retransmit_time;
next_sendtime += retransmit_time;
goto send_again;
}
cu->cu_sent += CWNDSCALE;
TAILQ_INSERT_TAIL(&cs->cs_pending, cr, cr_link);
}
got_reply:
/*
* Now decode and validate the response. We need to drop the
* lock since xdr_replymsg may end up sleeping in malloc.
*/
mtx_unlock(&cs->cs_lock);
if (ext && ext->rc_feedback)
ext->rc_feedback(FEEDBACK_OK, proc, ext->rc_feedback_arg);
xdrmbuf_create(&xdrs, cr->cr_mrep, XDR_DECODE);
ok = xdr_replymsg(&xdrs, &reply_msg);
cr->cr_mrep = NULL;
if (ok) {
if ((reply_msg.rm_reply.rp_stat == MSG_ACCEPTED) &&
(reply_msg.acpted_rply.ar_stat == SUCCESS))
errp->re_status = stat = RPC_SUCCESS;
else
stat = _seterr_reply(&reply_msg, &(cu->cu_error));
if (errp->re_status == RPC_SUCCESS) {
results = xdrmbuf_getall(&xdrs);
if (! AUTH_VALIDATE(auth, xid,
&reply_msg.acpted_rply.ar_verf,
&results)) {
errp->re_status = stat = RPC_AUTHERROR;
errp->re_why = AUTH_INVALIDRESP;
if (retrans &&
auth->ah_cred.oa_flavor == RPCSEC_GSS) {
/*
* If we retransmitted, its
* possible that we will
* receive a reply for one of
* the earlier transmissions
* (which will use an older
* RPCSEC_GSS sequence
* number). In this case, just
* go back and listen for a
* new reply. We could keep a
* record of all the seq
* numbers we have transmitted
* so far so that we could
* accept a reply for any of
* them here.
*/
XDR_DESTROY(&xdrs);
mtx_lock(&cs->cs_lock);
cu->cu_sent += CWNDSCALE;
TAILQ_INSERT_TAIL(&cs->cs_pending,
cr, cr_link);
cr->cr_mrep = NULL;
goto get_reply;
}
} else {
*resultsp = results;
}
} /* end successful completion */
/*
* If unsuccessful AND error is an authentication error
* then refresh credentials and try again, else break
*/
else if (stat == RPC_AUTHERROR)
/* maybe our credentials need to be refreshed ... */
if (nrefreshes > 0 &&
AUTH_REFRESH(auth, &reply_msg)) {
nrefreshes--;
XDR_DESTROY(&xdrs);
mtx_lock(&cs->cs_lock);
goto call_again;
}
/* end of unsuccessful completion */
} /* end of valid reply message */
else {
errp->re_status = stat = RPC_CANTDECODERES;
}
XDR_DESTROY(&xdrs);
mtx_lock(&cs->cs_lock);
out:
mtx_assert(&cs->cs_lock, MA_OWNED);
if (mreq)
m_freem(mreq);
if (cr->cr_mrep)
m_freem(cr->cr_mrep);
cu->cu_threads--;
if (cu->cu_closing)
wakeup(cu);
mtx_unlock(&cs->cs_lock);
if (auth && stat != RPC_SUCCESS)
AUTH_VALIDATE(auth, xid, NULL, NULL);
free(cr, M_RPC);
return (stat);
}
static void
clnt_dg_geterr(CLIENT *cl, struct rpc_err *errp)
{
struct cu_data *cu = (struct cu_data *)cl->cl_private;
*errp = cu->cu_error;
}
static bool_t
clnt_dg_freeres(CLIENT *cl, xdrproc_t xdr_res, void *res_ptr)
{
XDR xdrs;
bool_t dummy;
xdrs.x_op = XDR_FREE;
dummy = (*xdr_res)(&xdrs, res_ptr);
return (dummy);
}
/*ARGSUSED*/
static void
clnt_dg_abort(CLIENT *h)
{
}
static bool_t
clnt_dg_control(CLIENT *cl, u_int request, void *info)
{
struct cu_data *cu = (struct cu_data *)cl->cl_private;
struct cu_socket *cs;
struct sockaddr *addr;
cs = cu->cu_socket->so_rcv.sb_upcallarg;
mtx_lock(&cs->cs_lock);
switch (request) {
case CLSET_FD_CLOSE:
cu->cu_closeit = TRUE;
mtx_unlock(&cs->cs_lock);
return (TRUE);
case CLSET_FD_NCLOSE:
cu->cu_closeit = FALSE;
mtx_unlock(&cs->cs_lock);
return (TRUE);
}
/* for other requests which use info */
if (info == NULL) {
mtx_unlock(&cs->cs_lock);
return (FALSE);
}
switch (request) {
case CLSET_TIMEOUT:
if (time_not_ok((struct timeval *)info)) {
mtx_unlock(&cs->cs_lock);
return (FALSE);
}
cu->cu_total = *(struct timeval *)info;
break;
case CLGET_TIMEOUT:
*(struct timeval *)info = cu->cu_total;
break;
case CLSET_RETRY_TIMEOUT:
if (time_not_ok((struct timeval *)info)) {
mtx_unlock(&cs->cs_lock);
return (FALSE);
}
cu->cu_wait = *(struct timeval *)info;
break;
case CLGET_RETRY_TIMEOUT:
*(struct timeval *)info = cu->cu_wait;
break;
case CLGET_SVC_ADDR:
/*
* Slightly different semantics to userland - we use
* sockaddr instead of netbuf.
*/
memcpy(info, &cu->cu_raddr, cu->cu_raddr.ss_len);
break;
case CLSET_SVC_ADDR: /* set to new address */
addr = (struct sockaddr *)info;
(void) memcpy(&cu->cu_raddr, addr, addr->sa_len);
break;
case CLGET_XID:
*(uint32_t *)info = cu->cu_xid;
break;
case CLSET_XID:
/* This will set the xid of the NEXT call */
/* decrement by 1 as clnt_dg_call() increments once */
cu->cu_xid = *(uint32_t *)info - 1;
break;
case CLGET_VERS:
/*
* This RELIES on the information that, in the call body,
* the version number field is the fifth field from the
* beginning of the RPC header. MUST be changed if the
* call_struct is changed
*/
*(uint32_t *)info =
ntohl(*(uint32_t *)(void *)(cu->cu_mcallc +
4 * BYTES_PER_XDR_UNIT));
break;
case CLSET_VERS:
*(uint32_t *)(void *)(cu->cu_mcallc + 4 * BYTES_PER_XDR_UNIT)
= htonl(*(uint32_t *)info);
break;
case CLGET_PROG:
/*
* This RELIES on the information that, in the call body,
* the program number field is the fourth field from the
* beginning of the RPC header. MUST be changed if the
* call_struct is changed
*/
*(uint32_t *)info =
ntohl(*(uint32_t *)(void *)(cu->cu_mcallc +
3 * BYTES_PER_XDR_UNIT));
break;
case CLSET_PROG:
*(uint32_t *)(void *)(cu->cu_mcallc + 3 * BYTES_PER_XDR_UNIT)
= htonl(*(uint32_t *)info);
break;
case CLSET_ASYNC:
cu->cu_async = *(int *)info;
break;
case CLSET_CONNECT:
cu->cu_connect = *(int *)info;
break;
case CLSET_WAITCHAN:
cu->cu_waitchan = (const char *)info;
break;
case CLGET_WAITCHAN:
*(const char **) info = cu->cu_waitchan;
break;
case CLSET_INTERRUPTIBLE:
if (*(int *) info)
cu->cu_waitflag = PCATCH;
else
cu->cu_waitflag = 0;
break;
case CLGET_INTERRUPTIBLE:
if (cu->cu_waitflag)
*(int *) info = TRUE;
else
*(int *) info = FALSE;
break;
default:
mtx_unlock(&cs->cs_lock);
return (FALSE);
}
mtx_unlock(&cs->cs_lock);
return (TRUE);
}
static void
clnt_dg_close(CLIENT *cl)
{
struct cu_data *cu = (struct cu_data *)cl->cl_private;
struct cu_socket *cs;
struct cu_request *cr;
cs = cu->cu_socket->so_rcv.sb_upcallarg;
mtx_lock(&cs->cs_lock);
if (cu->cu_closed) {
mtx_unlock(&cs->cs_lock);
return;
}
if (cu->cu_closing) {
while (cu->cu_closing)
msleep(cu, &cs->cs_lock, 0, "rpcclose", 0);
KASSERT(cu->cu_closed, ("client should be closed"));
mtx_unlock(&cs->cs_lock);
return;
}
/*
* Abort any pending requests and wait until everyone
* has finished with clnt_vc_call.
*/
cu->cu_closing = TRUE;
TAILQ_FOREACH(cr, &cs->cs_pending, cr_link) {
if (cr->cr_client == cl) {
cr->cr_xid = 0;
cr->cr_error = ESHUTDOWN;
wakeup(cr);
}
}
while (cu->cu_threads)
msleep(cu, &cs->cs_lock, 0, "rpcclose", 0);
cu->cu_closing = FALSE;
cu->cu_closed = TRUE;
mtx_unlock(&cs->cs_lock);
wakeup(cu);
}
static void
clnt_dg_destroy(CLIENT *cl)
{
struct cu_data *cu = (struct cu_data *)cl->cl_private;
struct cu_socket *cs;
struct socket *so = NULL;
bool_t lastsocketref;
cs = cu->cu_socket->so_rcv.sb_upcallarg;
clnt_dg_close(cl);
SOCKBUF_LOCK(&cu->cu_socket->so_rcv);
mtx_lock(&cs->cs_lock);
cs->cs_refs--;
if (cs->cs_refs == 0) {
mtx_unlock(&cs->cs_lock);
soupcall_clear(cu->cu_socket, SO_RCV);
clnt_dg_upcallsdone(cu->cu_socket, cs);
SOCKBUF_UNLOCK(&cu->cu_socket->so_rcv);
mtx_destroy(&cs->cs_lock);
mem_free(cs, sizeof(*cs));
lastsocketref = TRUE;
} else {
mtx_unlock(&cs->cs_lock);
SOCKBUF_UNLOCK(&cu->cu_socket->so_rcv);
lastsocketref = FALSE;
}
if (cu->cu_closeit && lastsocketref) {
so = cu->cu_socket;
cu->cu_socket = NULL;
}
if (so)
soclose(so);
if (cl->cl_netid && cl->cl_netid[0])
mem_free(cl->cl_netid, strlen(cl->cl_netid) +1);
if (cl->cl_tp && cl->cl_tp[0])
mem_free(cl->cl_tp, strlen(cl->cl_tp) +1);
mem_free(cu, sizeof (*cu));
mem_free(cl, sizeof (CLIENT));
}
/*
* Make sure that the time is not garbage. -1 value is allowed.
*/
static bool_t
time_not_ok(struct timeval *t)
{
return (t->tv_sec < -1 || t->tv_sec > 100000000 ||
t->tv_usec < -1 || t->tv_usec > 1000000);
}
int
clnt_dg_soupcall(struct socket *so, void *arg, int waitflag)
{
struct cu_socket *cs = (struct cu_socket *) arg;
struct uio uio;
struct mbuf *m;
struct mbuf *control;
struct cu_request *cr;
int error, rcvflag, foundreq;
uint32_t xid;
cs->cs_upcallrefs++;
uio.uio_resid = 1000000000;
uio.uio_td = curthread;
do {
SOCKBUF_UNLOCK(&so->so_rcv);
m = NULL;
control = NULL;
rcvflag = MSG_DONTWAIT;
error = soreceive(so, NULL, &uio, &m, &control, &rcvflag);
if (control)
m_freem(control);
SOCKBUF_LOCK(&so->so_rcv);
if (error == EWOULDBLOCK)
break;
/*
* If there was an error, wake up all pending
* requests.
*/
if (error) {
mtx_lock(&cs->cs_lock);
TAILQ_FOREACH(cr, &cs->cs_pending, cr_link) {
cr->cr_xid = 0;
cr->cr_error = error;
wakeup(cr);
}
mtx_unlock(&cs->cs_lock);
break;
}
/*
* The XID is in the first uint32_t of the reply.
*/
if (m->m_len < sizeof(xid) && m_length(m, NULL) < sizeof(xid)) {
/*
* Should never happen.
*/
m_freem(m);
continue;
}
m_copydata(m, 0, sizeof(xid), (char *)&xid);
xid = ntohl(xid);
/*
* Attempt to match this reply with a pending request.
*/
mtx_lock(&cs->cs_lock);
foundreq = 0;
TAILQ_FOREACH(cr, &cs->cs_pending, cr_link) {
if (cr->cr_xid == xid) {
/*
* This one matches. We leave the
* reply mbuf in cr->cr_mrep. Set the
* XID to zero so that we will ignore
* any duplicated replies that arrive
* before clnt_dg_call removes it from
* the queue.
*/
cr->cr_xid = 0;
cr->cr_mrep = m;
cr->cr_error = 0;
foundreq = 1;
wakeup(cr);
break;
}
}
mtx_unlock(&cs->cs_lock);
/*
* If we didn't find the matching request, just drop
* it - its probably a repeated reply.
*/
if (!foundreq)
m_freem(m);
} while (m);
cs->cs_upcallrefs--;
if (cs->cs_upcallrefs < 0)
panic("rpcdg upcall refcnt");
if (cs->cs_upcallrefs == 0)
wakeup(&cs->cs_upcallrefs);
return (SU_OK);
}
/*
* Wait for all upcalls in progress to complete.
*/
static void
clnt_dg_upcallsdone(struct socket *so, struct cu_socket *cs)
{
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
while (cs->cs_upcallrefs > 0)
(void) msleep(&cs->cs_upcallrefs, SOCKBUF_MTX(&so->so_rcv), 0,
"rpcdgup", 0);
}
Index: head/sys/security/mac/mac_syscalls.c
===================================================================
--- head/sys/security/mac/mac_syscalls.c (revision 327172)
+++ head/sys/security/mac/mac_syscalls.c (revision 327173)
@@ -1,733 +1,731 @@
/*-
* Copyright (c) 1999-2002, 2006, 2009 Robert N. M. Watson
* Copyright (c) 2001 Ilmar S. Habibulin
* Copyright (c) 2001-2005 Networks Associates Technology, Inc.
* Copyright (c) 2005-2006 SPARTA, Inc.
* Copyright (c) 2008 Apple Inc.
* All rights reserved.
*
* This software was developed by Robert Watson and Ilmar Habibulin for the
* TrustedBSD Project.
*
* This software was developed for the FreeBSD Project in part by Network
* Associates Laboratories, the Security Research Division of Network
* Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
* as part of the DARPA CHATS research program.
*
* This software was enhanced by SPARTA ISSO under SPAWAR contract
* N66001-04-C-6019 ("SEFOS").
*
* This software was developed at the University of Cambridge Computer
* Laboratory with support from a grant from Google, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_mac.h"
#include <sys/param.h>
#include <sys/capsicum.h>
#include <sys/fcntl.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/mac.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
#include <sys/sysproto.h>
#include <sys/sysent.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/file.h>
#include <sys/namei.h>
#include <sys/socket.h>
#include <sys/pipe.h>
#include <sys/socketvar.h>
#include <security/mac/mac_framework.h>
#include <security/mac/mac_internal.h>
#include <security/mac/mac_policy.h>
#ifdef MAC
FEATURE(security_mac, "Mandatory Access Control Framework support");
int
sys___mac_get_pid(struct thread *td, struct __mac_get_pid_args *uap)
{
char *elements, *buffer;
struct mac mac;
struct proc *tproc;
struct ucred *tcred;
int error;
error = copyin(uap->mac_p, &mac, sizeof(mac));
if (error)
return (error);
error = mac_check_structmac_consistent(&mac);
if (error)
return (error);
tproc = pfind(uap->pid);
if (tproc == NULL)
return (ESRCH);
tcred = NULL; /* Satisfy gcc. */
error = p_cansee(td, tproc);
if (error == 0)
tcred = crhold(tproc->p_ucred);
PROC_UNLOCK(tproc);
if (error)
return (error);
elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
if (error) {
free(elements, M_MACTEMP);
crfree(tcred);
return (error);
}
buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
error = mac_cred_externalize_label(tcred->cr_label, elements,
buffer, mac.m_buflen);
if (error == 0)
error = copyout(buffer, mac.m_string, strlen(buffer)+1);
free(buffer, M_MACTEMP);
free(elements, M_MACTEMP);
crfree(tcred);
return (error);
}
int
sys___mac_get_proc(struct thread *td, struct __mac_get_proc_args *uap)
{
char *elements, *buffer;
struct mac mac;
int error;
error = copyin(uap->mac_p, &mac, sizeof(mac));
if (error)
return (error);
error = mac_check_structmac_consistent(&mac);
if (error)
return (error);
elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
if (error) {
free(elements, M_MACTEMP);
return (error);
}
buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
error = mac_cred_externalize_label(td->td_ucred->cr_label,
elements, buffer, mac.m_buflen);
if (error == 0)
error = copyout(buffer, mac.m_string, strlen(buffer)+1);
free(buffer, M_MACTEMP);
free(elements, M_MACTEMP);
return (error);
}
int
sys___mac_set_proc(struct thread *td, struct __mac_set_proc_args *uap)
{
struct ucred *newcred, *oldcred;
struct label *intlabel;
struct proc *p;
struct mac mac;
char *buffer;
int error;
if (!(mac_labeled & MPC_OBJECT_CRED))
return (EINVAL);
error = copyin(uap->mac_p, &mac, sizeof(mac));
if (error)
return (error);
error = mac_check_structmac_consistent(&mac);
if (error)
return (error);
buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
if (error) {
free(buffer, M_MACTEMP);
return (error);
}
intlabel = mac_cred_label_alloc();
error = mac_cred_internalize_label(intlabel, buffer);
free(buffer, M_MACTEMP);
if (error)
goto out;
newcred = crget();
p = td->td_proc;
PROC_LOCK(p);
oldcred = p->p_ucred;
error = mac_cred_check_relabel(oldcred, intlabel);
if (error) {
PROC_UNLOCK(p);
crfree(newcred);
goto out;
}
setsugid(p);
crcopy(newcred, oldcred);
mac_cred_relabel(newcred, intlabel);
proc_set_cred(p, newcred);
PROC_UNLOCK(p);
crfree(oldcred);
mac_proc_vm_revoke(td);
out:
mac_cred_label_free(intlabel);
return (error);
}
int
sys___mac_get_fd(struct thread *td, struct __mac_get_fd_args *uap)
{
char *elements, *buffer;
struct label *intlabel;
struct file *fp;
struct mac mac;
struct vnode *vp;
struct pipe *pipe;
struct socket *so;
cap_rights_t rights;
- short label_type;
int error;
error = copyin(uap->mac_p, &mac, sizeof(mac));
if (error)
return (error);
error = mac_check_structmac_consistent(&mac);
if (error)
return (error);
elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
if (error) {
free(elements, M_MACTEMP);
return (error);
}
buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
error = fget(td, uap->fd, cap_rights_init(&rights, CAP_MAC_GET), &fp);
if (error)
goto out;
- label_type = fp->f_type;
switch (fp->f_type) {
case DTYPE_FIFO:
case DTYPE_VNODE:
if (!(mac_labeled & MPC_OBJECT_VNODE)) {
error = EINVAL;
goto out_fdrop;
}
vp = fp->f_vnode;
intlabel = mac_vnode_label_alloc();
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
mac_vnode_copy_label(vp->v_label, intlabel);
VOP_UNLOCK(vp, 0);
error = mac_vnode_externalize_label(intlabel, elements,
buffer, mac.m_buflen);
mac_vnode_label_free(intlabel);
break;
case DTYPE_PIPE:
if (!(mac_labeled & MPC_OBJECT_PIPE)) {
error = EINVAL;
goto out_fdrop;
}
pipe = fp->f_data;
intlabel = mac_pipe_label_alloc();
PIPE_LOCK(pipe);
mac_pipe_copy_label(pipe->pipe_pair->pp_label, intlabel);
PIPE_UNLOCK(pipe);
error = mac_pipe_externalize_label(intlabel, elements,
buffer, mac.m_buflen);
mac_pipe_label_free(intlabel);
break;
case DTYPE_SOCKET:
if (!(mac_labeled & MPC_OBJECT_SOCKET)) {
error = EINVAL;
goto out_fdrop;
}
so = fp->f_data;
intlabel = mac_socket_label_alloc(M_WAITOK);
SOCK_LOCK(so);
mac_socket_copy_label(so->so_label, intlabel);
SOCK_UNLOCK(so);
error = mac_socket_externalize_label(intlabel, elements,
buffer, mac.m_buflen);
mac_socket_label_free(intlabel);
break;
default:
error = EINVAL;
}
if (error == 0)
error = copyout(buffer, mac.m_string, strlen(buffer)+1);
out_fdrop:
fdrop(fp, td);
out:
free(buffer, M_MACTEMP);
free(elements, M_MACTEMP);
return (error);
}
int
sys___mac_get_file(struct thread *td, struct __mac_get_file_args *uap)
{
char *elements, *buffer;
struct nameidata nd;
struct label *intlabel;
struct mac mac;
int error;
if (!(mac_labeled & MPC_OBJECT_VNODE))
return (EINVAL);
error = copyin(uap->mac_p, &mac, sizeof(mac));
if (error)
return (error);
error = mac_check_structmac_consistent(&mac);
if (error)
return (error);
elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
if (error) {
free(elements, M_MACTEMP);
return (error);
}
buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW, UIO_USERSPACE,
uap->path_p, td);
error = namei(&nd);
if (error)
goto out;
intlabel = mac_vnode_label_alloc();
mac_vnode_copy_label(nd.ni_vp->v_label, intlabel);
error = mac_vnode_externalize_label(intlabel, elements, buffer,
mac.m_buflen);
NDFREE(&nd, 0);
mac_vnode_label_free(intlabel);
if (error == 0)
error = copyout(buffer, mac.m_string, strlen(buffer)+1);
out:
free(buffer, M_MACTEMP);
free(elements, M_MACTEMP);
return (error);
}
int
sys___mac_get_link(struct thread *td, struct __mac_get_link_args *uap)
{
char *elements, *buffer;
struct nameidata nd;
struct label *intlabel;
struct mac mac;
int error;
if (!(mac_labeled & MPC_OBJECT_VNODE))
return (EINVAL);
error = copyin(uap->mac_p, &mac, sizeof(mac));
if (error)
return (error);
error = mac_check_structmac_consistent(&mac);
if (error)
return (error);
elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
if (error) {
free(elements, M_MACTEMP);
return (error);
}
buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
NDINIT(&nd, LOOKUP, LOCKLEAF | NOFOLLOW, UIO_USERSPACE,
uap->path_p, td);
error = namei(&nd);
if (error)
goto out;
intlabel = mac_vnode_label_alloc();
mac_vnode_copy_label(nd.ni_vp->v_label, intlabel);
error = mac_vnode_externalize_label(intlabel, elements, buffer,
mac.m_buflen);
NDFREE(&nd, 0);
mac_vnode_label_free(intlabel);
if (error == 0)
error = copyout(buffer, mac.m_string, strlen(buffer)+1);
out:
free(buffer, M_MACTEMP);
free(elements, M_MACTEMP);
return (error);
}
int
sys___mac_set_fd(struct thread *td, struct __mac_set_fd_args *uap)
{
struct label *intlabel;
struct pipe *pipe;
struct socket *so;
struct file *fp;
struct mount *mp;
struct vnode *vp;
struct mac mac;
cap_rights_t rights;
char *buffer;
int error;
error = copyin(uap->mac_p, &mac, sizeof(mac));
if (error)
return (error);
error = mac_check_structmac_consistent(&mac);
if (error)
return (error);
buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
if (error) {
free(buffer, M_MACTEMP);
return (error);
}
error = fget(td, uap->fd, cap_rights_init(&rights, CAP_MAC_SET), &fp);
if (error)
goto out;
switch (fp->f_type) {
case DTYPE_FIFO:
case DTYPE_VNODE:
if (!(mac_labeled & MPC_OBJECT_VNODE)) {
error = EINVAL;
goto out_fdrop;
}
intlabel = mac_vnode_label_alloc();
error = mac_vnode_internalize_label(intlabel, buffer);
if (error) {
mac_vnode_label_free(intlabel);
break;
}
vp = fp->f_vnode;
error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
if (error != 0) {
mac_vnode_label_free(intlabel);
break;
}
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
error = vn_setlabel(vp, intlabel, td->td_ucred);
VOP_UNLOCK(vp, 0);
vn_finished_write(mp);
mac_vnode_label_free(intlabel);
break;
case DTYPE_PIPE:
if (!(mac_labeled & MPC_OBJECT_PIPE)) {
error = EINVAL;
goto out_fdrop;
}
intlabel = mac_pipe_label_alloc();
error = mac_pipe_internalize_label(intlabel, buffer);
if (error == 0) {
pipe = fp->f_data;
PIPE_LOCK(pipe);
error = mac_pipe_label_set(td->td_ucred,
pipe->pipe_pair, intlabel);
PIPE_UNLOCK(pipe);
}
mac_pipe_label_free(intlabel);
break;
case DTYPE_SOCKET:
if (!(mac_labeled & MPC_OBJECT_SOCKET)) {
error = EINVAL;
goto out_fdrop;
}
intlabel = mac_socket_label_alloc(M_WAITOK);
error = mac_socket_internalize_label(intlabel, buffer);
if (error == 0) {
so = fp->f_data;
error = mac_socket_label_set(td->td_ucred, so,
intlabel);
}
mac_socket_label_free(intlabel);
break;
default:
error = EINVAL;
}
out_fdrop:
fdrop(fp, td);
out:
free(buffer, M_MACTEMP);
return (error);
}
int
sys___mac_set_file(struct thread *td, struct __mac_set_file_args *uap)
{
struct label *intlabel;
struct nameidata nd;
struct mount *mp;
struct mac mac;
char *buffer;
int error;
if (!(mac_labeled & MPC_OBJECT_VNODE))
return (EINVAL);
error = copyin(uap->mac_p, &mac, sizeof(mac));
if (error)
return (error);
error = mac_check_structmac_consistent(&mac);
if (error)
return (error);
buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
if (error) {
free(buffer, M_MACTEMP);
return (error);
}
intlabel = mac_vnode_label_alloc();
error = mac_vnode_internalize_label(intlabel, buffer);
free(buffer, M_MACTEMP);
if (error)
goto out;
NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW, UIO_USERSPACE,
uap->path_p, td);
error = namei(&nd);
if (error == 0) {
error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH);
if (error == 0) {
error = vn_setlabel(nd.ni_vp, intlabel,
td->td_ucred);
vn_finished_write(mp);
}
}
NDFREE(&nd, 0);
out:
mac_vnode_label_free(intlabel);
return (error);
}
int
sys___mac_set_link(struct thread *td, struct __mac_set_link_args *uap)
{
struct label *intlabel;
struct nameidata nd;
struct mount *mp;
struct mac mac;
char *buffer;
int error;
if (!(mac_labeled & MPC_OBJECT_VNODE))
return (EINVAL);
error = copyin(uap->mac_p, &mac, sizeof(mac));
if (error)
return (error);
error = mac_check_structmac_consistent(&mac);
if (error)
return (error);
buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
if (error) {
free(buffer, M_MACTEMP);
return (error);
}
intlabel = mac_vnode_label_alloc();
error = mac_vnode_internalize_label(intlabel, buffer);
free(buffer, M_MACTEMP);
if (error)
goto out;
NDINIT(&nd, LOOKUP, LOCKLEAF | NOFOLLOW, UIO_USERSPACE,
uap->path_p, td);
error = namei(&nd);
if (error == 0) {
error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH);
if (error == 0) {
error = vn_setlabel(nd.ni_vp, intlabel,
td->td_ucred);
vn_finished_write(mp);
}
}
NDFREE(&nd, 0);
out:
mac_vnode_label_free(intlabel);
return (error);
}
int
sys_mac_syscall(struct thread *td, struct mac_syscall_args *uap)
{
struct mac_policy_conf *mpc;
char target[MAC_MAX_POLICY_NAME];
int error;
error = copyinstr(uap->policy, target, sizeof(target), NULL);
if (error)
return (error);
error = ENOSYS;
LIST_FOREACH(mpc, &mac_static_policy_list, mpc_list) {
if (strcmp(mpc->mpc_name, target) == 0 &&
mpc->mpc_ops->mpo_syscall != NULL) {
error = mpc->mpc_ops->mpo_syscall(td,
uap->call, uap->arg);
goto out;
}
}
if (!LIST_EMPTY(&mac_policy_list)) {
mac_policy_slock_sleep();
LIST_FOREACH(mpc, &mac_policy_list, mpc_list) {
if (strcmp(mpc->mpc_name, target) == 0 &&
mpc->mpc_ops->mpo_syscall != NULL) {
error = mpc->mpc_ops->mpo_syscall(td,
uap->call, uap->arg);
break;
}
}
mac_policy_sunlock_sleep();
}
out:
return (error);
}
#else /* !MAC */
int
sys___mac_get_pid(struct thread *td, struct __mac_get_pid_args *uap)
{
return (ENOSYS);
}
int
sys___mac_get_proc(struct thread *td, struct __mac_get_proc_args *uap)
{
return (ENOSYS);
}
int
sys___mac_set_proc(struct thread *td, struct __mac_set_proc_args *uap)
{
return (ENOSYS);
}
int
sys___mac_get_fd(struct thread *td, struct __mac_get_fd_args *uap)
{
return (ENOSYS);
}
int
sys___mac_get_file(struct thread *td, struct __mac_get_file_args *uap)
{
return (ENOSYS);
}
int
sys___mac_get_link(struct thread *td, struct __mac_get_link_args *uap)
{
return (ENOSYS);
}
int
sys___mac_set_fd(struct thread *td, struct __mac_set_fd_args *uap)
{
return (ENOSYS);
}
int
sys___mac_set_file(struct thread *td, struct __mac_set_file_args *uap)
{
return (ENOSYS);
}
int
sys___mac_set_link(struct thread *td, struct __mac_set_link_args *uap)
{
return (ENOSYS);
}
int
sys_mac_syscall(struct thread *td, struct mac_syscall_args *uap)
{
return (ENOSYS);
}
#endif /* !MAC */
Index: head/sys/ufs/ffs/ffs_alloc.c
===================================================================
--- head/sys/ufs/ffs/ffs_alloc.c (revision 327172)
+++ head/sys/ufs/ffs/ffs_alloc.c (revision 327173)
@@ -1,3255 +1,3253 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (c) 2002 Networks Associates Technology, Inc.
* All rights reserved.
*
* This software was developed for the FreeBSD Project by Marshall
* Kirk McKusick and Network Associates Laboratories, the Security
* Research Division of Network Associates, Inc. under DARPA/SPAWAR
* contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
* research program
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ffs_alloc.c 8.18 (Berkeley) 5/26/95
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_quota.h"
#include <sys/param.h>
#include <sys/capsicum.h>
#include <sys/systm.h>
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/conf.h>
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/kernel.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/taskqueue.h>
#include <security/audit/audit.h>
#include <geom/geom.h>
#include <ufs/ufs/dir.h>
#include <ufs/ufs/extattr.h>
#include <ufs/ufs/quota.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufs_extern.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ffs/fs.h>
#include <ufs/ffs/ffs_extern.h>
#include <ufs/ffs/softdep.h>
typedef ufs2_daddr_t allocfcn_t(struct inode *ip, u_int cg, ufs2_daddr_t bpref,
int size, int rsize);
static ufs2_daddr_t ffs_alloccg(struct inode *, u_int, ufs2_daddr_t, int, int);
static ufs2_daddr_t
ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t, int);
static void ffs_blkfree_cg(struct ufsmount *, struct fs *,
struct vnode *, ufs2_daddr_t, long, ino_t,
struct workhead *);
static void ffs_blkfree_trim_completed(struct bio *);
static void ffs_blkfree_trim_task(void *ctx, int pending __unused);
#ifdef INVARIANTS
static int ffs_checkblk(struct inode *, ufs2_daddr_t, long);
#endif
static ufs2_daddr_t ffs_clusteralloc(struct inode *, u_int, ufs2_daddr_t, int);
static ino_t ffs_dirpref(struct inode *);
static ufs2_daddr_t ffs_fragextend(struct inode *, u_int, ufs2_daddr_t,
int, int);
static ufs2_daddr_t ffs_hashalloc
(struct inode *, u_int, ufs2_daddr_t, int, int, allocfcn_t *);
static ufs2_daddr_t ffs_nodealloccg(struct inode *, u_int, ufs2_daddr_t, int,
int);
static ufs1_daddr_t ffs_mapsearch(struct fs *, struct cg *, ufs2_daddr_t, int);
static int ffs_reallocblks_ufs1(struct vop_reallocblks_args *);
static int ffs_reallocblks_ufs2(struct vop_reallocblks_args *);
static void ffs_ckhash_cg(struct buf *);
/*
* Allocate a block in the filesystem.
*
* The size of the requested block is given, which must be some
* multiple of fs_fsize and <= fs_bsize.
* A preference may be optionally specified. If a preference is given
* the following hierarchy is used to allocate a block:
* 1) allocate the requested block.
* 2) allocate a rotationally optimal block in the same cylinder.
* 3) allocate a block in the same cylinder group.
* 4) quadradically rehash into other cylinder groups, until an
* available block is located.
* If no block preference is given the following hierarchy is used
* to allocate a block:
* 1) allocate a block in the cylinder group that contains the
* inode for the file.
* 2) quadradically rehash into other cylinder groups, until an
* available block is located.
*/
int
ffs_alloc(ip, lbn, bpref, size, flags, cred, bnp)
struct inode *ip;
ufs2_daddr_t lbn, bpref;
int size, flags;
struct ucred *cred;
ufs2_daddr_t *bnp;
{
struct fs *fs;
struct ufsmount *ump;
ufs2_daddr_t bno;
u_int cg, reclaimed;
static struct timeval lastfail;
static int curfail;
int64_t delta;
#ifdef QUOTA
int error;
#endif
*bnp = 0;
ump = ITOUMP(ip);
fs = ump->um_fs;
mtx_assert(UFS_MTX(ump), MA_OWNED);
#ifdef INVARIANTS
if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) {
printf("dev = %s, bsize = %ld, size = %d, fs = %s\n",
devtoname(ump->um_dev), (long)fs->fs_bsize, size,
fs->fs_fsmnt);
panic("ffs_alloc: bad size");
}
if (cred == NOCRED)
panic("ffs_alloc: missing credential");
#endif /* INVARIANTS */
reclaimed = 0;
retry:
#ifdef QUOTA
UFS_UNLOCK(ump);
error = chkdq(ip, btodb(size), cred, 0);
if (error)
return (error);
UFS_LOCK(ump);
#endif
if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
goto nospace;
if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0) &&
freespace(fs, fs->fs_minfree) - numfrags(fs, size) < 0)
goto nospace;
if (bpref >= fs->fs_size)
bpref = 0;
if (bpref == 0)
cg = ino_to_cg(fs, ip->i_number);
else
cg = dtog(fs, bpref);
bno = ffs_hashalloc(ip, cg, bpref, size, size, ffs_alloccg);
if (bno > 0) {
delta = btodb(size);
DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta);
if (flags & IO_EXT)
ip->i_flag |= IN_CHANGE;
else
ip->i_flag |= IN_CHANGE | IN_UPDATE;
*bnp = bno;
return (0);
}
nospace:
#ifdef QUOTA
UFS_UNLOCK(ump);
/*
* Restore user's disk quota because allocation failed.
*/
(void) chkdq(ip, -btodb(size), cred, FORCE);
UFS_LOCK(ump);
#endif
if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) {
reclaimed = 1;
softdep_request_cleanup(fs, ITOV(ip), cred, FLUSH_BLOCKS_WAIT);
goto retry;
}
UFS_UNLOCK(ump);
if (reclaimed > 0 && ppsratecheck(&lastfail, &curfail, 1)) {
ffs_fserr(fs, ip->i_number, "filesystem full");
uprintf("\n%s: write failed, filesystem is full\n",
fs->fs_fsmnt);
}
return (ENOSPC);
}
/*
* Reallocate a fragment to a bigger size
*
* The number and size of the old block is given, and a preference
* and new size is also specified. The allocator attempts to extend
* the original block. Failing that, the regular block allocator is
* invoked to get an appropriate block.
*/
int
ffs_realloccg(ip, lbprev, bprev, bpref, osize, nsize, flags, cred, bpp)
struct inode *ip;
ufs2_daddr_t lbprev;
ufs2_daddr_t bprev;
ufs2_daddr_t bpref;
int osize, nsize, flags;
struct ucred *cred;
struct buf **bpp;
{
struct vnode *vp;
struct fs *fs;
struct buf *bp;
struct ufsmount *ump;
u_int cg, request, reclaimed;
int error, gbflags;
ufs2_daddr_t bno;
static struct timeval lastfail;
static int curfail;
int64_t delta;
vp = ITOV(ip);
ump = ITOUMP(ip);
fs = ump->um_fs;
bp = NULL;
gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0;
mtx_assert(UFS_MTX(ump), MA_OWNED);
#ifdef INVARIANTS
if (vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED)
panic("ffs_realloccg: allocation on suspended filesystem");
if ((u_int)osize > fs->fs_bsize || fragoff(fs, osize) != 0 ||
(u_int)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) {
printf(
"dev = %s, bsize = %ld, osize = %d, nsize = %d, fs = %s\n",
devtoname(ump->um_dev), (long)fs->fs_bsize, osize,
nsize, fs->fs_fsmnt);
panic("ffs_realloccg: bad size");
}
if (cred == NOCRED)
panic("ffs_realloccg: missing credential");
#endif /* INVARIANTS */
reclaimed = 0;
retry:
if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0) &&
freespace(fs, fs->fs_minfree) - numfrags(fs, nsize - osize) < 0) {
goto nospace;
}
if (bprev == 0) {
printf("dev = %s, bsize = %ld, bprev = %jd, fs = %s\n",
devtoname(ump->um_dev), (long)fs->fs_bsize, (intmax_t)bprev,
fs->fs_fsmnt);
panic("ffs_realloccg: bad bprev");
}
UFS_UNLOCK(ump);
/*
* Allocate the extra space in the buffer.
*/
error = bread_gb(vp, lbprev, osize, NOCRED, gbflags, &bp);
if (error) {
brelse(bp);
return (error);
}
if (bp->b_blkno == bp->b_lblkno) {
if (lbprev >= UFS_NDADDR)
panic("ffs_realloccg: lbprev out of range");
bp->b_blkno = fsbtodb(fs, bprev);
}
#ifdef QUOTA
error = chkdq(ip, btodb(nsize - osize), cred, 0);
if (error) {
brelse(bp);
return (error);
}
#endif
/*
* Check for extension in the existing location.
*/
*bpp = NULL;
cg = dtog(fs, bprev);
UFS_LOCK(ump);
bno = ffs_fragextend(ip, cg, bprev, osize, nsize);
if (bno) {
if (bp->b_blkno != fsbtodb(fs, bno))
panic("ffs_realloccg: bad blockno");
delta = btodb(nsize - osize);
DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta);
if (flags & IO_EXT)
ip->i_flag |= IN_CHANGE;
else
ip->i_flag |= IN_CHANGE | IN_UPDATE;
allocbuf(bp, nsize);
bp->b_flags |= B_DONE;
vfs_bio_bzero_buf(bp, osize, nsize - osize);
if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO)
vfs_bio_set_valid(bp, osize, nsize - osize);
*bpp = bp;
return (0);
}
/*
* Allocate a new disk location.
*/
if (bpref >= fs->fs_size)
bpref = 0;
switch ((int)fs->fs_optim) {
case FS_OPTSPACE:
/*
* Allocate an exact sized fragment. Although this makes
* best use of space, we will waste time relocating it if
* the file continues to grow. If the fragmentation is
* less than half of the minimum free reserve, we choose
* to begin optimizing for time.
*/
request = nsize;
if (fs->fs_minfree <= 5 ||
fs->fs_cstotal.cs_nffree >
(off_t)fs->fs_dsize * fs->fs_minfree / (2 * 100))
break;
log(LOG_NOTICE, "%s: optimization changed from SPACE to TIME\n",
fs->fs_fsmnt);
fs->fs_optim = FS_OPTTIME;
break;
case FS_OPTTIME:
/*
* At this point we have discovered a file that is trying to
* grow a small fragment to a larger fragment. To save time,
* we allocate a full sized block, then free the unused portion.
* If the file continues to grow, the `ffs_fragextend' call
* above will be able to grow it in place without further
* copying. If aberrant programs cause disk fragmentation to
* grow within 2% of the free reserve, we choose to begin
* optimizing for space.
*/
request = fs->fs_bsize;
if (fs->fs_cstotal.cs_nffree <
(off_t)fs->fs_dsize * (fs->fs_minfree - 2) / 100)
break;
log(LOG_NOTICE, "%s: optimization changed from TIME to SPACE\n",
fs->fs_fsmnt);
fs->fs_optim = FS_OPTSPACE;
break;
default:
printf("dev = %s, optim = %ld, fs = %s\n",
devtoname(ump->um_dev), (long)fs->fs_optim, fs->fs_fsmnt);
panic("ffs_realloccg: bad optim");
/* NOTREACHED */
}
bno = ffs_hashalloc(ip, cg, bpref, request, nsize, ffs_alloccg);
if (bno > 0) {
bp->b_blkno = fsbtodb(fs, bno);
if (!DOINGSOFTDEP(vp))
ffs_blkfree(ump, fs, ump->um_devvp, bprev, (long)osize,
ip->i_number, vp->v_type, NULL);
delta = btodb(nsize - osize);
DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta);
if (flags & IO_EXT)
ip->i_flag |= IN_CHANGE;
else
ip->i_flag |= IN_CHANGE | IN_UPDATE;
allocbuf(bp, nsize);
bp->b_flags |= B_DONE;
vfs_bio_bzero_buf(bp, osize, nsize - osize);
if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO)
vfs_bio_set_valid(bp, osize, nsize - osize);
*bpp = bp;
return (0);
}
#ifdef QUOTA
UFS_UNLOCK(ump);
/*
* Restore user's disk quota because allocation failed.
*/
(void) chkdq(ip, -btodb(nsize - osize), cred, FORCE);
UFS_LOCK(ump);
#endif
nospace:
/*
* no space available
*/
if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) {
reclaimed = 1;
UFS_UNLOCK(ump);
if (bp) {
brelse(bp);
bp = NULL;
}
UFS_LOCK(ump);
softdep_request_cleanup(fs, vp, cred, FLUSH_BLOCKS_WAIT);
goto retry;
}
UFS_UNLOCK(ump);
if (bp)
brelse(bp);
if (reclaimed > 0 && ppsratecheck(&lastfail, &curfail, 1)) {
ffs_fserr(fs, ip->i_number, "filesystem full");
uprintf("\n%s: write failed, filesystem is full\n",
fs->fs_fsmnt);
}
return (ENOSPC);
}
/*
* Reallocate a sequence of blocks into a contiguous sequence of blocks.
*
* The vnode and an array of buffer pointers for a range of sequential
* logical blocks to be made contiguous is given. The allocator attempts
* to find a range of sequential blocks starting as close as possible
* from the end of the allocation for the logical block immediately
* preceding the current range. If successful, the physical block numbers
* in the buffer pointers and in the inode are changed to reflect the new
* allocation. If unsuccessful, the allocation is left unchanged. The
* success in doing the reallocation is returned. Note that the error
* return is not reflected back to the user. Rather the previous block
* allocation will be used.
*/
SYSCTL_NODE(_vfs, OID_AUTO, ffs, CTLFLAG_RW, 0, "FFS filesystem");
static int doasyncfree = 1;
SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncfree, CTLFLAG_RW, &doasyncfree, 0,
"do not force synchronous writes when blocks are reallocated");
static int doreallocblks = 1;
SYSCTL_INT(_vfs_ffs, OID_AUTO, doreallocblks, CTLFLAG_RW, &doreallocblks, 0,
"enable block reallocation");
static int maxclustersearch = 10;
SYSCTL_INT(_vfs_ffs, OID_AUTO, maxclustersearch, CTLFLAG_RW, &maxclustersearch,
0, "max number of cylinder group to search for contigous blocks");
#ifdef DEBUG
static volatile int prtrealloc = 0;
#endif
int
ffs_reallocblks(ap)
struct vop_reallocblks_args /* {
struct vnode *a_vp;
struct cluster_save *a_buflist;
} */ *ap;
{
struct ufsmount *ump;
/*
* If the underlying device can do deletes, then skip reallocating
* the blocks of this file into contiguous sequences. Devices that
* benefit from BIO_DELETE also benefit from not moving the data.
* These devices are flash and therefore work less well with this
* optimization. Also skip if reallocblks has been disabled globally.
*/
ump = ap->a_vp->v_mount->mnt_data;
if (ump->um_candelete || doreallocblks == 0)
return (ENOSPC);
/*
* We can't wait in softdep prealloc as it may fsync and recurse
* here. Instead we simply fail to reallocate blocks if this
* rare condition arises.
*/
if (DOINGSOFTDEP(ap->a_vp))
if (softdep_prealloc(ap->a_vp, MNT_NOWAIT) != 0)
return (ENOSPC);
if (ump->um_fstype == UFS1)
return (ffs_reallocblks_ufs1(ap));
return (ffs_reallocblks_ufs2(ap));
}
static int
ffs_reallocblks_ufs1(ap)
struct vop_reallocblks_args /* {
struct vnode *a_vp;
struct cluster_save *a_buflist;
} */ *ap;
{
struct fs *fs;
struct inode *ip;
struct vnode *vp;
struct buf *sbp, *ebp;
ufs1_daddr_t *bap, *sbap, *ebap;
struct cluster_save *buflist;
struct ufsmount *ump;
ufs_lbn_t start_lbn, end_lbn;
ufs1_daddr_t soff, newblk, blkno;
ufs2_daddr_t pref;
struct indir start_ap[UFS_NIADDR + 1], end_ap[UFS_NIADDR + 1], *idp;
int i, cg, len, start_lvl, end_lvl, ssize;
vp = ap->a_vp;
ip = VTOI(vp);
ump = ITOUMP(ip);
fs = ump->um_fs;
/*
* If we are not tracking block clusters or if we have less than 4%
* free blocks left, then do not attempt to cluster. Running with
* less than 5% free block reserve is not recommended and those that
* choose to do so do not expect to have good file layout.
*/
if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0)
return (ENOSPC);
buflist = ap->a_buflist;
len = buflist->bs_nchildren;
start_lbn = buflist->bs_children[0]->b_lblkno;
end_lbn = start_lbn + len - 1;
#ifdef INVARIANTS
for (i = 0; i < len; i++)
if (!ffs_checkblk(ip,
dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
panic("ffs_reallocblks: unallocated block 1");
for (i = 1; i < len; i++)
if (buflist->bs_children[i]->b_lblkno != start_lbn + i)
panic("ffs_reallocblks: non-logical cluster");
blkno = buflist->bs_children[0]->b_blkno;
ssize = fsbtodb(fs, fs->fs_frag);
for (i = 1; i < len - 1; i++)
if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize))
panic("ffs_reallocblks: non-physical cluster %d", i);
#endif
/*
* If the cluster crosses the boundary for the first indirect
* block, leave space for the indirect block. Indirect blocks
* are initially laid out in a position after the last direct
* block. Block reallocation would usually destroy locality by
* moving the indirect block out of the way to make room for
* data blocks if we didn't compensate here. We should also do
* this for other indirect block boundaries, but it is only
* important for the first one.
*/
if (start_lbn < UFS_NDADDR && end_lbn >= UFS_NDADDR)
return (ENOSPC);
/*
* If the latest allocation is in a new cylinder group, assume that
* the filesystem has decided to move and do not force it back to
* the previous cylinder group.
*/
if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) !=
dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno)))
return (ENOSPC);
if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) ||
ufs_getlbns(vp, end_lbn, end_ap, &end_lvl))
return (ENOSPC);
/*
* Get the starting offset and block map for the first block.
*/
if (start_lvl == 0) {
sbap = &ip->i_din1->di_db[0];
soff = start_lbn;
} else {
idp = &start_ap[start_lvl - 1];
if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) {
brelse(sbp);
return (ENOSPC);
}
sbap = (ufs1_daddr_t *)sbp->b_data;
soff = idp->in_off;
}
/*
* If the block range spans two block maps, get the second map.
*/
ebap = NULL;
if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) {
ssize = len;
} else {
#ifdef INVARIANTS
if (start_lvl > 0 &&
start_ap[start_lvl - 1].in_lbn == idp->in_lbn)
panic("ffs_reallocblk: start == end");
#endif
ssize = len - (idp->in_off + 1);
if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp))
goto fail;
ebap = (ufs1_daddr_t *)ebp->b_data;
}
/*
* Find the preferred location for the cluster. If we have not
* previously failed at this endeavor, then follow our standard
* preference calculation. If we have failed at it, then pick up
* where we last ended our search.
*/
UFS_LOCK(ump);
if (ip->i_nextclustercg == -1)
pref = ffs_blkpref_ufs1(ip, start_lbn, soff, sbap);
else
pref = cgdata(fs, ip->i_nextclustercg);
/*
* Search the block map looking for an allocation of the desired size.
* To avoid wasting too much time, we limit the number of cylinder
* groups that we will search.
*/
cg = dtog(fs, pref);
for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) {
if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0)
break;
cg += 1;
if (cg >= fs->fs_ncg)
cg = 0;
}
/*
* If we have failed in our search, record where we gave up for
* next time. Otherwise, fall back to our usual search citerion.
*/
if (newblk == 0) {
ip->i_nextclustercg = cg;
UFS_UNLOCK(ump);
goto fail;
}
ip->i_nextclustercg = -1;
/*
* We have found a new contiguous block.
*
* First we have to replace the old block pointers with the new
* block pointers in the inode and indirect blocks associated
* with the file.
*/
#ifdef DEBUG
if (prtrealloc)
printf("realloc: ino %ju, lbns %jd-%jd\n\told:",
(uintmax_t)ip->i_number,
(intmax_t)start_lbn, (intmax_t)end_lbn);
#endif
blkno = newblk;
for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) {
if (i == ssize) {
bap = ebap;
soff = -i;
}
#ifdef INVARIANTS
if (!ffs_checkblk(ip,
dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
panic("ffs_reallocblks: unallocated block 2");
if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap)
panic("ffs_reallocblks: alloc mismatch");
#endif
#ifdef DEBUG
if (prtrealloc)
printf(" %d,", *bap);
#endif
if (DOINGSOFTDEP(vp)) {
if (sbap == &ip->i_din1->di_db[0] && i < ssize)
softdep_setup_allocdirect(ip, start_lbn + i,
blkno, *bap, fs->fs_bsize, fs->fs_bsize,
buflist->bs_children[i]);
else
softdep_setup_allocindir_page(ip, start_lbn + i,
i < ssize ? sbp : ebp, soff + i, blkno,
*bap, buflist->bs_children[i]);
}
*bap++ = blkno;
}
/*
* Next we must write out the modified inode and indirect blocks.
* For strict correctness, the writes should be synchronous since
* the old block values may have been written to disk. In practise
* they are almost never written, but if we are concerned about
* strict correctness, the `doasyncfree' flag should be set to zero.
*
* The test on `doasyncfree' should be changed to test a flag
* that shows whether the associated buffers and inodes have
* been written. The flag should be set when the cluster is
* started and cleared whenever the buffer or inode is flushed.
* We can then check below to see if it is set, and do the
* synchronous write only when it has been cleared.
*/
if (sbap != &ip->i_din1->di_db[0]) {
if (doasyncfree)
bdwrite(sbp);
else
bwrite(sbp);
} else {
ip->i_flag |= IN_CHANGE | IN_UPDATE;
if (!doasyncfree)
ffs_update(vp, 1);
}
if (ssize < len) {
if (doasyncfree)
bdwrite(ebp);
else
bwrite(ebp);
}
/*
* Last, free the old blocks and assign the new blocks to the buffers.
*/
#ifdef DEBUG
if (prtrealloc)
printf("\n\tnew:");
#endif
for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) {
if (!DOINGSOFTDEP(vp))
ffs_blkfree(ump, fs, ump->um_devvp,
dbtofsb(fs, buflist->bs_children[i]->b_blkno),
fs->fs_bsize, ip->i_number, vp->v_type, NULL);
buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
#ifdef INVARIANTS
if (!ffs_checkblk(ip,
dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
panic("ffs_reallocblks: unallocated block 3");
#endif
#ifdef DEBUG
if (prtrealloc)
printf(" %d,", blkno);
#endif
}
#ifdef DEBUG
if (prtrealloc) {
prtrealloc--;
printf("\n");
}
#endif
return (0);
fail:
if (ssize < len)
brelse(ebp);
if (sbap != &ip->i_din1->di_db[0])
brelse(sbp);
return (ENOSPC);
}
static int
ffs_reallocblks_ufs2(ap)
struct vop_reallocblks_args /* {
struct vnode *a_vp;
struct cluster_save *a_buflist;
} */ *ap;
{
struct fs *fs;
struct inode *ip;
struct vnode *vp;
struct buf *sbp, *ebp;
ufs2_daddr_t *bap, *sbap, *ebap;
struct cluster_save *buflist;
struct ufsmount *ump;
ufs_lbn_t start_lbn, end_lbn;
ufs2_daddr_t soff, newblk, blkno, pref;
struct indir start_ap[UFS_NIADDR + 1], end_ap[UFS_NIADDR + 1], *idp;
int i, cg, len, start_lvl, end_lvl, ssize;
vp = ap->a_vp;
ip = VTOI(vp);
ump = ITOUMP(ip);
fs = ump->um_fs;
/*
* If we are not tracking block clusters or if we have less than 4%
* free blocks left, then do not attempt to cluster. Running with
* less than 5% free block reserve is not recommended and those that
* choose to do so do not expect to have good file layout.
*/
if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0)
return (ENOSPC);
buflist = ap->a_buflist;
len = buflist->bs_nchildren;
start_lbn = buflist->bs_children[0]->b_lblkno;
end_lbn = start_lbn + len - 1;
#ifdef INVARIANTS
for (i = 0; i < len; i++)
if (!ffs_checkblk(ip,
dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
panic("ffs_reallocblks: unallocated block 1");
for (i = 1; i < len; i++)
if (buflist->bs_children[i]->b_lblkno != start_lbn + i)
panic("ffs_reallocblks: non-logical cluster");
blkno = buflist->bs_children[0]->b_blkno;
ssize = fsbtodb(fs, fs->fs_frag);
for (i = 1; i < len - 1; i++)
if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize))
panic("ffs_reallocblks: non-physical cluster %d", i);
#endif
/*
* If the cluster crosses the boundary for the first indirect
* block, do not move anything in it. Indirect blocks are
* usually initially laid out in a position between the data
* blocks. Block reallocation would usually destroy locality by
* moving the indirect block out of the way to make room for
* data blocks if we didn't compensate here. We should also do
* this for other indirect block boundaries, but it is only
* important for the first one.
*/
if (start_lbn < UFS_NDADDR && end_lbn >= UFS_NDADDR)
return (ENOSPC);
/*
* If the latest allocation is in a new cylinder group, assume that
* the filesystem has decided to move and do not force it back to
* the previous cylinder group.
*/
if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) !=
dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno)))
return (ENOSPC);
if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) ||
ufs_getlbns(vp, end_lbn, end_ap, &end_lvl))
return (ENOSPC);
/*
* Get the starting offset and block map for the first block.
*/
if (start_lvl == 0) {
sbap = &ip->i_din2->di_db[0];
soff = start_lbn;
} else {
idp = &start_ap[start_lvl - 1];
if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) {
brelse(sbp);
return (ENOSPC);
}
sbap = (ufs2_daddr_t *)sbp->b_data;
soff = idp->in_off;
}
/*
* If the block range spans two block maps, get the second map.
*/
ebap = NULL;
if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) {
ssize = len;
} else {
#ifdef INVARIANTS
if (start_lvl > 0 &&
start_ap[start_lvl - 1].in_lbn == idp->in_lbn)
panic("ffs_reallocblk: start == end");
#endif
ssize = len - (idp->in_off + 1);
if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp))
goto fail;
ebap = (ufs2_daddr_t *)ebp->b_data;
}
/*
* Find the preferred location for the cluster. If we have not
* previously failed at this endeavor, then follow our standard
* preference calculation. If we have failed at it, then pick up
* where we last ended our search.
*/
UFS_LOCK(ump);
if (ip->i_nextclustercg == -1)
pref = ffs_blkpref_ufs2(ip, start_lbn, soff, sbap);
else
pref = cgdata(fs, ip->i_nextclustercg);
/*
* Search the block map looking for an allocation of the desired size.
* To avoid wasting too much time, we limit the number of cylinder
* groups that we will search.
*/
cg = dtog(fs, pref);
for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) {
if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0)
break;
cg += 1;
if (cg >= fs->fs_ncg)
cg = 0;
}
/*
* If we have failed in our search, record where we gave up for
* next time. Otherwise, fall back to our usual search citerion.
*/
if (newblk == 0) {
ip->i_nextclustercg = cg;
UFS_UNLOCK(ump);
goto fail;
}
ip->i_nextclustercg = -1;
/*
* We have found a new contiguous block.
*
* First we have to replace the old block pointers with the new
* block pointers in the inode and indirect blocks associated
* with the file.
*/
#ifdef DEBUG
if (prtrealloc)
printf("realloc: ino %ju, lbns %jd-%jd\n\told:", (uintmax_t)ip->i_number,
(intmax_t)start_lbn, (intmax_t)end_lbn);
#endif
blkno = newblk;
for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) {
if (i == ssize) {
bap = ebap;
soff = -i;
}
#ifdef INVARIANTS
if (!ffs_checkblk(ip,
dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
panic("ffs_reallocblks: unallocated block 2");
if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap)
panic("ffs_reallocblks: alloc mismatch");
#endif
#ifdef DEBUG
if (prtrealloc)
printf(" %jd,", (intmax_t)*bap);
#endif
if (DOINGSOFTDEP(vp)) {
if (sbap == &ip->i_din2->di_db[0] && i < ssize)
softdep_setup_allocdirect(ip, start_lbn + i,
blkno, *bap, fs->fs_bsize, fs->fs_bsize,
buflist->bs_children[i]);
else
softdep_setup_allocindir_page(ip, start_lbn + i,
i < ssize ? sbp : ebp, soff + i, blkno,
*bap, buflist->bs_children[i]);
}
*bap++ = blkno;
}
/*
* Next we must write out the modified inode and indirect blocks.
* For strict correctness, the writes should be synchronous since
* the old block values may have been written to disk. In practise
* they are almost never written, but if we are concerned about
* strict correctness, the `doasyncfree' flag should be set to zero.
*
* The test on `doasyncfree' should be changed to test a flag
* that shows whether the associated buffers and inodes have
* been written. The flag should be set when the cluster is
* started and cleared whenever the buffer or inode is flushed.
* We can then check below to see if it is set, and do the
* synchronous write only when it has been cleared.
*/
if (sbap != &ip->i_din2->di_db[0]) {
if (doasyncfree)
bdwrite(sbp);
else
bwrite(sbp);
} else {
ip->i_flag |= IN_CHANGE | IN_UPDATE;
if (!doasyncfree)
ffs_update(vp, 1);
}
if (ssize < len) {
if (doasyncfree)
bdwrite(ebp);
else
bwrite(ebp);
}
/*
* Last, free the old blocks and assign the new blocks to the buffers.
*/
#ifdef DEBUG
if (prtrealloc)
printf("\n\tnew:");
#endif
for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) {
if (!DOINGSOFTDEP(vp))
ffs_blkfree(ump, fs, ump->um_devvp,
dbtofsb(fs, buflist->bs_children[i]->b_blkno),
fs->fs_bsize, ip->i_number, vp->v_type, NULL);
buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
#ifdef INVARIANTS
if (!ffs_checkblk(ip,
dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
panic("ffs_reallocblks: unallocated block 3");
#endif
#ifdef DEBUG
if (prtrealloc)
printf(" %jd,", (intmax_t)blkno);
#endif
}
#ifdef DEBUG
if (prtrealloc) {
prtrealloc--;
printf("\n");
}
#endif
return (0);
fail:
if (ssize < len)
brelse(ebp);
if (sbap != &ip->i_din2->di_db[0])
brelse(sbp);
return (ENOSPC);
}
/*
* Allocate an inode in the filesystem.
*
* If allocating a directory, use ffs_dirpref to select the inode.
* If allocating in a directory, the following hierarchy is followed:
* 1) allocate the preferred inode.
* 2) allocate an inode in the same cylinder group.
* 3) quadradically rehash into other cylinder groups, until an
* available inode is located.
* If no inode preference is given the following hierarchy is used
* to allocate an inode:
* 1) allocate an inode in cylinder group 0.
* 2) quadradically rehash into other cylinder groups, until an
* available inode is located.
*/
int
ffs_valloc(pvp, mode, cred, vpp)
struct vnode *pvp;
int mode;
struct ucred *cred;
struct vnode **vpp;
{
struct inode *pip;
struct fs *fs;
struct inode *ip;
struct timespec ts;
struct ufsmount *ump;
ino_t ino, ipref;
u_int cg;
int error, error1, reclaimed;
static struct timeval lastfail;
static int curfail;
*vpp = NULL;
pip = VTOI(pvp);
ump = ITOUMP(pip);
fs = ump->um_fs;
UFS_LOCK(ump);
reclaimed = 0;
retry:
if (fs->fs_cstotal.cs_nifree == 0)
goto noinodes;
if ((mode & IFMT) == IFDIR)
ipref = ffs_dirpref(pip);
else
ipref = pip->i_number;
if (ipref >= fs->fs_ncg * fs->fs_ipg)
ipref = 0;
cg = ino_to_cg(fs, ipref);
/*
* Track number of dirs created one after another
* in a same cg without intervening by files.
*/
if ((mode & IFMT) == IFDIR) {
if (fs->fs_contigdirs[cg] < 255)
fs->fs_contigdirs[cg]++;
} else {
if (fs->fs_contigdirs[cg] > 0)
fs->fs_contigdirs[cg]--;
}
ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0,
(allocfcn_t *)ffs_nodealloccg);
if (ino == 0)
goto noinodes;
error = ffs_vget(pvp->v_mount, ino, LK_EXCLUSIVE, vpp);
if (error) {
error1 = ffs_vgetf(pvp->v_mount, ino, LK_EXCLUSIVE, vpp,
FFSV_FORCEINSMQ);
ffs_vfree(pvp, ino, mode);
if (error1 == 0) {
ip = VTOI(*vpp);
if (ip->i_mode)
goto dup_alloc;
ip->i_flag |= IN_MODIFIED;
vput(*vpp);
}
return (error);
}
ip = VTOI(*vpp);
if (ip->i_mode) {
dup_alloc:
printf("mode = 0%o, inum = %ju, fs = %s\n",
ip->i_mode, (uintmax_t)ip->i_number, fs->fs_fsmnt);
panic("ffs_valloc: dup alloc");
}
if (DIP(ip, i_blocks) && (fs->fs_flags & FS_UNCLEAN) == 0) { /* XXX */
printf("free inode %s/%lu had %ld blocks\n",
fs->fs_fsmnt, (u_long)ino, (long)DIP(ip, i_blocks));
DIP_SET(ip, i_blocks, 0);
}
ip->i_flags = 0;
DIP_SET(ip, i_flags, 0);
/*
* Set up a new generation number for this inode.
*/
while (ip->i_gen == 0 || ++ip->i_gen == 0)
ip->i_gen = arc4random();
DIP_SET(ip, i_gen, ip->i_gen);
if (fs->fs_magic == FS_UFS2_MAGIC) {
vfs_timestamp(&ts);
ip->i_din2->di_birthtime = ts.tv_sec;
ip->i_din2->di_birthnsec = ts.tv_nsec;
}
ufs_prepare_reclaim(*vpp);
ip->i_flag = 0;
(*vpp)->v_vflag = 0;
(*vpp)->v_type = VNON;
if (fs->fs_magic == FS_UFS2_MAGIC) {
(*vpp)->v_op = &ffs_vnodeops2;
ip->i_flag |= IN_UFS2;
} else {
(*vpp)->v_op = &ffs_vnodeops1;
}
return (0);
noinodes:
if (reclaimed == 0) {
reclaimed = 1;
softdep_request_cleanup(fs, pvp, cred, FLUSH_INODES_WAIT);
goto retry;
}
UFS_UNLOCK(ump);
if (ppsratecheck(&lastfail, &curfail, 1)) {
ffs_fserr(fs, pip->i_number, "out of inodes");
uprintf("\n%s: create/symlink failed, no inodes free\n",
fs->fs_fsmnt);
}
return (ENOSPC);
}
/*
* Find a cylinder group to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
static ino_t
ffs_dirpref(pip)
struct inode *pip;
{
struct fs *fs;
int cg, prefcg, dirsize, cgsize;
u_int avgifree, avgbfree, avgndir, curdirsize;
u_int minifree, minbfree, maxndir;
u_int mincg, minndir;
u_int maxcontigdirs;
mtx_assert(UFS_MTX(ITOUMP(pip)), MA_OWNED);
fs = ITOFS(pip);
avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg;
avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg;
/*
* Force allocation in another cg if creating a first level dir.
*/
ASSERT_VOP_LOCKED(ITOV(pip), "ffs_dirpref");
if (ITOV(pip)->v_vflag & VV_ROOT) {
prefcg = arc4random() % fs->fs_ncg;
mincg = prefcg;
minndir = fs->fs_ipg;
for (cg = prefcg; cg < fs->fs_ncg; cg++)
if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
mincg = cg;
minndir = fs->fs_cs(fs, cg).cs_ndir;
}
for (cg = 0; cg < prefcg; cg++)
if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
mincg = cg;
minndir = fs->fs_cs(fs, cg).cs_ndir;
}
return ((ino_t)(fs->fs_ipg * mincg));
}
/*
* Count various limits which used for
* optimal allocation of a directory inode.
*/
maxndir = min(avgndir + fs->fs_ipg / 16, fs->fs_ipg);
minifree = avgifree - avgifree / 4;
if (minifree < 1)
minifree = 1;
minbfree = avgbfree - avgbfree / 4;
if (minbfree < 1)
minbfree = 1;
cgsize = fs->fs_fsize * fs->fs_fpg;
dirsize = fs->fs_avgfilesize * fs->fs_avgfpdir;
curdirsize = avgndir ? (cgsize - avgbfree * fs->fs_bsize) / avgndir : 0;
if (dirsize < curdirsize)
dirsize = curdirsize;
if (dirsize <= 0)
maxcontigdirs = 0; /* dirsize overflowed */
else
maxcontigdirs = min((avgbfree * fs->fs_bsize) / dirsize, 255);
if (fs->fs_avgfpdir > 0)
maxcontigdirs = min(maxcontigdirs,
fs->fs_ipg / fs->fs_avgfpdir);
if (maxcontigdirs == 0)
maxcontigdirs = 1;
/*
* Limit number of dirs in one cg and reserve space for
* regular files, but only if we have no deficit in
* inodes or space.
*
* We are trying to find a suitable cylinder group nearby
* our preferred cylinder group to place a new directory.
* We scan from our preferred cylinder group forward looking
* for a cylinder group that meets our criterion. If we get
* to the final cylinder group and do not find anything,
* we start scanning forwards from the beginning of the
* filesystem. While it might seem sensible to start scanning
* backwards or even to alternate looking forward and backward,
* this approach fails badly when the filesystem is nearly full.
* Specifically, we first search all the areas that have no space
* and finally try the one preceding that. We repeat this on
* every request and in the case of the final block end up
* searching the entire filesystem. By jumping to the front
* of the filesystem, our future forward searches always look
* in new cylinder groups so finds every possible block after
* one pass over the filesystem.
*/
prefcg = ino_to_cg(fs, pip->i_number);
for (cg = prefcg; cg < fs->fs_ncg; cg++)
if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
fs->fs_cs(fs, cg).cs_nifree >= minifree &&
fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
if (fs->fs_contigdirs[cg] < maxcontigdirs)
return ((ino_t)(fs->fs_ipg * cg));
}
for (cg = 0; cg < prefcg; cg++)
if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
fs->fs_cs(fs, cg).cs_nifree >= minifree &&
fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
if (fs->fs_contigdirs[cg] < maxcontigdirs)
return ((ino_t)(fs->fs_ipg * cg));
}
/*
* This is a backstop when we have deficit in space.
*/
for (cg = prefcg; cg < fs->fs_ncg; cg++)
if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
return ((ino_t)(fs->fs_ipg * cg));
for (cg = 0; cg < prefcg; cg++)
if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
break;
return ((ino_t)(fs->fs_ipg * cg));
}
/*
* Select the desired position for the next block in a file. The file is
* logically divided into sections. The first section is composed of the
* direct blocks and the next fs_maxbpg blocks. Each additional section
* contains fs_maxbpg blocks.
*
* If no blocks have been allocated in the first section, the policy is to
* request a block in the same cylinder group as the inode that describes
* the file. The first indirect is allocated immediately following the last
* direct block and the data blocks for the first indirect immediately
* follow it.
*
* If no blocks have been allocated in any other section, the indirect
* block(s) are allocated in the same cylinder group as its inode in an
* area reserved immediately following the inode blocks. The policy for
* the data blocks is to place them in a cylinder group with a greater than
* average number of free blocks. An appropriate cylinder group is found
* by using a rotor that sweeps the cylinder groups. When a new group of
* blocks is needed, the sweep begins in the cylinder group following the
* cylinder group from which the previous allocation was made. The sweep
* continues until a cylinder group with greater than the average number
* of free blocks is found. If the allocation is for the first block in an
* indirect block or the previous block is a hole, then the information on
* the previous allocation is unavailable; here a best guess is made based
* on the logical block number being allocated.
*
* If a section is already partially allocated, the policy is to
* allocate blocks contiguously within the section if possible.
*/
ufs2_daddr_t
ffs_blkpref_ufs1(ip, lbn, indx, bap)
struct inode *ip;
ufs_lbn_t lbn;
int indx;
ufs1_daddr_t *bap;
{
struct fs *fs;
u_int cg, inocg;
u_int avgbfree, startcg;
ufs2_daddr_t pref;
KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap"));
mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED);
fs = ITOFS(ip);
/*
* Allocation of indirect blocks is indicated by passing negative
* values in indx: -1 for single indirect, -2 for double indirect,
* -3 for triple indirect. As noted below, we attempt to allocate
* the first indirect inline with the file data. For all later
* indirect blocks, the data is often allocated in other cylinder
* groups. However to speed random file access and to speed up
* fsck, the filesystem reserves the first fs_metaspace blocks
* (typically half of fs_minfree) of the data area of each cylinder
* group to hold these later indirect blocks.
*/
inocg = ino_to_cg(fs, ip->i_number);
if (indx < 0) {
/*
* Our preference for indirect blocks is the zone at the
* beginning of the inode's cylinder group data area that
* we try to reserve for indirect blocks.
*/
pref = cgmeta(fs, inocg);
/*
* If we are allocating the first indirect block, try to
* place it immediately following the last direct block.
*/
if (indx == -1 && lbn < UFS_NDADDR + NINDIR(fs) &&
ip->i_din1->di_db[UFS_NDADDR - 1] != 0)
pref = ip->i_din1->di_db[UFS_NDADDR - 1] + fs->fs_frag;
return (pref);
}
/*
* If we are allocating the first data block in the first indirect
* block and the indirect has been allocated in the data block area,
* try to place it immediately following the indirect block.
*/
if (lbn == UFS_NDADDR) {
pref = ip->i_din1->di_ib[0];
if (pref != 0 && pref >= cgdata(fs, inocg) &&
pref < cgbase(fs, inocg + 1))
return (pref + fs->fs_frag);
}
/*
* If we are at the beginning of a file, or we have already allocated
* the maximum number of blocks per cylinder group, or we do not
* have a block allocated immediately preceding us, then we need
* to decide where to start allocating new blocks.
*/
if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
/*
* If we are allocating a directory data block, we want
* to place it in the metadata area.
*/
if ((ip->i_mode & IFMT) == IFDIR)
return (cgmeta(fs, inocg));
/*
* Until we fill all the direct and all the first indirect's
* blocks, we try to allocate in the data area of the inode's
* cylinder group.
*/
if (lbn < UFS_NDADDR + NINDIR(fs))
return (cgdata(fs, inocg));
/*
* Find a cylinder with greater than average number of
* unused data blocks.
*/
if (indx == 0 || bap[indx - 1] == 0)
startcg = inocg + lbn / fs->fs_maxbpg;
else
startcg = dtog(fs, bap[indx - 1]) + 1;
startcg %= fs->fs_ncg;
avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
for (cg = startcg; cg < fs->fs_ncg; cg++)
if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
fs->fs_cgrotor = cg;
return (cgdata(fs, cg));
}
for (cg = 0; cg <= startcg; cg++)
if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
fs->fs_cgrotor = cg;
return (cgdata(fs, cg));
}
return (0);
}
/*
* Otherwise, we just always try to lay things out contiguously.
*/
return (bap[indx - 1] + fs->fs_frag);
}
/*
* Same as above, but for UFS2
*/
ufs2_daddr_t
ffs_blkpref_ufs2(ip, lbn, indx, bap)
struct inode *ip;
ufs_lbn_t lbn;
int indx;
ufs2_daddr_t *bap;
{
struct fs *fs;
u_int cg, inocg;
u_int avgbfree, startcg;
ufs2_daddr_t pref;
KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap"));
mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED);
fs = ITOFS(ip);
/*
* Allocation of indirect blocks is indicated by passing negative
* values in indx: -1 for single indirect, -2 for double indirect,
* -3 for triple indirect. As noted below, we attempt to allocate
* the first indirect inline with the file data. For all later
* indirect blocks, the data is often allocated in other cylinder
* groups. However to speed random file access and to speed up
* fsck, the filesystem reserves the first fs_metaspace blocks
* (typically half of fs_minfree) of the data area of each cylinder
* group to hold these later indirect blocks.
*/
inocg = ino_to_cg(fs, ip->i_number);
if (indx < 0) {
/*
* Our preference for indirect blocks is the zone at the
* beginning of the inode's cylinder group data area that
* we try to reserve for indirect blocks.
*/
pref = cgmeta(fs, inocg);
/*
* If we are allocating the first indirect block, try to
* place it immediately following the last direct block.
*/
if (indx == -1 && lbn < UFS_NDADDR + NINDIR(fs) &&
ip->i_din2->di_db[UFS_NDADDR - 1] != 0)
pref = ip->i_din2->di_db[UFS_NDADDR - 1] + fs->fs_frag;
return (pref);
}
/*
* If we are allocating the first data block in the first indirect
* block and the indirect has been allocated in the data block area,
* try to place it immediately following the indirect block.
*/
if (lbn == UFS_NDADDR) {
pref = ip->i_din2->di_ib[0];
if (pref != 0 && pref >= cgdata(fs, inocg) &&
pref < cgbase(fs, inocg + 1))
return (pref + fs->fs_frag);
}
/*
* If we are at the beginning of a file, or we have already allocated
* the maximum number of blocks per cylinder group, or we do not
* have a block allocated immediately preceding us, then we need
* to decide where to start allocating new blocks.
*/
if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
/*
* If we are allocating a directory data block, we want
* to place it in the metadata area.
*/
if ((ip->i_mode & IFMT) == IFDIR)
return (cgmeta(fs, inocg));
/*
* Until we fill all the direct and all the first indirect's
* blocks, we try to allocate in the data area of the inode's
* cylinder group.
*/
if (lbn < UFS_NDADDR + NINDIR(fs))
return (cgdata(fs, inocg));
/*
* Find a cylinder with greater than average number of
* unused data blocks.
*/
if (indx == 0 || bap[indx - 1] == 0)
startcg = inocg + lbn / fs->fs_maxbpg;
else
startcg = dtog(fs, bap[indx - 1]) + 1;
startcg %= fs->fs_ncg;
avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
for (cg = startcg; cg < fs->fs_ncg; cg++)
if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
fs->fs_cgrotor = cg;
return (cgdata(fs, cg));
}
for (cg = 0; cg <= startcg; cg++)
if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
fs->fs_cgrotor = cg;
return (cgdata(fs, cg));
}
return (0);
}
/*
* Otherwise, we just always try to lay things out contiguously.
*/
return (bap[indx - 1] + fs->fs_frag);
}
/*
* Implement the cylinder overflow algorithm.
*
* The policy implemented by this algorithm is:
* 1) allocate the block in its requested cylinder group.
* 2) quadradically rehash on the cylinder group number.
* 3) brute force search for a free block.
*
* Must be called with the UFS lock held. Will release the lock on success
* and return with it held on failure.
*/
/*VARARGS5*/
static ufs2_daddr_t
ffs_hashalloc(ip, cg, pref, size, rsize, allocator)
struct inode *ip;
u_int cg;
ufs2_daddr_t pref;
int size; /* Search size for data blocks, mode for inodes */
int rsize; /* Real allocated size. */
allocfcn_t *allocator;
{
struct fs *fs;
ufs2_daddr_t result;
u_int i, icg = cg;
mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED);
#ifdef INVARIANTS
if (ITOV(ip)->v_mount->mnt_kern_flag & MNTK_SUSPENDED)
panic("ffs_hashalloc: allocation on suspended filesystem");
#endif
fs = ITOFS(ip);
/*
* 1: preferred cylinder group
*/
result = (*allocator)(ip, cg, pref, size, rsize);
if (result)
return (result);
/*
* 2: quadratic rehash
*/
for (i = 1; i < fs->fs_ncg; i *= 2) {
cg += i;
if (cg >= fs->fs_ncg)
cg -= fs->fs_ncg;
result = (*allocator)(ip, cg, 0, size, rsize);
if (result)
return (result);
}
/*
* 3: brute force search
* Note that we start at i == 2, since 0 was checked initially,
* and 1 is always checked in the quadratic rehash.
*/
cg = (icg + 2) % fs->fs_ncg;
for (i = 2; i < fs->fs_ncg; i++) {
result = (*allocator)(ip, cg, 0, size, rsize);
if (result)
return (result);
cg++;
if (cg == fs->fs_ncg)
cg = 0;
}
return (0);
}
/*
* Determine whether a fragment can be extended.
*
* Check to see if the necessary fragments are available, and
* if they are, allocate them.
*/
static ufs2_daddr_t
ffs_fragextend(ip, cg, bprev, osize, nsize)
struct inode *ip;
u_int cg;
ufs2_daddr_t bprev;
int osize, nsize;
{
struct fs *fs;
struct cg *cgp;
struct buf *bp;
struct ufsmount *ump;
int nffree;
long bno;
int frags, bbase;
int i, error;
u_int8_t *blksfree;
ump = ITOUMP(ip);
fs = ump->um_fs;
if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize))
return (0);
frags = numfrags(fs, nsize);
bbase = fragnum(fs, bprev);
if (bbase > fragnum(fs, (bprev + frags - 1))) {
/* cannot extend across a block boundary */
return (0);
}
UFS_UNLOCK(ump);
if ((error = ffs_getcg(fs, ump->um_devvp, cg, &bp, &cgp)) != 0)
goto fail;
bno = dtogd(fs, bprev);
blksfree = cg_blksfree(cgp);
for (i = numfrags(fs, osize); i < frags; i++)
if (isclr(blksfree, bno + i))
goto fail;
/*
* the current fragment can be extended
* deduct the count on fragment being extended into
* increase the count on the remaining fragment (if any)
* allocate the extended piece
*/
for (i = frags; i < fs->fs_frag - bbase; i++)
if (isclr(blksfree, bno + i))
break;
cgp->cg_frsum[i - numfrags(fs, osize)]--;
if (i != frags)
cgp->cg_frsum[i - frags]++;
for (i = numfrags(fs, osize), nffree = 0; i < frags; i++) {
clrbit(blksfree, bno + i);
cgp->cg_cs.cs_nffree--;
nffree++;
}
UFS_LOCK(ump);
fs->fs_cstotal.cs_nffree -= nffree;
fs->fs_cs(fs, cg).cs_nffree -= nffree;
fs->fs_fmod = 1;
ACTIVECLEAR(fs, cg);
UFS_UNLOCK(ump);
if (DOINGSOFTDEP(ITOV(ip)))
softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev,
frags, numfrags(fs, osize));
bdwrite(bp);
return (bprev);
fail:
brelse(bp);
UFS_LOCK(ump);
return (0);
}
/*
* Determine whether a block can be allocated.
*
* Check to see if a block of the appropriate size is available,
* and if it is, allocate it.
*/
static ufs2_daddr_t
ffs_alloccg(ip, cg, bpref, size, rsize)
struct inode *ip;
u_int cg;
ufs2_daddr_t bpref;
int size;
int rsize;
{
struct fs *fs;
struct cg *cgp;
struct buf *bp;
struct ufsmount *ump;
ufs1_daddr_t bno;
ufs2_daddr_t blkno;
int i, allocsiz, error, frags;
u_int8_t *blksfree;
ump = ITOUMP(ip);
fs = ump->um_fs;
if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize)
return (0);
UFS_UNLOCK(ump);
if ((error = ffs_getcg(fs, ump->um_devvp, cg, &bp, &cgp)) != 0 ||
(cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize))
goto fail;
if (size == fs->fs_bsize) {
UFS_LOCK(ump);
blkno = ffs_alloccgblk(ip, bp, bpref, rsize);
ACTIVECLEAR(fs, cg);
UFS_UNLOCK(ump);
bdwrite(bp);
return (blkno);
}
/*
* check to see if any fragments are already available
* allocsiz is the size which will be allocated, hacking
* it down to a smaller size if necessary
*/
blksfree = cg_blksfree(cgp);
frags = numfrags(fs, size);
for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++)
if (cgp->cg_frsum[allocsiz] != 0)
break;
if (allocsiz == fs->fs_frag) {
/*
* no fragments were available, so a block will be
* allocated, and hacked up
*/
if (cgp->cg_cs.cs_nbfree == 0)
goto fail;
UFS_LOCK(ump);
blkno = ffs_alloccgblk(ip, bp, bpref, rsize);
ACTIVECLEAR(fs, cg);
UFS_UNLOCK(ump);
bdwrite(bp);
return (blkno);
}
KASSERT(size == rsize,
("ffs_alloccg: size(%d) != rsize(%d)", size, rsize));
bno = ffs_mapsearch(fs, cgp, bpref, allocsiz);
if (bno < 0)
goto fail;
for (i = 0; i < frags; i++)
clrbit(blksfree, bno + i);
cgp->cg_cs.cs_nffree -= frags;
cgp->cg_frsum[allocsiz]--;
if (frags != allocsiz)
cgp->cg_frsum[allocsiz - frags]++;
UFS_LOCK(ump);
fs->fs_cstotal.cs_nffree -= frags;
fs->fs_cs(fs, cg).cs_nffree -= frags;
fs->fs_fmod = 1;
blkno = cgbase(fs, cg) + bno;
ACTIVECLEAR(fs, cg);
UFS_UNLOCK(ump);
if (DOINGSOFTDEP(ITOV(ip)))
softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, frags, 0);
bdwrite(bp);
return (blkno);
fail:
brelse(bp);
UFS_LOCK(ump);
return (0);
}
/*
* Allocate a block in a cylinder group.
*
* This algorithm implements the following policy:
* 1) allocate the requested block.
* 2) allocate a rotationally optimal block in the same cylinder.
* 3) allocate the next available block on the block rotor for the
* specified cylinder group.
* Note that this routine only allocates fs_bsize blocks; these
* blocks may be fragmented by the routine that allocates them.
*/
static ufs2_daddr_t
ffs_alloccgblk(ip, bp, bpref, size)
struct inode *ip;
struct buf *bp;
ufs2_daddr_t bpref;
int size;
{
struct fs *fs;
struct cg *cgp;
struct ufsmount *ump;
ufs1_daddr_t bno;
ufs2_daddr_t blkno;
u_int8_t *blksfree;
int i, cgbpref;
ump = ITOUMP(ip);
fs = ump->um_fs;
mtx_assert(UFS_MTX(ump), MA_OWNED);
cgp = (struct cg *)bp->b_data;
blksfree = cg_blksfree(cgp);
if (bpref == 0) {
bpref = cgbase(fs, cgp->cg_cgx) + cgp->cg_rotor + fs->fs_frag;
} else if ((cgbpref = dtog(fs, bpref)) != cgp->cg_cgx) {
/* map bpref to correct zone in this cg */
if (bpref < cgdata(fs, cgbpref))
bpref = cgmeta(fs, cgp->cg_cgx);
else
bpref = cgdata(fs, cgp->cg_cgx);
}
/*
* if the requested block is available, use it
*/
bno = dtogd(fs, blknum(fs, bpref));
if (ffs_isblock(fs, blksfree, fragstoblks(fs, bno)))
goto gotit;
/*
* Take the next available block in this cylinder group.
*/
bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag);
if (bno < 0)
return (0);
/* Update cg_rotor only if allocated from the data zone */
if (bno >= dtogd(fs, cgdata(fs, cgp->cg_cgx)))
cgp->cg_rotor = bno;
gotit:
blkno = fragstoblks(fs, bno);
ffs_clrblock(fs, blksfree, (long)blkno);
ffs_clusteracct(fs, cgp, blkno, -1);
cgp->cg_cs.cs_nbfree--;
fs->fs_cstotal.cs_nbfree--;
fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--;
fs->fs_fmod = 1;
blkno = cgbase(fs, cgp->cg_cgx) + bno;
/*
* If the caller didn't want the whole block free the frags here.
*/
size = numfrags(fs, size);
if (size != fs->fs_frag) {
bno = dtogd(fs, blkno);
for (i = size; i < fs->fs_frag; i++)
setbit(blksfree, bno + i);
i = fs->fs_frag - size;
cgp->cg_cs.cs_nffree += i;
fs->fs_cstotal.cs_nffree += i;
fs->fs_cs(fs, cgp->cg_cgx).cs_nffree += i;
fs->fs_fmod = 1;
cgp->cg_frsum[i]++;
}
/* XXX Fixme. */
UFS_UNLOCK(ump);
if (DOINGSOFTDEP(ITOV(ip)))
softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno,
size, 0);
UFS_LOCK(ump);
return (blkno);
}
/*
* Determine whether a cluster can be allocated.
*
* We do not currently check for optimal rotational layout if there
* are multiple choices in the same cylinder group. Instead we just
* take the first one that we find following bpref.
*/
static ufs2_daddr_t
ffs_clusteralloc(ip, cg, bpref, len)
struct inode *ip;
u_int cg;
ufs2_daddr_t bpref;
int len;
{
struct fs *fs;
struct cg *cgp;
struct buf *bp;
struct ufsmount *ump;
int i, run, bit, map, got, error;
ufs2_daddr_t bno;
u_char *mapp;
int32_t *lp;
u_int8_t *blksfree;
ump = ITOUMP(ip);
fs = ump->um_fs;
if (fs->fs_maxcluster[cg] < len)
return (0);
UFS_UNLOCK(ump);
if ((error = ffs_getcg(fs, ump->um_devvp, cg, &bp, &cgp)) != 0) {
UFS_LOCK(ump);
return (0);
}
/*
* Check to see if a cluster of the needed size (or bigger) is
* available in this cylinder group.
*/
lp = &cg_clustersum(cgp)[len];
for (i = len; i <= fs->fs_contigsumsize; i++)
if (*lp++ > 0)
break;
if (i > fs->fs_contigsumsize) {
/*
* This is the first time looking for a cluster in this
* cylinder group. Update the cluster summary information
* to reflect the true maximum sized cluster so that
* future cluster allocation requests can avoid reading
* the cylinder group map only to find no clusters.
*/
lp = &cg_clustersum(cgp)[len - 1];
for (i = len - 1; i > 0; i--)
if (*lp-- > 0)
break;
UFS_LOCK(ump);
fs->fs_maxcluster[cg] = i;
brelse(bp);
return (0);
}
/*
* Search the cluster map to find a big enough cluster.
* We take the first one that we find, even if it is larger
* than we need as we prefer to get one close to the previous
* block allocation. We do not search before the current
* preference point as we do not want to allocate a block
* that is allocated before the previous one (as we will
* then have to wait for another pass of the elevator
* algorithm before it will be read). We prefer to fail and
* be recalled to try an allocation in the next cylinder group.
*/
if (dtog(fs, bpref) != cg)
bpref = cgdata(fs, cg);
else
bpref = blknum(fs, bpref);
bpref = fragstoblks(fs, dtogd(fs, bpref));
mapp = &cg_clustersfree(cgp)[bpref / NBBY];
map = *mapp++;
bit = 1 << (bpref % NBBY);
for (run = 0, got = bpref; got < cgp->cg_nclusterblks; got++) {
if ((map & bit) == 0) {
run = 0;
} else {
run++;
if (run == len)
break;
}
if ((got & (NBBY - 1)) != (NBBY - 1)) {
bit <<= 1;
} else {
map = *mapp++;
bit = 1;
}
}
if (got >= cgp->cg_nclusterblks) {
UFS_LOCK(ump);
brelse(bp);
return (0);
}
/*
* Allocate the cluster that we have found.
*/
blksfree = cg_blksfree(cgp);
for (i = 1; i <= len; i++)
if (!ffs_isblock(fs, blksfree, got - run + i))
panic("ffs_clusteralloc: map mismatch");
bno = cgbase(fs, cg) + blkstofrags(fs, got - run + 1);
if (dtog(fs, bno) != cg)
panic("ffs_clusteralloc: allocated out of group");
len = blkstofrags(fs, len);
UFS_LOCK(ump);
for (i = 0; i < len; i += fs->fs_frag)
if (ffs_alloccgblk(ip, bp, bno + i, fs->fs_bsize) != bno + i)
panic("ffs_clusteralloc: lost block");
ACTIVECLEAR(fs, cg);
UFS_UNLOCK(ump);
bdwrite(bp);
return (bno);
}
static inline struct buf *
getinobuf(struct inode *ip, u_int cg, u_int32_t cginoblk, int gbflags)
{
struct fs *fs;
fs = ITOFS(ip);
return (getblk(ITODEVVP(ip), fsbtodb(fs, ino_to_fsba(fs,
cg * fs->fs_ipg + cginoblk)), (int)fs->fs_bsize, 0, 0,
gbflags));
}
/*
* Synchronous inode initialization is needed only when barrier writes do not
* work as advertised, and will impose a heavy cost on file creation in a newly
* created filesystem.
*/
static int doasyncinodeinit = 1;
SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncinodeinit, CTLFLAG_RWTUN,
&doasyncinodeinit, 0,
"Perform inode block initialization using asynchronous writes");
/*
* Determine whether an inode can be allocated.
*
* Check to see if an inode is available, and if it is,
* allocate it using the following policy:
* 1) allocate the requested inode.
* 2) allocate the next available inode after the requested
* inode in the specified cylinder group.
*/
static ufs2_daddr_t
ffs_nodealloccg(ip, cg, ipref, mode, unused)
struct inode *ip;
u_int cg;
ufs2_daddr_t ipref;
int mode;
int unused;
{
struct fs *fs;
struct cg *cgp;
struct buf *bp, *ibp;
struct ufsmount *ump;
u_int8_t *inosused, *loc;
struct ufs2_dinode *dp2;
int error, start, len, i;
u_int32_t old_initediblk;
ump = ITOUMP(ip);
fs = ump->um_fs;
check_nifree:
if (fs->fs_cs(fs, cg).cs_nifree == 0)
return (0);
UFS_UNLOCK(ump);
if ((error = ffs_getcg(fs, ump->um_devvp, cg, &bp, &cgp)) != 0) {
UFS_LOCK(ump);
return (0);
}
restart:
if (cgp->cg_cs.cs_nifree == 0) {
brelse(bp);
UFS_LOCK(ump);
return (0);
}
inosused = cg_inosused(cgp);
if (ipref) {
ipref %= fs->fs_ipg;
if (isclr(inosused, ipref))
goto gotit;
}
start = cgp->cg_irotor / NBBY;
len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY);
loc = memcchr(&inosused[start], 0xff, len);
if (loc == NULL) {
len = start + 1;
start = 0;
loc = memcchr(&inosused[start], 0xff, len);
if (loc == NULL) {
printf("cg = %d, irotor = %ld, fs = %s\n",
cg, (long)cgp->cg_irotor, fs->fs_fsmnt);
panic("ffs_nodealloccg: map corrupted");
/* NOTREACHED */
}
}
ipref = (loc - inosused) * NBBY + ffs(~*loc) - 1;
gotit:
/*
* Check to see if we need to initialize more inodes.
*/
if (fs->fs_magic == FS_UFS2_MAGIC &&
ipref + INOPB(fs) > cgp->cg_initediblk &&
cgp->cg_initediblk < cgp->cg_niblk) {
old_initediblk = cgp->cg_initediblk;
/*
* Free the cylinder group lock before writing the
* initialized inode block. Entering the
* babarrierwrite() with the cylinder group lock
* causes lock order violation between the lock and
* snaplk.
*
* Another thread can decide to initialize the same
* inode block, but whichever thread first gets the
* cylinder group lock after writing the newly
* allocated inode block will update it and the other
* will realize that it has lost and leave the
* cylinder group unchanged.
*/
ibp = getinobuf(ip, cg, old_initediblk, GB_LOCK_NOWAIT);
brelse(bp);
if (ibp == NULL) {
/*
* The inode block buffer is already owned by
* another thread, which must initialize it.
* Wait on the buffer to allow another thread
* to finish the updates, with dropped cg
* buffer lock, then retry.
*/
ibp = getinobuf(ip, cg, old_initediblk, 0);
brelse(ibp);
UFS_LOCK(ump);
goto check_nifree;
}
bzero(ibp->b_data, (int)fs->fs_bsize);
dp2 = (struct ufs2_dinode *)(ibp->b_data);
for (i = 0; i < INOPB(fs); i++) {
while (dp2->di_gen == 0)
dp2->di_gen = arc4random();
dp2++;
}
/*
* Rather than adding a soft updates dependency to ensure
* that the new inode block is written before it is claimed
* by the cylinder group map, we just do a barrier write
* here. The barrier write will ensure that the inode block
* gets written before the updated cylinder group map can be
* written. The barrier write should only slow down bulk
* loading of newly created filesystems.
*/
if (doasyncinodeinit)
babarrierwrite(ibp);
else
bwrite(ibp);
/*
* After the inode block is written, try to update the
* cg initediblk pointer. If another thread beat us
* to it, then leave it unchanged as the other thread
* has already set it correctly.
*/
error = ffs_getcg(fs, ump->um_devvp, cg, &bp, &cgp);
UFS_LOCK(ump);
ACTIVECLEAR(fs, cg);
UFS_UNLOCK(ump);
if (error != 0)
return (error);
if (cgp->cg_initediblk == old_initediblk)
cgp->cg_initediblk += INOPB(fs);
goto restart;
}
cgp->cg_irotor = ipref;
UFS_LOCK(ump);
ACTIVECLEAR(fs, cg);
setbit(inosused, ipref);
cgp->cg_cs.cs_nifree--;
fs->fs_cstotal.cs_nifree--;
fs->fs_cs(fs, cg).cs_nifree--;
fs->fs_fmod = 1;
if ((mode & IFMT) == IFDIR) {
cgp->cg_cs.cs_ndir++;
fs->fs_cstotal.cs_ndir++;
fs->fs_cs(fs, cg).cs_ndir++;
}
UFS_UNLOCK(ump);
if (DOINGSOFTDEP(ITOV(ip)))
softdep_setup_inomapdep(bp, ip, cg * fs->fs_ipg + ipref, mode);
bdwrite(bp);
return ((ino_t)(cg * fs->fs_ipg + ipref));
}
/*
* Free a block or fragment.
*
* The specified block or fragment is placed back in the
* free map. If a fragment is deallocated, a possible
* block reassembly is checked.
*/
static void
ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd)
struct ufsmount *ump;
struct fs *fs;
struct vnode *devvp;
ufs2_daddr_t bno;
long size;
ino_t inum;
struct workhead *dephd;
{
struct mount *mp;
struct cg *cgp;
struct buf *bp;
ufs1_daddr_t fragno, cgbno;
int i, blk, frags, bbase, error;
u_int cg;
u_int8_t *blksfree;
struct cdev *dev;
cg = dtog(fs, bno);
if (devvp->v_type == VREG) {
/* devvp is a snapshot */
MPASS(devvp->v_mount->mnt_data == ump);
dev = ump->um_devvp->v_rdev;
} else if (devvp->v_type == VCHR) {
/* devvp is a normal disk device */
dev = devvp->v_rdev;
ASSERT_VOP_LOCKED(devvp, "ffs_blkfree_cg");
} else
return;
#ifdef INVARIANTS
if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0 ||
fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) {
printf("dev=%s, bno = %jd, bsize = %ld, size = %ld, fs = %s\n",
devtoname(dev), (intmax_t)bno, (long)fs->fs_bsize,
size, fs->fs_fsmnt);
panic("ffs_blkfree_cg: bad size");
}
#endif
if ((u_int)bno >= fs->fs_size) {
printf("bad block %jd, ino %lu\n", (intmax_t)bno,
(u_long)inum);
ffs_fserr(fs, inum, "bad block");
return;
}
if ((error = ffs_getcg(fs, devvp, cg, &bp, &cgp)) != 0)
return;
cgbno = dtogd(fs, bno);
blksfree = cg_blksfree(cgp);
UFS_LOCK(ump);
if (size == fs->fs_bsize) {
fragno = fragstoblks(fs, cgbno);
if (!ffs_isfreeblock(fs, blksfree, fragno)) {
if (devvp->v_type == VREG) {
UFS_UNLOCK(ump);
/* devvp is a snapshot */
brelse(bp);
return;
}
printf("dev = %s, block = %jd, fs = %s\n",
devtoname(dev), (intmax_t)bno, fs->fs_fsmnt);
panic("ffs_blkfree_cg: freeing free block");
}
ffs_setblock(fs, blksfree, fragno);
ffs_clusteracct(fs, cgp, fragno, 1);
cgp->cg_cs.cs_nbfree++;
fs->fs_cstotal.cs_nbfree++;
fs->fs_cs(fs, cg).cs_nbfree++;
} else {
bbase = cgbno - fragnum(fs, cgbno);
/*
* decrement the counts associated with the old frags
*/
blk = blkmap(fs, blksfree, bbase);
ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
/*
* deallocate the fragment
*/
frags = numfrags(fs, size);
for (i = 0; i < frags; i++) {
if (isset(blksfree, cgbno + i)) {
printf("dev = %s, block = %jd, fs = %s\n",
devtoname(dev), (intmax_t)(bno + i),
fs->fs_fsmnt);
panic("ffs_blkfree_cg: freeing free frag");
}
setbit(blksfree, cgbno + i);
}
cgp->cg_cs.cs_nffree += i;
fs->fs_cstotal.cs_nffree += i;
fs->fs_cs(fs, cg).cs_nffree += i;
/*
* add back in counts associated with the new frags
*/
blk = blkmap(fs, blksfree, bbase);
ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
/*
* if a complete block has been reassembled, account for it
*/
fragno = fragstoblks(fs, bbase);
if (ffs_isblock(fs, blksfree, fragno)) {
cgp->cg_cs.cs_nffree -= fs->fs_frag;
fs->fs_cstotal.cs_nffree -= fs->fs_frag;
fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
ffs_clusteracct(fs, cgp, fragno, 1);
cgp->cg_cs.cs_nbfree++;
fs->fs_cstotal.cs_nbfree++;
fs->fs_cs(fs, cg).cs_nbfree++;
}
}
fs->fs_fmod = 1;
ACTIVECLEAR(fs, cg);
UFS_UNLOCK(ump);
mp = UFSTOVFS(ump);
if (MOUNTEDSOFTDEP(mp) && devvp->v_type == VCHR)
softdep_setup_blkfree(UFSTOVFS(ump), bp, bno,
numfrags(fs, size), dephd);
bdwrite(bp);
}
struct ffs_blkfree_trim_params {
struct task task;
struct ufsmount *ump;
struct vnode *devvp;
ufs2_daddr_t bno;
long size;
ino_t inum;
struct workhead *pdephd;
struct workhead dephd;
};
static void
ffs_blkfree_trim_task(ctx, pending)
void *ctx;
int pending;
{
struct ffs_blkfree_trim_params *tp;
tp = ctx;
ffs_blkfree_cg(tp->ump, tp->ump->um_fs, tp->devvp, tp->bno, tp->size,
tp->inum, tp->pdephd);
vn_finished_secondary_write(UFSTOVFS(tp->ump));
atomic_add_int(&tp->ump->um_trim_inflight, -1);
free(tp, M_TEMP);
}
static void
ffs_blkfree_trim_completed(bip)
struct bio *bip;
{
struct ffs_blkfree_trim_params *tp;
tp = bip->bio_caller2;
g_destroy_bio(bip);
TASK_INIT(&tp->task, 0, ffs_blkfree_trim_task, tp);
taskqueue_enqueue(tp->ump->um_trim_tq, &tp->task);
}
void
ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd)
struct ufsmount *ump;
struct fs *fs;
struct vnode *devvp;
ufs2_daddr_t bno;
long size;
ino_t inum;
enum vtype vtype;
struct workhead *dephd;
{
struct mount *mp;
struct bio *bip;
struct ffs_blkfree_trim_params *tp;
/*
* Check to see if a snapshot wants to claim the block.
* Check that devvp is a normal disk device, not a snapshot,
* it has a snapshot(s) associated with it, and one of the
* snapshots wants to claim the block.
*/
if (devvp->v_type == VCHR &&
(devvp->v_vflag & VV_COPYONWRITE) &&
ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, dephd)) {
return;
}
/*
* Nothing to delay if TRIM is disabled, or the operation is
* performed on the snapshot.
*/
if (!ump->um_candelete || devvp->v_type == VREG) {
ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd);
return;
}
/*
* Postpone the set of the free bit in the cg bitmap until the
* BIO_DELETE is completed. Otherwise, due to disk queue
* reordering, TRIM might be issued after we reuse the block
* and write some new data into it.
*/
atomic_add_int(&ump->um_trim_inflight, 1);
tp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TEMP, M_WAITOK);
tp->ump = ump;
tp->devvp = devvp;
tp->bno = bno;
tp->size = size;
tp->inum = inum;
if (dephd != NULL) {
LIST_INIT(&tp->dephd);
LIST_SWAP(dephd, &tp->dephd, worklist, wk_list);
tp->pdephd = &tp->dephd;
} else
tp->pdephd = NULL;
bip = g_alloc_bio();
bip->bio_cmd = BIO_DELETE;
bip->bio_offset = dbtob(fsbtodb(fs, bno));
bip->bio_done = ffs_blkfree_trim_completed;
bip->bio_length = size;
bip->bio_caller2 = tp;
mp = UFSTOVFS(ump);
vn_start_secondary_write(NULL, &mp, 0);
g_io_request(bip, (struct g_consumer *)devvp->v_bufobj.bo_private);
}
#ifdef INVARIANTS
/*
* Verify allocation of a block or fragment. Returns true if block or
* fragment is allocated, false if it is free.
*/
static int
ffs_checkblk(ip, bno, size)
struct inode *ip;
ufs2_daddr_t bno;
long size;
{
struct fs *fs;
struct cg *cgp;
struct buf *bp;
ufs1_daddr_t cgbno;
int i, error, frags, free;
u_int8_t *blksfree;
fs = ITOFS(ip);
if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) {
printf("bsize = %ld, size = %ld, fs = %s\n",
(long)fs->fs_bsize, size, fs->fs_fsmnt);
panic("ffs_checkblk: bad size");
}
if ((u_int)bno >= fs->fs_size)
panic("ffs_checkblk: bad block %jd", (intmax_t)bno);
error = ffs_getcg(fs, ITODEVVP(ip), dtog(fs, bno), &bp, &cgp);
if (error)
panic("ffs_checkblk: cylinder group read failed");
blksfree = cg_blksfree(cgp);
cgbno = dtogd(fs, bno);
if (size == fs->fs_bsize) {
free = ffs_isblock(fs, blksfree, fragstoblks(fs, cgbno));
} else {
frags = numfrags(fs, size);
for (free = 0, i = 0; i < frags; i++)
if (isset(blksfree, cgbno + i))
free++;
if (free != 0 && free != frags)
panic("ffs_checkblk: partially free fragment");
}
brelse(bp);
return (!free);
}
#endif /* INVARIANTS */
/*
* Free an inode.
*/
int
ffs_vfree(pvp, ino, mode)
struct vnode *pvp;
ino_t ino;
int mode;
{
struct ufsmount *ump;
- struct inode *ip;
if (DOINGSOFTDEP(pvp)) {
softdep_freefile(pvp, ino, mode);
return (0);
}
- ip = VTOI(pvp);
ump = VFSTOUFS(pvp->v_mount);
return (ffs_freefile(ump, ump->um_fs, ump->um_devvp, ino, mode, NULL));
}
/*
* Do the actual free operation.
* The specified inode is placed back in the free map.
*/
int
ffs_freefile(ump, fs, devvp, ino, mode, wkhd)
struct ufsmount *ump;
struct fs *fs;
struct vnode *devvp;
ino_t ino;
int mode;
struct workhead *wkhd;
{
struct cg *cgp;
struct buf *bp;
ufs2_daddr_t cgbno;
int error;
u_int cg;
u_int8_t *inosused;
struct cdev *dev;
cg = ino_to_cg(fs, ino);
if (devvp->v_type == VREG) {
/* devvp is a snapshot */
MPASS(devvp->v_mount->mnt_data == ump);
dev = ump->um_devvp->v_rdev;
cgbno = fragstoblks(fs, cgtod(fs, cg));
} else if (devvp->v_type == VCHR) {
/* devvp is a normal disk device */
dev = devvp->v_rdev;
cgbno = fsbtodb(fs, cgtod(fs, cg));
} else {
bp = NULL;
return (0);
}
if (ino >= fs->fs_ipg * fs->fs_ncg)
panic("ffs_freefile: range: dev = %s, ino = %ju, fs = %s",
devtoname(dev), (uintmax_t)ino, fs->fs_fsmnt);
if ((error = ffs_getcg(fs, devvp, cg, &bp, &cgp)) != 0)
return (error);
inosused = cg_inosused(cgp);
ino %= fs->fs_ipg;
if (isclr(inosused, ino)) {
printf("dev = %s, ino = %ju, fs = %s\n", devtoname(dev),
(uintmax_t)(ino + cg * fs->fs_ipg), fs->fs_fsmnt);
if (fs->fs_ronly == 0)
panic("ffs_freefile: freeing free inode");
}
clrbit(inosused, ino);
if (ino < cgp->cg_irotor)
cgp->cg_irotor = ino;
cgp->cg_cs.cs_nifree++;
UFS_LOCK(ump);
fs->fs_cstotal.cs_nifree++;
fs->fs_cs(fs, cg).cs_nifree++;
if ((mode & IFMT) == IFDIR) {
cgp->cg_cs.cs_ndir--;
fs->fs_cstotal.cs_ndir--;
fs->fs_cs(fs, cg).cs_ndir--;
}
fs->fs_fmod = 1;
ACTIVECLEAR(fs, cg);
UFS_UNLOCK(ump);
if (MOUNTEDSOFTDEP(UFSTOVFS(ump)) && devvp->v_type == VCHR)
softdep_setup_inofree(UFSTOVFS(ump), bp,
ino + cg * fs->fs_ipg, wkhd);
bdwrite(bp);
return (0);
}
/*
* Check to see if a file is free.
* Used to check for allocated files in snapshots.
*/
int
ffs_checkfreefile(fs, devvp, ino)
struct fs *fs;
struct vnode *devvp;
ino_t ino;
{
struct cg *cgp;
struct buf *bp;
ufs2_daddr_t cgbno;
int ret, error;
u_int cg;
u_int8_t *inosused;
cg = ino_to_cg(fs, ino);
if (devvp->v_type == VREG) {
/* devvp is a snapshot */
cgbno = fragstoblks(fs, cgtod(fs, cg));
} else if (devvp->v_type == VCHR) {
/* devvp is a normal disk device */
cgbno = fsbtodb(fs, cgtod(fs, cg));
} else {
return (1);
}
if (ino >= fs->fs_ipg * fs->fs_ncg)
return (1);
if ((error = ffs_getcg(fs, devvp, cg, &bp, &cgp)) != 0)
return (1);
inosused = cg_inosused(cgp);
ino %= fs->fs_ipg;
ret = isclr(inosused, ino);
brelse(bp);
return (ret);
}
/*
* Find a block of the specified size in the specified cylinder group.
*
* It is a panic if a request is made to find a block if none are
* available.
*/
static ufs1_daddr_t
ffs_mapsearch(fs, cgp, bpref, allocsiz)
struct fs *fs;
struct cg *cgp;
ufs2_daddr_t bpref;
int allocsiz;
{
ufs1_daddr_t bno;
int start, len, loc, i;
int blk, field, subfield, pos;
u_int8_t *blksfree;
/*
* find the fragment by searching through the free block
* map for an appropriate bit pattern
*/
if (bpref)
start = dtogd(fs, bpref) / NBBY;
else
start = cgp->cg_frotor / NBBY;
blksfree = cg_blksfree(cgp);
len = howmany(fs->fs_fpg, NBBY) - start;
loc = scanc((u_int)len, (u_char *)&blksfree[start],
fragtbl[fs->fs_frag],
(u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
if (loc == 0) {
len = start + 1;
start = 0;
loc = scanc((u_int)len, (u_char *)&blksfree[0],
fragtbl[fs->fs_frag],
(u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
if (loc == 0) {
printf("start = %d, len = %d, fs = %s\n",
start, len, fs->fs_fsmnt);
panic("ffs_alloccg: map corrupted");
/* NOTREACHED */
}
}
bno = (start + len - loc) * NBBY;
cgp->cg_frotor = bno;
/*
* found the byte in the map
* sift through the bits to find the selected frag
*/
for (i = bno + NBBY; bno < i; bno += fs->fs_frag) {
blk = blkmap(fs, blksfree, bno);
blk <<= 1;
field = around[allocsiz];
subfield = inside[allocsiz];
for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) {
if ((blk & field) == subfield)
return (bno + pos);
field <<= 1;
subfield <<= 1;
}
}
printf("bno = %lu, fs = %s\n", (u_long)bno, fs->fs_fsmnt);
panic("ffs_alloccg: block not in map");
return (-1);
}
static const struct statfs *
ffs_getmntstat(struct vnode *devvp)
{
if (devvp->v_type == VCHR)
return (&devvp->v_rdev->si_mountpt->mnt_stat);
return (ffs_getmntstat(VFSTOUFS(devvp->v_mount)->um_devvp));
}
/*
* Fetch and verify a cylinder group.
*/
int
ffs_getcg(fs, devvp, cg, bpp, cgpp)
struct fs *fs;
struct vnode *devvp;
u_int cg;
struct buf **bpp;
struct cg **cgpp;
{
struct buf *bp;
struct cg *cgp;
const struct statfs *sfs;
int flags, error;
*bpp = NULL;
*cgpp = NULL;
flags = 0;
if ((fs->fs_metackhash & CK_CYLGRP) != 0)
flags |= GB_CKHASH;
error = breadn_flags(devvp, devvp->v_type == VREG ?
fragstoblks(fs, cgtod(fs, cg)) : fsbtodb(fs, cgtod(fs, cg)),
(int)fs->fs_cgsize, NULL, NULL, 0, NOCRED, flags,
ffs_ckhash_cg, &bp);
if (error != 0)
return (error);
cgp = (struct cg *)bp->b_data;
if (((fs->fs_metackhash & CK_CYLGRP) != 0 &&
(bp->b_flags & B_CKHASH) != 0 &&
cgp->cg_ckhash != bp->b_ckhash) ||
!cg_chkmagic(cgp) || cgp->cg_cgx != cg) {
sfs = ffs_getmntstat(devvp);
printf("UFS %s%s (%s) cylinder checksum failed: cg %u, cgp: "
"0x%x != bp: 0x%jx\n",
devvp->v_type == VCHR ? "" : "snapshot of ",
sfs->f_mntfromname, sfs->f_mntonname,
cg, cgp->cg_ckhash, (uintmax_t)bp->b_ckhash);
bp->b_flags &= ~B_CKHASH;
bp->b_flags |= B_INVAL | B_NOCACHE;
brelse(bp);
return (EIO);
}
bp->b_flags &= ~B_CKHASH;
bp->b_xflags |= BX_BKGRDWRITE;
if ((fs->fs_metackhash & CK_CYLGRP) != 0)
bp->b_xflags |= BX_CYLGRP;
cgp->cg_old_time = cgp->cg_time = time_second;
*bpp = bp;
*cgpp = cgp;
return (0);
}
static void
ffs_ckhash_cg(bp)
struct buf *bp;
{
uint32_t ckhash;
struct cg *cgp;
cgp = (struct cg *)bp->b_data;
ckhash = cgp->cg_ckhash;
cgp->cg_ckhash = 0;
bp->b_ckhash = calculate_crc32c(~0L, bp->b_data, bp->b_bcount);
cgp->cg_ckhash = ckhash;
}
/*
* Fserr prints the name of a filesystem with an error diagnostic.
*
* The form of the error message is:
* fs: error message
*/
void
ffs_fserr(fs, inum, cp)
struct fs *fs;
ino_t inum;
char *cp;
{
struct thread *td = curthread; /* XXX */
struct proc *p = td->td_proc;
log(LOG_ERR, "pid %d (%s), uid %d inumber %ju on %s: %s\n",
p->p_pid, p->p_comm, td->td_ucred->cr_uid, (uintmax_t)inum,
fs->fs_fsmnt, cp);
}
/*
* This function provides the capability for the fsck program to
* update an active filesystem. Fourteen operations are provided:
*
* adjrefcnt(inode, amt) - adjusts the reference count on the
* specified inode by the specified amount. Under normal
* operation the count should always go down. Decrementing
* the count to zero will cause the inode to be freed.
* adjblkcnt(inode, amt) - adjust the number of blocks used by the
* inode by the specified amount.
* adjndir, adjbfree, adjifree, adjffree, adjnumclusters(amt) -
* adjust the superblock summary.
* freedirs(inode, count) - directory inodes [inode..inode + count - 1]
* are marked as free. Inodes should never have to be marked
* as in use.
* freefiles(inode, count) - file inodes [inode..inode + count - 1]
* are marked as free. Inodes should never have to be marked
* as in use.
* freeblks(blockno, size) - blocks [blockno..blockno + size - 1]
* are marked as free. Blocks should never have to be marked
* as in use.
* setflags(flags, set/clear) - the fs_flags field has the specified
* flags set (second parameter +1) or cleared (second parameter -1).
* setcwd(dirinode) - set the current directory to dirinode in the
* filesystem associated with the snapshot.
* setdotdot(oldvalue, newvalue) - Verify that the inode number for ".."
* in the current directory is oldvalue then change it to newvalue.
* unlink(nameptr, oldvalue) - Verify that the inode number associated
* with nameptr in the current directory is oldvalue then unlink it.
*
* The following functions may only be used on a quiescent filesystem
* by the soft updates journal. They are not safe to be run on an active
* filesystem.
*
* setinode(inode, dip) - the specified disk inode is replaced with the
* contents pointed to by dip.
* setbufoutput(fd, flags) - output associated with the specified file
* descriptor (which must reference the character device supporting
* the filesystem) switches from using physio to running through the
* buffer cache when flags is set to 1. The descriptor reverts to
* physio for output when flags is set to zero.
*/
static int sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS);
SYSCTL_PROC(_vfs_ffs, FFS_ADJ_REFCNT, adjrefcnt, CTLFLAG_WR|CTLTYPE_STRUCT,
0, 0, sysctl_ffs_fsck, "S,fsck", "Adjust Inode Reference Count");
static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_BLKCNT, adjblkcnt, CTLFLAG_WR,
sysctl_ffs_fsck, "Adjust Inode Used Blocks Count");
static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NDIR, adjndir, CTLFLAG_WR,
sysctl_ffs_fsck, "Adjust number of directories");
static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NBFREE, adjnbfree, CTLFLAG_WR,
sysctl_ffs_fsck, "Adjust number of free blocks");
static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NIFREE, adjnifree, CTLFLAG_WR,
sysctl_ffs_fsck, "Adjust number of free inodes");
static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NFFREE, adjnffree, CTLFLAG_WR,
sysctl_ffs_fsck, "Adjust number of free frags");
static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NUMCLUSTERS, adjnumclusters, CTLFLAG_WR,
sysctl_ffs_fsck, "Adjust number of free clusters");
static SYSCTL_NODE(_vfs_ffs, FFS_DIR_FREE, freedirs, CTLFLAG_WR,
sysctl_ffs_fsck, "Free Range of Directory Inodes");
static SYSCTL_NODE(_vfs_ffs, FFS_FILE_FREE, freefiles, CTLFLAG_WR,
sysctl_ffs_fsck, "Free Range of File Inodes");
static SYSCTL_NODE(_vfs_ffs, FFS_BLK_FREE, freeblks, CTLFLAG_WR,
sysctl_ffs_fsck, "Free Range of Blocks");
static SYSCTL_NODE(_vfs_ffs, FFS_SET_FLAGS, setflags, CTLFLAG_WR,
sysctl_ffs_fsck, "Change Filesystem Flags");
static SYSCTL_NODE(_vfs_ffs, FFS_SET_CWD, setcwd, CTLFLAG_WR,
sysctl_ffs_fsck, "Set Current Working Directory");
static SYSCTL_NODE(_vfs_ffs, FFS_SET_DOTDOT, setdotdot, CTLFLAG_WR,
sysctl_ffs_fsck, "Change Value of .. Entry");
static SYSCTL_NODE(_vfs_ffs, FFS_UNLINK, unlink, CTLFLAG_WR,
sysctl_ffs_fsck, "Unlink a Duplicate Name");
static SYSCTL_NODE(_vfs_ffs, FFS_SET_INODE, setinode, CTLFLAG_WR,
sysctl_ffs_fsck, "Update an On-Disk Inode");
static SYSCTL_NODE(_vfs_ffs, FFS_SET_BUFOUTPUT, setbufoutput, CTLFLAG_WR,
sysctl_ffs_fsck, "Set Buffered Writing for Descriptor");
#define DEBUG 1
#ifdef DEBUG
static int fsckcmds = 0;
SYSCTL_INT(_debug, OID_AUTO, fsckcmds, CTLFLAG_RW, &fsckcmds, 0, "");
#endif /* DEBUG */
static int buffered_write(struct file *, struct uio *, struct ucred *,
int, struct thread *);
static int
sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
{
struct thread *td = curthread;
struct fsck_cmd cmd;
struct ufsmount *ump;
struct vnode *vp, *dvp, *fdvp;
struct inode *ip, *dp;
struct mount *mp;
struct fs *fs;
ufs2_daddr_t blkno;
long blkcnt, blksize;
struct file *fp, *vfp;
cap_rights_t rights;
int filetype, error;
static struct fileops *origops, bufferedops;
if (req->newlen > sizeof cmd)
return (EBADRPC);
if ((error = SYSCTL_IN(req, &cmd, sizeof cmd)) != 0)
return (error);
if (cmd.version != FFS_CMD_VERSION)
return (ERPCMISMATCH);
if ((error = getvnode(td, cmd.handle,
cap_rights_init(&rights, CAP_FSCK), &fp)) != 0)
return (error);
vp = fp->f_data;
if (vp->v_type != VREG && vp->v_type != VDIR) {
fdrop(fp, td);
return (EINVAL);
}
vn_start_write(vp, &mp, V_WAIT);
if (mp == NULL ||
strncmp(mp->mnt_stat.f_fstypename, "ufs", MFSNAMELEN)) {
vn_finished_write(mp);
fdrop(fp, td);
return (EINVAL);
}
ump = VFSTOUFS(mp);
if ((mp->mnt_flag & MNT_RDONLY) &&
ump->um_fsckpid != td->td_proc->p_pid) {
vn_finished_write(mp);
fdrop(fp, td);
return (EROFS);
}
fs = ump->um_fs;
filetype = IFREG;
switch (oidp->oid_number) {
case FFS_SET_FLAGS:
#ifdef DEBUG
if (fsckcmds)
printf("%s: %s flags\n", mp->mnt_stat.f_mntonname,
cmd.size > 0 ? "set" : "clear");
#endif /* DEBUG */
if (cmd.size > 0)
fs->fs_flags |= (long)cmd.value;
else
fs->fs_flags &= ~(long)cmd.value;
break;
case FFS_ADJ_REFCNT:
#ifdef DEBUG
if (fsckcmds) {
printf("%s: adjust inode %jd link count by %jd\n",
mp->mnt_stat.f_mntonname, (intmax_t)cmd.value,
(intmax_t)cmd.size);
}
#endif /* DEBUG */
if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp)))
break;
ip = VTOI(vp);
ip->i_nlink += cmd.size;
DIP_SET(ip, i_nlink, ip->i_nlink);
ip->i_effnlink += cmd.size;
ip->i_flag |= IN_CHANGE | IN_MODIFIED;
error = ffs_update(vp, 1);
if (DOINGSOFTDEP(vp))
softdep_change_linkcnt(ip);
vput(vp);
break;
case FFS_ADJ_BLKCNT:
#ifdef DEBUG
if (fsckcmds) {
printf("%s: adjust inode %jd block count by %jd\n",
mp->mnt_stat.f_mntonname, (intmax_t)cmd.value,
(intmax_t)cmd.size);
}
#endif /* DEBUG */
if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp)))
break;
ip = VTOI(vp);
DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + cmd.size);
ip->i_flag |= IN_CHANGE | IN_MODIFIED;
error = ffs_update(vp, 1);
vput(vp);
break;
case FFS_DIR_FREE:
filetype = IFDIR;
/* fall through */
case FFS_FILE_FREE:
#ifdef DEBUG
if (fsckcmds) {
if (cmd.size == 1)
printf("%s: free %s inode %ju\n",
mp->mnt_stat.f_mntonname,
filetype == IFDIR ? "directory" : "file",
(uintmax_t)cmd.value);
else
printf("%s: free %s inodes %ju-%ju\n",
mp->mnt_stat.f_mntonname,
filetype == IFDIR ? "directory" : "file",
(uintmax_t)cmd.value,
(uintmax_t)(cmd.value + cmd.size - 1));
}
#endif /* DEBUG */
while (cmd.size > 0) {
if ((error = ffs_freefile(ump, fs, ump->um_devvp,
cmd.value, filetype, NULL)))
break;
cmd.size -= 1;
cmd.value += 1;
}
break;
case FFS_BLK_FREE:
#ifdef DEBUG
if (fsckcmds) {
if (cmd.size == 1)
printf("%s: free block %jd\n",
mp->mnt_stat.f_mntonname,
(intmax_t)cmd.value);
else
printf("%s: free blocks %jd-%jd\n",
mp->mnt_stat.f_mntonname,
(intmax_t)cmd.value,
(intmax_t)cmd.value + cmd.size - 1);
}
#endif /* DEBUG */
blkno = cmd.value;
blkcnt = cmd.size;
blksize = fs->fs_frag - (blkno % fs->fs_frag);
while (blkcnt > 0) {
if (blksize > blkcnt)
blksize = blkcnt;
ffs_blkfree(ump, fs, ump->um_devvp, blkno,
blksize * fs->fs_fsize, UFS_ROOTINO, VDIR, NULL);
blkno += blksize;
blkcnt -= blksize;
blksize = fs->fs_frag;
}
break;
/*
* Adjust superblock summaries. fsck(8) is expected to
* submit deltas when necessary.
*/
case FFS_ADJ_NDIR:
#ifdef DEBUG
if (fsckcmds) {
printf("%s: adjust number of directories by %jd\n",
mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
}
#endif /* DEBUG */
fs->fs_cstotal.cs_ndir += cmd.value;
break;
case FFS_ADJ_NBFREE:
#ifdef DEBUG
if (fsckcmds) {
printf("%s: adjust number of free blocks by %+jd\n",
mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
}
#endif /* DEBUG */
fs->fs_cstotal.cs_nbfree += cmd.value;
break;
case FFS_ADJ_NIFREE:
#ifdef DEBUG
if (fsckcmds) {
printf("%s: adjust number of free inodes by %+jd\n",
mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
}
#endif /* DEBUG */
fs->fs_cstotal.cs_nifree += cmd.value;
break;
case FFS_ADJ_NFFREE:
#ifdef DEBUG
if (fsckcmds) {
printf("%s: adjust number of free frags by %+jd\n",
mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
}
#endif /* DEBUG */
fs->fs_cstotal.cs_nffree += cmd.value;
break;
case FFS_ADJ_NUMCLUSTERS:
#ifdef DEBUG
if (fsckcmds) {
printf("%s: adjust number of free clusters by %+jd\n",
mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
}
#endif /* DEBUG */
fs->fs_cstotal.cs_numclusters += cmd.value;
break;
case FFS_SET_CWD:
#ifdef DEBUG
if (fsckcmds) {
printf("%s: set current directory to inode %jd\n",
mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
}
#endif /* DEBUG */
if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_SHARED, &vp)))
break;
AUDIT_ARG_VNODE1(vp);
if ((error = change_dir(vp, td)) != 0) {
vput(vp);
break;
}
VOP_UNLOCK(vp, 0);
pwd_chdir(td, vp);
break;
case FFS_SET_DOTDOT:
#ifdef DEBUG
if (fsckcmds) {
printf("%s: change .. in cwd from %jd to %jd\n",
mp->mnt_stat.f_mntonname, (intmax_t)cmd.value,
(intmax_t)cmd.size);
}
#endif /* DEBUG */
/*
* First we have to get and lock the parent directory
* to which ".." points.
*/
error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &fdvp);
if (error)
break;
/*
* Now we get and lock the child directory containing "..".
*/
FILEDESC_SLOCK(td->td_proc->p_fd);
dvp = td->td_proc->p_fd->fd_cdir;
FILEDESC_SUNLOCK(td->td_proc->p_fd);
if ((error = vget(dvp, LK_EXCLUSIVE, td)) != 0) {
vput(fdvp);
break;
}
dp = VTOI(dvp);
dp->i_offset = 12; /* XXX mastertemplate.dot_reclen */
error = ufs_dirrewrite(dp, VTOI(fdvp), (ino_t)cmd.size,
DT_DIR, 0);
cache_purge(fdvp);
cache_purge(dvp);
vput(dvp);
vput(fdvp);
break;
case FFS_UNLINK:
#ifdef DEBUG
if (fsckcmds) {
char buf[32];
if (copyinstr((char *)(intptr_t)cmd.value, buf,32,NULL))
strncpy(buf, "Name_too_long", 32);
printf("%s: unlink %s (inode %jd)\n",
mp->mnt_stat.f_mntonname, buf, (intmax_t)cmd.size);
}
#endif /* DEBUG */
/*
* kern_unlinkat will do its own start/finish writes and
* they do not nest, so drop ours here. Setting mp == NULL
* indicates that vn_finished_write is not needed down below.
*/
vn_finished_write(mp);
mp = NULL;
error = kern_unlinkat(td, AT_FDCWD, (char *)(intptr_t)cmd.value,
UIO_USERSPACE, (ino_t)cmd.size);
break;
case FFS_SET_INODE:
if (ump->um_fsckpid != td->td_proc->p_pid) {
error = EPERM;
break;
}
#ifdef DEBUG
if (fsckcmds) {
printf("%s: update inode %jd\n",
mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
}
#endif /* DEBUG */
if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp)))
break;
AUDIT_ARG_VNODE1(vp);
ip = VTOI(vp);
if (I_IS_UFS1(ip))
error = copyin((void *)(intptr_t)cmd.size, ip->i_din1,
sizeof(struct ufs1_dinode));
else
error = copyin((void *)(intptr_t)cmd.size, ip->i_din2,
sizeof(struct ufs2_dinode));
if (error) {
vput(vp);
break;
}
ip->i_flag |= IN_CHANGE | IN_MODIFIED;
error = ffs_update(vp, 1);
vput(vp);
break;
case FFS_SET_BUFOUTPUT:
if (ump->um_fsckpid != td->td_proc->p_pid) {
error = EPERM;
break;
}
if (ITOUMP(VTOI(vp)) != ump) {
error = EINVAL;
break;
}
#ifdef DEBUG
if (fsckcmds) {
printf("%s: %s buffered output for descriptor %jd\n",
mp->mnt_stat.f_mntonname,
cmd.size == 1 ? "enable" : "disable",
(intmax_t)cmd.value);
}
#endif /* DEBUG */
if ((error = getvnode(td, cmd.value,
cap_rights_init(&rights, CAP_FSCK), &vfp)) != 0)
break;
if (vfp->f_vnode->v_type != VCHR) {
fdrop(vfp, td);
error = EINVAL;
break;
}
if (origops == NULL) {
origops = vfp->f_ops;
bcopy((void *)origops, (void *)&bufferedops,
sizeof(bufferedops));
bufferedops.fo_write = buffered_write;
}
if (cmd.size == 1)
atomic_store_rel_ptr((volatile uintptr_t *)&vfp->f_ops,
(uintptr_t)&bufferedops);
else
atomic_store_rel_ptr((volatile uintptr_t *)&vfp->f_ops,
(uintptr_t)origops);
fdrop(vfp, td);
break;
default:
#ifdef DEBUG
if (fsckcmds) {
printf("Invalid request %d from fsck\n",
oidp->oid_number);
}
#endif /* DEBUG */
error = EINVAL;
break;
}
fdrop(fp, td);
vn_finished_write(mp);
return (error);
}
/*
* Function to switch a descriptor to use the buffer cache to stage
* its I/O. This is needed so that writes to the filesystem device
* will give snapshots a chance to copy modified blocks for which it
* needs to retain copies.
*/
static int
buffered_write(fp, uio, active_cred, flags, td)
struct file *fp;
struct uio *uio;
struct ucred *active_cred;
int flags;
struct thread *td;
{
struct vnode *devvp, *vp;
struct inode *ip;
struct buf *bp;
struct fs *fs;
struct filedesc *fdp;
int error;
daddr_t lbn;
/*
* The devvp is associated with the /dev filesystem. To discover
* the filesystem with which the device is associated, we depend
* on the application setting the current directory to a location
* within the filesystem being written. Yes, this is an ugly hack.
*/
devvp = fp->f_vnode;
if (!vn_isdisk(devvp, NULL))
return (EINVAL);
fdp = td->td_proc->p_fd;
FILEDESC_SLOCK(fdp);
vp = fdp->fd_cdir;
vref(vp);
FILEDESC_SUNLOCK(fdp);
vn_lock(vp, LK_SHARED | LK_RETRY);
/*
* Check that the current directory vnode indeed belongs to
* UFS before trying to dereference UFS-specific v_data fields.
*/
if (vp->v_op != &ffs_vnodeops1 && vp->v_op != &ffs_vnodeops2) {
vput(vp);
return (EINVAL);
}
ip = VTOI(vp);
if (ITODEVVP(ip) != devvp) {
vput(vp);
return (EINVAL);
}
fs = ITOFS(ip);
vput(vp);
foffset_lock_uio(fp, uio, flags);
vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
#ifdef DEBUG
if (fsckcmds) {
printf("%s: buffered write for block %jd\n",
fs->fs_fsmnt, (intmax_t)btodb(uio->uio_offset));
}
#endif /* DEBUG */
/*
* All I/O must be contained within a filesystem block, start on
* a fragment boundary, and be a multiple of fragments in length.
*/
if (uio->uio_resid > fs->fs_bsize - (uio->uio_offset % fs->fs_bsize) ||
fragoff(fs, uio->uio_offset) != 0 ||
fragoff(fs, uio->uio_resid) != 0) {
error = EINVAL;
goto out;
}
lbn = numfrags(fs, uio->uio_offset);
bp = getblk(devvp, lbn, uio->uio_resid, 0, 0, 0);
bp->b_flags |= B_RELBUF;
if ((error = uiomove((char *)bp->b_data, uio->uio_resid, uio)) != 0) {
brelse(bp);
goto out;
}
error = bwrite(bp);
out:
VOP_UNLOCK(devvp, 0);
foffset_unlock_uio(fp, uio, flags | FOF_NEXTOFF);
return (error);
}
Index: head/sys/ufs/ffs/ffs_vnops.c
===================================================================
--- head/sys/ufs/ffs/ffs_vnops.c (revision 327172)
+++ head/sys/ufs/ffs/ffs_vnops.c (revision 327173)
@@ -1,1728 +1,1726 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (c) 2002, 2003 Networks Associates Technology, Inc.
* All rights reserved.
*
* This software was developed for the FreeBSD Project by Marshall
* Kirk McKusick and Network Associates Laboratories, the Security
* Research Division of Network Associates, Inc. under DARPA/SPAWAR
* contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
* research program
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95
* from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ...
* @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/bio.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/conf.h>
#include <sys/extattr.h>
#include <sys/kernel.h>
#include <sys/limits.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/priv.h>
#include <sys/rwlock.h>
#include <sys/stat.h>
#include <sys/sysctl.h>
#include <sys/vmmeter.h>
#include <sys/vnode.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/vm_extern.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/vm_pager.h>
#include <vm/vnode_pager.h>
#include <ufs/ufs/extattr.h>
#include <ufs/ufs/quota.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufs_extern.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ffs/fs.h>
#include <ufs/ffs/ffs_extern.h>
#include "opt_directio.h"
#include "opt_ffs.h"
#define ALIGNED_TO(ptr, s) \
(((uintptr_t)(ptr) & (_Alignof(s) - 1)) == 0)
#ifdef DIRECTIO
extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
#endif
static vop_fdatasync_t ffs_fdatasync;
static vop_fsync_t ffs_fsync;
static vop_getpages_t ffs_getpages;
static vop_lock1_t ffs_lock;
static vop_read_t ffs_read;
static vop_write_t ffs_write;
static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag);
static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag,
struct ucred *cred);
static vop_strategy_t ffsext_strategy;
static vop_closeextattr_t ffs_closeextattr;
static vop_deleteextattr_t ffs_deleteextattr;
static vop_getextattr_t ffs_getextattr;
static vop_listextattr_t ffs_listextattr;
static vop_openextattr_t ffs_openextattr;
static vop_setextattr_t ffs_setextattr;
static vop_vptofh_t ffs_vptofh;
/* Global vfs data structures for ufs. */
struct vop_vector ffs_vnodeops1 = {
.vop_default = &ufs_vnodeops,
.vop_fsync = ffs_fsync,
.vop_fdatasync = ffs_fdatasync,
.vop_getpages = ffs_getpages,
.vop_getpages_async = vnode_pager_local_getpages_async,
.vop_lock1 = ffs_lock,
.vop_read = ffs_read,
.vop_reallocblks = ffs_reallocblks,
.vop_write = ffs_write,
.vop_vptofh = ffs_vptofh,
};
struct vop_vector ffs_fifoops1 = {
.vop_default = &ufs_fifoops,
.vop_fsync = ffs_fsync,
.vop_fdatasync = ffs_fdatasync,
.vop_reallocblks = ffs_reallocblks, /* XXX: really ??? */
.vop_vptofh = ffs_vptofh,
};
/* Global vfs data structures for ufs. */
struct vop_vector ffs_vnodeops2 = {
.vop_default = &ufs_vnodeops,
.vop_fsync = ffs_fsync,
.vop_fdatasync = ffs_fdatasync,
.vop_getpages = ffs_getpages,
.vop_getpages_async = vnode_pager_local_getpages_async,
.vop_lock1 = ffs_lock,
.vop_read = ffs_read,
.vop_reallocblks = ffs_reallocblks,
.vop_write = ffs_write,
.vop_closeextattr = ffs_closeextattr,
.vop_deleteextattr = ffs_deleteextattr,
.vop_getextattr = ffs_getextattr,
.vop_listextattr = ffs_listextattr,
.vop_openextattr = ffs_openextattr,
.vop_setextattr = ffs_setextattr,
.vop_vptofh = ffs_vptofh,
};
struct vop_vector ffs_fifoops2 = {
.vop_default = &ufs_fifoops,
.vop_fsync = ffs_fsync,
.vop_fdatasync = ffs_fdatasync,
.vop_lock1 = ffs_lock,
.vop_reallocblks = ffs_reallocblks,
.vop_strategy = ffsext_strategy,
.vop_closeextattr = ffs_closeextattr,
.vop_deleteextattr = ffs_deleteextattr,
.vop_getextattr = ffs_getextattr,
.vop_listextattr = ffs_listextattr,
.vop_openextattr = ffs_openextattr,
.vop_setextattr = ffs_setextattr,
.vop_vptofh = ffs_vptofh,
};
/*
* Synch an open file.
*/
/* ARGSUSED */
static int
ffs_fsync(struct vop_fsync_args *ap)
{
struct vnode *vp;
struct bufobj *bo;
int error;
vp = ap->a_vp;
bo = &vp->v_bufobj;
retry:
error = ffs_syncvnode(vp, ap->a_waitfor, 0);
if (error)
return (error);
if (ap->a_waitfor == MNT_WAIT && DOINGSOFTDEP(vp)) {
error = softdep_fsync(vp);
if (error)
return (error);
/*
* The softdep_fsync() function may drop vp lock,
* allowing for dirty buffers to reappear on the
* bo_dirty list. Recheck and resync as needed.
*/
BO_LOCK(bo);
if ((vp->v_type == VREG || vp->v_type == VDIR) &&
(bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)) {
BO_UNLOCK(bo);
goto retry;
}
BO_UNLOCK(bo);
}
return (0);
}
int
ffs_syncvnode(struct vnode *vp, int waitfor, int flags)
{
struct inode *ip;
struct bufobj *bo;
struct buf *bp, *nbp;
ufs_lbn_t lbn;
int error, passes;
bool still_dirty, wait;
ip = VTOI(vp);
ip->i_flag &= ~IN_NEEDSYNC;
bo = &vp->v_bufobj;
/*
* When doing MNT_WAIT we must first flush all dependencies
* on the inode.
*/
if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT &&
(error = softdep_sync_metadata(vp)) != 0)
return (error);
/*
* Flush all dirty buffers associated with a vnode.
*/
error = 0;
passes = 0;
wait = false; /* Always do an async pass first. */
lbn = lblkno(ITOFS(ip), (ip->i_size + ITOFS(ip)->fs_bsize - 1));
BO_LOCK(bo);
loop:
TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
bp->b_vflags &= ~BV_SCANNED;
TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
/*
* Reasons to skip this buffer: it has already been considered
* on this pass, the buffer has dependencies that will cause
* it to be redirtied and it has not already been deferred,
* or it is already being written.
*/
if ((bp->b_vflags & BV_SCANNED) != 0)
continue;
bp->b_vflags |= BV_SCANNED;
/*
* Flush indirects in order, if requested.
*
* Note that if only datasync is requested, we can
* skip indirect blocks when softupdates are not
* active. Otherwise we must flush them with data,
* since dependencies prevent data block writes.
*/
if (waitfor == MNT_WAIT && bp->b_lblkno <= -UFS_NDADDR &&
(lbn_level(bp->b_lblkno) >= passes ||
((flags & DATA_ONLY) != 0 && !DOINGSOFTDEP(vp))))
continue;
if (bp->b_lblkno > lbn)
panic("ffs_syncvnode: syncing truncated data.");
if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) {
BO_UNLOCK(bo);
} else if (wait) {
if (BUF_LOCK(bp,
LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
BO_LOCKPTR(bo)) != 0) {
bp->b_vflags &= ~BV_SCANNED;
goto next;
}
} else
continue;
if ((bp->b_flags & B_DELWRI) == 0)
panic("ffs_fsync: not dirty");
/*
* Check for dependencies and potentially complete them.
*/
if (!LIST_EMPTY(&bp->b_dep) &&
(error = softdep_sync_buf(vp, bp,
wait ? MNT_WAIT : MNT_NOWAIT)) != 0) {
/* I/O error. */
if (error != EBUSY) {
BUF_UNLOCK(bp);
return (error);
}
/* If we deferred once, don't defer again. */
if ((bp->b_flags & B_DEFERRED) == 0) {
bp->b_flags |= B_DEFERRED;
BUF_UNLOCK(bp);
goto next;
}
}
if (wait) {
bremfree(bp);
if ((error = bwrite(bp)) != 0)
return (error);
} else if ((bp->b_flags & B_CLUSTEROK)) {
(void) vfs_bio_awrite(bp);
} else {
bremfree(bp);
(void) bawrite(bp);
}
next:
/*
* Since we may have slept during the I/O, we need
* to start from a known point.
*/
BO_LOCK(bo);
nbp = TAILQ_FIRST(&bo->bo_dirty.bv_hd);
}
if (waitfor != MNT_WAIT) {
BO_UNLOCK(bo);
if ((flags & NO_INO_UPDT) != 0)
return (0);
else
return (ffs_update(vp, 0));
}
/* Drain IO to see if we're done. */
bufobj_wwait(bo, 0, 0);
/*
* Block devices associated with filesystems may have new I/O
* requests posted for them even if the vnode is locked, so no
* amount of trying will get them clean. We make several passes
* as a best effort.
*
* Regular files may need multiple passes to flush all dependency
* work as it is possible that we must write once per indirect
* level, once for the leaf, and once for the inode and each of
* these will be done with one sync and one async pass.
*/
if (bo->bo_dirty.bv_cnt > 0) {
if ((flags & DATA_ONLY) == 0) {
still_dirty = true;
} else {
/*
* For data-only sync, dirty indirect buffers
* are ignored.
*/
still_dirty = false;
TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
if (bp->b_lblkno > -UFS_NDADDR) {
still_dirty = true;
break;
}
}
}
if (still_dirty) {
/* Write the inode after sync passes to flush deps. */
if (wait && DOINGSOFTDEP(vp) &&
(flags & NO_INO_UPDT) == 0) {
BO_UNLOCK(bo);
ffs_update(vp, 1);
BO_LOCK(bo);
}
/* switch between sync/async. */
wait = !wait;
if (wait || ++passes < UFS_NIADDR + 2)
goto loop;
#ifdef INVARIANTS
if (!vn_isdisk(vp, NULL))
vn_printf(vp, "ffs_fsync: dirty ");
#endif
}
}
BO_UNLOCK(bo);
error = 0;
if ((flags & DATA_ONLY) == 0) {
if ((flags & NO_INO_UPDT) == 0)
error = ffs_update(vp, 1);
if (DOINGSUJ(vp))
softdep_journal_fsync(VTOI(vp));
}
return (error);
}
static int
ffs_fdatasync(struct vop_fdatasync_args *ap)
{
return (ffs_syncvnode(ap->a_vp, MNT_WAIT, DATA_ONLY));
}
static int
ffs_lock(ap)
struct vop_lock1_args /* {
struct vnode *a_vp;
int a_flags;
struct thread *a_td;
char *file;
int line;
} */ *ap;
{
#ifndef NO_FFS_SNAPSHOT
struct vnode *vp;
int flags;
struct lock *lkp;
int result;
switch (ap->a_flags & LK_TYPE_MASK) {
case LK_SHARED:
case LK_UPGRADE:
case LK_EXCLUSIVE:
vp = ap->a_vp;
flags = ap->a_flags;
for (;;) {
#ifdef DEBUG_VFS_LOCKS
KASSERT(vp->v_holdcnt != 0,
("ffs_lock %p: zero hold count", vp));
#endif
lkp = vp->v_vnlock;
result = _lockmgr_args(lkp, flags, VI_MTX(vp),
LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT,
ap->a_file, ap->a_line);
if (lkp == vp->v_vnlock || result != 0)
break;
/*
* Apparent success, except that the vnode
* mutated between snapshot file vnode and
* regular file vnode while this process
* slept. The lock currently held is not the
* right lock. Release it, and try to get the
* new lock.
*/
(void) _lockmgr_args(lkp, LK_RELEASE, NULL,
LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT,
ap->a_file, ap->a_line);
if ((flags & (LK_INTERLOCK | LK_NOWAIT)) ==
(LK_INTERLOCK | LK_NOWAIT))
return (EBUSY);
if ((flags & LK_TYPE_MASK) == LK_UPGRADE)
flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE;
flags &= ~LK_INTERLOCK;
}
break;
default:
result = VOP_LOCK1_APV(&ufs_vnodeops, ap);
}
return (result);
#else
return (VOP_LOCK1_APV(&ufs_vnodeops, ap));
#endif
}
/*
* Vnode op for reading.
*/
static int
ffs_read(ap)
struct vop_read_args /* {
struct vnode *a_vp;
struct uio *a_uio;
int a_ioflag;
struct ucred *a_cred;
} */ *ap;
{
struct vnode *vp;
struct inode *ip;
struct uio *uio;
struct fs *fs;
struct buf *bp;
ufs_lbn_t lbn, nextlbn;
off_t bytesinfile;
long size, xfersize, blkoffset;
ssize_t orig_resid;
int error;
int seqcount;
int ioflag;
vp = ap->a_vp;
uio = ap->a_uio;
ioflag = ap->a_ioflag;
if (ap->a_ioflag & IO_EXT)
#ifdef notyet
return (ffs_extread(vp, uio, ioflag));
#else
panic("ffs_read+IO_EXT");
#endif
#ifdef DIRECTIO
if ((ioflag & IO_DIRECT) != 0) {
int workdone;
error = ffs_rawread(vp, uio, &workdone);
if (error != 0 || workdone != 0)
return error;
}
#endif
seqcount = ap->a_ioflag >> IO_SEQSHIFT;
ip = VTOI(vp);
#ifdef INVARIANTS
if (uio->uio_rw != UIO_READ)
panic("ffs_read: mode");
if (vp->v_type == VLNK) {
if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen)
panic("ffs_read: short symlink");
} else if (vp->v_type != VREG && vp->v_type != VDIR)
panic("ffs_read: type %d", vp->v_type);
#endif
orig_resid = uio->uio_resid;
KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0"));
if (orig_resid == 0)
return (0);
KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0"));
fs = ITOFS(ip);
if (uio->uio_offset < ip->i_size &&
uio->uio_offset >= fs->fs_maxfilesize)
return (EOVERFLOW);
for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
break;
lbn = lblkno(fs, uio->uio_offset);
nextlbn = lbn + 1;
/*
* size of buffer. The buffer representing the
* end of the file is rounded up to the size of
* the block type ( fragment or full block,
* depending ).
*/
size = blksize(fs, ip, lbn);
blkoffset = blkoff(fs, uio->uio_offset);
/*
* The amount we want to transfer in this iteration is
* one FS block less the amount of the data before
* our startpoint (duh!)
*/
xfersize = fs->fs_bsize - blkoffset;
/*
* But if we actually want less than the block,
* or the file doesn't have a whole block more of data,
* then use the lesser number.
*/
if (uio->uio_resid < xfersize)
xfersize = uio->uio_resid;
if (bytesinfile < xfersize)
xfersize = bytesinfile;
if (lblktosize(fs, nextlbn) >= ip->i_size) {
/*
* Don't do readahead if this is the end of the file.
*/
error = bread_gb(vp, lbn, size, NOCRED,
GB_UNMAPPED, &bp);
} else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
/*
* Otherwise if we are allowed to cluster,
* grab as much as we can.
*
* XXX This may not be a win if we are not
* doing sequential access.
*/
error = cluster_read(vp, ip->i_size, lbn,
size, NOCRED, blkoffset + uio->uio_resid,
seqcount, GB_UNMAPPED, &bp);
} else if (seqcount > 1) {
/*
* If we are NOT allowed to cluster, then
* if we appear to be acting sequentially,
* fire off a request for a readahead
* as well as a read. Note that the 4th and 5th
* arguments point to arrays of the size specified in
* the 6th argument.
*/
u_int nextsize = blksize(fs, ip, nextlbn);
error = breadn_flags(vp, lbn, size, &nextlbn,
&nextsize, 1, NOCRED, GB_UNMAPPED, NULL, &bp);
} else {
/*
* Failing all of the above, just read what the
* user asked for. Interestingly, the same as
* the first option above.
*/
error = bread_gb(vp, lbn, size, NOCRED,
GB_UNMAPPED, &bp);
}
if (error) {
brelse(bp);
bp = NULL;
break;
}
/*
* We should only get non-zero b_resid when an I/O error
* has occurred, which should cause us to break above.
* However, if the short read did not cause an error,
* then we want to ensure that we do not uiomove bad
* or uninitialized data.
*/
size -= bp->b_resid;
if (size < xfersize) {
if (size == 0)
break;
xfersize = size;
}
if (buf_mapped(bp)) {
error = vn_io_fault_uiomove((char *)bp->b_data +
blkoffset, (int)xfersize, uio);
} else {
error = vn_io_fault_pgmove(bp->b_pages, blkoffset,
(int)xfersize, uio);
}
if (error)
break;
vfs_bio_brelse(bp, ioflag);
}
/*
* This can only happen in the case of an error
* because the loop above resets bp to NULL on each iteration
* and on normal completion has not set a new value into it.
* so it must have come from a 'break' statement
*/
if (bp != NULL)
vfs_bio_brelse(bp, ioflag);
if ((error == 0 || uio->uio_resid != orig_resid) &&
(vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0 &&
(ip->i_flag & IN_ACCESS) == 0) {
VI_LOCK(vp);
ip->i_flag |= IN_ACCESS;
VI_UNLOCK(vp);
}
return (error);
}
/*
* Vnode op for writing.
*/
static int
ffs_write(ap)
struct vop_write_args /* {
struct vnode *a_vp;
struct uio *a_uio;
int a_ioflag;
struct ucred *a_cred;
} */ *ap;
{
struct vnode *vp;
struct uio *uio;
struct inode *ip;
struct fs *fs;
struct buf *bp;
ufs_lbn_t lbn;
off_t osize;
ssize_t resid;
int seqcount;
int blkoffset, error, flags, ioflag, size, xfersize;
vp = ap->a_vp;
uio = ap->a_uio;
ioflag = ap->a_ioflag;
if (ap->a_ioflag & IO_EXT)
#ifdef notyet
return (ffs_extwrite(vp, uio, ioflag, ap->a_cred));
#else
panic("ffs_write+IO_EXT");
#endif
seqcount = ap->a_ioflag >> IO_SEQSHIFT;
ip = VTOI(vp);
#ifdef INVARIANTS
if (uio->uio_rw != UIO_WRITE)
panic("ffs_write: mode");
#endif
switch (vp->v_type) {
case VREG:
if (ioflag & IO_APPEND)
uio->uio_offset = ip->i_size;
if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
return (EPERM);
/* FALLTHROUGH */
case VLNK:
break;
case VDIR:
panic("ffs_write: dir write");
break;
default:
panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type,
(int)uio->uio_offset,
(int)uio->uio_resid
);
}
KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0"));
KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0"));
fs = ITOFS(ip);
if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize)
return (EFBIG);
/*
* Maybe this should be above the vnode op call, but so long as
* file servers have no limits, I don't think it matters.
*/
if (vn_rlimit_fsize(vp, uio, uio->uio_td))
return (EFBIG);
resid = uio->uio_resid;
osize = ip->i_size;
if (seqcount > BA_SEQMAX)
flags = BA_SEQMAX << BA_SEQSHIFT;
else
flags = seqcount << BA_SEQSHIFT;
if (ioflag & IO_SYNC)
flags |= IO_SYNC;
flags |= BA_UNMAPPED;
for (error = 0; uio->uio_resid > 0;) {
lbn = lblkno(fs, uio->uio_offset);
blkoffset = blkoff(fs, uio->uio_offset);
xfersize = fs->fs_bsize - blkoffset;
if (uio->uio_resid < xfersize)
xfersize = uio->uio_resid;
if (uio->uio_offset + xfersize > ip->i_size)
vnode_pager_setsize(vp, uio->uio_offset + xfersize);
/*
* We must perform a read-before-write if the transfer size
* does not cover the entire buffer.
*/
if (fs->fs_bsize > xfersize)
flags |= BA_CLRBUF;
else
flags &= ~BA_CLRBUF;
/* XXX is uio->uio_offset the right thing here? */
error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
ap->a_cred, flags, &bp);
if (error != 0) {
vnode_pager_setsize(vp, ip->i_size);
break;
}
if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL))
bp->b_flags |= B_NOCACHE;
if (uio->uio_offset + xfersize > ip->i_size) {
ip->i_size = uio->uio_offset + xfersize;
DIP_SET(ip, i_size, ip->i_size);
}
size = blksize(fs, ip, lbn) - bp->b_resid;
if (size < xfersize)
xfersize = size;
if (buf_mapped(bp)) {
error = vn_io_fault_uiomove((char *)bp->b_data +
blkoffset, (int)xfersize, uio);
} else {
error = vn_io_fault_pgmove(bp->b_pages, blkoffset,
(int)xfersize, uio);
}
/*
* If the buffer is not already filled and we encounter an
* error while trying to fill it, we have to clear out any
* garbage data from the pages instantiated for the buffer.
* If we do not, a failed uiomove() during a write can leave
* the prior contents of the pages exposed to a userland mmap.
*
* Note that we need only clear buffers with a transfer size
* equal to the block size because buffers with a shorter
* transfer size were cleared above by the call to UFS_BALLOC()
* with the BA_CLRBUF flag set.
*
* If the source region for uiomove identically mmaps the
* buffer, uiomove() performed the NOP copy, and the buffer
* content remains valid because the page fault handler
* validated the pages.
*/
if (error != 0 && (bp->b_flags & B_CACHE) == 0 &&
fs->fs_bsize == xfersize)
vfs_bio_clrbuf(bp);
vfs_bio_set_flags(bp, ioflag);
/*
* If IO_SYNC each buffer is written synchronously. Otherwise
* if we have a severe page deficiency write the buffer
* asynchronously. Otherwise try to cluster, and if that
* doesn't do it then either do an async write (if O_DIRECT),
* or a delayed write (if not).
*/
if (ioflag & IO_SYNC) {
(void)bwrite(bp);
} else if (vm_page_count_severe() ||
buf_dirty_count_severe() ||
(ioflag & IO_ASYNC)) {
bp->b_flags |= B_CLUSTEROK;
bawrite(bp);
} else if (xfersize + blkoffset == fs->fs_bsize) {
if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
bp->b_flags |= B_CLUSTEROK;
cluster_write(vp, bp, ip->i_size, seqcount,
GB_UNMAPPED);
} else {
bawrite(bp);
}
} else if (ioflag & IO_DIRECT) {
bp->b_flags |= B_CLUSTEROK;
bawrite(bp);
} else {
bp->b_flags |= B_CLUSTEROK;
bdwrite(bp);
}
if (error || xfersize == 0)
break;
ip->i_flag |= IN_CHANGE | IN_UPDATE;
}
/*
* If we successfully wrote any data, and we are not the superuser
* we clear the setuid and setgid bits as a precaution against
* tampering.
*/
if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid &&
ap->a_cred) {
if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0)) {
ip->i_mode &= ~(ISUID | ISGID);
DIP_SET(ip, i_mode, ip->i_mode);
}
}
if (error) {
if (ioflag & IO_UNIT) {
(void)ffs_truncate(vp, osize,
IO_NORMAL | (ioflag & IO_SYNC), ap->a_cred);
uio->uio_offset -= resid - uio->uio_resid;
uio->uio_resid = resid;
}
} else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
error = ffs_update(vp, 1);
return (error);
}
/*
* Extended attribute area reading.
*/
static int
ffs_extread(struct vnode *vp, struct uio *uio, int ioflag)
{
struct inode *ip;
struct ufs2_dinode *dp;
struct fs *fs;
struct buf *bp;
ufs_lbn_t lbn, nextlbn;
off_t bytesinfile;
long size, xfersize, blkoffset;
ssize_t orig_resid;
int error;
ip = VTOI(vp);
fs = ITOFS(ip);
dp = ip->i_din2;
#ifdef INVARIANTS
if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC)
panic("ffs_extread: mode");
#endif
orig_resid = uio->uio_resid;
KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0"));
if (orig_resid == 0)
return (0);
KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0"));
for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0)
break;
lbn = lblkno(fs, uio->uio_offset);
nextlbn = lbn + 1;
/*
* size of buffer. The buffer representing the
* end of the file is rounded up to the size of
* the block type ( fragment or full block,
* depending ).
*/
size = sblksize(fs, dp->di_extsize, lbn);
blkoffset = blkoff(fs, uio->uio_offset);
/*
* The amount we want to transfer in this iteration is
* one FS block less the amount of the data before
* our startpoint (duh!)
*/
xfersize = fs->fs_bsize - blkoffset;
/*
* But if we actually want less than the block,
* or the file doesn't have a whole block more of data,
* then use the lesser number.
*/
if (uio->uio_resid < xfersize)
xfersize = uio->uio_resid;
if (bytesinfile < xfersize)
xfersize = bytesinfile;
if (lblktosize(fs, nextlbn) >= dp->di_extsize) {
/*
* Don't do readahead if this is the end of the info.
*/
error = bread(vp, -1 - lbn, size, NOCRED, &bp);
} else {
/*
* If we have a second block, then
* fire off a request for a readahead
* as well as a read. Note that the 4th and 5th
* arguments point to arrays of the size specified in
* the 6th argument.
*/
u_int nextsize = sblksize(fs, dp->di_extsize, nextlbn);
nextlbn = -1 - nextlbn;
error = breadn(vp, -1 - lbn,
size, &nextlbn, &nextsize, 1, NOCRED, &bp);
}
if (error) {
brelse(bp);
bp = NULL;
break;
}
/*
* We should only get non-zero b_resid when an I/O error
* has occurred, which should cause us to break above.
* However, if the short read did not cause an error,
* then we want to ensure that we do not uiomove bad
* or uninitialized data.
*/
size -= bp->b_resid;
if (size < xfersize) {
if (size == 0)
break;
xfersize = size;
}
error = uiomove((char *)bp->b_data + blkoffset,
(int)xfersize, uio);
if (error)
break;
vfs_bio_brelse(bp, ioflag);
}
/*
* This can only happen in the case of an error
* because the loop above resets bp to NULL on each iteration
* and on normal completion has not set a new value into it.
* so it must have come from a 'break' statement
*/
if (bp != NULL)
vfs_bio_brelse(bp, ioflag);
return (error);
}
/*
* Extended attribute area writing.
*/
static int
ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred)
{
struct inode *ip;
struct ufs2_dinode *dp;
struct fs *fs;
struct buf *bp;
ufs_lbn_t lbn;
off_t osize;
ssize_t resid;
int blkoffset, error, flags, size, xfersize;
ip = VTOI(vp);
fs = ITOFS(ip);
dp = ip->i_din2;
#ifdef INVARIANTS
if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC)
panic("ffs_extwrite: mode");
#endif
if (ioflag & IO_APPEND)
uio->uio_offset = dp->di_extsize;
KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0"));
KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0"));
if ((uoff_t)uio->uio_offset + uio->uio_resid >
UFS_NXADDR * fs->fs_bsize)
return (EFBIG);
resid = uio->uio_resid;
osize = dp->di_extsize;
flags = IO_EXT;
if (ioflag & IO_SYNC)
flags |= IO_SYNC;
for (error = 0; uio->uio_resid > 0;) {
lbn = lblkno(fs, uio->uio_offset);
blkoffset = blkoff(fs, uio->uio_offset);
xfersize = fs->fs_bsize - blkoffset;
if (uio->uio_resid < xfersize)
xfersize = uio->uio_resid;
/*
* We must perform a read-before-write if the transfer size
* does not cover the entire buffer.
*/
if (fs->fs_bsize > xfersize)
flags |= BA_CLRBUF;
else
flags &= ~BA_CLRBUF;
error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
ucred, flags, &bp);
if (error != 0)
break;
/*
* If the buffer is not valid we have to clear out any
* garbage data from the pages instantiated for the buffer.
* If we do not, a failed uiomove() during a write can leave
* the prior contents of the pages exposed to a userland
* mmap(). XXX deal with uiomove() errors a better way.
*/
if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
vfs_bio_clrbuf(bp);
if (uio->uio_offset + xfersize > dp->di_extsize)
dp->di_extsize = uio->uio_offset + xfersize;
size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid;
if (size < xfersize)
xfersize = size;
error =
uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
vfs_bio_set_flags(bp, ioflag);
/*
* If IO_SYNC each buffer is written synchronously. Otherwise
* if we have a severe page deficiency write the buffer
* asynchronously. Otherwise try to cluster, and if that
* doesn't do it then either do an async write (if O_DIRECT),
* or a delayed write (if not).
*/
if (ioflag & IO_SYNC) {
(void)bwrite(bp);
} else if (vm_page_count_severe() ||
buf_dirty_count_severe() ||
xfersize + blkoffset == fs->fs_bsize ||
(ioflag & (IO_ASYNC | IO_DIRECT)))
bawrite(bp);
else
bdwrite(bp);
if (error || xfersize == 0)
break;
ip->i_flag |= IN_CHANGE;
}
/*
* If we successfully wrote any data, and we are not the superuser
* we clear the setuid and setgid bits as a precaution against
* tampering.
*/
if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) {
if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID, 0)) {
ip->i_mode &= ~(ISUID | ISGID);
dp->di_mode = ip->i_mode;
}
}
if (error) {
if (ioflag & IO_UNIT) {
(void)ffs_truncate(vp, osize,
IO_EXT | (ioflag&IO_SYNC), ucred);
uio->uio_offset -= resid - uio->uio_resid;
uio->uio_resid = resid;
}
} else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
error = ffs_update(vp, 1);
return (error);
}
/*
* Vnode operating to retrieve a named extended attribute.
*
* Locate a particular EA (nspace:name) in the area (ptr:length), and return
* the length of the EA, and possibly the pointer to the entry and to the data.
*/
static int
ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name,
struct extattr **eapp, u_char **eac)
{
struct extattr *eap, *eaend;
size_t nlen;
nlen = strlen(name);
KASSERT(ALIGNED_TO(ptr, struct extattr), ("unaligned"));
eap = (struct extattr *)ptr;
eaend = (struct extattr *)(ptr + length);
for (; eap < eaend; eap = EXTATTR_NEXT(eap)) {
/* make sure this entry is complete */
if (EXTATTR_NEXT(eap) > eaend)
break;
if (eap->ea_namespace != nspace || eap->ea_namelength != nlen
|| memcmp(eap->ea_name, name, nlen) != 0)
continue;
if (eapp != NULL)
*eapp = eap;
if (eac != NULL)
*eac = EXTATTR_CONTENT(eap);
return (EXTATTR_CONTENT_SIZE(eap));
}
return (-1);
}
static int
ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra)
{
struct inode *ip;
struct ufs2_dinode *dp;
struct fs *fs;
struct uio luio;
struct iovec liovec;
u_int easize;
int error;
u_char *eae;
ip = VTOI(vp);
fs = ITOFS(ip);
dp = ip->i_din2;
easize = dp->di_extsize;
if ((uoff_t)easize + extra > UFS_NXADDR * fs->fs_bsize)
return (EFBIG);
eae = malloc(easize + extra, M_TEMP, M_WAITOK);
liovec.iov_base = eae;
liovec.iov_len = easize;
luio.uio_iov = &liovec;
luio.uio_iovcnt = 1;
luio.uio_offset = 0;
luio.uio_resid = easize;
luio.uio_segflg = UIO_SYSSPACE;
luio.uio_rw = UIO_READ;
luio.uio_td = td;
error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC);
if (error) {
free(eae, M_TEMP);
return(error);
}
*p = eae;
return (0);
}
static void
ffs_lock_ea(struct vnode *vp)
{
struct inode *ip;
ip = VTOI(vp);
VI_LOCK(vp);
while (ip->i_flag & IN_EA_LOCKED) {
ip->i_flag |= IN_EA_LOCKWAIT;
msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD + 2, "ufs_ea",
0);
}
ip->i_flag |= IN_EA_LOCKED;
VI_UNLOCK(vp);
}
static void
ffs_unlock_ea(struct vnode *vp)
{
struct inode *ip;
ip = VTOI(vp);
VI_LOCK(vp);
if (ip->i_flag & IN_EA_LOCKWAIT)
wakeup(&ip->i_ea_refs);
ip->i_flag &= ~(IN_EA_LOCKED | IN_EA_LOCKWAIT);
VI_UNLOCK(vp);
}
static int
ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td)
{
struct inode *ip;
struct ufs2_dinode *dp;
int error;
ip = VTOI(vp);
ffs_lock_ea(vp);
if (ip->i_ea_area != NULL) {
ip->i_ea_refs++;
ffs_unlock_ea(vp);
return (0);
}
dp = ip->i_din2;
error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0);
if (error) {
ffs_unlock_ea(vp);
return (error);
}
ip->i_ea_len = dp->di_extsize;
ip->i_ea_error = 0;
ip->i_ea_refs++;
ffs_unlock_ea(vp);
return (0);
}
/*
* Vnode extattr transaction commit/abort
*/
static int
ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td)
{
struct inode *ip;
struct uio luio;
struct iovec liovec;
int error;
struct ufs2_dinode *dp;
ip = VTOI(vp);
ffs_lock_ea(vp);
if (ip->i_ea_area == NULL) {
ffs_unlock_ea(vp);
return (EINVAL);
}
dp = ip->i_din2;
error = ip->i_ea_error;
if (commit && error == 0) {
ASSERT_VOP_ELOCKED(vp, "ffs_close_ea commit");
if (cred == NOCRED)
cred = vp->v_mount->mnt_cred;
liovec.iov_base = ip->i_ea_area;
liovec.iov_len = ip->i_ea_len;
luio.uio_iov = &liovec;
luio.uio_iovcnt = 1;
luio.uio_offset = 0;
luio.uio_resid = ip->i_ea_len;
luio.uio_segflg = UIO_SYSSPACE;
luio.uio_rw = UIO_WRITE;
luio.uio_td = td;
/* XXX: I'm not happy about truncating to zero size */
if (ip->i_ea_len < dp->di_extsize)
error = ffs_truncate(vp, 0, IO_EXT, cred);
error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred);
}
if (--ip->i_ea_refs == 0) {
free(ip->i_ea_area, M_TEMP);
ip->i_ea_area = NULL;
ip->i_ea_len = 0;
ip->i_ea_error = 0;
}
ffs_unlock_ea(vp);
return (error);
}
/*
* Vnode extattr strategy routine for fifos.
*
* We need to check for a read or write of the external attributes.
* Otherwise we just fall through and do the usual thing.
*/
static int
ffsext_strategy(struct vop_strategy_args *ap)
/*
struct vop_strategy_args {
struct vnodeop_desc *a_desc;
struct vnode *a_vp;
struct buf *a_bp;
};
*/
{
struct vnode *vp;
daddr_t lbn;
vp = ap->a_vp;
lbn = ap->a_bp->b_lblkno;
if (I_IS_UFS2(VTOI(vp)) && lbn < 0 && lbn >= -UFS_NXADDR)
return (VOP_STRATEGY_APV(&ufs_vnodeops, ap));
if (vp->v_type == VFIFO)
return (VOP_STRATEGY_APV(&ufs_fifoops, ap));
panic("spec nodes went here");
}
/*
* Vnode extattr transaction commit/abort
*/
static int
ffs_openextattr(struct vop_openextattr_args *ap)
/*
struct vop_openextattr_args {
struct vnodeop_desc *a_desc;
struct vnode *a_vp;
IN struct ucred *a_cred;
IN struct thread *a_td;
};
*/
{
if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
return (EOPNOTSUPP);
return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td));
}
/*
* Vnode extattr transaction commit/abort
*/
static int
ffs_closeextattr(struct vop_closeextattr_args *ap)
/*
struct vop_closeextattr_args {
struct vnodeop_desc *a_desc;
struct vnode *a_vp;
int a_commit;
IN struct ucred *a_cred;
IN struct thread *a_td;
};
*/
{
if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
return (EOPNOTSUPP);
if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY))
return (EROFS);
return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td));
}
/*
* Vnode operation to remove a named attribute.
*/
static int
ffs_deleteextattr(struct vop_deleteextattr_args *ap)
/*
vop_deleteextattr {
IN struct vnode *a_vp;
IN int a_attrnamespace;
IN const char *a_name;
IN struct ucred *a_cred;
IN struct thread *a_td;
};
*/
{
struct inode *ip;
- struct fs *fs;
struct extattr *eap;
uint32_t ul;
int olen, error, i, easize;
u_char *eae;
void *tmp;
ip = VTOI(ap->a_vp);
- fs = ITOFS(ip);
if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
return (EOPNOTSUPP);
if (strlen(ap->a_name) == 0)
return (EINVAL);
if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
return (EROFS);
error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
ap->a_cred, ap->a_td, VWRITE);
if (error) {
/*
* ffs_lock_ea is not needed there, because the vnode
* must be exclusively locked.
*/
if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
ip->i_ea_error = error;
return (error);
}
error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
if (error)
return (error);
/* CEM: delete could be done in-place instead */
eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK);
bcopy(ip->i_ea_area, eae, ip->i_ea_len);
easize = ip->i_ea_len;
olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
&eap, NULL);
if (olen == -1) {
/* delete but nonexistent */
free(eae, M_TEMP);
ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
return (ENOATTR);
}
ul = eap->ea_length;
i = (u_char *)EXTATTR_NEXT(eap) - eae;
bcopy(EXTATTR_NEXT(eap), eap, easize - i);
easize -= ul;
tmp = ip->i_ea_area;
ip->i_ea_area = eae;
ip->i_ea_len = easize;
free(tmp, M_TEMP);
error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
return (error);
}
/*
* Vnode operation to retrieve a named extended attribute.
*/
static int
ffs_getextattr(struct vop_getextattr_args *ap)
/*
vop_getextattr {
IN struct vnode *a_vp;
IN int a_attrnamespace;
IN const char *a_name;
INOUT struct uio *a_uio;
OUT size_t *a_size;
IN struct ucred *a_cred;
IN struct thread *a_td;
};
*/
{
struct inode *ip;
u_char *eae, *p;
unsigned easize;
int error, ealen;
ip = VTOI(ap->a_vp);
if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
return (EOPNOTSUPP);
error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
ap->a_cred, ap->a_td, VREAD);
if (error)
return (error);
error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
if (error)
return (error);
eae = ip->i_ea_area;
easize = ip->i_ea_len;
ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
NULL, &p);
if (ealen >= 0) {
error = 0;
if (ap->a_size != NULL)
*ap->a_size = ealen;
else if (ap->a_uio != NULL)
error = uiomove(p, ealen, ap->a_uio);
} else
error = ENOATTR;
ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
return (error);
}
/*
* Vnode operation to retrieve extended attributes on a vnode.
*/
static int
ffs_listextattr(struct vop_listextattr_args *ap)
/*
vop_listextattr {
IN struct vnode *a_vp;
IN int a_attrnamespace;
INOUT struct uio *a_uio;
OUT size_t *a_size;
IN struct ucred *a_cred;
IN struct thread *a_td;
};
*/
{
struct inode *ip;
struct extattr *eap, *eaend;
int error, ealen;
ip = VTOI(ap->a_vp);
if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
return (EOPNOTSUPP);
error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
ap->a_cred, ap->a_td, VREAD);
if (error)
return (error);
error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
if (error)
return (error);
error = 0;
if (ap->a_size != NULL)
*ap->a_size = 0;
KASSERT(ALIGNED_TO(ip->i_ea_area, struct extattr), ("unaligned"));
eap = (struct extattr *)ip->i_ea_area;
eaend = (struct extattr *)(ip->i_ea_area + ip->i_ea_len);
for (; error == 0 && eap < eaend; eap = EXTATTR_NEXT(eap)) {
/* make sure this entry is complete */
if (EXTATTR_NEXT(eap) > eaend)
break;
if (eap->ea_namespace != ap->a_attrnamespace)
continue;
ealen = eap->ea_namelength;
if (ap->a_size != NULL)
*ap->a_size += ealen + 1;
else if (ap->a_uio != NULL)
error = uiomove(&eap->ea_namelength, ealen + 1,
ap->a_uio);
}
ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
return (error);
}
/*
* Vnode operation to set a named attribute.
*/
static int
ffs_setextattr(struct vop_setextattr_args *ap)
/*
vop_setextattr {
IN struct vnode *a_vp;
IN int a_attrnamespace;
IN const char *a_name;
INOUT struct uio *a_uio;
IN struct ucred *a_cred;
IN struct thread *a_td;
};
*/
{
struct inode *ip;
struct fs *fs;
struct extattr *eap;
uint32_t ealength, ul;
ssize_t ealen;
int olen, eapad1, eapad2, error, i, easize;
u_char *eae;
void *tmp;
ip = VTOI(ap->a_vp);
fs = ITOFS(ip);
if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
return (EOPNOTSUPP);
if (strlen(ap->a_name) == 0)
return (EINVAL);
/* XXX Now unsupported API to delete EAs using NULL uio. */
if (ap->a_uio == NULL)
return (EOPNOTSUPP);
if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
return (EROFS);
ealen = ap->a_uio->uio_resid;
if (ealen < 0 || ealen > lblktosize(fs, UFS_NXADDR))
return (EINVAL);
error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
ap->a_cred, ap->a_td, VWRITE);
if (error) {
/*
* ffs_lock_ea is not needed there, because the vnode
* must be exclusively locked.
*/
if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
ip->i_ea_error = error;
return (error);
}
error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
if (error)
return (error);
ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name);
eapad1 = roundup2(ealength, 8) - ealength;
eapad2 = roundup2(ealen, 8) - ealen;
ealength += eapad1 + ealen + eapad2;
/*
* CEM: rewrites of the same size or smaller could be done in-place
* instead. (We don't acquire any fine-grained locks in here either,
* so we could also do bigger writes in-place.)
*/
eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK);
bcopy(ip->i_ea_area, eae, ip->i_ea_len);
easize = ip->i_ea_len;
olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
&eap, NULL);
if (olen == -1) {
/* new, append at end */
KASSERT(ALIGNED_TO(eae + easize, struct extattr),
("unaligned"));
eap = (struct extattr *)(eae + easize);
easize += ealength;
} else {
ul = eap->ea_length;
i = (u_char *)EXTATTR_NEXT(eap) - eae;
if (ul != ealength) {
bcopy(EXTATTR_NEXT(eap), (u_char *)eap + ealength,
easize - i);
easize += (ealength - ul);
}
}
if (easize > lblktosize(fs, UFS_NXADDR)) {
free(eae, M_TEMP);
ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
ip->i_ea_error = ENOSPC;
return (ENOSPC);
}
eap->ea_length = ealength;
eap->ea_namespace = ap->a_attrnamespace;
eap->ea_contentpadlen = eapad2;
eap->ea_namelength = strlen(ap->a_name);
memcpy(eap->ea_name, ap->a_name, strlen(ap->a_name));
bzero(&eap->ea_name[strlen(ap->a_name)], eapad1);
error = uiomove(EXTATTR_CONTENT(eap), ealen, ap->a_uio);
if (error) {
free(eae, M_TEMP);
ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
ip->i_ea_error = error;
return (error);
}
bzero((u_char *)EXTATTR_CONTENT(eap) + ealen, eapad2);
tmp = ip->i_ea_area;
ip->i_ea_area = eae;
ip->i_ea_len = easize;
free(tmp, M_TEMP);
error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
return (error);
}
/*
* Vnode pointer to File handle
*/
static int
ffs_vptofh(struct vop_vptofh_args *ap)
/*
vop_vptofh {
IN struct vnode *a_vp;
IN struct fid *a_fhp;
};
*/
{
struct inode *ip;
struct ufid *ufhp;
ip = VTOI(ap->a_vp);
ufhp = (struct ufid *)ap->a_fhp;
ufhp->ufid_len = sizeof(struct ufid);
ufhp->ufid_ino = ip->i_number;
ufhp->ufid_gen = ip->i_gen;
return (0);
}
SYSCTL_DECL(_vfs_ffs);
static int use_buf_pager = 1;
SYSCTL_INT(_vfs_ffs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, &use_buf_pager, 0,
"Always use buffer pager instead of bmap");
static daddr_t
ffs_gbp_getblkno(struct vnode *vp, vm_ooffset_t off)
{
return (lblkno(VFSTOUFS(vp->v_mount)->um_fs, off));
}
static int
ffs_gbp_getblksz(struct vnode *vp, daddr_t lbn)
{
return (blksize(VFSTOUFS(vp->v_mount)->um_fs, VTOI(vp), lbn));
}
static int
ffs_getpages(struct vop_getpages_args *ap)
{
struct vnode *vp;
struct ufsmount *um;
vp = ap->a_vp;
um = VFSTOUFS(vp->v_mount);
if (!use_buf_pager && um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE)
return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count,
ap->a_rbehind, ap->a_rahead, NULL, NULL));
return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind,
ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz));
}

File Metadata

Mime Type
application/octet-stream
Expires
Thu, Apr 25, 4:42 PM (1 d, 23 h)
Storage Engine
chunks
Storage Format
Chunks
Storage Handle
l1sOVRkF.KC8
Default Alt Text
(4 MB)

Event Timeline