Index: head/sys/arm/samsung/exynos/chrome_kb.c
===================================================================
--- head/sys/arm/samsung/exynos/chrome_kb.c (revision 356654)
+++ head/sys/arm/samsung/exynos/chrome_kb.c (revision 356655)
@@ -1,918 +1,918 @@
/*-
* Copyright (c) 2014 Ruslan Bukin
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Samsung Chromebook Keyboard
*/
#include
__FBSDID("$FreeBSD$");
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "gpio_if.h"
#include
#include
#include
#include
#define CKB_LOCK() mtx_lock(&Giant)
#define CKB_UNLOCK() mtx_unlock(&Giant)
#ifdef INVARIANTS
/*
* Assert that the lock is held in all contexts
* where the code can be executed.
*/
#define CKB_LOCK_ASSERT() mtx_assert(&Giant, MA_OWNED)
/*
* Assert that the lock is held in the contexts
* where it really has to be so.
*/
#define CKB_CTX_LOCK_ASSERT() \
do { \
- if (!kdb_active && panicstr == NULL) \
+ if (!kdb_active && !KERNEL_PANICKED()) \
mtx_assert(&Giant, MA_OWNED); \
} while (0)
#else
#define CKB_LOCK_ASSERT() (void)0
#define CKB_CTX_LOCK_ASSERT() (void)0
#endif
/*
* Define a stub keyboard driver in case one hasn't been
* compiled into the kernel
*/
#include
#include
#include
#define CKB_NFKEY 12
#define CKB_FLAG_COMPOSE 0x1
#define CKB_FLAG_POLLING 0x2
#define KBD_DRIVER_NAME "ckbd"
struct ckb_softc {
keyboard_t sc_kbd;
keymap_t sc_keymap;
accentmap_t sc_accmap;
fkeytab_t sc_fkeymap[CKB_NFKEY];
struct resource* sc_mem_res;
struct resource* sc_irq_res;
void* sc_intr_hl;
int sc_mode; /* input mode (K_XLATE,K_RAW,K_CODE) */
int sc_state; /* shift/lock key state */
int sc_accents; /* accent key index (> 0) */
int sc_flags; /* flags */
struct callout sc_repeat_callout;
int sc_repeat_key;
int sc_repeating;
int flag;
int rows;
int cols;
int gpio;
device_t dev;
device_t gpio_dev;
struct thread *sc_poll_thread;
uint16_t *keymap;
uint8_t *scan_local;
uint8_t *scan;
};
/* prototypes */
static int ckb_set_typematic(keyboard_t *, int);
static uint32_t ckb_read_char(keyboard_t *, int);
static void ckb_clear_state(keyboard_t *);
static int ckb_ioctl(keyboard_t *, u_long, caddr_t);
static int ckb_enable(keyboard_t *);
static int ckb_disable(keyboard_t *);
static void
ckb_repeat(void *arg)
{
struct ckb_softc *sc;
sc = arg;
if (KBD_IS_ACTIVE(&sc->sc_kbd) && KBD_IS_BUSY(&sc->sc_kbd)) {
if (sc->sc_repeat_key != -1) {
sc->sc_repeating = 1;
sc->sc_kbd.kb_callback.kc_func(&sc->sc_kbd,
KBDIO_KEYINPUT, sc->sc_kbd.kb_callback.kc_arg);
}
}
}
/* detect a keyboard, not used */
static int
ckb__probe(int unit, void *arg, int flags)
{
return (ENXIO);
}
/* reset and initialize the device, not used */
static int
ckb_init(int unit, keyboard_t **kbdp, void *arg, int flags)
{
return (ENXIO);
}
/* test the interface to the device, not used */
static int
ckb_test_if(keyboard_t *kbd)
{
return (0);
}
/* finish using this keyboard, not used */
static int
ckb_term(keyboard_t *kbd)
{
return (ENXIO);
}
/* keyboard interrupt routine, not used */
static int
ckb_intr(keyboard_t *kbd, void *arg)
{
return (0);
}
/* lock the access to the keyboard, not used */
static int
ckb_lock(keyboard_t *kbd, int lock)
{
return (1);
}
/* clear the internal state of the keyboard */
static void
ckb_clear_state(keyboard_t *kbd)
{
struct ckb_softc *sc;
sc = kbd->kb_data;
CKB_CTX_LOCK_ASSERT();
sc->sc_flags &= ~(CKB_FLAG_COMPOSE | CKB_FLAG_POLLING);
sc->sc_state &= LOCK_MASK; /* preserve locking key state */
sc->sc_accents = 0;
}
/* save the internal state, not used */
static int
ckb_get_state(keyboard_t *kbd, void *buf, size_t len)
{
return (len == 0) ? 1 : -1;
}
/* set the internal state, not used */
static int
ckb_set_state(keyboard_t *kbd, void *buf, size_t len)
{
return (EINVAL);
}
/* check if data is waiting */
static int
ckb_check(keyboard_t *kbd)
{
struct ckb_softc *sc;
int i;
sc = kbd->kb_data;
CKB_CTX_LOCK_ASSERT();
if (!KBD_IS_ACTIVE(kbd))
return (0);
if (sc->sc_flags & CKB_FLAG_POLLING) {
return (1);
}
for (i = 0; i < sc->cols; i++)
if (sc->scan_local[i] != sc->scan[i]) {
return (1);
}
if (sc->sc_repeating)
return (1);
return (0);
}
/* check if char is waiting */
static int
ckb_check_char_locked(keyboard_t *kbd)
{
CKB_CTX_LOCK_ASSERT();
if (!KBD_IS_ACTIVE(kbd))
return (0);
return (ckb_check(kbd));
}
static int
ckb_check_char(keyboard_t *kbd)
{
int result;
CKB_LOCK();
result = ckb_check_char_locked(kbd);
CKB_UNLOCK();
return (result);
}
/* read one byte from the keyboard if it's allowed */
/* Currently unused. */
static int
ckb_read(keyboard_t *kbd, int wait)
{
CKB_CTX_LOCK_ASSERT();
if (!KBD_IS_ACTIVE(kbd))
return (-1);
printf("Implement ME: %s\n", __func__);
return (0);
}
static uint16_t
keymap_read(struct ckb_softc *sc, int col, int row)
{
KASSERT(sc->keymap != NULL, ("keymap_read: no keymap"));
if (col >= 0 && col < sc->cols &&
row >= 0 && row < sc->rows) {
return sc->keymap[row * sc->cols + col];
}
return (0);
}
static int
keymap_write(struct ckb_softc *sc, int col, int row, uint16_t key)
{
KASSERT(sc->keymap != NULL, ("keymap_write: no keymap"));
if (col >= 0 && col < sc->cols &&
row >= 0 && row < sc->rows) {
sc->keymap[row * sc->cols + col] = key;
return (0);
}
return (-1);
}
/* read char from the keyboard */
static uint32_t
ckb_read_char_locked(keyboard_t *kbd, int wait)
{
struct ckb_softc *sc;
int i,j;
uint16_t key;
int oldbit;
int newbit;
int status;
sc = kbd->kb_data;
CKB_CTX_LOCK_ASSERT();
if (!KBD_IS_ACTIVE(kbd))
return (NOKEY);
if (sc->sc_repeating) {
sc->sc_repeating = 0;
callout_reset(&sc->sc_repeat_callout, hz / 10,
ckb_repeat, sc);
return (sc->sc_repeat_key);
}
if (sc->sc_flags & CKB_FLAG_POLLING) {
for (;;) {
GPIO_PIN_GET(sc->gpio_dev, sc->gpio, &status);
if (status == 0) {
if (ec_command(EC_CMD_MKBP_STATE, sc->scan,
sc->cols,
sc->scan, sc->cols)) {
return (NOKEY);
}
break;
}
if (!wait) {
return (NOKEY);
}
DELAY(1000);
}
}
for (i = 0; i < sc->cols; i++) {
for (j = 0; j < sc->rows; j++) {
oldbit = (sc->scan_local[i] & (1 << j));
newbit = (sc->scan[i] & (1 << j));
if (oldbit == newbit)
continue;
key = keymap_read(sc, i, j);
if (key == 0) {
continue;
}
if (newbit > 0) {
/* key pressed */
sc->scan_local[i] |= (1 << j);
/* setup repeating */
sc->sc_repeat_key = key;
callout_reset(&sc->sc_repeat_callout,
hz / 2, ckb_repeat, sc);
} else {
/* key released */
sc->scan_local[i] &= ~(1 << j);
/* release flag */
key |= 0x80;
/* unsetup repeating */
sc->sc_repeat_key = -1;
callout_stop(&sc->sc_repeat_callout);
}
return (key);
}
}
return (NOKEY);
}
/* Currently wait is always false. */
static uint32_t
ckb_read_char(keyboard_t *kbd, int wait)
{
uint32_t keycode;
CKB_LOCK();
keycode = ckb_read_char_locked(kbd, wait);
CKB_UNLOCK();
return (keycode);
}
/* some useful control functions */
static int
ckb_ioctl_locked(keyboard_t *kbd, u_long cmd, caddr_t arg)
{
struct ckb_softc *sc;
int i;
sc = kbd->kb_data;
CKB_LOCK_ASSERT();
switch (cmd) {
case KDGKBMODE: /* get keyboard mode */
*(int *)arg = sc->sc_mode;
break;
case KDSKBMODE: /* set keyboard mode */
switch (*(int *)arg) {
case K_XLATE:
if (sc->sc_mode != K_XLATE) {
/* make lock key state and LED state match */
sc->sc_state &= ~LOCK_MASK;
sc->sc_state |= KBD_LED_VAL(kbd);
}
/* FALLTHROUGH */
case K_RAW:
case K_CODE:
if (sc->sc_mode != *(int *)arg) {
if ((sc->sc_flags & CKB_FLAG_POLLING) == 0)
ckb_clear_state(kbd);
sc->sc_mode = *(int *)arg;
}
break;
default:
return (EINVAL);
}
break;
case KDGETLED: /* get keyboard LED */
*(int *)arg = KBD_LED_VAL(kbd);
break;
case KDSETLED: /* set keyboard LED */
/* NOTE: lock key state in "sc_state" won't be changed */
if (*(int *)arg & ~LOCK_MASK)
return (EINVAL);
i = *(int *)arg;
/* replace CAPS LED with ALTGR LED for ALTGR keyboards */
if (sc->sc_mode == K_XLATE &&
kbd->kb_keymap->n_keys > ALTGR_OFFSET) {
if (i & ALKED)
i |= CLKED;
else
i &= ~CLKED;
}
if (KBD_HAS_DEVICE(kbd)) {
/* Configure LED */
}
KBD_LED_VAL(kbd) = *(int *)arg;
break;
case KDGKBSTATE: /* get lock key state */
*(int *)arg = sc->sc_state & LOCK_MASK;
break;
case KDSKBSTATE: /* set lock key state */
if (*(int *)arg & ~LOCK_MASK) {
return (EINVAL);
}
sc->sc_state &= ~LOCK_MASK;
sc->sc_state |= *(int *)arg;
/* set LEDs and quit */
return (ckb_ioctl(kbd, KDSETLED, arg));
case KDSETREPEAT: /* set keyboard repeat rate (new
* interface) */
if (!KBD_HAS_DEVICE(kbd)) {
return (0);
}
if (((int *)arg)[1] < 0) {
return (EINVAL);
}
if (((int *)arg)[0] < 0) {
return (EINVAL);
}
if (((int *)arg)[0] < 200) /* fastest possible value */
kbd->kb_delay1 = 200;
else
kbd->kb_delay1 = ((int *)arg)[0];
kbd->kb_delay2 = ((int *)arg)[1];
return (0);
case KDSETRAD: /* set keyboard repeat rate (old
* interface) */
return (ckb_set_typematic(kbd, *(int *)arg));
case PIO_KEYMAP: /* set keyboard translation table */
case OPIO_KEYMAP: /* set keyboard translation table
* (compat) */
case PIO_KEYMAPENT: /* set keyboard translation table
* entry */
case PIO_DEADKEYMAP: /* set accent key translation table */
sc->sc_accents = 0;
/* FALLTHROUGH */
default:
return (genkbd_commonioctl(kbd, cmd, arg));
}
return (0);
}
static int
ckb_ioctl(keyboard_t *kbd, u_long cmd, caddr_t arg)
{
int result;
/*
* XXX KDGKBSTATE, KDSKBSTATE and KDSETLED can be called from any
* context where printf(9) can be called, which among other things
* includes interrupt filters and threads with any kinds of locks
* already held. For this reason it would be dangerous to acquire
* the Giant here unconditionally. On the other hand we have to
* have it to handle the ioctl.
* So we make our best effort to auto-detect whether we can grab
* the Giant or not. Blame syscons(4) for this.
*/
switch (cmd) {
case KDGKBSTATE:
case KDSKBSTATE:
case KDSETLED:
if (!mtx_owned(&Giant) && !SCHEDULER_STOPPED())
return (EDEADLK); /* best I could come up with */
/* FALLTHROUGH */
default:
CKB_LOCK();
result = ckb_ioctl_locked(kbd, cmd, arg);
CKB_UNLOCK();
return (result);
}
}
/*
* Enable the access to the device; until this function is called,
* the client cannot read from the keyboard.
*/
static int
ckb_enable(keyboard_t *kbd)
{
CKB_LOCK();
KBD_ACTIVATE(kbd);
CKB_UNLOCK();
return (0);
}
/* disallow the access to the device */
static int
ckb_disable(keyboard_t *kbd)
{
CKB_LOCK();
KBD_DEACTIVATE(kbd);
CKB_UNLOCK();
return (0);
}
/* local functions */
static int
ckb_set_typematic(keyboard_t *kbd, int code)
{
static const int delays[] = {250, 500, 750, 1000};
static const int rates[] = {34, 38, 42, 46, 50, 55, 59, 63,
68, 76, 84, 92, 100, 110, 118, 126,
136, 152, 168, 184, 200, 220, 236, 252,
272, 304, 336, 368, 400, 440, 472, 504};
if (code & ~0x7f) {
return (EINVAL);
}
kbd->kb_delay1 = delays[(code >> 5) & 3];
kbd->kb_delay2 = rates[code & 0x1f];
return (0);
}
static int
ckb_poll(keyboard_t *kbd, int on)
{
struct ckb_softc *sc;
sc = kbd->kb_data;
CKB_LOCK();
if (on) {
sc->sc_flags |= CKB_FLAG_POLLING;
sc->sc_poll_thread = curthread;
} else {
sc->sc_flags &= ~CKB_FLAG_POLLING;
}
CKB_UNLOCK();
return (0);
}
/* local functions */
static int dummy_kbd_configure(int flags);
keyboard_switch_t ckbdsw = {
.probe = &ckb__probe,
.init = &ckb_init,
.term = &ckb_term,
.intr = &ckb_intr,
.test_if = &ckb_test_if,
.enable = &ckb_enable,
.disable = &ckb_disable,
.read = &ckb_read,
.check = &ckb_check,
.read_char = &ckb_read_char,
.check_char = &ckb_check_char,
.ioctl = &ckb_ioctl,
.lock = &ckb_lock,
.clear_state = &ckb_clear_state,
.get_state = &ckb_get_state,
.set_state = &ckb_set_state,
.poll = &ckb_poll,
};
static int
dummy_kbd_configure(int flags)
{
return (0);
}
KEYBOARD_DRIVER(ckbd, ckbdsw, dummy_kbd_configure);
/*
* Parses 'keymap' into sc->keymap.
* Requires sc->cols and sc->rows to be set.
*/
static int
parse_keymap(struct ckb_softc *sc, pcell_t *keymap, size_t len)
{
int i;
sc->keymap = malloc(sc->cols * sc->rows * sizeof(sc->keymap[0]),
M_DEVBUF, M_NOWAIT | M_ZERO);
if (sc->keymap == NULL) {
return (ENOMEM);
}
for (i = 0; i < len; i++) {
/*
* Return value is ignored, we just write whatever fits into
* specified number of rows and columns and silently ignore
* everything else.
* Keymap entries follow this format: 0xRRCCKKKK
* RR - row number, CC - column number, KKKK - key code
*/
keymap_write(sc, (keymap[i] >> 16) & 0xff,
(keymap[i] >> 24) & 0xff,
keymap[i] & 0xffff);
}
return (0);
}
/* Allocates a new array for keymap and returns it in 'keymap'. */
static int
read_keymap(phandle_t node, const char *prop, pcell_t **keymap, size_t *len)
{
if ((*len = OF_getproplen(node, prop)) <= 0) {
return (ENXIO);
}
if ((*keymap = malloc(*len, M_DEVBUF, M_NOWAIT)) == NULL) {
return (ENOMEM);
}
if (OF_getencprop(node, prop, *keymap, *len) != *len) {
return (ENXIO);
}
return (0);
}
static int
parse_dts(struct ckb_softc *sc)
{
phandle_t node;
pcell_t dts_value;
pcell_t *keymap;
int len, ret;
const char *keymap_prop = NULL;
if ((node = ofw_bus_get_node(sc->dev)) == -1)
return (ENXIO);
if ((len = OF_getproplen(node, "google,key-rows")) <= 0)
return (ENXIO);
OF_getencprop(node, "google,key-rows", &dts_value, len);
sc->rows = dts_value;
if ((len = OF_getproplen(node, "google,key-columns")) <= 0)
return (ENXIO);
OF_getencprop(node, "google,key-columns", &dts_value, len);
sc->cols = dts_value;
if ((len = OF_getproplen(node, "freebsd,intr-gpio")) <= 0)
return (ENXIO);
OF_getencprop(node, "freebsd,intr-gpio", &dts_value, len);
sc->gpio = dts_value;
if (OF_hasprop(node, "freebsd,keymap")) {
keymap_prop = "freebsd,keymap";
device_printf(sc->dev, "using FreeBSD-specific keymap from FDT\n");
} else if (OF_hasprop(node, "linux,keymap")) {
keymap_prop = "linux,keymap";
device_printf(sc->dev, "using Linux keymap from FDT\n");
} else {
device_printf(sc->dev, "using built-in keymap\n");
}
if (keymap_prop != NULL) {
if ((ret = read_keymap(node, keymap_prop, &keymap, &len))) {
device_printf(sc->dev,
"failed to read keymap from FDT: %d\n", ret);
return (ret);
}
ret = parse_keymap(sc, keymap, len);
free(keymap, M_DEVBUF);
if (ret) {
return (ret);
}
} else {
if ((ret = parse_keymap(sc, default_keymap, KEYMAP_LEN))) {
return (ret);
}
}
if ((sc->rows == 0) || (sc->cols == 0) || (sc->gpio == 0))
return (ENXIO);
return (0);
}
void
ckb_ec_intr(void *arg)
{
struct ckb_softc *sc;
sc = arg;
if (sc->sc_flags & CKB_FLAG_POLLING)
return;
ec_command(EC_CMD_MKBP_STATE, sc->scan, sc->cols,
sc->scan, sc->cols);
(sc->sc_kbd.kb_callback.kc_func) (&sc->sc_kbd, KBDIO_KEYINPUT,
sc->sc_kbd.kb_callback.kc_arg);
};
static int
chrome_kb_attach(device_t dev)
{
struct ckb_softc *sc;
keyboard_t *kbd;
int error;
int rid;
int i;
sc = device_get_softc(dev);
sc->dev = dev;
sc->keymap = NULL;
if ((error = parse_dts(sc)) != 0)
return error;
sc->gpio_dev = devclass_get_device(devclass_find("gpio"), 0);
if (sc->gpio_dev == NULL) {
device_printf(sc->dev, "Can't find gpio device.\n");
return (ENXIO);
}
#if 0
device_printf(sc->dev, "Keyboard matrix [%dx%d]\n",
sc->cols, sc->rows);
#endif
pad_setup_intr(sc->gpio, ckb_ec_intr, sc);
kbd = &sc->sc_kbd;
rid = 0;
sc->scan_local = malloc(sc->cols, M_DEVBUF, M_NOWAIT);
sc->scan = malloc(sc->cols, M_DEVBUF, M_NOWAIT);
for (i = 0; i < sc->cols; i++) {
sc->scan_local[i] = 0;
sc->scan[i] = 0;
}
kbd_init_struct(kbd, KBD_DRIVER_NAME, KB_OTHER,
device_get_unit(dev), 0, 0, 0);
kbd->kb_data = (void *)sc;
sc->sc_keymap = key_map;
sc->sc_accmap = accent_map;
for (i = 0; i < CKB_NFKEY; i++) {
sc->sc_fkeymap[i] = fkey_tab[i];
}
kbd_set_maps(kbd, &sc->sc_keymap, &sc->sc_accmap,
sc->sc_fkeymap, CKB_NFKEY);
KBD_FOUND_DEVICE(kbd);
ckb_clear_state(kbd);
KBD_PROBE_DONE(kbd);
callout_init(&sc->sc_repeat_callout, 0);
KBD_INIT_DONE(kbd);
if (kbd_register(kbd) < 0) {
return (ENXIO);
}
KBD_CONFIG_DONE(kbd);
return (0);
}
static int
chrome_kb_probe(device_t dev)
{
if (!ofw_bus_status_okay(dev))
return (ENXIO);
if (ofw_bus_is_compatible(dev, "google,cros-ec-keyb") ||
ofw_bus_is_compatible(dev, "google,mkbp-keyb")) {
device_set_desc(dev, "Chrome EC Keyboard");
return (BUS_PROBE_DEFAULT);
}
return (ENXIO);
}
static int
chrome_kb_detach(device_t dev)
{
struct ckb_softc *sc;
sc = device_get_softc(dev);
if (sc->keymap != NULL) {
free(sc->keymap, M_DEVBUF);
}
return 0;
}
static device_method_t chrome_kb_methods[] = {
DEVMETHOD(device_probe, chrome_kb_probe),
DEVMETHOD(device_attach, chrome_kb_attach),
DEVMETHOD(device_detach, chrome_kb_detach),
{ 0, 0 }
};
static driver_t chrome_kb_driver = {
"chrome_kb",
chrome_kb_methods,
sizeof(struct ckb_softc),
};
static devclass_t chrome_kb_devclass;
DRIVER_MODULE(chrome_kb, simplebus, chrome_kb_driver,
chrome_kb_devclass, 0, 0);
Index: head/sys/arm/versatile/pl050.c
===================================================================
--- head/sys/arm/versatile/pl050.c (revision 356654)
+++ head/sys/arm/versatile/pl050.c (revision 356655)
@@ -1,742 +1,742 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2012 Oleksandr Tymoshenko
* All rights reserved.
*
* Based on dev/usb/input/ukbd.c
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include
__FBSDID("$FreeBSD$");
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#define KMI_LOCK() mtx_lock(&Giant)
#define KMI_UNLOCK() mtx_unlock(&Giant)
#ifdef INVARIANTS
/*
* Assert that the lock is held in all contexts
* where the code can be executed.
*/
#define KMI_LOCK_ASSERT() mtx_assert(&Giant, MA_OWNED)
/*
* Assert that the lock is held in the contexts
* where it really has to be so.
*/
#define KMI_CTX_LOCK_ASSERT() \
do { \
- if (!kdb_active && panicstr == NULL) \
+ if (!kdb_active && !KERNEL_PANICKED()) \
mtx_assert(&Giant, MA_OWNED); \
} while (0)
#else
#define KMI_LOCK_ASSERT() (void)0
#define KMI_CTX_LOCK_ASSERT() (void)0
#endif
#define KMICR 0x00
#define KMICR_TYPE_NONPS2 (1 << 5)
#define KMICR_RXINTREN (1 << 4)
#define KMICR_TXINTREN (1 << 3)
#define KMICR_EN (1 << 2)
#define KMICR_FKMID (1 << 1)
#define KMICR_FKMIC (1 << 0)
#define KMISTAT 0x04
#define KMISTAT_TXEMPTY (1 << 6)
#define KMISTAT_TXBUSY (1 << 5)
#define KMISTAT_RXFULL (1 << 4)
#define KMISTAT_RXBUSY (1 << 3)
#define KMISTAT_RXPARITY (1 << 2)
#define KMISTAT_KMIC (1 << 1)
#define KMISTAT_KMID (1 << 0)
#define KMIDATA 0x08
#define KMICLKDIV 0x0C
#define KMIIR 0x10
#define KMIIR_TXINTR (1 << 1)
#define KMIIR_RXINTR (1 << 0)
#define KMI_DRIVER_NAME "kmi"
#define KMI_NFKEY (sizeof(fkey_tab)/sizeof(fkey_tab[0])) /* units */
#define SET_SCANCODE_SET 0xf0
struct kmi_softc {
device_t sc_dev;
keyboard_t sc_kbd;
keymap_t sc_keymap;
accentmap_t sc_accmap;
fkeytab_t sc_fkeymap[KMI_NFKEY];
struct resource* sc_mem_res;
struct resource* sc_irq_res;
void* sc_intr_hl;
int sc_mode; /* input mode (K_XLATE,K_RAW,K_CODE) */
int sc_state; /* shift/lock key state */
int sc_accents; /* accent key index (> 0) */
uint32_t sc_flags; /* flags */
#define KMI_FLAG_COMPOSE 0x00000001
#define KMI_FLAG_POLLING 0x00000002
struct thread *sc_poll_thread;
};
/* Read/Write macros for Timer used as timecounter */
#define pl050_kmi_read_4(sc, reg) \
bus_read_4((sc)->sc_mem_res, (reg))
#define pl050_kmi_write_4(sc, reg, val) \
bus_write_4((sc)->sc_mem_res, (reg), (val))
/* prototypes */
static void kmi_set_leds(struct kmi_softc *, uint8_t);
static int kmi_set_typematic(keyboard_t *, int);
static uint32_t kmi_read_char(keyboard_t *, int);
static void kmi_clear_state(keyboard_t *);
static int kmi_ioctl(keyboard_t *, u_long, caddr_t);
static int kmi_enable(keyboard_t *);
static int kmi_disable(keyboard_t *);
static int kmi_attached = 0;
/* early keyboard probe, not supported */
static int
kmi_configure(int flags)
{
return (0);
}
/* detect a keyboard, not used */
static int
kmi_probe(int unit, void *arg, int flags)
{
return (ENXIO);
}
/* reset and initialize the device, not used */
static int
kmi_init(int unit, keyboard_t **kbdp, void *arg, int flags)
{
return (ENXIO);
}
/* test the interface to the device, not used */
static int
kmi_test_if(keyboard_t *kbd)
{
return (0);
}
/* finish using this keyboard, not used */
static int
kmi_term(keyboard_t *kbd)
{
return (ENXIO);
}
/* keyboard interrupt routine, not used */
static int
kmi_intr(keyboard_t *kbd, void *arg)
{
return (0);
}
/* lock the access to the keyboard, not used */
static int
kmi_lock(keyboard_t *kbd, int lock)
{
return (1);
}
/*
* Enable the access to the device; until this function is called,
* the client cannot read from the keyboard.
*/
static int
kmi_enable(keyboard_t *kbd)
{
KMI_LOCK();
KBD_ACTIVATE(kbd);
KMI_UNLOCK();
return (0);
}
/* disallow the access to the device */
static int
kmi_disable(keyboard_t *kbd)
{
KMI_LOCK();
KBD_DEACTIVATE(kbd);
KMI_UNLOCK();
return (0);
}
/* check if data is waiting */
static int
kmi_check(keyboard_t *kbd)
{
struct kmi_softc *sc = kbd->kb_data;
uint32_t reg;
KMI_CTX_LOCK_ASSERT();
if (!KBD_IS_ACTIVE(kbd))
return (0);
reg = pl050_kmi_read_4(sc, KMIIR);
return (reg & KMIIR_RXINTR);
}
/* check if char is waiting */
static int
kmi_check_char_locked(keyboard_t *kbd)
{
KMI_CTX_LOCK_ASSERT();
if (!KBD_IS_ACTIVE(kbd))
return (0);
return (kmi_check(kbd));
}
static int
kmi_check_char(keyboard_t *kbd)
{
int result;
KMI_LOCK();
result = kmi_check_char_locked(kbd);
KMI_UNLOCK();
return (result);
}
/* read one byte from the keyboard if it's allowed */
/* Currently unused. */
static int
kmi_read(keyboard_t *kbd, int wait)
{
KMI_CTX_LOCK_ASSERT();
if (!KBD_IS_ACTIVE(kbd))
return (-1);
++(kbd->kb_count);
printf("Implement ME: %s\n", __func__);
return (0);
}
/* read char from the keyboard */
static uint32_t
kmi_read_char_locked(keyboard_t *kbd, int wait)
{
struct kmi_softc *sc = kbd->kb_data;
uint32_t reg, data;
KMI_CTX_LOCK_ASSERT();
if (!KBD_IS_ACTIVE(kbd))
return (NOKEY);
reg = pl050_kmi_read_4(sc, KMIIR);
if (reg & KMIIR_RXINTR) {
data = pl050_kmi_read_4(sc, KMIDATA);
return (data);
}
++kbd->kb_count;
return (NOKEY);
}
/* Currently wait is always false. */
static uint32_t
kmi_read_char(keyboard_t *kbd, int wait)
{
uint32_t keycode;
KMI_LOCK();
keycode = kmi_read_char_locked(kbd, wait);
KMI_UNLOCK();
return (keycode);
}
/* some useful control functions */
static int
kmi_ioctl_locked(keyboard_t *kbd, u_long cmd, caddr_t arg)
{
struct kmi_softc *sc = kbd->kb_data;
int i;
#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
int ival;
#endif
KMI_LOCK_ASSERT();
switch (cmd) {
case KDGKBMODE: /* get keyboard mode */
*(int *)arg = sc->sc_mode;
break;
#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
case _IO('K', 7):
ival = IOCPARM_IVAL(arg);
arg = (caddr_t)&ival;
/* FALLTHROUGH */
#endif
case KDSKBMODE: /* set keyboard mode */
switch (*(int *)arg) {
case K_XLATE:
if (sc->sc_mode != K_XLATE) {
/* make lock key state and LED state match */
sc->sc_state &= ~LOCK_MASK;
sc->sc_state |= KBD_LED_VAL(kbd);
}
/* FALLTHROUGH */
case K_RAW:
case K_CODE:
if (sc->sc_mode != *(int *)arg) {
if ((sc->sc_flags & KMI_FLAG_POLLING) == 0)
kmi_clear_state(kbd);
sc->sc_mode = *(int *)arg;
}
break;
default:
return (EINVAL);
}
break;
case KDGETLED: /* get keyboard LED */
*(int *)arg = KBD_LED_VAL(kbd);
break;
#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
case _IO('K', 66):
ival = IOCPARM_IVAL(arg);
arg = (caddr_t)&ival;
/* FALLTHROUGH */
#endif
case KDSETLED: /* set keyboard LED */
/* NOTE: lock key state in "sc_state" won't be changed */
if (*(int *)arg & ~LOCK_MASK)
return (EINVAL);
i = *(int *)arg;
/* replace CAPS LED with ALTGR LED for ALTGR keyboards */
if (sc->sc_mode == K_XLATE &&
kbd->kb_keymap->n_keys > ALTGR_OFFSET) {
if (i & ALKED)
i |= CLKED;
else
i &= ~CLKED;
}
if (KBD_HAS_DEVICE(kbd))
kmi_set_leds(sc, i);
KBD_LED_VAL(kbd) = *(int *)arg;
break;
case KDGKBSTATE: /* get lock key state */
*(int *)arg = sc->sc_state & LOCK_MASK;
break;
#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
case _IO('K', 20):
ival = IOCPARM_IVAL(arg);
arg = (caddr_t)&ival;
/* FALLTHROUGH */
#endif
case KDSKBSTATE: /* set lock key state */
if (*(int *)arg & ~LOCK_MASK) {
return (EINVAL);
}
sc->sc_state &= ~LOCK_MASK;
sc->sc_state |= *(int *)arg;
/* set LEDs and quit */
return (kmi_ioctl(kbd, KDSETLED, arg));
case KDSETREPEAT: /* set keyboard repeat rate (new
* interface) */
if (!KBD_HAS_DEVICE(kbd)) {
return (0);
}
if (((int *)arg)[1] < 0) {
return (EINVAL);
}
if (((int *)arg)[0] < 0) {
return (EINVAL);
}
if (((int *)arg)[0] < 200) /* fastest possible value */
kbd->kb_delay1 = 200;
else
kbd->kb_delay1 = ((int *)arg)[0];
kbd->kb_delay2 = ((int *)arg)[1];
return (0);
#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
case _IO('K', 67):
ival = IOCPARM_IVAL(arg);
arg = (caddr_t)&ival;
/* FALLTHROUGH */
#endif
case KDSETRAD: /* set keyboard repeat rate (old
* interface) */
return (kmi_set_typematic(kbd, *(int *)arg));
case PIO_KEYMAP: /* set keyboard translation table */
case OPIO_KEYMAP: /* set keyboard translation table
* (compat) */
case PIO_KEYMAPENT: /* set keyboard translation table
* entry */
case PIO_DEADKEYMAP: /* set accent key translation table */
sc->sc_accents = 0;
/* FALLTHROUGH */
default:
return (genkbd_commonioctl(kbd, cmd, arg));
}
return (0);
}
static int
kmi_ioctl(keyboard_t *kbd, u_long cmd, caddr_t arg)
{
int result;
/*
* XXX KDGKBSTATE, KDSKBSTATE and KDSETLED can be called from any
* context where printf(9) can be called, which among other things
* includes interrupt filters and threads with any kinds of locks
* already held. For this reason it would be dangerous to acquire
* the Giant here unconditionally. On the other hand we have to
* have it to handle the ioctl.
* So we make our best effort to auto-detect whether we can grab
* the Giant or not. Blame syscons(4) for this.
*/
switch (cmd) {
case KDGKBSTATE:
case KDSKBSTATE:
case KDSETLED:
if (!mtx_owned(&Giant) && !SCHEDULER_STOPPED())
return (EDEADLK); /* best I could come up with */
/* FALLTHROUGH */
default:
KMI_LOCK();
result = kmi_ioctl_locked(kbd, cmd, arg);
KMI_UNLOCK();
return (result);
}
}
/* clear the internal state of the keyboard */
static void
kmi_clear_state(keyboard_t *kbd)
{
struct kmi_softc *sc = kbd->kb_data;
KMI_CTX_LOCK_ASSERT();
sc->sc_flags &= ~(KMI_FLAG_COMPOSE | KMI_FLAG_POLLING);
sc->sc_state &= LOCK_MASK; /* preserve locking key state */
sc->sc_accents = 0;
}
/* save the internal state, not used */
static int
kmi_get_state(keyboard_t *kbd, void *buf, size_t len)
{
return (len == 0) ? 1 : -1;
}
/* set the internal state, not used */
static int
kmi_set_state(keyboard_t *kbd, void *buf, size_t len)
{
return (EINVAL);
}
static int
kmi_poll(keyboard_t *kbd, int on)
{
struct kmi_softc *sc = kbd->kb_data;
KMI_LOCK();
if (on) {
sc->sc_flags |= KMI_FLAG_POLLING;
sc->sc_poll_thread = curthread;
} else {
sc->sc_flags &= ~KMI_FLAG_POLLING;
}
KMI_UNLOCK();
return (0);
}
/* local functions */
static void
kmi_set_leds(struct kmi_softc *sc, uint8_t leds)
{
KMI_LOCK_ASSERT();
/* start transfer, if not already started */
printf("Implement me: %s\n", __func__);
}
static int
kmi_set_typematic(keyboard_t *kbd, int code)
{
static const int delays[] = {250, 500, 750, 1000};
static const int rates[] = {34, 38, 42, 46, 50, 55, 59, 63,
68, 76, 84, 92, 100, 110, 118, 126,
136, 152, 168, 184, 200, 220, 236, 252,
272, 304, 336, 368, 400, 440, 472, 504};
if (code & ~0x7f) {
return (EINVAL);
}
kbd->kb_delay1 = delays[(code >> 5) & 3];
kbd->kb_delay2 = rates[code & 0x1f];
return (0);
}
static keyboard_switch_t kmisw = {
.probe = &kmi_probe,
.init = &kmi_init,
.term = &kmi_term,
.intr = &kmi_intr,
.test_if = &kmi_test_if,
.enable = &kmi_enable,
.disable = &kmi_disable,
.read = &kmi_read,
.check = &kmi_check,
.read_char = &kmi_read_char,
.check_char = &kmi_check_char,
.ioctl = &kmi_ioctl,
.lock = &kmi_lock,
.clear_state = &kmi_clear_state,
.get_state = &kmi_get_state,
.set_state = &kmi_set_state,
.poll = &kmi_poll,
};
KEYBOARD_DRIVER(kmi, kmisw, kmi_configure);
static void
pl050_kmi_intr(void *arg)
{
struct kmi_softc *sc = arg;
uint32_t c;
KMI_CTX_LOCK_ASSERT();
if ((sc->sc_flags & KMI_FLAG_POLLING) != 0)
return;
if (KBD_IS_ACTIVE(&sc->sc_kbd) &&
KBD_IS_BUSY(&sc->sc_kbd)) {
/* let the callback function process the input */
(sc->sc_kbd.kb_callback.kc_func) (&sc->sc_kbd, KBDIO_KEYINPUT,
sc->sc_kbd.kb_callback.kc_arg);
} else {
/* read and discard the input, no one is waiting for it */
do {
c = kmi_read_char_locked(&sc->sc_kbd, 0);
} while (c != NOKEY);
}
}
static int
pl050_kmi_probe(device_t dev)
{
if (!ofw_bus_status_okay(dev))
return (ENXIO);
/*
* PL050 is plain PS2 port that pushes bytes to/from computer
* VersatilePB has two such ports and QEMU simulates keyboard
* connected to port #0 and mouse connected to port #1. This
* information can't be obtained from device tree so we just
* hardcode this knowledge here. We attach keyboard driver to
* port #0 and ignore port #1
*/
if (kmi_attached)
return (ENXIO);
if (ofw_bus_is_compatible(dev, "arm,pl050")) {
device_set_desc(dev, "PL050 Keyboard/Mouse Interface");
return (BUS_PROBE_DEFAULT);
}
return (ENXIO);
}
static int
pl050_kmi_attach(device_t dev)
{
struct kmi_softc *sc = device_get_softc(dev);
keyboard_t *kbd;
int rid;
int i;
uint32_t ack;
sc->sc_dev = dev;
kbd = &sc->sc_kbd;
rid = 0;
sc->sc_mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE);
if (sc->sc_mem_res == NULL) {
device_printf(dev, "could not allocate memory resource\n");
return (ENXIO);
}
/* Request the IRQ resources */
sc->sc_irq_res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_ACTIVE);
if (sc->sc_irq_res == NULL) {
device_printf(dev, "Error: could not allocate irq resources\n");
return (ENXIO);
}
/* Setup and enable the timer */
if (bus_setup_intr(dev, sc->sc_irq_res, INTR_TYPE_CLK,
NULL, pl050_kmi_intr, sc,
&sc->sc_intr_hl) != 0) {
bus_release_resource(dev, SYS_RES_IRQ, rid,
sc->sc_irq_res);
device_printf(dev, "Unable to setup the clock irq handler.\n");
return (ENXIO);
}
/* TODO: clock & divisor */
pl050_kmi_write_4(sc, KMICR, KMICR_EN);
pl050_kmi_write_4(sc, KMIDATA, SET_SCANCODE_SET);
/* read out ACK */
ack = pl050_kmi_read_4(sc, KMIDATA);
/* Set Scan Code set 1 (XT) */
pl050_kmi_write_4(sc, KMIDATA, 1);
/* read out ACK */
ack = pl050_kmi_read_4(sc, KMIDATA);
pl050_kmi_write_4(sc, KMICR, KMICR_EN | KMICR_RXINTREN);
kbd_init_struct(kbd, KMI_DRIVER_NAME, KB_OTHER,
device_get_unit(dev), 0, 0, 0);
kbd->kb_data = (void *)sc;
sc->sc_keymap = key_map;
sc->sc_accmap = accent_map;
for (i = 0; i < KMI_NFKEY; i++) {
sc->sc_fkeymap[i] = fkey_tab[i];
}
kbd_set_maps(kbd, &sc->sc_keymap, &sc->sc_accmap,
sc->sc_fkeymap, KMI_NFKEY);
KBD_FOUND_DEVICE(kbd);
kmi_clear_state(kbd);
KBD_PROBE_DONE(kbd);
KBD_INIT_DONE(kbd);
if (kbd_register(kbd) < 0) {
goto detach;
}
KBD_CONFIG_DONE(kbd);
#ifdef KBD_INSTALL_CDEV
if (kbd_attach(kbd)) {
goto detach;
}
#endif
if (bootverbose) {
kbdd_diag(kbd, bootverbose);
}
kmi_attached = 1;
return (0);
detach:
return (ENXIO);
}
static device_method_t pl050_kmi_methods[] = {
DEVMETHOD(device_probe, pl050_kmi_probe),
DEVMETHOD(device_attach, pl050_kmi_attach),
{ 0, 0 }
};
static driver_t pl050_kmi_driver = {
"kmi",
pl050_kmi_methods,
sizeof(struct kmi_softc),
};
static devclass_t pl050_kmi_devclass;
DRIVER_MODULE(pl050_kmi, simplebus, pl050_kmi_driver, pl050_kmi_devclass, 0, 0);
Index: head/sys/cddl/compat/opensolaris/sys/mutex.h
===================================================================
--- head/sys/cddl/compat/opensolaris/sys/mutex.h (revision 356654)
+++ head/sys/cddl/compat/opensolaris/sys/mutex.h (revision 356655)
@@ -1,77 +1,77 @@
/*-
* Copyright (c) 2007 Pawel Jakub Dawidek
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _OPENSOLARIS_SYS_MUTEX_H_
#define _OPENSOLARIS_SYS_MUTEX_H_
#ifdef _KERNEL
#include
#include
#include_next
#include
#include
typedef enum {
MUTEX_DEFAULT = 6 /* kernel default mutex */
} kmutex_type_t;
#define MUTEX_HELD(x) (mutex_owned(x))
-#define MUTEX_NOT_HELD(x) (!mutex_owned(x) || panicstr)
+#define MUTEX_NOT_HELD(x) (!mutex_owned(x) || KERNEL_PANICKED())
typedef struct sx kmutex_t;
#ifndef OPENSOLARIS_WITNESS
#define MUTEX_FLAGS (SX_DUPOK | SX_NEW | SX_NOWITNESS)
#else
#define MUTEX_FLAGS (SX_DUPOK | SX_NEW)
#endif
#define mutex_init(lock, desc, type, arg) do { \
const char *_name; \
ASSERT((type) == 0 || (type) == MUTEX_DEFAULT); \
KASSERT(((lock)->lock_object.lo_flags & LO_ALLMASK) != \
LO_EXPECTED, ("lock %s already initialized", #lock)); \
for (_name = #lock; *_name != '\0'; _name++) { \
if (*_name >= 'a' && *_name <= 'z') \
break; \
} \
if (*_name == '\0') \
_name = #lock; \
sx_init_flags((lock), _name, MUTEX_FLAGS); \
} while (0)
#define mutex_destroy(lock) sx_destroy(lock)
#define mutex_enter(lock) sx_xlock(lock)
#define mutex_tryenter(lock) sx_try_xlock(lock)
#define mutex_exit(lock) sx_xunlock(lock)
#define mutex_owned(lock) sx_xlocked(lock)
#define mutex_owner(lock) sx_xholder(lock)
#endif /* _KERNEL */
#endif /* _OPENSOLARIS_SYS_MUTEX_H_ */
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c (revision 356654)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c (revision 356655)
@@ -1,7341 +1,7341 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved.
* Copyright 2013 Martin Matuska . All rights reserved.
* Copyright 2014 Xin Li . All rights reserved.
* Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved.
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2016 Toomas Soome
* Copyright 2017 RackTop Systems.
* Copyright (c) 2019 Datto Inc.
*/
/*
* ZFS ioctls.
*
* This file handles the ioctls to /dev/zfs, used for configuring ZFS storage
* pools and filesystems, e.g. with /sbin/zfs and /sbin/zpool.
*
* There are two ways that we handle ioctls: the legacy way where almost
* all of the logic is in the ioctl callback, and the new way where most
* of the marshalling is handled in the common entry point, zfsdev_ioctl().
*
* Non-legacy ioctls should be registered by calling
* zfs_ioctl_register() from zfs_ioctl_init(). The ioctl is invoked
* from userland by lzc_ioctl().
*
* The registration arguments are as follows:
*
* const char *name
* The name of the ioctl. This is used for history logging. If the
* ioctl returns successfully (the callback returns 0), and allow_log
* is true, then a history log entry will be recorded with the input &
* output nvlists. The log entry can be printed with "zpool history -i".
*
* zfs_ioc_t ioc
* The ioctl request number, which userland will pass to ioctl(2).
* The ioctl numbers can change from release to release, because
* the caller (libzfs) must be matched to the kernel.
*
* zfs_secpolicy_func_t *secpolicy
* This function will be called before the zfs_ioc_func_t, to
* determine if this operation is permitted. It should return EPERM
* on failure, and 0 on success. Checks include determining if the
* dataset is visible in this zone, and if the user has either all
* zfs privileges in the zone (SYS_MOUNT), or has been granted permission
* to do this operation on this dataset with "zfs allow".
*
* zfs_ioc_namecheck_t namecheck
* This specifies what to expect in the zfs_cmd_t:zc_name -- a pool
* name, a dataset name, or nothing. If the name is not well-formed,
* the ioctl will fail and the callback will not be called.
* Therefore, the callback can assume that the name is well-formed
* (e.g. is null-terminated, doesn't have more than one '@' character,
* doesn't have invalid characters).
*
* zfs_ioc_poolcheck_t pool_check
* This specifies requirements on the pool state. If the pool does
* not meet them (is suspended or is readonly), the ioctl will fail
* and the callback will not be called. If any checks are specified
* (i.e. it is not POOL_CHECK_NONE), namecheck must not be NO_NAME.
* Multiple checks can be or-ed together (e.g. POOL_CHECK_SUSPENDED |
* POOL_CHECK_READONLY).
*
* boolean_t smush_outnvlist
* If smush_outnvlist is true, then the output is presumed to be a
* list of errors, and it will be "smushed" down to fit into the
* caller's buffer, by removing some entries and replacing them with a
* single "N_MORE_ERRORS" entry indicating how many were removed. See
* nvlist_smush() for details. If smush_outnvlist is false, and the
* outnvlist does not fit into the userland-provided buffer, then the
* ioctl will fail with ENOMEM.
*
* zfs_ioc_func_t *func
* The callback function that will perform the operation.
*
* The callback should return 0 on success, or an error number on
* failure. If the function fails, the userland ioctl will return -1,
* and errno will be set to the callback's return value. The callback
* will be called with the following arguments:
*
* const char *name
* The name of the pool or dataset to operate on, from
* zfs_cmd_t:zc_name. The 'namecheck' argument specifies the
* expected type (pool, dataset, or none).
*
* nvlist_t *innvl
* The input nvlist, deserialized from zfs_cmd_t:zc_nvlist_src. Or
* NULL if no input nvlist was provided. Changes to this nvlist are
* ignored. If the input nvlist could not be deserialized, the
* ioctl will fail and the callback will not be called.
*
* nvlist_t *outnvl
* The output nvlist, initially empty. The callback can fill it in,
* and it will be returned to userland by serializing it into
* zfs_cmd_t:zc_nvlist_dst. If it is non-empty, and serialization
* fails (e.g. because the caller didn't supply a large enough
* buffer), then the overall ioctl will fail. See the
* 'smush_nvlist' argument above for additional behaviors.
*
* There are two typical uses of the output nvlist:
* - To return state, e.g. property values. In this case,
* smush_outnvlist should be false. If the buffer was not large
* enough, the caller will reallocate a larger buffer and try
* the ioctl again.
*
* - To return multiple errors from an ioctl which makes on-disk
* changes. In this case, smush_outnvlist should be true.
* Ioctls which make on-disk modifications should generally not
* use the outnvl if they succeed, because the caller can not
* distinguish between the operation failing, and
* deserialization failing.
*/
#ifdef __FreeBSD__
#include "opt_kstack_pages.h"
#endif
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "zfs_namecheck.h"
#include "zfs_prop.h"
#include "zfs_deleg.h"
#include "zfs_comutil.h"
#include "zfs_ioctl_compat.h"
#include "lua.h"
#include "lauxlib.h"
static struct cdev *zfsdev;
extern void zfs_init(void);
extern void zfs_fini(void);
uint_t zfs_fsyncer_key;
extern uint_t rrw_tsd_key;
static uint_t zfs_allow_log_key;
extern uint_t zfs_geom_probe_vdev_key;
typedef int zfs_ioc_legacy_func_t(zfs_cmd_t *);
typedef int zfs_ioc_func_t(const char *, nvlist_t *, nvlist_t *);
typedef int zfs_secpolicy_func_t(zfs_cmd_t *, nvlist_t *, cred_t *);
typedef enum {
NO_NAME,
POOL_NAME,
DATASET_NAME,
ENTITY_NAME
} zfs_ioc_namecheck_t;
typedef enum {
POOL_CHECK_NONE = 1 << 0,
POOL_CHECK_SUSPENDED = 1 << 1,
POOL_CHECK_READONLY = 1 << 2,
} zfs_ioc_poolcheck_t;
typedef struct zfs_ioc_vec {
zfs_ioc_legacy_func_t *zvec_legacy_func;
zfs_ioc_func_t *zvec_func;
zfs_secpolicy_func_t *zvec_secpolicy;
zfs_ioc_namecheck_t zvec_namecheck;
boolean_t zvec_allow_log;
zfs_ioc_poolcheck_t zvec_pool_check;
boolean_t zvec_smush_outnvlist;
const char *zvec_name;
} zfs_ioc_vec_t;
/* This array is indexed by zfs_userquota_prop_t */
static const char *userquota_perms[] = {
ZFS_DELEG_PERM_USERUSED,
ZFS_DELEG_PERM_USERQUOTA,
ZFS_DELEG_PERM_GROUPUSED,
ZFS_DELEG_PERM_GROUPQUOTA,
};
static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc);
static int zfs_check_settable(const char *name, nvpair_t *property,
cred_t *cr);
static int zfs_check_clearable(char *dataset, nvlist_t *props,
nvlist_t **errors);
static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *,
boolean_t *);
int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t *);
static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp);
static void zfsdev_close(void *data);
static int zfs_prop_activate_feature(spa_t *spa, spa_feature_t feature);
/* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */
void
__dprintf(const char *file, const char *func, int line, const char *fmt, ...)
{
const char *newfile;
char buf[512];
va_list adx;
/*
* Get rid of annoying "../common/" prefix to filename.
*/
newfile = strrchr(file, '/');
if (newfile != NULL) {
newfile = newfile + 1; /* Get rid of leading / */
} else {
newfile = file;
}
va_start(adx, fmt);
(void) vsnprintf(buf, sizeof (buf), fmt, adx);
va_end(adx);
/*
* To get this data, use the zfs-dprintf probe as so:
* dtrace -q -n 'zfs-dprintf \
* /stringof(arg0) == "dbuf.c"/ \
* {printf("%s: %s", stringof(arg1), stringof(arg3))}'
* arg0 = file name
* arg1 = function name
* arg2 = line number
* arg3 = message
*/
DTRACE_PROBE4(zfs__dprintf,
char *, newfile, char *, func, int, line, char *, buf);
}
static void
history_str_free(char *buf)
{
kmem_free(buf, HIS_MAX_RECORD_LEN);
}
static char *
history_str_get(zfs_cmd_t *zc)
{
char *buf;
if (zc->zc_history == 0)
return (NULL);
buf = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP);
if (copyinstr((void *)(uintptr_t)zc->zc_history,
buf, HIS_MAX_RECORD_LEN, NULL) != 0) {
history_str_free(buf);
return (NULL);
}
buf[HIS_MAX_RECORD_LEN -1] = '\0';
return (buf);
}
/*
* Check to see if the named dataset is currently defined as bootable
*/
static boolean_t
zfs_is_bootfs(const char *name)
{
objset_t *os;
if (dmu_objset_hold(name, FTAG, &os) == 0) {
boolean_t ret;
ret = (dmu_objset_id(os) == spa_bootfs(dmu_objset_spa(os)));
dmu_objset_rele(os, FTAG);
return (ret);
}
return (B_FALSE);
}
/*
* Return non-zero if the spa version is less than requested version.
*/
static int
zfs_earlier_version(const char *name, int version)
{
spa_t *spa;
if (spa_open(name, &spa, FTAG) == 0) {
if (spa_version(spa) < version) {
spa_close(spa, FTAG);
return (1);
}
spa_close(spa, FTAG);
}
return (0);
}
/*
* Return TRUE if the ZPL version is less than requested version.
*/
static boolean_t
zpl_earlier_version(const char *name, int version)
{
objset_t *os;
boolean_t rc = B_TRUE;
if (dmu_objset_hold(name, FTAG, &os) == 0) {
uint64_t zplversion;
if (dmu_objset_type(os) != DMU_OST_ZFS) {
dmu_objset_rele(os, FTAG);
return (B_TRUE);
}
/* XXX reading from non-owned objset */
if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &zplversion) == 0)
rc = zplversion < version;
dmu_objset_rele(os, FTAG);
}
return (rc);
}
static void
zfs_log_history(zfs_cmd_t *zc)
{
spa_t *spa;
char *buf;
if ((buf = history_str_get(zc)) == NULL)
return;
if (spa_open(zc->zc_name, &spa, FTAG) == 0) {
if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY)
(void) spa_history_log(spa, buf);
spa_close(spa, FTAG);
}
history_str_free(buf);
}
/*
* Policy for top-level read operations (list pools). Requires no privileges,
* and can be used in the local zone, as there is no associated dataset.
*/
/* ARGSUSED */
static int
zfs_secpolicy_none(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
return (0);
}
/*
* Policy for dataset read operations (list children, get statistics). Requires
* no privileges, but must be visible in the local zone.
*/
/* ARGSUSED */
static int
zfs_secpolicy_read(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
if (INGLOBALZONE(curthread) ||
zone_dataset_visible(zc->zc_name, NULL))
return (0);
return (SET_ERROR(ENOENT));
}
static int
zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr)
{
int writable = 1;
/*
* The dataset must be visible by this zone -- check this first
* so they don't see EPERM on something they shouldn't know about.
*/
if (!INGLOBALZONE(curthread) &&
!zone_dataset_visible(dataset, &writable))
return (SET_ERROR(ENOENT));
if (INGLOBALZONE(curthread)) {
/*
* If the fs is zoned, only root can access it from the
* global zone.
*/
if (secpolicy_zfs(cr) && zoned)
return (SET_ERROR(EPERM));
} else {
/*
* If we are in a local zone, the 'zoned' property must be set.
*/
if (!zoned)
return (SET_ERROR(EPERM));
/* must be writable by this zone */
if (!writable)
return (SET_ERROR(EPERM));
}
return (0);
}
static int
zfs_dozonecheck(const char *dataset, cred_t *cr)
{
uint64_t zoned;
if (dsl_prop_get_integer(dataset, "jailed", &zoned, NULL))
return (SET_ERROR(ENOENT));
return (zfs_dozonecheck_impl(dataset, zoned, cr));
}
static int
zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr)
{
uint64_t zoned;
if (dsl_prop_get_int_ds(ds, "jailed", &zoned))
return (SET_ERROR(ENOENT));
return (zfs_dozonecheck_impl(dataset, zoned, cr));
}
static int
zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds,
const char *perm, cred_t *cr)
{
int error;
error = zfs_dozonecheck_ds(name, ds, cr);
if (error == 0) {
error = secpolicy_zfs(cr);
if (error != 0)
error = dsl_deleg_access_impl(ds, perm, cr);
}
return (error);
}
static int
zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr)
{
int error;
dsl_dataset_t *ds;
dsl_pool_t *dp;
/*
* First do a quick check for root in the global zone, which
* is allowed to do all write_perms. This ensures that zfs_ioc_*
* will get to handle nonexistent datasets.
*/
if (INGLOBALZONE(curthread) && secpolicy_zfs(cr) == 0)
return (0);
error = dsl_pool_hold(name, FTAG, &dp);
if (error != 0)
return (error);
error = dsl_dataset_hold(dp, name, FTAG, &ds);
if (error != 0) {
dsl_pool_rele(dp, FTAG);
return (error);
}
error = zfs_secpolicy_write_perms_ds(name, ds, perm, cr);
dsl_dataset_rele(ds, FTAG);
dsl_pool_rele(dp, FTAG);
return (error);
}
#ifdef SECLABEL
/*
* Policy for setting the security label property.
*
* Returns 0 for success, non-zero for access and other errors.
*/
static int
zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr)
{
char ds_hexsl[MAXNAMELEN];
bslabel_t ds_sl, new_sl;
boolean_t new_default = FALSE;
uint64_t zoned;
int needed_priv = -1;
int error;
/* First get the existing dataset label. */
error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
1, sizeof (ds_hexsl), &ds_hexsl, NULL);
if (error != 0)
return (SET_ERROR(EPERM));
if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0)
new_default = TRUE;
/* The label must be translatable */
if (!new_default && (hexstr_to_label(strval, &new_sl) != 0))
return (SET_ERROR(EINVAL));
/*
* In a non-global zone, disallow attempts to set a label that
* doesn't match that of the zone; otherwise no other checks
* are needed.
*/
if (!INGLOBALZONE(curproc)) {
if (new_default || !blequal(&new_sl, CR_SL(CRED())))
return (SET_ERROR(EPERM));
return (0);
}
/*
* For global-zone datasets (i.e., those whose zoned property is
* "off", verify that the specified new label is valid for the
* global zone.
*/
if (dsl_prop_get_integer(name,
zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
return (SET_ERROR(EPERM));
if (!zoned) {
if (zfs_check_global_label(name, strval) != 0)
return (SET_ERROR(EPERM));
}
/*
* If the existing dataset label is nondefault, check if the
* dataset is mounted (label cannot be changed while mounted).
* Get the zfsvfs; if there isn't one, then the dataset isn't
* mounted (or isn't a dataset, doesn't exist, ...).
*/
if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) != 0) {
objset_t *os;
static char *setsl_tag = "setsl_tag";
/*
* Try to own the dataset; abort if there is any error,
* (e.g., already mounted, in use, or other error).
*/
error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE,
setsl_tag, &os);
if (error != 0)
return (SET_ERROR(EPERM));
dmu_objset_disown(os, setsl_tag);
if (new_default) {
needed_priv = PRIV_FILE_DOWNGRADE_SL;
goto out_check;
}
if (hexstr_to_label(strval, &new_sl) != 0)
return (SET_ERROR(EPERM));
if (blstrictdom(&ds_sl, &new_sl))
needed_priv = PRIV_FILE_DOWNGRADE_SL;
else if (blstrictdom(&new_sl, &ds_sl))
needed_priv = PRIV_FILE_UPGRADE_SL;
} else {
/* dataset currently has a default label */
if (!new_default)
needed_priv = PRIV_FILE_UPGRADE_SL;
}
out_check:
if (needed_priv != -1)
return (PRIV_POLICY(cr, needed_priv, B_FALSE, EPERM, NULL));
return (0);
}
#endif /* SECLABEL */
static int
zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval,
cred_t *cr)
{
char *strval;
/*
* Check permissions for special properties.
*/
switch (prop) {
case ZFS_PROP_ZONED:
/*
* Disallow setting of 'zoned' from within a local zone.
*/
if (!INGLOBALZONE(curthread))
return (SET_ERROR(EPERM));
break;
case ZFS_PROP_QUOTA:
case ZFS_PROP_FILESYSTEM_LIMIT:
case ZFS_PROP_SNAPSHOT_LIMIT:
if (!INGLOBALZONE(curthread)) {
uint64_t zoned;
char setpoint[ZFS_MAX_DATASET_NAME_LEN];
/*
* Unprivileged users are allowed to modify the
* limit on things *under* (ie. contained by)
* the thing they own.
*/
if (dsl_prop_get_integer(dsname, "jailed", &zoned,
setpoint))
return (SET_ERROR(EPERM));
if (!zoned || strlen(dsname) <= strlen(setpoint))
return (SET_ERROR(EPERM));
}
break;
case ZFS_PROP_MLSLABEL:
#ifdef SECLABEL
if (!is_system_labeled())
return (SET_ERROR(EPERM));
if (nvpair_value_string(propval, &strval) == 0) {
int err;
err = zfs_set_slabel_policy(dsname, strval, CRED());
if (err != 0)
return (err);
}
#else
return (EOPNOTSUPP);
#endif
break;
}
return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr));
}
/* ARGSUSED */
static int
zfs_secpolicy_set_fsacl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
int error;
error = zfs_dozonecheck(zc->zc_name, cr);
if (error != 0)
return (error);
/*
* permission to set permissions will be evaluated later in
* dsl_deleg_can_allow()
*/
return (0);
}
/* ARGSUSED */
static int
zfs_secpolicy_rollback(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
return (zfs_secpolicy_write_perms(zc->zc_name,
ZFS_DELEG_PERM_ROLLBACK, cr));
}
/* ARGSUSED */
static int
zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
dsl_pool_t *dp;
dsl_dataset_t *ds;
char *cp;
int error;
/*
* Generate the current snapshot name from the given objsetid, then
* use that name for the secpolicy/zone checks.
*/
cp = strchr(zc->zc_name, '@');
if (cp == NULL)
return (SET_ERROR(EINVAL));
error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
if (error != 0)
return (error);
error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds);
if (error != 0) {
dsl_pool_rele(dp, FTAG);
return (error);
}
dsl_dataset_name(ds, zc->zc_name);
error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds,
ZFS_DELEG_PERM_SEND, cr);
dsl_dataset_rele(ds, FTAG);
dsl_pool_rele(dp, FTAG);
return (error);
}
/* ARGSUSED */
static int
zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
return (zfs_secpolicy_write_perms(zc->zc_name,
ZFS_DELEG_PERM_SEND, cr));
}
/* ARGSUSED */
static int
zfs_secpolicy_deleg_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
vnode_t *vp;
int error;
if ((error = lookupname(zc->zc_value, UIO_SYSSPACE,
NO_FOLLOW, NULL, &vp)) != 0)
return (error);
/* Now make sure mntpnt and dataset are ZFS */
if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 ||
(strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource),
zc->zc_name) != 0)) {
VN_RELE(vp);
return (SET_ERROR(EPERM));
}
VN_RELE(vp);
return (dsl_deleg_access(zc->zc_name,
ZFS_DELEG_PERM_SHARE, cr));
}
int
zfs_secpolicy_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
if (!INGLOBALZONE(curthread))
return (SET_ERROR(EPERM));
if (secpolicy_nfs(cr) == 0) {
return (0);
} else {
return (zfs_secpolicy_deleg_share(zc, innvl, cr));
}
}
int
zfs_secpolicy_smb_acl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
if (!INGLOBALZONE(curthread))
return (SET_ERROR(EPERM));
if (secpolicy_smb(cr) == 0) {
return (0);
} else {
return (zfs_secpolicy_deleg_share(zc, innvl, cr));
}
}
static int
zfs_get_parent(const char *datasetname, char *parent, int parentsize)
{
char *cp;
/*
* Remove the @bla or /bla from the end of the name to get the parent.
*/
(void) strncpy(parent, datasetname, parentsize);
cp = strrchr(parent, '@');
if (cp != NULL) {
cp[0] = '\0';
} else {
cp = strrchr(parent, '/');
if (cp == NULL)
return (SET_ERROR(ENOENT));
cp[0] = '\0';
}
return (0);
}
int
zfs_secpolicy_destroy_perms(const char *name, cred_t *cr)
{
int error;
if ((error = zfs_secpolicy_write_perms(name,
ZFS_DELEG_PERM_MOUNT, cr)) != 0)
return (error);
return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr));
}
/* ARGSUSED */
static int
zfs_secpolicy_destroy(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
return (zfs_secpolicy_destroy_perms(zc->zc_name, cr));
}
/*
* Destroying snapshots with delegated permissions requires
* descendant mount and destroy permissions.
*/
/* ARGSUSED */
static int
zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
nvlist_t *snaps;
nvpair_t *pair, *nextpair;
int error = 0;
if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0)
return (SET_ERROR(EINVAL));
for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
pair = nextpair) {
nextpair = nvlist_next_nvpair(snaps, pair);
error = zfs_secpolicy_destroy_perms(nvpair_name(pair), cr);
if (error == ENOENT) {
/*
* Ignore any snapshots that don't exist (we consider
* them "already destroyed"). Remove the name from the
* nvl here in case the snapshot is created between
* now and when we try to destroy it (in which case
* we don't want to destroy it since we haven't
* checked for permission).
*/
fnvlist_remove_nvpair(snaps, pair);
error = 0;
}
if (error != 0)
break;
}
return (error);
}
int
zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr)
{
char parentname[ZFS_MAX_DATASET_NAME_LEN];
int error;
if ((error = zfs_secpolicy_write_perms(from,
ZFS_DELEG_PERM_RENAME, cr)) != 0)
return (error);
if ((error = zfs_secpolicy_write_perms(from,
ZFS_DELEG_PERM_MOUNT, cr)) != 0)
return (error);
if ((error = zfs_get_parent(to, parentname,
sizeof (parentname))) != 0)
return (error);
if ((error = zfs_secpolicy_write_perms(parentname,
ZFS_DELEG_PERM_CREATE, cr)) != 0)
return (error);
if ((error = zfs_secpolicy_write_perms(parentname,
ZFS_DELEG_PERM_MOUNT, cr)) != 0)
return (error);
return (error);
}
/* ARGSUSED */
static int
zfs_secpolicy_rename(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
char *at = NULL;
char *pound;
int error;
if ((pound = strchr(zc->zc_name, '#')) != NULL) {
*pound = '\0';
error = zfs_secpolicy_write_perms(zc->zc_name,
ZFS_DELEG_PERM_RENAME, cr);
if (error == 0) {
error = zfs_secpolicy_write_perms(zc->zc_name,
ZFS_DELEG_PERM_BOOKMARK, cr);
}
*pound = '#';
return (error);
}
if ((zc->zc_cookie & 1) != 0) {
/*
* This is recursive rename, so the starting snapshot might
* not exist. Check file system or volume permission instead.
*/
at = strchr(zc->zc_name, '@');
if (at == NULL)
return (EINVAL);
*at = '\0';
}
error = zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr);
if (at != NULL)
*at = '@';
return (error);
}
/* ARGSUSED */
static int
zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
dsl_pool_t *dp;
dsl_dataset_t *clone;
int error;
error = zfs_secpolicy_write_perms(zc->zc_name,
ZFS_DELEG_PERM_PROMOTE, cr);
if (error != 0)
return (error);
error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
if (error != 0)
return (error);
error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &clone);
if (error == 0) {
char parentname[ZFS_MAX_DATASET_NAME_LEN];
dsl_dataset_t *origin = NULL;
dsl_dir_t *dd;
dd = clone->ds_dir;
error = dsl_dataset_hold_obj(dd->dd_pool,
dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin);
if (error != 0) {
dsl_dataset_rele(clone, FTAG);
dsl_pool_rele(dp, FTAG);
return (error);
}
error = zfs_secpolicy_write_perms_ds(zc->zc_name, clone,
ZFS_DELEG_PERM_MOUNT, cr);
dsl_dataset_name(origin, parentname);
if (error == 0) {
error = zfs_secpolicy_write_perms_ds(parentname, origin,
ZFS_DELEG_PERM_PROMOTE, cr);
}
dsl_dataset_rele(clone, FTAG);
dsl_dataset_rele(origin, FTAG);
}
dsl_pool_rele(dp, FTAG);
return (error);
}
/* ARGSUSED */
static int
zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
int error;
if ((error = zfs_secpolicy_write_perms(zc->zc_name,
ZFS_DELEG_PERM_RECEIVE, cr)) != 0)
return (error);
if ((error = zfs_secpolicy_write_perms(zc->zc_name,
ZFS_DELEG_PERM_MOUNT, cr)) != 0)
return (error);
return (zfs_secpolicy_write_perms(zc->zc_name,
ZFS_DELEG_PERM_CREATE, cr));
}
int
zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr)
{
return (zfs_secpolicy_write_perms(name,
ZFS_DELEG_PERM_SNAPSHOT, cr));
}
/*
* Check for permission to create each snapshot in the nvlist.
*/
/* ARGSUSED */
static int
zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
nvlist_t *snaps;
int error;
nvpair_t *pair;
if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0)
return (SET_ERROR(EINVAL));
for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
pair = nvlist_next_nvpair(snaps, pair)) {
char *name = nvpair_name(pair);
char *atp = strchr(name, '@');
if (atp == NULL) {
error = SET_ERROR(EINVAL);
break;
}
*atp = '\0';
error = zfs_secpolicy_snapshot_perms(name, cr);
*atp = '@';
if (error != 0)
break;
}
return (error);
}
/*
* Check for permission to create each snapshot in the nvlist.
*/
/* ARGSUSED */
static int
zfs_secpolicy_bookmark(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
int error = 0;
for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
char *name = nvpair_name(pair);
char *hashp = strchr(name, '#');
if (hashp == NULL) {
error = SET_ERROR(EINVAL);
break;
}
*hashp = '\0';
error = zfs_secpolicy_write_perms(name,
ZFS_DELEG_PERM_BOOKMARK, cr);
*hashp = '#';
if (error != 0)
break;
}
return (error);
}
/* ARGSUSED */
static int
zfs_secpolicy_remap(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
return (zfs_secpolicy_write_perms(zc->zc_name,
ZFS_DELEG_PERM_REMAP, cr));
}
/* ARGSUSED */
static int
zfs_secpolicy_destroy_bookmarks(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
nvpair_t *pair, *nextpair;
int error = 0;
for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL;
pair = nextpair) {
char *name = nvpair_name(pair);
char *hashp = strchr(name, '#');
nextpair = nvlist_next_nvpair(innvl, pair);
if (hashp == NULL) {
error = SET_ERROR(EINVAL);
break;
}
*hashp = '\0';
error = zfs_secpolicy_write_perms(name,
ZFS_DELEG_PERM_DESTROY, cr);
*hashp = '#';
if (error == ENOENT) {
/*
* Ignore any filesystems that don't exist (we consider
* their bookmarks "already destroyed"). Remove
* the name from the nvl here in case the filesystem
* is created between now and when we try to destroy
* the bookmark (in which case we don't want to
* destroy it since we haven't checked for permission).
*/
fnvlist_remove_nvpair(innvl, pair);
error = 0;
}
if (error != 0)
break;
}
return (error);
}
/* ARGSUSED */
static int
zfs_secpolicy_log_history(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
/*
* Even root must have a proper TSD so that we know what pool
* to log to.
*/
if (tsd_get(zfs_allow_log_key) == NULL)
return (SET_ERROR(EPERM));
return (0);
}
static int
zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
char parentname[ZFS_MAX_DATASET_NAME_LEN];
int error;
char *origin;
if ((error = zfs_get_parent(zc->zc_name, parentname,
sizeof (parentname))) != 0)
return (error);
if (nvlist_lookup_string(innvl, "origin", &origin) == 0 &&
(error = zfs_secpolicy_write_perms(origin,
ZFS_DELEG_PERM_CLONE, cr)) != 0)
return (error);
if ((error = zfs_secpolicy_write_perms(parentname,
ZFS_DELEG_PERM_CREATE, cr)) != 0)
return (error);
return (zfs_secpolicy_write_perms(parentname,
ZFS_DELEG_PERM_MOUNT, cr));
}
/*
* Policy for pool operations - create/destroy pools, add vdevs, etc. Requires
* SYS_CONFIG privilege, which is not available in a local zone.
*/
/* ARGSUSED */
static int
zfs_secpolicy_config(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
if (secpolicy_sys_config(cr, B_FALSE) != 0)
return (SET_ERROR(EPERM));
return (0);
}
/*
* Policy for object to name lookups.
*/
/* ARGSUSED */
static int
zfs_secpolicy_diff(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
int error;
if ((error = secpolicy_sys_config(cr, B_FALSE)) == 0)
return (0);
error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr);
return (error);
}
/*
* Policy for fault injection. Requires all privileges.
*/
/* ARGSUSED */
static int
zfs_secpolicy_inject(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
return (secpolicy_zinject(cr));
}
/* ARGSUSED */
static int
zfs_secpolicy_inherit_prop(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
zfs_prop_t prop = zfs_name_to_prop(zc->zc_value);
if (prop == ZPROP_INVAL) {
if (!zfs_prop_user(zc->zc_value))
return (SET_ERROR(EINVAL));
return (zfs_secpolicy_write_perms(zc->zc_name,
ZFS_DELEG_PERM_USERPROP, cr));
} else {
return (zfs_secpolicy_setprop(zc->zc_name, prop,
NULL, cr));
}
}
static int
zfs_secpolicy_userspace_one(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
int err = zfs_secpolicy_read(zc, innvl, cr);
if (err)
return (err);
if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
return (SET_ERROR(EINVAL));
if (zc->zc_value[0] == 0) {
/*
* They are asking about a posix uid/gid. If it's
* themself, allow it.
*/
if (zc->zc_objset_type == ZFS_PROP_USERUSED ||
zc->zc_objset_type == ZFS_PROP_USERQUOTA) {
if (zc->zc_guid == crgetuid(cr))
return (0);
} else {
if (groupmember(zc->zc_guid, cr))
return (0);
}
}
return (zfs_secpolicy_write_perms(zc->zc_name,
userquota_perms[zc->zc_objset_type], cr));
}
static int
zfs_secpolicy_userspace_many(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
int err = zfs_secpolicy_read(zc, innvl, cr);
if (err)
return (err);
if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
return (SET_ERROR(EINVAL));
return (zfs_secpolicy_write_perms(zc->zc_name,
userquota_perms[zc->zc_objset_type], cr));
}
/* ARGSUSED */
static int
zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION,
NULL, cr));
}
/* ARGSUSED */
static int
zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
nvpair_t *pair;
nvlist_t *holds;
int error;
error = nvlist_lookup_nvlist(innvl, "holds", &holds);
if (error != 0)
return (SET_ERROR(EINVAL));
for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
pair = nvlist_next_nvpair(holds, pair)) {
char fsname[ZFS_MAX_DATASET_NAME_LEN];
error = dmu_fsname(nvpair_name(pair), fsname);
if (error != 0)
return (error);
error = zfs_secpolicy_write_perms(fsname,
ZFS_DELEG_PERM_HOLD, cr);
if (error != 0)
return (error);
}
return (0);
}
/* ARGSUSED */
static int
zfs_secpolicy_release(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
nvpair_t *pair;
int error;
for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL;
pair = nvlist_next_nvpair(innvl, pair)) {
char fsname[ZFS_MAX_DATASET_NAME_LEN];
error = dmu_fsname(nvpair_name(pair), fsname);
if (error != 0)
return (error);
error = zfs_secpolicy_write_perms(fsname,
ZFS_DELEG_PERM_RELEASE, cr);
if (error != 0)
return (error);
}
return (0);
}
/*
* Policy for allowing temporary snapshots to be taken or released
*/
static int
zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
/*
* A temporary snapshot is the same as a snapshot,
* hold, destroy and release all rolled into one.
* Delegated diff alone is sufficient that we allow this.
*/
int error;
if ((error = zfs_secpolicy_write_perms(zc->zc_name,
ZFS_DELEG_PERM_DIFF, cr)) == 0)
return (0);
error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr);
if (error == 0)
error = zfs_secpolicy_hold(zc, innvl, cr);
if (error == 0)
error = zfs_secpolicy_release(zc, innvl, cr);
if (error == 0)
error = zfs_secpolicy_destroy(zc, innvl, cr);
return (error);
}
/*
* Returns the nvlist as specified by the user in the zfs_cmd_t.
*/
static int
get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp)
{
char *packed;
int error;
nvlist_t *list = NULL;
/*
* Read in and unpack the user-supplied nvlist.
*/
if (size == 0)
return (SET_ERROR(EINVAL));
packed = kmem_alloc(size, KM_SLEEP);
if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size,
iflag)) != 0) {
kmem_free(packed, size);
return (SET_ERROR(EFAULT));
}
if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) {
kmem_free(packed, size);
return (error);
}
kmem_free(packed, size);
*nvp = list;
return (0);
}
/*
* Reduce the size of this nvlist until it can be serialized in 'max' bytes.
* Entries will be removed from the end of the nvlist, and one int32 entry
* named "N_MORE_ERRORS" will be added indicating how many entries were
* removed.
*/
static int
nvlist_smush(nvlist_t *errors, size_t max)
{
size_t size;
size = fnvlist_size(errors);
if (size > max) {
nvpair_t *more_errors;
int n = 0;
if (max < 1024)
return (SET_ERROR(ENOMEM));
fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, 0);
more_errors = nvlist_prev_nvpair(errors, NULL);
do {
nvpair_t *pair = nvlist_prev_nvpair(errors,
more_errors);
fnvlist_remove_nvpair(errors, pair);
n++;
size = fnvlist_size(errors);
} while (size > max);
fnvlist_remove_nvpair(errors, more_errors);
fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, n);
ASSERT3U(fnvlist_size(errors), <=, max);
}
return (0);
}
static int
put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl)
{
char *packed = NULL;
int error = 0;
size_t size;
size = fnvlist_size(nvl);
if (size > zc->zc_nvlist_dst_size) {
/*
* Solaris returns ENOMEM here, because even if an error is
* returned from an ioctl(2), new zc_nvlist_dst_size will be
* passed to the userland. This is not the case for FreeBSD.
* We need to return 0, so the kernel will copy the
* zc_nvlist_dst_size back and the userland can discover that a
* bigger buffer is needed.
*/
error = 0;
} else {
packed = fnvlist_pack(nvl, &size);
if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst,
size, zc->zc_iflags) != 0)
error = SET_ERROR(EFAULT);
fnvlist_pack_free(packed, size);
}
zc->zc_nvlist_dst_size = size;
zc->zc_nvlist_dst_filled = B_TRUE;
return (error);
}
int
getzfsvfs_impl(objset_t *os, vfs_t **vfsp)
{
zfsvfs_t *zfvp;
int error = 0;
if (dmu_objset_type(os) != DMU_OST_ZFS) {
return (SET_ERROR(EINVAL));
}
mutex_enter(&os->os_user_ptr_lock);
zfvp = dmu_objset_get_user(os);
if (zfvp) {
*vfsp = zfvp->z_vfs;
vfs_ref(zfvp->z_vfs);
} else {
error = SET_ERROR(ESRCH);
}
mutex_exit(&os->os_user_ptr_lock);
return (error);
}
int
getzfsvfs(const char *dsname, zfsvfs_t **zfvp)
{
objset_t *os;
vfs_t *vfsp;
int error;
error = dmu_objset_hold(dsname, FTAG, &os);
if (error != 0)
return (error);
error = getzfsvfs_impl(os, &vfsp);
dmu_objset_rele(os, FTAG);
if (error != 0)
return (error);
error = vfs_busy(vfsp, 0);
vfs_rel(vfsp);
if (error != 0) {
*zfvp = NULL;
error = SET_ERROR(ESRCH);
} else {
*zfvp = vfsp->vfs_data;
}
return (error);
}
/*
* Find a zfsvfs_t for a mounted filesystem, or create our own, in which
* case its z_vfs will be NULL, and it will be opened as the owner.
* If 'writer' is set, the z_teardown_lock will be held for RW_WRITER,
* which prevents all vnode ops from running.
*/
static int
zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer)
{
int error = 0;
if (getzfsvfs(name, zfvp) != 0)
error = zfsvfs_create(name, zfvp);
if (error == 0) {
rrm_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER :
RW_READER, tag);
#ifdef illumos
if ((*zfvp)->z_unmounted) {
/*
* XXX we could probably try again, since the unmounting
* thread should be just about to disassociate the
* objset from the zfsvfs.
*/
rrm_exit(&(*zfvp)->z_teardown_lock, tag);
return (SET_ERROR(EBUSY));
}
#else
/*
* vfs_busy() ensures that the filesystem is not and
* can not be unmounted.
*/
ASSERT(!(*zfvp)->z_unmounted);
#endif
}
return (error);
}
static void
zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag)
{
rrm_exit(&zfsvfs->z_teardown_lock, tag);
if (zfsvfs->z_vfs) {
#ifdef illumos
VFS_RELE(zfsvfs->z_vfs);
#else
vfs_unbusy(zfsvfs->z_vfs);
#endif
} else {
dmu_objset_disown(zfsvfs->z_os, zfsvfs);
zfsvfs_free(zfsvfs);
}
}
static int
zfs_ioc_pool_create(zfs_cmd_t *zc)
{
int error;
nvlist_t *config, *props = NULL;
nvlist_t *rootprops = NULL;
nvlist_t *zplprops = NULL;
char *spa_name = zc->zc_name;
if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
zc->zc_iflags, &config))
return (error);
if (zc->zc_nvlist_src_size != 0 && (error =
get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
zc->zc_iflags, &props))) {
nvlist_free(config);
return (error);
}
if (props) {
nvlist_t *nvl = NULL;
uint64_t version = SPA_VERSION;
char *tname;
(void) nvlist_lookup_uint64(props,
zpool_prop_to_name(ZPOOL_PROP_VERSION), &version);
if (!SPA_VERSION_IS_SUPPORTED(version)) {
error = SET_ERROR(EINVAL);
goto pool_props_bad;
}
(void) nvlist_lookup_nvlist(props, ZPOOL_ROOTFS_PROPS, &nvl);
if (nvl) {
error = nvlist_dup(nvl, &rootprops, KM_SLEEP);
if (error != 0) {
nvlist_free(config);
nvlist_free(props);
return (error);
}
(void) nvlist_remove_all(props, ZPOOL_ROOTFS_PROPS);
}
VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
error = zfs_fill_zplprops_root(version, rootprops,
zplprops, NULL);
if (error != 0)
goto pool_props_bad;
if (nvlist_lookup_string(props,
zpool_prop_to_name(ZPOOL_PROP_TNAME), &tname) == 0)
spa_name = tname;
}
error = spa_create(zc->zc_name, config, props, zplprops);
/*
* Set the remaining root properties
*/
if (!error && (error = zfs_set_prop_nvlist(spa_name,
ZPROP_SRC_LOCAL, rootprops, NULL)) != 0)
(void) spa_destroy(spa_name);
pool_props_bad:
nvlist_free(rootprops);
nvlist_free(zplprops);
nvlist_free(config);
nvlist_free(props);
return (error);
}
static int
zfs_ioc_pool_destroy(zfs_cmd_t *zc)
{
int error;
zfs_log_history(zc);
error = spa_destroy(zc->zc_name);
if (error == 0)
zvol_remove_minors(zc->zc_name);
return (error);
}
static int
zfs_ioc_pool_import(zfs_cmd_t *zc)
{
nvlist_t *config, *props = NULL;
uint64_t guid;
int error;
if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
zc->zc_iflags, &config)) != 0)
return (error);
if (zc->zc_nvlist_src_size != 0 && (error =
get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
zc->zc_iflags, &props))) {
nvlist_free(config);
return (error);
}
if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
guid != zc->zc_guid)
error = SET_ERROR(EINVAL);
else
error = spa_import(zc->zc_name, config, props, zc->zc_cookie);
if (zc->zc_nvlist_dst != 0) {
int err;
if ((err = put_nvlist(zc, config)) != 0)
error = err;
}
nvlist_free(config);
nvlist_free(props);
return (error);
}
static int
zfs_ioc_pool_export(zfs_cmd_t *zc)
{
int error;
boolean_t force = (boolean_t)zc->zc_cookie;
boolean_t hardforce = (boolean_t)zc->zc_guid;
zfs_log_history(zc);
error = spa_export(zc->zc_name, NULL, force, hardforce);
if (error == 0)
zvol_remove_minors(zc->zc_name);
return (error);
}
static int
zfs_ioc_pool_configs(zfs_cmd_t *zc)
{
nvlist_t *configs;
int error;
if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL)
return (SET_ERROR(EEXIST));
error = put_nvlist(zc, configs);
nvlist_free(configs);
return (error);
}
/*
* inputs:
* zc_name name of the pool
*
* outputs:
* zc_cookie real errno
* zc_nvlist_dst config nvlist
* zc_nvlist_dst_size size of config nvlist
*/
static int
zfs_ioc_pool_stats(zfs_cmd_t *zc)
{
nvlist_t *config;
int error;
int ret = 0;
error = spa_get_stats(zc->zc_name, &config, zc->zc_value,
sizeof (zc->zc_value));
if (config != NULL) {
ret = put_nvlist(zc, config);
nvlist_free(config);
/*
* The config may be present even if 'error' is non-zero.
* In this case we return success, and preserve the real errno
* in 'zc_cookie'.
*/
zc->zc_cookie = error;
} else {
ret = error;
}
return (ret);
}
/*
* Try to import the given pool, returning pool stats as appropriate so that
* user land knows which devices are available and overall pool health.
*/
static int
zfs_ioc_pool_tryimport(zfs_cmd_t *zc)
{
nvlist_t *tryconfig, *config;
int error;
if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
zc->zc_iflags, &tryconfig)) != 0)
return (error);
config = spa_tryimport(tryconfig);
nvlist_free(tryconfig);
if (config == NULL)
return (SET_ERROR(EINVAL));
error = put_nvlist(zc, config);
nvlist_free(config);
return (error);
}
/*
* inputs:
* zc_name name of the pool
* zc_cookie scan func (pool_scan_func_t)
* zc_flags scrub pause/resume flag (pool_scrub_cmd_t)
*/
static int
zfs_ioc_pool_scan(zfs_cmd_t *zc)
{
spa_t *spa;
int error;
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
return (error);
if (zc->zc_flags >= POOL_SCRUB_FLAGS_END)
return (SET_ERROR(EINVAL));
if (zc->zc_flags == POOL_SCRUB_PAUSE)
error = spa_scrub_pause_resume(spa, POOL_SCRUB_PAUSE);
else if (zc->zc_cookie == POOL_SCAN_NONE)
error = spa_scan_stop(spa);
else
error = spa_scan(spa, zc->zc_cookie);
spa_close(spa, FTAG);
return (error);
}
static int
zfs_ioc_pool_freeze(zfs_cmd_t *zc)
{
spa_t *spa;
int error;
error = spa_open(zc->zc_name, &spa, FTAG);
if (error == 0) {
spa_freeze(spa);
spa_close(spa, FTAG);
}
return (error);
}
static int
zfs_ioc_pool_upgrade(zfs_cmd_t *zc)
{
spa_t *spa;
int error;
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
return (error);
if (zc->zc_cookie < spa_version(spa) ||
!SPA_VERSION_IS_SUPPORTED(zc->zc_cookie)) {
spa_close(spa, FTAG);
return (SET_ERROR(EINVAL));
}
spa_upgrade(spa, zc->zc_cookie);
spa_close(spa, FTAG);
return (error);
}
static int
zfs_ioc_pool_get_history(zfs_cmd_t *zc)
{
spa_t *spa;
char *hist_buf;
uint64_t size;
int error;
if ((size = zc->zc_history_len) == 0)
return (SET_ERROR(EINVAL));
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
return (error);
if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) {
spa_close(spa, FTAG);
return (SET_ERROR(ENOTSUP));
}
hist_buf = kmem_alloc(size, KM_SLEEP);
if ((error = spa_history_get(spa, &zc->zc_history_offset,
&zc->zc_history_len, hist_buf)) == 0) {
error = ddi_copyout(hist_buf,
(void *)(uintptr_t)zc->zc_history,
zc->zc_history_len, zc->zc_iflags);
}
spa_close(spa, FTAG);
kmem_free(hist_buf, size);
return (error);
}
static int
zfs_ioc_pool_reguid(zfs_cmd_t *zc)
{
spa_t *spa;
int error;
error = spa_open(zc->zc_name, &spa, FTAG);
if (error == 0) {
error = spa_change_guid(spa);
spa_close(spa, FTAG);
}
return (error);
}
static int
zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc)
{
return (dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value));
}
/*
* inputs:
* zc_name name of filesystem
* zc_obj object to find
*
* outputs:
* zc_value name of object
*/
static int
zfs_ioc_obj_to_path(zfs_cmd_t *zc)
{
objset_t *os;
int error;
/* XXX reading from objset not owned */
if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0)
return (error);
if (dmu_objset_type(os) != DMU_OST_ZFS) {
dmu_objset_rele(os, FTAG);
return (SET_ERROR(EINVAL));
}
error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value,
sizeof (zc->zc_value));
dmu_objset_rele(os, FTAG);
return (error);
}
/*
* inputs:
* zc_name name of filesystem
* zc_obj object to find
*
* outputs:
* zc_stat stats on object
* zc_value path to object
*/
static int
zfs_ioc_obj_to_stats(zfs_cmd_t *zc)
{
objset_t *os;
int error;
/* XXX reading from objset not owned */
if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0)
return (error);
if (dmu_objset_type(os) != DMU_OST_ZFS) {
dmu_objset_rele(os, FTAG);
return (SET_ERROR(EINVAL));
}
error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value,
sizeof (zc->zc_value));
dmu_objset_rele(os, FTAG);
return (error);
}
static int
zfs_ioc_vdev_add(zfs_cmd_t *zc)
{
spa_t *spa;
int error;
nvlist_t *config, **l2cache, **spares;
uint_t nl2cache = 0, nspares = 0;
error = spa_open(zc->zc_name, &spa, FTAG);
if (error != 0)
return (error);
error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
zc->zc_iflags, &config);
(void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_L2CACHE,
&l2cache, &nl2cache);
(void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_SPARES,
&spares, &nspares);
#ifdef illumos
/*
* A root pool with concatenated devices is not supported.
* Thus, can not add a device to a root pool.
*
* Intent log device can not be added to a rootpool because
* during mountroot, zil is replayed, a seperated log device
* can not be accessed during the mountroot time.
*
* l2cache and spare devices are ok to be added to a rootpool.
*/
if (spa_bootfs(spa) != 0 && nl2cache == 0 && nspares == 0) {
nvlist_free(config);
spa_close(spa, FTAG);
return (SET_ERROR(EDOM));
}
#endif /* illumos */
if (error == 0) {
error = spa_vdev_add(spa, config);
nvlist_free(config);
}
spa_close(spa, FTAG);
return (error);
}
/*
* inputs:
* zc_name name of the pool
* zc_guid guid of vdev to remove
* zc_cookie cancel removal
*/
static int
zfs_ioc_vdev_remove(zfs_cmd_t *zc)
{
spa_t *spa;
int error;
error = spa_open(zc->zc_name, &spa, FTAG);
if (error != 0)
return (error);
if (zc->zc_cookie != 0) {
error = spa_vdev_remove_cancel(spa);
} else {
error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE);
}
spa_close(spa, FTAG);
return (error);
}
static int
zfs_ioc_vdev_set_state(zfs_cmd_t *zc)
{
spa_t *spa;
int error;
vdev_state_t newstate = VDEV_STATE_UNKNOWN;
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
return (error);
switch (zc->zc_cookie) {
case VDEV_STATE_ONLINE:
error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate);
break;
case VDEV_STATE_OFFLINE:
error = vdev_offline(spa, zc->zc_guid, zc->zc_obj);
break;
case VDEV_STATE_FAULTED:
if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED &&
zc->zc_obj != VDEV_AUX_EXTERNAL)
zc->zc_obj = VDEV_AUX_ERR_EXCEEDED;
error = vdev_fault(spa, zc->zc_guid, zc->zc_obj);
break;
case VDEV_STATE_DEGRADED:
if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED &&
zc->zc_obj != VDEV_AUX_EXTERNAL)
zc->zc_obj = VDEV_AUX_ERR_EXCEEDED;
error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj);
break;
default:
error = SET_ERROR(EINVAL);
}
zc->zc_cookie = newstate;
spa_close(spa, FTAG);
return (error);
}
static int
zfs_ioc_vdev_attach(zfs_cmd_t *zc)
{
spa_t *spa;
int replacing = zc->zc_cookie;
nvlist_t *config;
int error;
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
return (error);
if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
zc->zc_iflags, &config)) == 0) {
error = spa_vdev_attach(spa, zc->zc_guid, config, replacing);
nvlist_free(config);
}
spa_close(spa, FTAG);
return (error);
}
static int
zfs_ioc_vdev_detach(zfs_cmd_t *zc)
{
spa_t *spa;
int error;
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
return (error);
error = spa_vdev_detach(spa, zc->zc_guid, 0, B_FALSE);
spa_close(spa, FTAG);
return (error);
}
static int
zfs_ioc_vdev_split(zfs_cmd_t *zc)
{
spa_t *spa;
nvlist_t *config, *props = NULL;
int error;
boolean_t exp = !!(zc->zc_cookie & ZPOOL_EXPORT_AFTER_SPLIT);
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
return (error);
if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
zc->zc_iflags, &config)) {
spa_close(spa, FTAG);
return (error);
}
if (zc->zc_nvlist_src_size != 0 && (error =
get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
zc->zc_iflags, &props))) {
spa_close(spa, FTAG);
nvlist_free(config);
return (error);
}
error = spa_vdev_split_mirror(spa, zc->zc_string, config, props, exp);
spa_close(spa, FTAG);
nvlist_free(config);
nvlist_free(props);
return (error);
}
static int
zfs_ioc_vdev_setpath(zfs_cmd_t *zc)
{
spa_t *spa;
char *path = zc->zc_value;
uint64_t guid = zc->zc_guid;
int error;
error = spa_open(zc->zc_name, &spa, FTAG);
if (error != 0)
return (error);
error = spa_vdev_setpath(spa, guid, path);
spa_close(spa, FTAG);
return (error);
}
static int
zfs_ioc_vdev_setfru(zfs_cmd_t *zc)
{
spa_t *spa;
char *fru = zc->zc_value;
uint64_t guid = zc->zc_guid;
int error;
error = spa_open(zc->zc_name, &spa, FTAG);
if (error != 0)
return (error);
error = spa_vdev_setfru(spa, guid, fru);
spa_close(spa, FTAG);
return (error);
}
static int
zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os)
{
int error = 0;
nvlist_t *nv;
dmu_objset_fast_stat(os, &zc->zc_objset_stats);
if (zc->zc_nvlist_dst != 0 &&
(error = dsl_prop_get_all(os, &nv)) == 0) {
dmu_objset_stats(os, nv);
/*
* NB: zvol_get_stats() will read the objset contents,
* which we aren't supposed to do with a
* DS_MODE_USER hold, because it could be
* inconsistent. So this is a bit of a workaround...
* XXX reading with out owning
*/
if (!zc->zc_objset_stats.dds_inconsistent &&
dmu_objset_type(os) == DMU_OST_ZVOL) {
error = zvol_get_stats(os, nv);
if (error == EIO)
return (error);
VERIFY0(error);
}
error = put_nvlist(zc, nv);
nvlist_free(nv);
}
return (error);
}
/*
* inputs:
* zc_name name of filesystem
* zc_nvlist_dst_size size of buffer for property nvlist
*
* outputs:
* zc_objset_stats stats
* zc_nvlist_dst property nvlist
* zc_nvlist_dst_size size of property nvlist
*/
static int
zfs_ioc_objset_stats(zfs_cmd_t *zc)
{
objset_t *os;
int error;
error = dmu_objset_hold(zc->zc_name, FTAG, &os);
if (error == 0) {
error = zfs_ioc_objset_stats_impl(zc, os);
dmu_objset_rele(os, FTAG);
}
if (error == ENOMEM)
error = 0;
return (error);
}
/*
* inputs:
* zc_name name of filesystem
* zc_nvlist_dst_size size of buffer for property nvlist
*
* outputs:
* zc_nvlist_dst received property nvlist
* zc_nvlist_dst_size size of received property nvlist
*
* Gets received properties (distinct from local properties on or after
* SPA_VERSION_RECVD_PROPS) for callers who want to differentiate received from
* local property values.
*/
static int
zfs_ioc_objset_recvd_props(zfs_cmd_t *zc)
{
int error = 0;
nvlist_t *nv;
/*
* Without this check, we would return local property values if the
* caller has not already received properties on or after
* SPA_VERSION_RECVD_PROPS.
*/
if (!dsl_prop_get_hasrecvd(zc->zc_name))
return (SET_ERROR(ENOTSUP));
if (zc->zc_nvlist_dst != 0 &&
(error = dsl_prop_get_received(zc->zc_name, &nv)) == 0) {
error = put_nvlist(zc, nv);
nvlist_free(nv);
}
return (error);
}
static int
nvl_add_zplprop(objset_t *os, nvlist_t *props, zfs_prop_t prop)
{
uint64_t value;
int error;
/*
* zfs_get_zplprop() will either find a value or give us
* the default value (if there is one).
*/
if ((error = zfs_get_zplprop(os, prop, &value)) != 0)
return (error);
VERIFY(nvlist_add_uint64(props, zfs_prop_to_name(prop), value) == 0);
return (0);
}
/*
* inputs:
* zc_name name of filesystem
* zc_nvlist_dst_size size of buffer for zpl property nvlist
*
* outputs:
* zc_nvlist_dst zpl property nvlist
* zc_nvlist_dst_size size of zpl property nvlist
*/
static int
zfs_ioc_objset_zplprops(zfs_cmd_t *zc)
{
objset_t *os;
int err;
/* XXX reading without owning */
if (err = dmu_objset_hold(zc->zc_name, FTAG, &os))
return (err);
dmu_objset_fast_stat(os, &zc->zc_objset_stats);
/*
* NB: nvl_add_zplprop() will read the objset contents,
* which we aren't supposed to do with a DS_MODE_USER
* hold, because it could be inconsistent.
*/
if (zc->zc_nvlist_dst != 0 &&
!zc->zc_objset_stats.dds_inconsistent &&
dmu_objset_type(os) == DMU_OST_ZFS) {
nvlist_t *nv;
VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
if ((err = nvl_add_zplprop(os, nv, ZFS_PROP_VERSION)) == 0 &&
(err = nvl_add_zplprop(os, nv, ZFS_PROP_NORMALIZE)) == 0 &&
(err = nvl_add_zplprop(os, nv, ZFS_PROP_UTF8ONLY)) == 0 &&
(err = nvl_add_zplprop(os, nv, ZFS_PROP_CASE)) == 0)
err = put_nvlist(zc, nv);
nvlist_free(nv);
} else {
err = SET_ERROR(ENOENT);
}
dmu_objset_rele(os, FTAG);
return (err);
}
boolean_t
dataset_name_hidden(const char *name)
{
/*
* Skip over datasets that are not visible in this zone,
* internal datasets (which have a $ in their name), and
* temporary datasets (which have a % in their name).
*/
if (strchr(name, '$') != NULL)
return (B_TRUE);
if (strchr(name, '%') != NULL)
return (B_TRUE);
if (!INGLOBALZONE(curthread) && !zone_dataset_visible(name, NULL))
return (B_TRUE);
return (B_FALSE);
}
/*
* inputs:
* zc_name name of filesystem
* zc_cookie zap cursor
* zc_nvlist_src iteration range nvlist
* zc_nvlist_src_size size of iteration range nvlist
*
* outputs:
* zc_name name of next filesystem
* zc_cookie zap cursor
* zc_objset_stats stats
* zc_nvlist_dst property nvlist
* zc_nvlist_dst_size size of property nvlist
*/
static int
zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
{
objset_t *os;
int error;
char *p;
size_t orig_len = strlen(zc->zc_name);
top:
if (error = dmu_objset_hold(zc->zc_name, FTAG, &os)) {
if (error == ENOENT)
error = SET_ERROR(ESRCH);
return (error);
}
p = strrchr(zc->zc_name, '/');
if (p == NULL || p[1] != '\0')
(void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name));
p = zc->zc_name + strlen(zc->zc_name);
do {
error = dmu_dir_list_next(os,
sizeof (zc->zc_name) - (p - zc->zc_name), p,
NULL, &zc->zc_cookie);
if (error == ENOENT)
error = SET_ERROR(ESRCH);
} while (error == 0 && dataset_name_hidden(zc->zc_name));
dmu_objset_rele(os, FTAG);
/*
* If it's an internal dataset (ie. with a '$' in its name),
* don't try to get stats for it, otherwise we'll return ENOENT.
*/
if (error == 0 && strchr(zc->zc_name, '$') == NULL) {
error = zfs_ioc_objset_stats(zc); /* fill in the stats */
if (error == ENOENT) {
/* We lost a race with destroy, get the next one. */
zc->zc_name[orig_len] = '\0';
goto top;
}
}
return (error);
}
/*
* inputs:
* zc_name name of filesystem
* zc_cookie zap cursor
* zc_nvlist_dst_size size of buffer for property nvlist
* zc_simple when set, only name is requested
*
* outputs:
* zc_name name of next snapshot
* zc_objset_stats stats
* zc_nvlist_dst property nvlist
* zc_nvlist_dst_size size of property nvlist
*/
static int
zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
{
int error;
objset_t *os, *ossnap;
dsl_dataset_t *ds;
uint64_t min_txg = 0, max_txg = 0;
if (zc->zc_nvlist_src_size != 0) {
nvlist_t *props = NULL;
error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
zc->zc_iflags, &props);
if (error != 0)
return (error);
(void) nvlist_lookup_uint64(props, SNAP_ITER_MIN_TXG,
&min_txg);
(void) nvlist_lookup_uint64(props, SNAP_ITER_MAX_TXG,
&max_txg);
nvlist_free(props);
}
error = dmu_objset_hold(zc->zc_name, FTAG, &os);
if (error != 0) {
return (error == ENOENT ? ESRCH : error);
}
/*
* A dataset name of maximum length cannot have any snapshots,
* so exit immediately.
*/
if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >=
ZFS_MAX_DATASET_NAME_LEN) {
dmu_objset_rele(os, FTAG);
return (SET_ERROR(ESRCH));
}
while (error == 0) {
if (issig(JUSTLOOKING) && issig(FORREAL)) {
error = SET_ERROR(EINTR);
break;
}
error = dmu_snapshot_list_next(os,
sizeof (zc->zc_name) - strlen(zc->zc_name),
zc->zc_name + strlen(zc->zc_name), &zc->zc_obj,
&zc->zc_cookie, NULL);
if (error == ENOENT) {
error = SET_ERROR(ESRCH);
break;
} else if (error != 0) {
break;
}
error = dsl_dataset_hold_obj(dmu_objset_pool(os), zc->zc_obj,
FTAG, &ds);
if (error != 0)
break;
if ((min_txg != 0 && dsl_get_creationtxg(ds) < min_txg) ||
(max_txg != 0 && dsl_get_creationtxg(ds) > max_txg)) {
dsl_dataset_rele(ds, FTAG);
/* undo snapshot name append */
*(strchr(zc->zc_name, '@') + 1) = '\0';
/* skip snapshot */
continue;
}
if (zc->zc_simple) {
dsl_dataset_rele(ds, FTAG);
break;
}
if ((error = dmu_objset_from_ds(ds, &ossnap)) != 0) {
dsl_dataset_rele(ds, FTAG);
break;
}
if ((error = zfs_ioc_objset_stats_impl(zc, ossnap)) != 0) {
dsl_dataset_rele(ds, FTAG);
break;
}
dsl_dataset_rele(ds, FTAG);
break;
}
dmu_objset_rele(os, FTAG);
/* if we failed, undo the @ that we tacked on to zc_name */
if (error != 0)
*strchr(zc->zc_name, '@') = '\0';
return (error);
}
static int
zfs_prop_set_userquota(const char *dsname, nvpair_t *pair)
{
const char *propname = nvpair_name(pair);
uint64_t *valary;
unsigned int vallen;
const char *domain;
char *dash;
zfs_userquota_prop_t type;
uint64_t rid;
uint64_t quota;
zfsvfs_t *zfsvfs;
int err;
if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
nvlist_t *attrs;
VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
&pair) != 0)
return (SET_ERROR(EINVAL));
}
/*
* A correctly constructed propname is encoded as
* userquota@-.
*/
if ((dash = strchr(propname, '-')) == NULL ||
nvpair_value_uint64_array(pair, &valary, &vallen) != 0 ||
vallen != 3)
return (SET_ERROR(EINVAL));
domain = dash + 1;
type = valary[0];
rid = valary[1];
quota = valary[2];
err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE);
if (err == 0) {
err = zfs_set_userquota(zfsvfs, type, domain, rid, quota);
zfsvfs_rele(zfsvfs, FTAG);
}
return (err);
}
/*
* If the named property is one that has a special function to set its value,
* return 0 on success and a positive error code on failure; otherwise if it is
* not one of the special properties handled by this function, return -1.
*
* XXX: It would be better for callers of the property interface if we handled
* these special cases in dsl_prop.c (in the dsl layer).
*/
static int
zfs_prop_set_special(const char *dsname, zprop_source_t source,
nvpair_t *pair)
{
const char *propname = nvpair_name(pair);
zfs_prop_t prop = zfs_name_to_prop(propname);
uint64_t intval;
int err = -1;
if (prop == ZPROP_INVAL) {
if (zfs_prop_userquota(propname))
return (zfs_prop_set_userquota(dsname, pair));
return (-1);
}
if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
nvlist_t *attrs;
VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
&pair) == 0);
}
if (zfs_prop_get_type(prop) == PROP_TYPE_STRING)
return (-1);
VERIFY(0 == nvpair_value_uint64(pair, &intval));
switch (prop) {
case ZFS_PROP_QUOTA:
err = dsl_dir_set_quota(dsname, source, intval);
break;
case ZFS_PROP_REFQUOTA:
err = dsl_dataset_set_refquota(dsname, source, intval);
break;
case ZFS_PROP_FILESYSTEM_LIMIT:
case ZFS_PROP_SNAPSHOT_LIMIT:
if (intval == UINT64_MAX) {
/* clearing the limit, just do it */
err = 0;
} else {
err = dsl_dir_activate_fs_ss_limit(dsname);
}
/*
* Set err to -1 to force the zfs_set_prop_nvlist code down the
* default path to set the value in the nvlist.
*/
if (err == 0)
err = -1;
break;
case ZFS_PROP_RESERVATION:
err = dsl_dir_set_reservation(dsname, source, intval);
break;
case ZFS_PROP_REFRESERVATION:
err = dsl_dataset_set_refreservation(dsname, source, intval);
break;
case ZFS_PROP_VOLSIZE:
err = zvol_set_volsize(dsname, intval);
break;
case ZFS_PROP_VERSION:
{
zfsvfs_t *zfsvfs;
if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_TRUE)) != 0)
break;
err = zfs_set_version(zfsvfs, intval);
zfsvfs_rele(zfsvfs, FTAG);
if (err == 0 && intval >= ZPL_VERSION_USERSPACE) {
zfs_cmd_t *zc;
zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
(void) strcpy(zc->zc_name, dsname);
(void) zfs_ioc_userspace_upgrade(zc);
kmem_free(zc, sizeof (zfs_cmd_t));
}
break;
}
default:
err = -1;
}
return (err);
}
/*
* This function is best effort. If it fails to set any of the given properties,
* it continues to set as many as it can and returns the last error
* encountered. If the caller provides a non-NULL errlist, it will be filled in
* with the list of names of all the properties that failed along with the
* corresponding error numbers.
*
* If every property is set successfully, zero is returned and errlist is not
* modified.
*/
int
zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl,
nvlist_t *errlist)
{
nvpair_t *pair;
nvpair_t *propval;
int rv = 0;
uint64_t intval;
char *strval;
nvlist_t *genericnvl = fnvlist_alloc();
nvlist_t *retrynvl = fnvlist_alloc();
retry:
pair = NULL;
while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) {
const char *propname = nvpair_name(pair);
zfs_prop_t prop = zfs_name_to_prop(propname);
int err = 0;
/* decode the property value */
propval = pair;
if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
nvlist_t *attrs;
attrs = fnvpair_value_nvlist(pair);
if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
&propval) != 0)
err = SET_ERROR(EINVAL);
}
/* Validate value type */
if (err == 0 && prop == ZPROP_INVAL) {
if (zfs_prop_user(propname)) {
if (nvpair_type(propval) != DATA_TYPE_STRING)
err = SET_ERROR(EINVAL);
} else if (zfs_prop_userquota(propname)) {
if (nvpair_type(propval) !=
DATA_TYPE_UINT64_ARRAY)
err = SET_ERROR(EINVAL);
} else {
err = SET_ERROR(EINVAL);
}
} else if (err == 0) {
if (nvpair_type(propval) == DATA_TYPE_STRING) {
if (zfs_prop_get_type(prop) != PROP_TYPE_STRING)
err = SET_ERROR(EINVAL);
} else if (nvpair_type(propval) == DATA_TYPE_UINT64) {
const char *unused;
intval = fnvpair_value_uint64(propval);
switch (zfs_prop_get_type(prop)) {
case PROP_TYPE_NUMBER:
break;
case PROP_TYPE_STRING:
err = SET_ERROR(EINVAL);
break;
case PROP_TYPE_INDEX:
if (zfs_prop_index_to_string(prop,
intval, &unused) != 0)
err = SET_ERROR(EINVAL);
break;
default:
cmn_err(CE_PANIC,
"unknown property type");
}
} else {
err = SET_ERROR(EINVAL);
}
}
/* Validate permissions */
if (err == 0)
err = zfs_check_settable(dsname, pair, CRED());
if (err == 0) {
err = zfs_prop_set_special(dsname, source, pair);
if (err == -1) {
/*
* For better performance we build up a list of
* properties to set in a single transaction.
*/
err = nvlist_add_nvpair(genericnvl, pair);
} else if (err != 0 && nvl != retrynvl) {
/*
* This may be a spurious error caused by
* receiving quota and reservation out of order.
* Try again in a second pass.
*/
err = nvlist_add_nvpair(retrynvl, pair);
}
}
if (err != 0) {
if (errlist != NULL)
fnvlist_add_int32(errlist, propname, err);
rv = err;
}
}
if (nvl != retrynvl && !nvlist_empty(retrynvl)) {
nvl = retrynvl;
goto retry;
}
if (!nvlist_empty(genericnvl) &&
dsl_props_set(dsname, source, genericnvl) != 0) {
/*
* If this fails, we still want to set as many properties as we
* can, so try setting them individually.
*/
pair = NULL;
while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) {
const char *propname = nvpair_name(pair);
int err = 0;
propval = pair;
if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
nvlist_t *attrs;
attrs = fnvpair_value_nvlist(pair);
propval = fnvlist_lookup_nvpair(attrs,
ZPROP_VALUE);
}
if (nvpair_type(propval) == DATA_TYPE_STRING) {
strval = fnvpair_value_string(propval);
err = dsl_prop_set_string(dsname, propname,
source, strval);
} else {
intval = fnvpair_value_uint64(propval);
err = dsl_prop_set_int(dsname, propname, source,
intval);
}
if (err != 0) {
if (errlist != NULL) {
fnvlist_add_int32(errlist, propname,
err);
}
rv = err;
}
}
}
nvlist_free(genericnvl);
nvlist_free(retrynvl);
return (rv);
}
/*
* Check that all the properties are valid user properties.
*/
static int
zfs_check_userprops(nvlist_t *nvl)
{
nvpair_t *pair = NULL;
while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) {
const char *propname = nvpair_name(pair);
if (!zfs_prop_user(propname) ||
nvpair_type(pair) != DATA_TYPE_STRING)
return (SET_ERROR(EINVAL));
if (strlen(propname) >= ZAP_MAXNAMELEN)
return (SET_ERROR(ENAMETOOLONG));
if (strlen(fnvpair_value_string(pair)) >= ZAP_MAXVALUELEN)
return (E2BIG);
}
return (0);
}
static void
props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops)
{
nvpair_t *pair;
VERIFY(nvlist_alloc(newprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
pair = NULL;
while ((pair = nvlist_next_nvpair(props, pair)) != NULL) {
if (nvlist_exists(skipped, nvpair_name(pair)))
continue;
VERIFY(nvlist_add_nvpair(*newprops, pair) == 0);
}
}
static int
clear_received_props(const char *dsname, nvlist_t *props,
nvlist_t *skipped)
{
int err = 0;
nvlist_t *cleared_props = NULL;
props_skip(props, skipped, &cleared_props);
if (!nvlist_empty(cleared_props)) {
/*
* Acts on local properties until the dataset has received
* properties at least once on or after SPA_VERSION_RECVD_PROPS.
*/
zprop_source_t flags = (ZPROP_SRC_NONE |
(dsl_prop_get_hasrecvd(dsname) ? ZPROP_SRC_RECEIVED : 0));
err = zfs_set_prop_nvlist(dsname, flags, cleared_props, NULL);
}
nvlist_free(cleared_props);
return (err);
}
/*
* inputs:
* zc_name name of filesystem
* zc_value name of property to set
* zc_nvlist_src{_size} nvlist of properties to apply
* zc_cookie received properties flag
*
* outputs:
* zc_nvlist_dst{_size} error for each unapplied received property
*/
static int
zfs_ioc_set_prop(zfs_cmd_t *zc)
{
nvlist_t *nvl;
boolean_t received = zc->zc_cookie;
zprop_source_t source = (received ? ZPROP_SRC_RECEIVED :
ZPROP_SRC_LOCAL);
nvlist_t *errors;
int error;
if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
zc->zc_iflags, &nvl)) != 0)
return (error);
if (received) {
nvlist_t *origprops;
if (dsl_prop_get_received(zc->zc_name, &origprops) == 0) {
(void) clear_received_props(zc->zc_name,
origprops, nvl);
nvlist_free(origprops);
}
error = dsl_prop_set_hasrecvd(zc->zc_name);
}
errors = fnvlist_alloc();
if (error == 0)
error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, errors);
if (zc->zc_nvlist_dst != 0 && errors != NULL) {
(void) put_nvlist(zc, errors);
}
nvlist_free(errors);
nvlist_free(nvl);
return (error);
}
/*
* inputs:
* zc_name name of filesystem
* zc_value name of property to inherit
* zc_cookie revert to received value if TRUE
*
* outputs: none
*/
static int
zfs_ioc_inherit_prop(zfs_cmd_t *zc)
{
const char *propname = zc->zc_value;
zfs_prop_t prop = zfs_name_to_prop(propname);
boolean_t received = zc->zc_cookie;
zprop_source_t source = (received
? ZPROP_SRC_NONE /* revert to received value, if any */
: ZPROP_SRC_INHERITED); /* explicitly inherit */
if (received) {
nvlist_t *dummy;
nvpair_t *pair;
zprop_type_t type;
int err;
/*
* zfs_prop_set_special() expects properties in the form of an
* nvpair with type info.
*/
if (prop == ZPROP_INVAL) {
if (!zfs_prop_user(propname))
return (SET_ERROR(EINVAL));
type = PROP_TYPE_STRING;
} else if (prop == ZFS_PROP_VOLSIZE ||
prop == ZFS_PROP_VERSION) {
return (SET_ERROR(EINVAL));
} else {
type = zfs_prop_get_type(prop);
}
VERIFY(nvlist_alloc(&dummy, NV_UNIQUE_NAME, KM_SLEEP) == 0);
switch (type) {
case PROP_TYPE_STRING:
VERIFY(0 == nvlist_add_string(dummy, propname, ""));
break;
case PROP_TYPE_NUMBER:
case PROP_TYPE_INDEX:
VERIFY(0 == nvlist_add_uint64(dummy, propname, 0));
break;
default:
nvlist_free(dummy);
return (SET_ERROR(EINVAL));
}
pair = nvlist_next_nvpair(dummy, NULL);
err = zfs_prop_set_special(zc->zc_name, source, pair);
nvlist_free(dummy);
if (err != -1)
return (err); /* special property already handled */
} else {
/*
* Only check this in the non-received case. We want to allow
* 'inherit -S' to revert non-inheritable properties like quota
* and reservation to the received or default values even though
* they are not considered inheritable.
*/
if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop))
return (SET_ERROR(EINVAL));
}
/* property name has been validated by zfs_secpolicy_inherit_prop() */
return (dsl_prop_inherit(zc->zc_name, zc->zc_value, source));
}
static int
zfs_ioc_pool_set_props(zfs_cmd_t *zc)
{
nvlist_t *props;
spa_t *spa;
int error;
nvpair_t *pair;
if (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
zc->zc_iflags, &props))
return (error);
/*
* If the only property is the configfile, then just do a spa_lookup()
* to handle the faulted case.
*/
pair = nvlist_next_nvpair(props, NULL);
if (pair != NULL && strcmp(nvpair_name(pair),
zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 &&
nvlist_next_nvpair(props, pair) == NULL) {
mutex_enter(&spa_namespace_lock);
if ((spa = spa_lookup(zc->zc_name)) != NULL) {
spa_configfile_set(spa, props, B_FALSE);
spa_write_cachefile(spa, B_FALSE, B_TRUE);
}
mutex_exit(&spa_namespace_lock);
if (spa != NULL) {
nvlist_free(props);
return (0);
}
}
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
nvlist_free(props);
return (error);
}
error = spa_prop_set(spa, props);
nvlist_free(props);
spa_close(spa, FTAG);
return (error);
}
static int
zfs_ioc_pool_get_props(zfs_cmd_t *zc)
{
spa_t *spa;
int error;
nvlist_t *nvp = NULL;
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
/*
* If the pool is faulted, there may be properties we can still
* get (such as altroot and cachefile), so attempt to get them
* anyway.
*/
mutex_enter(&spa_namespace_lock);
if ((spa = spa_lookup(zc->zc_name)) != NULL)
error = spa_prop_get(spa, &nvp);
mutex_exit(&spa_namespace_lock);
} else {
error = spa_prop_get(spa, &nvp);
spa_close(spa, FTAG);
}
if (error == 0 && zc->zc_nvlist_dst != 0)
error = put_nvlist(zc, nvp);
else
error = SET_ERROR(EFAULT);
nvlist_free(nvp);
return (error);
}
/*
* inputs:
* zc_name name of filesystem
* zc_nvlist_src{_size} nvlist of delegated permissions
* zc_perm_action allow/unallow flag
*
* outputs: none
*/
static int
zfs_ioc_set_fsacl(zfs_cmd_t *zc)
{
int error;
nvlist_t *fsaclnv = NULL;
if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
zc->zc_iflags, &fsaclnv)) != 0)
return (error);
/*
* Verify nvlist is constructed correctly
*/
if ((error = zfs_deleg_verify_nvlist(fsaclnv)) != 0) {
nvlist_free(fsaclnv);
return (SET_ERROR(EINVAL));
}
/*
* If we don't have PRIV_SYS_MOUNT, then validate
* that user is allowed to hand out each permission in
* the nvlist(s)
*/
error = secpolicy_zfs(CRED());
if (error != 0) {
if (zc->zc_perm_action == B_FALSE) {
error = dsl_deleg_can_allow(zc->zc_name,
fsaclnv, CRED());
} else {
error = dsl_deleg_can_unallow(zc->zc_name,
fsaclnv, CRED());
}
}
if (error == 0)
error = dsl_deleg_set(zc->zc_name, fsaclnv, zc->zc_perm_action);
nvlist_free(fsaclnv);
return (error);
}
/*
* inputs:
* zc_name name of filesystem
*
* outputs:
* zc_nvlist_src{_size} nvlist of delegated permissions
*/
static int
zfs_ioc_get_fsacl(zfs_cmd_t *zc)
{
nvlist_t *nvp;
int error;
if ((error = dsl_deleg_get(zc->zc_name, &nvp)) == 0) {
error = put_nvlist(zc, nvp);
nvlist_free(nvp);
}
return (error);
}
/* ARGSUSED */
static void
zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
{
zfs_creat_t *zct = arg;
zfs_create_fs(os, cr, zct->zct_zplprops, tx);
}
#define ZFS_PROP_UNDEFINED ((uint64_t)-1)
/*
* inputs:
* os parent objset pointer (NULL if root fs)
* fuids_ok fuids allowed in this version of the spa?
* sa_ok SAs allowed in this version of the spa?
* createprops list of properties requested by creator
*
* outputs:
* zplprops values for the zplprops we attach to the master node object
* is_ci true if requested file system will be purely case-insensitive
*
* Determine the settings for utf8only, normalization and
* casesensitivity. Specific values may have been requested by the
* creator and/or we can inherit values from the parent dataset. If
* the file system is of too early a vintage, a creator can not
* request settings for these properties, even if the requested
* setting is the default value. We don't actually want to create dsl
* properties for these, so remove them from the source nvlist after
* processing.
*/
static int
zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
boolean_t fuids_ok, boolean_t sa_ok, nvlist_t *createprops,
nvlist_t *zplprops, boolean_t *is_ci)
{
uint64_t sense = ZFS_PROP_UNDEFINED;
uint64_t norm = ZFS_PROP_UNDEFINED;
uint64_t u8 = ZFS_PROP_UNDEFINED;
ASSERT(zplprops != NULL);
if (os != NULL && os->os_phys->os_type != DMU_OST_ZFS)
return (SET_ERROR(EINVAL));
/*
* Pull out creator prop choices, if any.
*/
if (createprops) {
(void) nvlist_lookup_uint64(createprops,
zfs_prop_to_name(ZFS_PROP_VERSION), &zplver);
(void) nvlist_lookup_uint64(createprops,
zfs_prop_to_name(ZFS_PROP_NORMALIZE), &norm);
(void) nvlist_remove_all(createprops,
zfs_prop_to_name(ZFS_PROP_NORMALIZE));
(void) nvlist_lookup_uint64(createprops,
zfs_prop_to_name(ZFS_PROP_UTF8ONLY), &u8);
(void) nvlist_remove_all(createprops,
zfs_prop_to_name(ZFS_PROP_UTF8ONLY));
(void) nvlist_lookup_uint64(createprops,
zfs_prop_to_name(ZFS_PROP_CASE), &sense);
(void) nvlist_remove_all(createprops,
zfs_prop_to_name(ZFS_PROP_CASE));
}
/*
* If the zpl version requested is whacky or the file system
* or pool is version is too "young" to support normalization
* and the creator tried to set a value for one of the props,
* error out.
*/
if ((zplver < ZPL_VERSION_INITIAL || zplver > ZPL_VERSION) ||
(zplver >= ZPL_VERSION_FUID && !fuids_ok) ||
(zplver >= ZPL_VERSION_SA && !sa_ok) ||
(zplver < ZPL_VERSION_NORMALIZATION &&
(norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED ||
sense != ZFS_PROP_UNDEFINED)))
return (SET_ERROR(ENOTSUP));
/*
* Put the version in the zplprops
*/
VERIFY(nvlist_add_uint64(zplprops,
zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0);
if (norm == ZFS_PROP_UNDEFINED)
VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0);
VERIFY(nvlist_add_uint64(zplprops,
zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0);
/*
* If we're normalizing, names must always be valid UTF-8 strings.
*/
if (norm)
u8 = 1;
if (u8 == ZFS_PROP_UNDEFINED)
VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0);
VERIFY(nvlist_add_uint64(zplprops,
zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0);
if (sense == ZFS_PROP_UNDEFINED)
VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0);
VERIFY(nvlist_add_uint64(zplprops,
zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0);
if (is_ci)
*is_ci = (sense == ZFS_CASE_INSENSITIVE);
return (0);
}
static int
zfs_fill_zplprops(const char *dataset, nvlist_t *createprops,
nvlist_t *zplprops, boolean_t *is_ci)
{
boolean_t fuids_ok, sa_ok;
uint64_t zplver = ZPL_VERSION;
objset_t *os = NULL;
char parentname[ZFS_MAX_DATASET_NAME_LEN];
char *cp;
spa_t *spa;
uint64_t spa_vers;
int error;
(void) strlcpy(parentname, dataset, sizeof (parentname));
cp = strrchr(parentname, '/');
ASSERT(cp != NULL);
cp[0] = '\0';
if ((error = spa_open(dataset, &spa, FTAG)) != 0)
return (error);
spa_vers = spa_version(spa);
spa_close(spa, FTAG);
zplver = zfs_zpl_version_map(spa_vers);
fuids_ok = (zplver >= ZPL_VERSION_FUID);
sa_ok = (zplver >= ZPL_VERSION_SA);
/*
* Open parent object set so we can inherit zplprop values.
*/
if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0)
return (error);
error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, sa_ok, createprops,
zplprops, is_ci);
dmu_objset_rele(os, FTAG);
return (error);
}
static int
zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops,
nvlist_t *zplprops, boolean_t *is_ci)
{
boolean_t fuids_ok;
boolean_t sa_ok;
uint64_t zplver = ZPL_VERSION;
int error;
zplver = zfs_zpl_version_map(spa_vers);
fuids_ok = (zplver >= ZPL_VERSION_FUID);
sa_ok = (zplver >= ZPL_VERSION_SA);
error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, sa_ok,
createprops, zplprops, is_ci);
return (error);
}
/*
* innvl: {
* "type" -> dmu_objset_type_t (int32)
* (optional) "props" -> { prop -> value }
* }
*
* outnvl: propname -> error code (int32)
*/
static int
zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
{
int error = 0;
zfs_creat_t zct = { 0 };
nvlist_t *nvprops = NULL;
void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
int32_t type32;
dmu_objset_type_t type;
boolean_t is_insensitive = B_FALSE;
if (nvlist_lookup_int32(innvl, "type", &type32) != 0)
return (SET_ERROR(EINVAL));
type = type32;
(void) nvlist_lookup_nvlist(innvl, "props", &nvprops);
switch (type) {
case DMU_OST_ZFS:
cbfunc = zfs_create_cb;
break;
case DMU_OST_ZVOL:
cbfunc = zvol_create_cb;
break;
default:
cbfunc = NULL;
break;
}
if (strchr(fsname, '@') ||
strchr(fsname, '%'))
return (SET_ERROR(EINVAL));
zct.zct_props = nvprops;
if (cbfunc == NULL)
return (SET_ERROR(EINVAL));
if (type == DMU_OST_ZVOL) {
uint64_t volsize, volblocksize;
if (nvprops == NULL)
return (SET_ERROR(EINVAL));
if (nvlist_lookup_uint64(nvprops,
zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) != 0)
return (SET_ERROR(EINVAL));
if ((error = nvlist_lookup_uint64(nvprops,
zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
&volblocksize)) != 0 && error != ENOENT)
return (SET_ERROR(EINVAL));
if (error != 0)
volblocksize = zfs_prop_default_numeric(
ZFS_PROP_VOLBLOCKSIZE);
if ((error = zvol_check_volblocksize(
volblocksize)) != 0 ||
(error = zvol_check_volsize(volsize,
volblocksize)) != 0)
return (error);
} else if (type == DMU_OST_ZFS) {
int error;
/*
* We have to have normalization and
* case-folding flags correct when we do the
* file system creation, so go figure them out
* now.
*/
VERIFY(nvlist_alloc(&zct.zct_zplprops,
NV_UNIQUE_NAME, KM_SLEEP) == 0);
error = zfs_fill_zplprops(fsname, nvprops,
zct.zct_zplprops, &is_insensitive);
if (error != 0) {
nvlist_free(zct.zct_zplprops);
return (error);
}
}
error = dmu_objset_create(fsname, type,
is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct);
nvlist_free(zct.zct_zplprops);
/*
* It would be nice to do this atomically.
*/
if (error == 0) {
error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL,
nvprops, outnvl);
if (error != 0)
(void) dsl_destroy_head(fsname);
}
#ifdef __FreeBSD__
if (error == 0 && type == DMU_OST_ZVOL)
zvol_create_minors(fsname);
#endif
return (error);
}
/*
* innvl: {
* "origin" -> name of origin snapshot
* (optional) "props" -> { prop -> value }
* }
*
* outnvl: propname -> error code (int32)
*/
static int
zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
{
int error = 0;
nvlist_t *nvprops = NULL;
char *origin_name;
if (nvlist_lookup_string(innvl, "origin", &origin_name) != 0)
return (SET_ERROR(EINVAL));
(void) nvlist_lookup_nvlist(innvl, "props", &nvprops);
if (strchr(fsname, '@') ||
strchr(fsname, '%'))
return (SET_ERROR(EINVAL));
if (dataset_namecheck(origin_name, NULL, NULL) != 0)
return (SET_ERROR(EINVAL));
error = dmu_objset_clone(fsname, origin_name);
if (error != 0)
return (error);
/*
* It would be nice to do this atomically.
*/
if (error == 0) {
error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL,
nvprops, outnvl);
if (error != 0)
(void) dsl_destroy_head(fsname);
}
#ifdef __FreeBSD__
if (error == 0)
zvol_create_minors(fsname);
#endif
return (error);
}
/* ARGSUSED */
static int
zfs_ioc_remap(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
{
if (strchr(fsname, '@') ||
strchr(fsname, '%'))
return (SET_ERROR(EINVAL));
return (dmu_objset_remap_indirects(fsname));
}
/*
* innvl: {
* "snaps" -> { snapshot1, snapshot2 }
* (optional) "props" -> { prop -> value (string) }
* }
*
* outnvl: snapshot -> error code (int32)
*/
static int
zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
{
nvlist_t *snaps;
nvlist_t *props = NULL;
int error, poollen;
nvpair_t *pair;
(void) nvlist_lookup_nvlist(innvl, "props", &props);
if (!nvlist_empty(props) &&
zfs_earlier_version(poolname, SPA_VERSION_SNAP_PROPS))
return (SET_ERROR(ENOTSUP));
if ((error = zfs_check_userprops(props)) != 0)
return (error);
if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0)
return (SET_ERROR(EINVAL));
poollen = strlen(poolname);
for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
pair = nvlist_next_nvpair(snaps, pair)) {
const char *name = nvpair_name(pair);
char *cp = strchr(name, '@');
/*
* The snap name must contain an @, and the part after it must
* contain only valid characters.
*/
if (cp == NULL ||
zfs_component_namecheck(cp + 1, NULL, NULL) != 0)
return (SET_ERROR(EINVAL));
/*
* The snap must be in the specified pool.
*/
if (strncmp(name, poolname, poollen) != 0 ||
(name[poollen] != '/' && name[poollen] != '@'))
return (SET_ERROR(EXDEV));
/*
* Check for permission to set the properties on the fs.
*/
if (!nvlist_empty(props)) {
*cp = '\0';
error = zfs_secpolicy_write_perms(name,
ZFS_DELEG_PERM_USERPROP, CRED());
*cp = '@';
if (error != 0)
return (error);
}
/* This must be the only snap of this fs. */
for (nvpair_t *pair2 = nvlist_next_nvpair(snaps, pair);
pair2 != NULL; pair2 = nvlist_next_nvpair(snaps, pair2)) {
if (strncmp(name, nvpair_name(pair2), cp - name + 1)
== 0) {
return (SET_ERROR(EXDEV));
}
}
}
error = dsl_dataset_snapshot(snaps, props, outnvl);
return (error);
}
/*
* innvl: "message" -> string
*/
/* ARGSUSED */
static int
zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
{
char *message;
spa_t *spa;
int error;
char *poolname;
/*
* The poolname in the ioctl is not set, we get it from the TSD,
* which was set at the end of the last successful ioctl that allows
* logging. The secpolicy func already checked that it is set.
* Only one log ioctl is allowed after each successful ioctl, so
* we clear the TSD here.
*/
poolname = tsd_get(zfs_allow_log_key);
(void) tsd_set(zfs_allow_log_key, NULL);
error = spa_open(poolname, &spa, FTAG);
strfree(poolname);
if (error != 0)
return (error);
if (nvlist_lookup_string(innvl, "message", &message) != 0) {
spa_close(spa, FTAG);
return (SET_ERROR(EINVAL));
}
if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) {
spa_close(spa, FTAG);
return (SET_ERROR(ENOTSUP));
}
error = spa_history_log(spa, message);
spa_close(spa, FTAG);
return (error);
}
#ifdef __FreeBSD__
static int
zfs_ioc_nextboot(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
{
char name[MAXNAMELEN];
spa_t *spa;
vdev_t *vd;
char *command;
uint64_t pool_guid;
uint64_t vdev_guid;
int error;
if (nvlist_lookup_uint64(innvl,
ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
return (EINVAL);
if (nvlist_lookup_uint64(innvl,
ZPOOL_CONFIG_GUID, &vdev_guid) != 0)
return (EINVAL);
if (nvlist_lookup_string(innvl,
"command", &command) != 0)
return (EINVAL);
mutex_enter(&spa_namespace_lock);
spa = spa_by_guid(pool_guid, vdev_guid);
if (spa != NULL)
strcpy(name, spa_name(spa));
mutex_exit(&spa_namespace_lock);
if (spa == NULL)
return (ENOENT);
if ((error = spa_open(name, &spa, FTAG)) != 0)
return (error);
spa_vdev_state_enter(spa, SCL_ALL);
vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE);
if (vd == NULL) {
(void) spa_vdev_state_exit(spa, NULL, ENXIO);
spa_close(spa, FTAG);
return (ENODEV);
}
error = vdev_label_write_pad2(vd, command, strlen(command));
(void) spa_vdev_state_exit(spa, NULL, 0);
txg_wait_synced(spa->spa_dsl_pool, 0);
spa_close(spa, FTAG);
return (error);
}
#endif
/*
* The dp_config_rwlock must not be held when calling this, because the
* unmount may need to write out data.
*
* This function is best-effort. Callers must deal gracefully if it
* remains mounted (or is remounted after this call).
*
* Returns 0 if the argument is not a snapshot, or it is not currently a
* filesystem, or we were able to unmount it. Returns error code otherwise.
*/
void
zfs_unmount_snap(const char *snapname)
{
vfs_t *vfsp = NULL;
zfsvfs_t *zfsvfs = NULL;
if (strchr(snapname, '@') == NULL)
return;
int err = getzfsvfs(snapname, &zfsvfs);
if (err != 0) {
ASSERT3P(zfsvfs, ==, NULL);
return;
}
vfsp = zfsvfs->z_vfs;
ASSERT(!dsl_pool_config_held(dmu_objset_pool(zfsvfs->z_os)));
#ifdef illumos
err = vn_vfswlock(vfsp->vfs_vnodecovered);
VFS_RELE(vfsp);
if (err != 0)
return;
#endif
/*
* Always force the unmount for snapshots.
*/
#ifdef illumos
(void) dounmount(vfsp, MS_FORCE, kcred);
#else
vfs_ref(vfsp);
vfs_unbusy(vfsp);
(void) dounmount(vfsp, MS_FORCE, curthread);
#endif
}
/* ARGSUSED */
static int
zfs_unmount_snap_cb(const char *snapname, void *arg)
{
zfs_unmount_snap(snapname);
return (0);
}
/*
* When a clone is destroyed, its origin may also need to be destroyed,
* in which case it must be unmounted. This routine will do that unmount
* if necessary.
*/
void
zfs_destroy_unmount_origin(const char *fsname)
{
int error;
objset_t *os;
dsl_dataset_t *ds;
error = dmu_objset_hold(fsname, FTAG, &os);
if (error != 0)
return;
ds = dmu_objset_ds(os);
if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev)) {
char originname[ZFS_MAX_DATASET_NAME_LEN];
dsl_dataset_name(ds->ds_prev, originname);
dmu_objset_rele(os, FTAG);
zfs_unmount_snap(originname);
} else {
dmu_objset_rele(os, FTAG);
}
}
/*
* innvl: {
* "snaps" -> { snapshot1, snapshot2 }
* (optional boolean) "defer"
* }
*
* outnvl: snapshot -> error code (int32)
*
*/
/* ARGSUSED */
static int
zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
{
int error, poollen;
nvlist_t *snaps;
nvpair_t *pair;
boolean_t defer;
if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0)
return (SET_ERROR(EINVAL));
defer = nvlist_exists(innvl, "defer");
poollen = strlen(poolname);
for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
pair = nvlist_next_nvpair(snaps, pair)) {
const char *name = nvpair_name(pair);
/*
* The snap must be in the specified pool to prevent the
* invalid removal of zvol minors below.
*/
if (strncmp(name, poolname, poollen) != 0 ||
(name[poollen] != '/' && name[poollen] != '@'))
return (SET_ERROR(EXDEV));
zfs_unmount_snap(nvpair_name(pair));
#if defined(__FreeBSD__)
zvol_remove_minors(name);
#endif
}
return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl));
}
/*
* Create bookmarks. Bookmark names are of the form #.
* All bookmarks must be in the same pool.
*
* innvl: {
* bookmark1 -> snapshot1, bookmark2 -> snapshot2
* }
*
* outnvl: bookmark -> error code (int32)
*
*/
/* ARGSUSED */
static int
zfs_ioc_bookmark(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
{
for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
char *snap_name;
/*
* Verify the snapshot argument.
*/
if (nvpair_value_string(pair, &snap_name) != 0)
return (SET_ERROR(EINVAL));
/* Verify that the keys (bookmarks) are unique */
for (nvpair_t *pair2 = nvlist_next_nvpair(innvl, pair);
pair2 != NULL; pair2 = nvlist_next_nvpair(innvl, pair2)) {
if (strcmp(nvpair_name(pair), nvpair_name(pair2)) == 0)
return (SET_ERROR(EINVAL));
}
}
return (dsl_bookmark_create(innvl, outnvl));
}
/*
* innvl: {
* property 1, property 2, ...
* }
*
* outnvl: {
* bookmark name 1 -> { property 1, property 2, ... },
* bookmark name 2 -> { property 1, property 2, ... }
* }
*
*/
static int
zfs_ioc_get_bookmarks(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
{
return (dsl_get_bookmarks(fsname, innvl, outnvl));
}
/*
* innvl: {
* bookmark name 1, bookmark name 2
* }
*
* outnvl: bookmark -> error code (int32)
*
*/
static int
zfs_ioc_destroy_bookmarks(const char *poolname, nvlist_t *innvl,
nvlist_t *outnvl)
{
int error, poollen;
poollen = strlen(poolname);
for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
const char *name = nvpair_name(pair);
const char *cp = strchr(name, '#');
/*
* The bookmark name must contain an #, and the part after it
* must contain only valid characters.
*/
if (cp == NULL ||
zfs_component_namecheck(cp + 1, NULL, NULL) != 0)
return (SET_ERROR(EINVAL));
/*
* The bookmark must be in the specified pool.
*/
if (strncmp(name, poolname, poollen) != 0 ||
(name[poollen] != '/' && name[poollen] != '#'))
return (SET_ERROR(EXDEV));
}
error = dsl_bookmark_destroy(innvl, outnvl);
return (error);
}
static int
zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl,
nvlist_t *outnvl)
{
char *program;
uint64_t instrlimit, memlimit;
boolean_t sync_flag;
nvpair_t *nvarg = NULL;
if (0 != nvlist_lookup_string(innvl, ZCP_ARG_PROGRAM, &program)) {
return (EINVAL);
}
if (0 != nvlist_lookup_boolean_value(innvl, ZCP_ARG_SYNC, &sync_flag)) {
sync_flag = B_TRUE;
}
if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_INSTRLIMIT, &instrlimit)) {
instrlimit = ZCP_DEFAULT_INSTRLIMIT;
}
if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_MEMLIMIT, &memlimit)) {
memlimit = ZCP_DEFAULT_MEMLIMIT;
}
if (0 != nvlist_lookup_nvpair(innvl, ZCP_ARG_ARGLIST, &nvarg)) {
return (EINVAL);
}
if (instrlimit == 0 || instrlimit > zfs_lua_max_instrlimit)
return (EINVAL);
if (memlimit == 0 || memlimit > zfs_lua_max_memlimit)
return (EINVAL);
return (zcp_eval(poolname, program, sync_flag, instrlimit, memlimit,
nvarg, outnvl));
}
/*
* innvl: unused
* outnvl: empty
*/
/* ARGSUSED */
static int
zfs_ioc_pool_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
{
return (spa_checkpoint(poolname));
}
/*
* innvl: unused
* outnvl: empty
*/
/* ARGSUSED */
static int
zfs_ioc_pool_discard_checkpoint(const char *poolname, nvlist_t *innvl,
nvlist_t *outnvl)
{
return (spa_checkpoint_discard(poolname));
}
/*
* inputs:
* zc_name name of dataset to destroy
* zc_defer_destroy mark for deferred destroy
*
* outputs: none
*/
static int
zfs_ioc_destroy(zfs_cmd_t *zc)
{
objset_t *os;
dmu_objset_type_t ost;
int err;
err = dmu_objset_hold(zc->zc_name, FTAG, &os);
if (err != 0)
return (err);
ost = dmu_objset_type(os);
dmu_objset_rele(os, FTAG);
if (ost == DMU_OST_ZFS)
zfs_unmount_snap(zc->zc_name);
if (strchr(zc->zc_name, '@'))
err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy);
else
err = dsl_destroy_head(zc->zc_name);
if (ost == DMU_OST_ZVOL && err == 0)
#ifdef __FreeBSD__
zvol_remove_minors(zc->zc_name);
#else
(void) zvol_remove_minor(zc->zc_name);
#endif
return (err);
}
/*
* innvl: {
* vdevs: {
* guid 1, guid 2, ...
* },
* func: POOL_INITIALIZE_{CANCEL|DO|SUSPEND}
* }
*
* outnvl: {
* [func: EINVAL (if provided command type didn't make sense)],
* [vdevs: {
* guid1: errno, (see function body for possible errnos)
* ...
* }]
* }
*
*/
static int
zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
{
spa_t *spa;
int error;
error = spa_open(poolname, &spa, FTAG);
if (error != 0)
return (error);
uint64_t cmd_type;
if (nvlist_lookup_uint64(innvl, ZPOOL_INITIALIZE_COMMAND,
&cmd_type) != 0) {
spa_close(spa, FTAG);
return (SET_ERROR(EINVAL));
}
if (!(cmd_type == POOL_INITIALIZE_CANCEL ||
cmd_type == POOL_INITIALIZE_DO ||
cmd_type == POOL_INITIALIZE_SUSPEND)) {
spa_close(spa, FTAG);
return (SET_ERROR(EINVAL));
}
nvlist_t *vdev_guids;
if (nvlist_lookup_nvlist(innvl, ZPOOL_INITIALIZE_VDEVS,
&vdev_guids) != 0) {
spa_close(spa, FTAG);
return (SET_ERROR(EINVAL));
}
nvlist_t *vdev_errlist = fnvlist_alloc();
int total_errors = 0;
for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL);
pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) {
uint64_t vdev_guid = fnvpair_value_uint64(pair);
error = spa_vdev_initialize(spa, vdev_guid, cmd_type);
if (error != 0) {
char guid_as_str[MAXNAMELEN];
(void) snprintf(guid_as_str, sizeof (guid_as_str),
"%llu", (unsigned long long)vdev_guid);
fnvlist_add_int64(vdev_errlist, guid_as_str, error);
total_errors++;
}
}
if (fnvlist_size(vdev_errlist) > 0) {
fnvlist_add_nvlist(outnvl, ZPOOL_INITIALIZE_VDEVS,
vdev_errlist);
}
fnvlist_free(vdev_errlist);
spa_close(spa, FTAG);
return (total_errors > 0 ? EINVAL : 0);
}
/*
* fsname is name of dataset to rollback (to most recent snapshot)
*
* innvl may contain name of expected target snapshot
*
* outnvl: "target" -> name of most recent snapshot
* }
*/
/* ARGSUSED */
static int
zfs_ioc_rollback(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
{
zfsvfs_t *zfsvfs;
char *target = NULL;
int error;
(void) nvlist_lookup_string(innvl, "target", &target);
if (target != NULL) {
const char *cp = strchr(target, '@');
/*
* The snap name must contain an @, and the part after it must
* contain only valid characters.
*/
if (cp == NULL ||
zfs_component_namecheck(cp + 1, NULL, NULL) != 0)
return (SET_ERROR(EINVAL));
}
if (getzfsvfs(fsname, &zfsvfs) == 0) {
dsl_dataset_t *ds;
ds = dmu_objset_ds(zfsvfs->z_os);
error = zfs_suspend_fs(zfsvfs);
if (error == 0) {
int resume_err;
error = dsl_dataset_rollback(fsname, target, zfsvfs,
outnvl);
resume_err = zfs_resume_fs(zfsvfs, ds);
error = error ? error : resume_err;
}
#ifdef illumos
VFS_RELE(zfsvfs->z_vfs);
#else
vfs_unbusy(zfsvfs->z_vfs);
#endif
} else {
error = dsl_dataset_rollback(fsname, target, NULL, outnvl);
}
return (error);
}
static int
recursive_unmount(const char *fsname, void *arg)
{
const char *snapname = arg;
char fullname[ZFS_MAX_DATASET_NAME_LEN];
(void) snprintf(fullname, sizeof (fullname), "%s@%s", fsname, snapname);
zfs_unmount_snap(fullname);
return (0);
}
/*
* inputs:
* zc_name old name of dataset or bookmark
* zc_value new name of dataset or bookmark
* zc_cookie recursive flag (only valid for snapshots)
*
* outputs: none
*/
static int
zfs_ioc_rename(zfs_cmd_t *zc)
{
objset_t *os;
dmu_objset_type_t ost;
boolean_t recursive = zc->zc_cookie & 1;
char *pos, *pos2;
boolean_t allow_mounted = B_TRUE;
int err;
#ifdef __FreeBSD__
allow_mounted = (zc->zc_cookie & 2) != 0;
#endif
zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
zc->zc_value[sizeof (zc->zc_value) - 1] = '\0';
pos = strchr(zc->zc_name, '#');
if (pos != NULL) {
/* Bookmarks must be in same fs. */
pos2 = strchr(zc->zc_value, '#');
if (pos2 == NULL)
return (SET_ERROR(EINVAL));
/* Recursive flag is not supported yet. */
if (recursive)
return (SET_ERROR(ENOTSUP));
*pos = '\0';
*pos2 = '\0';
if (strcmp(zc->zc_name, zc->zc_value) == 0) {
err = dsl_bookmark_rename(zc->zc_name,
pos + 1, pos2 + 1);
} else {
err = SET_ERROR(EXDEV);
}
*pos = '#';
*pos2 = '#';
return (err);
}
/* "zfs rename" from and to ...%recv datasets should both fail */
if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 ||
dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
strchr(zc->zc_name, '%') || strchr(zc->zc_value, '%'))
return (SET_ERROR(EINVAL));
err = dmu_objset_hold(zc->zc_name, FTAG, &os);
if (err != 0)
return (err);
ost = dmu_objset_type(os);
dmu_objset_rele(os, FTAG);
pos = strchr(zc->zc_name, '@');
if (pos != NULL) {
/* Snapshots must be in same fs. */
pos2 = strchr(zc->zc_value, '@');
if (pos2 == NULL)
return (SET_ERROR(EINVAL));
*pos = '\0';
*pos2 = '\0';
if (strcmp(zc->zc_name, zc->zc_value) != 0) {
err = SET_ERROR(EXDEV);
} else {
if (ost == DMU_OST_ZFS && !allow_mounted) {
err = dmu_objset_find(zc->zc_name,
recursive_unmount, pos + 1,
recursive ? DS_FIND_CHILDREN : 0);
}
if (err == 0) {
err = dsl_dataset_rename_snapshot(zc->zc_name,
pos + 1, pos2 + 1, recursive);
}
}
*pos = '@';
*pos2 = '@';
return (err);
} else {
#ifdef illumos
if (ost == DMU_OST_ZVOL)
(void) zvol_remove_minor(zc->zc_name);
#endif
return (dsl_dir_rename(zc->zc_name, zc->zc_value));
}
}
static int
zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
{
const char *propname = nvpair_name(pair);
boolean_t issnap = (strchr(dsname, '@') != NULL);
zfs_prop_t prop = zfs_name_to_prop(propname);
uint64_t intval;
int err;
if (prop == ZPROP_INVAL) {
if (zfs_prop_user(propname)) {
if (err = zfs_secpolicy_write_perms(dsname,
ZFS_DELEG_PERM_USERPROP, cr))
return (err);
return (0);
}
if (!issnap && zfs_prop_userquota(propname)) {
const char *perm = NULL;
const char *uq_prefix =
zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA];
const char *gq_prefix =
zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA];
if (strncmp(propname, uq_prefix,
strlen(uq_prefix)) == 0) {
perm = ZFS_DELEG_PERM_USERQUOTA;
} else if (strncmp(propname, gq_prefix,
strlen(gq_prefix)) == 0) {
perm = ZFS_DELEG_PERM_GROUPQUOTA;
} else {
/* USERUSED and GROUPUSED are read-only */
return (SET_ERROR(EINVAL));
}
if (err = zfs_secpolicy_write_perms(dsname, perm, cr))
return (err);
return (0);
}
return (SET_ERROR(EINVAL));
}
if (issnap)
return (SET_ERROR(EINVAL));
if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
/*
* dsl_prop_get_all_impl() returns properties in this
* format.
*/
nvlist_t *attrs;
VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
&pair) == 0);
}
/*
* Check that this value is valid for this pool version
*/
switch (prop) {
case ZFS_PROP_COMPRESSION:
/*
* If the user specified gzip compression, make sure
* the SPA supports it. We ignore any errors here since
* we'll catch them later.
*/
if (nvpair_value_uint64(pair, &intval) == 0) {
if (intval >= ZIO_COMPRESS_GZIP_1 &&
intval <= ZIO_COMPRESS_GZIP_9 &&
zfs_earlier_version(dsname,
SPA_VERSION_GZIP_COMPRESSION)) {
return (SET_ERROR(ENOTSUP));
}
if (intval == ZIO_COMPRESS_ZLE &&
zfs_earlier_version(dsname,
SPA_VERSION_ZLE_COMPRESSION))
return (SET_ERROR(ENOTSUP));
if (intval == ZIO_COMPRESS_LZ4) {
spa_t *spa;
if ((err = spa_open(dsname, &spa, FTAG)) != 0)
return (err);
if (!spa_feature_is_enabled(spa,
SPA_FEATURE_LZ4_COMPRESS)) {
spa_close(spa, FTAG);
return (SET_ERROR(ENOTSUP));
}
spa_close(spa, FTAG);
}
/*
* If this is a bootable dataset then
* verify that the compression algorithm
* is supported for booting. We must return
* something other than ENOTSUP since it
* implies a downrev pool version.
*/
if (zfs_is_bootfs(dsname) &&
!BOOTFS_COMPRESS_VALID(intval)) {
return (SET_ERROR(ERANGE));
}
}
break;
case ZFS_PROP_COPIES:
if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS))
return (SET_ERROR(ENOTSUP));
break;
case ZFS_PROP_RECORDSIZE:
/* Record sizes above 128k need the feature to be enabled */
if (nvpair_value_uint64(pair, &intval) == 0 &&
intval > SPA_OLD_MAXBLOCKSIZE) {
spa_t *spa;
/*
* We don't allow setting the property above 1MB,
* unless the tunable has been changed.
*/
if (intval > zfs_max_recordsize ||
intval > SPA_MAXBLOCKSIZE)
return (SET_ERROR(ERANGE));
if ((err = spa_open(dsname, &spa, FTAG)) != 0)
return (err);
if (!spa_feature_is_enabled(spa,
SPA_FEATURE_LARGE_BLOCKS)) {
spa_close(spa, FTAG);
return (SET_ERROR(ENOTSUP));
}
spa_close(spa, FTAG);
}
break;
case ZFS_PROP_DNODESIZE:
/* Dnode sizes above 512 need the feature to be enabled */
if (nvpair_value_uint64(pair, &intval) == 0 &&
intval != ZFS_DNSIZE_LEGACY) {
spa_t *spa;
if ((err = spa_open(dsname, &spa, FTAG)) != 0)
return (err);
if (!spa_feature_is_enabled(spa,
SPA_FEATURE_LARGE_DNODE)) {
spa_close(spa, FTAG);
return (SET_ERROR(ENOTSUP));
}
spa_close(spa, FTAG);
}
break;
case ZFS_PROP_SPECIAL_SMALL_BLOCKS:
/*
* This property could require the allocation classes
* feature to be active for setting, however we allow
* it so that tests of settable properties succeed.
* The CLI will issue a warning in this case.
*/
break;
case ZFS_PROP_SHARESMB:
if (zpl_earlier_version(dsname, ZPL_VERSION_FUID))
return (SET_ERROR(ENOTSUP));
break;
case ZFS_PROP_ACLINHERIT:
if (nvpair_type(pair) == DATA_TYPE_UINT64 &&
nvpair_value_uint64(pair, &intval) == 0) {
if (intval == ZFS_ACL_PASSTHROUGH_X &&
zfs_earlier_version(dsname,
SPA_VERSION_PASSTHROUGH_X))
return (SET_ERROR(ENOTSUP));
}
break;
case ZFS_PROP_CHECKSUM:
case ZFS_PROP_DEDUP:
{
spa_feature_t feature;
spa_t *spa;
/* dedup feature version checks */
if (prop == ZFS_PROP_DEDUP &&
zfs_earlier_version(dsname, SPA_VERSION_DEDUP))
return (SET_ERROR(ENOTSUP));
if (nvpair_value_uint64(pair, &intval) != 0)
return (SET_ERROR(EINVAL));
/* check prop value is enabled in features */
feature = zio_checksum_to_feature(intval & ZIO_CHECKSUM_MASK);
if (feature == SPA_FEATURE_NONE)
break;
if ((err = spa_open(dsname, &spa, FTAG)) != 0)
return (err);
/*
* Salted checksums are not supported on root pools.
*/
if (spa_bootfs(spa) != 0 &&
intval < ZIO_CHECKSUM_FUNCTIONS &&
(zio_checksum_table[intval].ci_flags &
ZCHECKSUM_FLAG_SALTED)) {
spa_close(spa, FTAG);
return (SET_ERROR(ERANGE));
}
if (!spa_feature_is_enabled(spa, feature)) {
spa_close(spa, FTAG);
return (SET_ERROR(ENOTSUP));
}
spa_close(spa, FTAG);
break;
}
}
return (zfs_secpolicy_setprop(dsname, prop, pair, CRED()));
}
/*
* Checks for a race condition to make sure we don't increment a feature flag
* multiple times.
*/
static int
zfs_prop_activate_feature_check(void *arg, dmu_tx_t *tx)
{
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
spa_feature_t *featurep = arg;
if (!spa_feature_is_active(spa, *featurep))
return (0);
else
return (SET_ERROR(EBUSY));
}
/*
* The callback invoked on feature activation in the sync task caused by
* zfs_prop_activate_feature.
*/
static void
zfs_prop_activate_feature_sync(void *arg, dmu_tx_t *tx)
{
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
spa_feature_t *featurep = arg;
spa_feature_incr(spa, *featurep, tx);
}
/*
* Activates a feature on a pool in response to a property setting. This
* creates a new sync task which modifies the pool to reflect the feature
* as being active.
*/
static int
zfs_prop_activate_feature(spa_t *spa, spa_feature_t feature)
{
int err;
/* EBUSY here indicates that the feature is already active */
err = dsl_sync_task(spa_name(spa),
zfs_prop_activate_feature_check, zfs_prop_activate_feature_sync,
&feature, 2, ZFS_SPACE_CHECK_RESERVED);
if (err != 0 && err != EBUSY)
return (err);
else
return (0);
}
/*
* Removes properties from the given props list that fail permission checks
* needed to clear them and to restore them in case of a receive error. For each
* property, make sure we have both set and inherit permissions.
*
* Returns the first error encountered if any permission checks fail. If the
* caller provides a non-NULL errlist, it also gives the complete list of names
* of all the properties that failed a permission check along with the
* corresponding error numbers. The caller is responsible for freeing the
* returned errlist.
*
* If every property checks out successfully, zero is returned and the list
* pointed at by errlist is NULL.
*/
static int
zfs_check_clearable(char *dataset, nvlist_t *props, nvlist_t **errlist)
{
zfs_cmd_t *zc;
nvpair_t *pair, *next_pair;
nvlist_t *errors;
int err, rv = 0;
if (props == NULL)
return (0);
VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0);
zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP);
(void) strcpy(zc->zc_name, dataset);
pair = nvlist_next_nvpair(props, NULL);
while (pair != NULL) {
next_pair = nvlist_next_nvpair(props, pair);
(void) strcpy(zc->zc_value, nvpair_name(pair));
if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 ||
(err = zfs_secpolicy_inherit_prop(zc, NULL, CRED())) != 0) {
VERIFY(nvlist_remove_nvpair(props, pair) == 0);
VERIFY(nvlist_add_int32(errors,
zc->zc_value, err) == 0);
}
pair = next_pair;
}
kmem_free(zc, sizeof (zfs_cmd_t));
if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) {
nvlist_free(errors);
errors = NULL;
} else {
VERIFY(nvpair_value_int32(pair, &rv) == 0);
}
if (errlist == NULL)
nvlist_free(errors);
else
*errlist = errors;
return (rv);
}
static boolean_t
propval_equals(nvpair_t *p1, nvpair_t *p2)
{
if (nvpair_type(p1) == DATA_TYPE_NVLIST) {
/* dsl_prop_get_all_impl() format */
nvlist_t *attrs;
VERIFY(nvpair_value_nvlist(p1, &attrs) == 0);
VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
&p1) == 0);
}
if (nvpair_type(p2) == DATA_TYPE_NVLIST) {
nvlist_t *attrs;
VERIFY(nvpair_value_nvlist(p2, &attrs) == 0);
VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
&p2) == 0);
}
if (nvpair_type(p1) != nvpair_type(p2))
return (B_FALSE);
if (nvpair_type(p1) == DATA_TYPE_STRING) {
char *valstr1, *valstr2;
VERIFY(nvpair_value_string(p1, (char **)&valstr1) == 0);
VERIFY(nvpair_value_string(p2, (char **)&valstr2) == 0);
return (strcmp(valstr1, valstr2) == 0);
} else {
uint64_t intval1, intval2;
VERIFY(nvpair_value_uint64(p1, &intval1) == 0);
VERIFY(nvpair_value_uint64(p2, &intval2) == 0);
return (intval1 == intval2);
}
}
/*
* Remove properties from props if they are not going to change (as determined
* by comparison with origprops). Remove them from origprops as well, since we
* do not need to clear or restore properties that won't change.
*/
static void
props_reduce(nvlist_t *props, nvlist_t *origprops)
{
nvpair_t *pair, *next_pair;
if (origprops == NULL)
return; /* all props need to be received */
pair = nvlist_next_nvpair(props, NULL);
while (pair != NULL) {
const char *propname = nvpair_name(pair);
nvpair_t *match;
next_pair = nvlist_next_nvpair(props, pair);
if ((nvlist_lookup_nvpair(origprops, propname,
&match) != 0) || !propval_equals(pair, match))
goto next; /* need to set received value */
/* don't clear the existing received value */
(void) nvlist_remove_nvpair(origprops, match);
/* don't bother receiving the property */
(void) nvlist_remove_nvpair(props, pair);
next:
pair = next_pair;
}
}
/*
* Extract properties that cannot be set PRIOR to the receipt of a dataset.
* For example, refquota cannot be set until after the receipt of a dataset,
* because in replication streams, an older/earlier snapshot may exceed the
* refquota. We want to receive the older/earlier snapshot, but setting
* refquota pre-receipt will set the dsl's ACTUAL quota, which will prevent
* the older/earlier snapshot from being received (with EDQUOT).
*
* The ZFS test "zfs_receive_011_pos" demonstrates such a scenario.
*
* libzfs will need to be judicious handling errors encountered by props
* extracted by this function.
*/
static nvlist_t *
extract_delay_props(nvlist_t *props)
{
nvlist_t *delayprops;
nvpair_t *nvp, *tmp;
static const zfs_prop_t delayable[] = { ZFS_PROP_REFQUOTA, 0 };
int i;
VERIFY(nvlist_alloc(&delayprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
for (nvp = nvlist_next_nvpair(props, NULL); nvp != NULL;
nvp = nvlist_next_nvpair(props, nvp)) {
/*
* strcmp() is safe because zfs_prop_to_name() always returns
* a bounded string.
*/
for (i = 0; delayable[i] != 0; i++) {
if (strcmp(zfs_prop_to_name(delayable[i]),
nvpair_name(nvp)) == 0) {
break;
}
}
if (delayable[i] != 0) {
tmp = nvlist_prev_nvpair(props, nvp);
VERIFY(nvlist_add_nvpair(delayprops, nvp) == 0);
VERIFY(nvlist_remove_nvpair(props, nvp) == 0);
nvp = tmp;
}
}
if (nvlist_empty(delayprops)) {
nvlist_free(delayprops);
delayprops = NULL;
}
return (delayprops);
}
#ifdef DEBUG
static boolean_t zfs_ioc_recv_inject_err;
#endif
/*
* inputs:
* zc_name name of containing filesystem
* zc_nvlist_src{_size} nvlist of properties to apply
* zc_value name of snapshot to create
* zc_string name of clone origin (if DRR_FLAG_CLONE)
* zc_cookie file descriptor to recv from
* zc_begin_record the BEGIN record of the stream (not byteswapped)
* zc_guid force flag
* zc_cleanup_fd cleanup-on-exit file descriptor
* zc_action_handle handle for this guid/ds mapping (or zero on first call)
* zc_resumable if data is incomplete assume sender will resume
*
* outputs:
* zc_cookie number of bytes read
* zc_nvlist_dst{_size} error for each unapplied received property
* zc_obj zprop_errflags_t
* zc_action_handle handle for this guid/ds mapping
*/
static int
zfs_ioc_recv(zfs_cmd_t *zc)
{
file_t *fp;
dmu_recv_cookie_t drc;
boolean_t force = (boolean_t)zc->zc_guid;
int fd;
int error = 0;
int props_error = 0;
nvlist_t *errors;
offset_t off;
nvlist_t *props = NULL; /* sent properties */
nvlist_t *origprops = NULL; /* existing properties */
nvlist_t *delayprops = NULL; /* sent properties applied post-receive */
char *origin = NULL;
char *tosnap;
char tofs[ZFS_MAX_DATASET_NAME_LEN];
boolean_t first_recvd_props = B_FALSE;
if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
strchr(zc->zc_value, '@') == NULL ||
strchr(zc->zc_value, '%'))
return (SET_ERROR(EINVAL));
(void) strcpy(tofs, zc->zc_value);
tosnap = strchr(tofs, '@');
*tosnap++ = '\0';
if (zc->zc_nvlist_src != 0 &&
(error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
zc->zc_iflags, &props)) != 0)
return (error);
fd = zc->zc_cookie;
#ifdef illumos
fp = getf(fd);
#else
fget_read(curthread, fd, &cap_pread_rights, &fp);
#endif
if (fp == NULL) {
nvlist_free(props);
return (SET_ERROR(EBADF));
}
errors = fnvlist_alloc();
if (zc->zc_string[0])
origin = zc->zc_string;
error = dmu_recv_begin(tofs, tosnap,
&zc->zc_begin_record, force, zc->zc_resumable, origin, &drc);
if (error != 0)
goto out;
/*
* Set properties before we receive the stream so that they are applied
* to the new data. Note that we must call dmu_recv_stream() if
* dmu_recv_begin() succeeds.
*/
if (props != NULL && !drc.drc_newfs) {
if (spa_version(dsl_dataset_get_spa(drc.drc_ds)) >=
SPA_VERSION_RECVD_PROPS &&
!dsl_prop_get_hasrecvd(tofs))
first_recvd_props = B_TRUE;
/*
* If new received properties are supplied, they are to
* completely replace the existing received properties, so stash
* away the existing ones.
*/
if (dsl_prop_get_received(tofs, &origprops) == 0) {
nvlist_t *errlist = NULL;
/*
* Don't bother writing a property if its value won't
* change (and avoid the unnecessary security checks).
*
* The first receive after SPA_VERSION_RECVD_PROPS is a
* special case where we blow away all local properties
* regardless.
*/
if (!first_recvd_props)
props_reduce(props, origprops);
if (zfs_check_clearable(tofs, origprops, &errlist) != 0)
(void) nvlist_merge(errors, errlist, 0);
nvlist_free(errlist);
if (clear_received_props(tofs, origprops,
first_recvd_props ? NULL : props) != 0)
zc->zc_obj |= ZPROP_ERR_NOCLEAR;
} else {
zc->zc_obj |= ZPROP_ERR_NOCLEAR;
}
}
if (props != NULL) {
props_error = dsl_prop_set_hasrecvd(tofs);
if (props_error == 0) {
delayprops = extract_delay_props(props);
(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED,
props, errors);
}
}
off = fp->f_offset;
error = dmu_recv_stream(&drc, fp, &off, zc->zc_cleanup_fd,
&zc->zc_action_handle);
if (error == 0) {
zfsvfs_t *zfsvfs = NULL;
if (getzfsvfs(tofs, &zfsvfs) == 0) {
/* online recv */
dsl_dataset_t *ds;
int end_err;
ds = dmu_objset_ds(zfsvfs->z_os);
error = zfs_suspend_fs(zfsvfs);
/*
* If the suspend fails, then the recv_end will
* likely also fail, and clean up after itself.
*/
end_err = dmu_recv_end(&drc, zfsvfs);
if (error == 0)
error = zfs_resume_fs(zfsvfs, ds);
error = error ? error : end_err;
#ifdef illumos
VFS_RELE(zfsvfs->z_vfs);
#else
vfs_unbusy(zfsvfs->z_vfs);
#endif
} else {
error = dmu_recv_end(&drc, NULL);
}
/* Set delayed properties now, after we're done receiving. */
if (delayprops != NULL && error == 0) {
(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED,
delayprops, errors);
}
}
if (delayprops != NULL) {
/*
* Merge delayed props back in with initial props, in case
* we're DEBUG and zfs_ioc_recv_inject_err is set (which means
* we have to make sure clear_received_props() includes
* the delayed properties).
*
* Since zfs_ioc_recv_inject_err is only in DEBUG kernels,
* using ASSERT() will be just like a VERIFY.
*/
ASSERT(nvlist_merge(props, delayprops, 0) == 0);
nvlist_free(delayprops);
}
/*
* Now that all props, initial and delayed, are set, report the prop
* errors to the caller.
*/
if (zc->zc_nvlist_dst_size != 0 &&
(nvlist_smush(errors, zc->zc_nvlist_dst_size) != 0 ||
put_nvlist(zc, errors) != 0)) {
/*
* Caller made zc->zc_nvlist_dst less than the minimum expected
* size or supplied an invalid address.
*/
props_error = SET_ERROR(EINVAL);
}
zc->zc_cookie = off - fp->f_offset;
if (off >= 0 && off <= MAXOFFSET_T)
fp->f_offset = off;
#ifdef DEBUG
if (zfs_ioc_recv_inject_err) {
zfs_ioc_recv_inject_err = B_FALSE;
error = 1;
}
#endif
#ifdef __FreeBSD__
if (error == 0)
zvol_create_minors(tofs);
#endif
/*
* On error, restore the original props.
*/
if (error != 0 && props != NULL && !drc.drc_newfs) {
if (clear_received_props(tofs, props, NULL) != 0) {
/*
* We failed to clear the received properties.
* Since we may have left a $recvd value on the
* system, we can't clear the $hasrecvd flag.
*/
zc->zc_obj |= ZPROP_ERR_NORESTORE;
} else if (first_recvd_props) {
dsl_prop_unset_hasrecvd(tofs);
}
if (origprops == NULL && !drc.drc_newfs) {
/* We failed to stash the original properties. */
zc->zc_obj |= ZPROP_ERR_NORESTORE;
}
/*
* dsl_props_set() will not convert RECEIVED to LOCAL on or
* after SPA_VERSION_RECVD_PROPS, so we need to specify LOCAL
* explictly if we're restoring local properties cleared in the
* first new-style receive.
*/
if (origprops != NULL &&
zfs_set_prop_nvlist(tofs, (first_recvd_props ?
ZPROP_SRC_LOCAL : ZPROP_SRC_RECEIVED),
origprops, NULL) != 0) {
/*
* We stashed the original properties but failed to
* restore them.
*/
zc->zc_obj |= ZPROP_ERR_NORESTORE;
}
}
out:
nvlist_free(props);
nvlist_free(origprops);
nvlist_free(errors);
releasef(fd);
if (error == 0)
error = props_error;
return (error);
}
/*
* inputs:
* zc_name name of snapshot to send
* zc_cookie file descriptor to send stream to
* zc_obj fromorigin flag (mutually exclusive with zc_fromobj)
* zc_sendobj objsetid of snapshot to send
* zc_fromobj objsetid of incremental fromsnap (may be zero)
* zc_guid if set, estimate size of stream only. zc_cookie is ignored.
* output size in zc_objset_type.
* zc_flags lzc_send_flags
*
* outputs:
* zc_objset_type estimated size, if zc_guid is set
*
* NOTE: This is no longer the preferred interface, any new functionality
* should be added to zfs_ioc_send_new() instead.
*/
static int
zfs_ioc_send(zfs_cmd_t *zc)
{
int error;
offset_t off;
boolean_t estimate = (zc->zc_guid != 0);
boolean_t embedok = (zc->zc_flags & 0x1);
boolean_t large_block_ok = (zc->zc_flags & 0x2);
boolean_t compressok = (zc->zc_flags & 0x4);
if (zc->zc_obj != 0) {
dsl_pool_t *dp;
dsl_dataset_t *tosnap;
error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
if (error != 0)
return (error);
error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap);
if (error != 0) {
dsl_pool_rele(dp, FTAG);
return (error);
}
if (dsl_dir_is_clone(tosnap->ds_dir))
zc->zc_fromobj =
dsl_dir_phys(tosnap->ds_dir)->dd_origin_obj;
dsl_dataset_rele(tosnap, FTAG);
dsl_pool_rele(dp, FTAG);
}
if (estimate) {
dsl_pool_t *dp;
dsl_dataset_t *tosnap;
dsl_dataset_t *fromsnap = NULL;
error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
if (error != 0)
return (error);
error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap);
if (error != 0) {
dsl_pool_rele(dp, FTAG);
return (error);
}
if (zc->zc_fromobj != 0) {
error = dsl_dataset_hold_obj(dp, zc->zc_fromobj,
FTAG, &fromsnap);
if (error != 0) {
dsl_dataset_rele(tosnap, FTAG);
dsl_pool_rele(dp, FTAG);
return (error);
}
}
error = dmu_send_estimate(tosnap, fromsnap, compressok,
&zc->zc_objset_type);
if (fromsnap != NULL)
dsl_dataset_rele(fromsnap, FTAG);
dsl_dataset_rele(tosnap, FTAG);
dsl_pool_rele(dp, FTAG);
} else {
file_t *fp;
#ifdef illumos
fp = getf(zc->zc_cookie);
#else
fget_write(curthread, zc->zc_cookie, &cap_write_rights, &fp);
#endif
if (fp == NULL)
return (SET_ERROR(EBADF));
off = fp->f_offset;
error = dmu_send_obj(zc->zc_name, zc->zc_sendobj,
zc->zc_fromobj, embedok, large_block_ok, compressok,
#ifdef illumos
zc->zc_cookie, fp->f_vnode, &off);
#else
zc->zc_cookie, fp, &off);
#endif
if (off >= 0 && off <= MAXOFFSET_T)
fp->f_offset = off;
releasef(zc->zc_cookie);
}
return (error);
}
/*
* inputs:
* zc_name name of snapshot on which to report progress
* zc_cookie file descriptor of send stream
*
* outputs:
* zc_cookie number of bytes written in send stream thus far
*/
static int
zfs_ioc_send_progress(zfs_cmd_t *zc)
{
dsl_pool_t *dp;
dsl_dataset_t *ds;
dmu_sendarg_t *dsp = NULL;
int error;
error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
if (error != 0)
return (error);
error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds);
if (error != 0) {
dsl_pool_rele(dp, FTAG);
return (error);
}
mutex_enter(&ds->ds_sendstream_lock);
/*
* Iterate over all the send streams currently active on this dataset.
* If there's one which matches the specified file descriptor _and_ the
* stream was started by the current process, return the progress of
* that stream.
*/
for (dsp = list_head(&ds->ds_sendstreams); dsp != NULL;
dsp = list_next(&ds->ds_sendstreams, dsp)) {
if (dsp->dsa_outfd == zc->zc_cookie &&
dsp->dsa_proc == curproc)
break;
}
if (dsp != NULL)
zc->zc_cookie = *(dsp->dsa_off);
else
error = SET_ERROR(ENOENT);
mutex_exit(&ds->ds_sendstream_lock);
dsl_dataset_rele(ds, FTAG);
dsl_pool_rele(dp, FTAG);
return (error);
}
static int
zfs_ioc_inject_fault(zfs_cmd_t *zc)
{
int id, error;
error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id,
&zc->zc_inject_record);
if (error == 0)
zc->zc_guid = (uint64_t)id;
return (error);
}
static int
zfs_ioc_clear_fault(zfs_cmd_t *zc)
{
return (zio_clear_fault((int)zc->zc_guid));
}
static int
zfs_ioc_inject_list_next(zfs_cmd_t *zc)
{
int id = (int)zc->zc_guid;
int error;
error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name),
&zc->zc_inject_record);
zc->zc_guid = id;
return (error);
}
static int
zfs_ioc_error_log(zfs_cmd_t *zc)
{
spa_t *spa;
int error;
size_t count = (size_t)zc->zc_nvlist_dst_size;
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
return (error);
error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst,
&count);
if (error == 0)
zc->zc_nvlist_dst_size = count;
else
zc->zc_nvlist_dst_size = spa_get_errlog_size(spa);
spa_close(spa, FTAG);
return (error);
}
static int
zfs_ioc_clear(zfs_cmd_t *zc)
{
spa_t *spa;
vdev_t *vd;
int error;
/*
* On zpool clear we also fix up missing slogs
*/
mutex_enter(&spa_namespace_lock);
spa = spa_lookup(zc->zc_name);
if (spa == NULL) {
mutex_exit(&spa_namespace_lock);
return (SET_ERROR(EIO));
}
if (spa_get_log_state(spa) == SPA_LOG_MISSING) {
/* we need to let spa_open/spa_load clear the chains */
spa_set_log_state(spa, SPA_LOG_CLEAR);
}
spa->spa_last_open_failed = 0;
mutex_exit(&spa_namespace_lock);
if (zc->zc_cookie & ZPOOL_NO_REWIND) {
error = spa_open(zc->zc_name, &spa, FTAG);
} else {
nvlist_t *policy;
nvlist_t *config = NULL;
if (zc->zc_nvlist_src == 0)
return (SET_ERROR(EINVAL));
if ((error = get_nvlist(zc->zc_nvlist_src,
zc->zc_nvlist_src_size, zc->zc_iflags, &policy)) == 0) {
error = spa_open_rewind(zc->zc_name, &spa, FTAG,
policy, &config);
if (config != NULL) {
int err;
if ((err = put_nvlist(zc, config)) != 0)
error = err;
nvlist_free(config);
}
nvlist_free(policy);
}
}
if (error != 0)
return (error);
/*
* If multihost is enabled, resuming I/O is unsafe as another
* host may have imported the pool.
*/
if (spa_multihost(spa) && spa_suspended(spa))
return (SET_ERROR(EINVAL));
spa_vdev_state_enter(spa, SCL_NONE);
if (zc->zc_guid == 0) {
vd = NULL;
} else {
vd = spa_lookup_by_guid(spa, zc->zc_guid, B_TRUE);
if (vd == NULL) {
(void) spa_vdev_state_exit(spa, NULL, ENODEV);
spa_close(spa, FTAG);
return (SET_ERROR(ENODEV));
}
}
vdev_clear(spa, vd);
(void) spa_vdev_state_exit(spa, NULL, 0);
/*
* Resume any suspended I/Os.
*/
if (zio_resume(spa) != 0)
error = SET_ERROR(EIO);
spa_close(spa, FTAG);
return (error);
}
static int
zfs_ioc_pool_reopen(zfs_cmd_t *zc)
{
spa_t *spa;
int error;
error = spa_open(zc->zc_name, &spa, FTAG);
if (error != 0)
return (error);
spa_vdev_state_enter(spa, SCL_NONE);
/*
* If a resilver is already in progress then set the
* spa_scrub_reopen flag to B_TRUE so that we don't restart
* the scan as a side effect of the reopen. Otherwise, let
* vdev_open() decided if a resilver is required.
*/
spa->spa_scrub_reopen = dsl_scan_resilvering(spa->spa_dsl_pool);
vdev_reopen(spa->spa_root_vdev);
spa->spa_scrub_reopen = B_FALSE;
(void) spa_vdev_state_exit(spa, NULL, 0);
spa_close(spa, FTAG);
return (0);
}
/*
* inputs:
* zc_name name of filesystem
*
* outputs:
* zc_string name of conflicting snapshot, if there is one
*/
static int
zfs_ioc_promote(zfs_cmd_t *zc)
{
dsl_pool_t *dp;
dsl_dataset_t *ds, *ods;
char origin[ZFS_MAX_DATASET_NAME_LEN];
char *cp;
int error;
zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 ||
strchr(zc->zc_name, '%'))
return (SET_ERROR(EINVAL));
error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
if (error != 0)
return (error);
error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds);
if (error != 0) {
dsl_pool_rele(dp, FTAG);
return (error);
}
if (!dsl_dir_is_clone(ds->ds_dir)) {
dsl_dataset_rele(ds, FTAG);
dsl_pool_rele(dp, FTAG);
return (SET_ERROR(EINVAL));
}
error = dsl_dataset_hold_obj(dp,
dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &ods);
if (error != 0) {
dsl_dataset_rele(ds, FTAG);
dsl_pool_rele(dp, FTAG);
return (error);
}
dsl_dataset_name(ods, origin);
dsl_dataset_rele(ods, FTAG);
dsl_dataset_rele(ds, FTAG);
dsl_pool_rele(dp, FTAG);
/*
* We don't need to unmount *all* the origin fs's snapshots, but
* it's easier.
*/
cp = strchr(origin, '@');
if (cp)
*cp = '\0';
(void) dmu_objset_find(origin,
zfs_unmount_snap_cb, NULL, DS_FIND_SNAPSHOTS);
return (dsl_dataset_promote(zc->zc_name, zc->zc_string));
}
/*
* Retrieve a single {user|group}{used|quota}@... property.
*
* inputs:
* zc_name name of filesystem
* zc_objset_type zfs_userquota_prop_t
* zc_value domain name (eg. "S-1-234-567-89")
* zc_guid RID/UID/GID
*
* outputs:
* zc_cookie property value
*/
static int
zfs_ioc_userspace_one(zfs_cmd_t *zc)
{
zfsvfs_t *zfsvfs;
int error;
if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
return (SET_ERROR(EINVAL));
error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE);
if (error != 0)
return (error);
error = zfs_userspace_one(zfsvfs,
zc->zc_objset_type, zc->zc_value, zc->zc_guid, &zc->zc_cookie);
zfsvfs_rele(zfsvfs, FTAG);
return (error);
}
/*
* inputs:
* zc_name name of filesystem
* zc_cookie zap cursor
* zc_objset_type zfs_userquota_prop_t
* zc_nvlist_dst[_size] buffer to fill (not really an nvlist)
*
* outputs:
* zc_nvlist_dst[_size] data buffer (array of zfs_useracct_t)
* zc_cookie zap cursor
*/
static int
zfs_ioc_userspace_many(zfs_cmd_t *zc)
{
zfsvfs_t *zfsvfs;
int bufsize = zc->zc_nvlist_dst_size;
if (bufsize <= 0)
return (SET_ERROR(ENOMEM));
int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE);
if (error != 0)
return (error);
void *buf = kmem_alloc(bufsize, KM_SLEEP);
error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie,
buf, &zc->zc_nvlist_dst_size);
if (error == 0) {
error = ddi_copyout(buf,
(void *)(uintptr_t)zc->zc_nvlist_dst,
zc->zc_nvlist_dst_size, zc->zc_iflags);
}
kmem_free(buf, bufsize);
zfsvfs_rele(zfsvfs, FTAG);
return (error);
}
/*
* inputs:
* zc_name name of filesystem
*
* outputs:
* none
*/
static int
zfs_ioc_userspace_upgrade(zfs_cmd_t *zc)
{
objset_t *os;
int error = 0;
zfsvfs_t *zfsvfs;
if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) {
if (!dmu_objset_userused_enabled(zfsvfs->z_os)) {
/*
* If userused is not enabled, it may be because the
* objset needs to be closed & reopened (to grow the
* objset_phys_t). Suspend/resume the fs will do that.
*/
dsl_dataset_t *ds, *newds;
ds = dmu_objset_ds(zfsvfs->z_os);
error = zfs_suspend_fs(zfsvfs);
if (error == 0) {
dmu_objset_refresh_ownership(ds, &newds,
zfsvfs);
error = zfs_resume_fs(zfsvfs, newds);
}
}
if (error == 0)
error = dmu_objset_userspace_upgrade(zfsvfs->z_os);
#ifdef illumos
VFS_RELE(zfsvfs->z_vfs);
#else
vfs_unbusy(zfsvfs->z_vfs);
#endif
} else {
/* XXX kind of reading contents without owning */
error = dmu_objset_hold(zc->zc_name, FTAG, &os);
if (error != 0)
return (error);
error = dmu_objset_userspace_upgrade(os);
dmu_objset_rele(os, FTAG);
}
return (error);
}
#ifdef illumos
/*
* We don't want to have a hard dependency
* against some special symbols in sharefs
* nfs, and smbsrv. Determine them if needed when
* the first file system is shared.
* Neither sharefs, nfs or smbsrv are unloadable modules.
*/
int (*znfsexport_fs)(void *arg);
int (*zshare_fs)(enum sharefs_sys_op, share_t *, uint32_t);
int (*zsmbexport_fs)(void *arg, boolean_t add_share);
int zfs_nfsshare_inited;
int zfs_smbshare_inited;
ddi_modhandle_t nfs_mod;
ddi_modhandle_t sharefs_mod;
ddi_modhandle_t smbsrv_mod;
#endif /* illumos */
kmutex_t zfs_share_lock;
#ifdef illumos
static int
zfs_init_sharefs()
{
int error;
ASSERT(MUTEX_HELD(&zfs_share_lock));
/* Both NFS and SMB shares also require sharetab support. */
if (sharefs_mod == NULL && ((sharefs_mod =
ddi_modopen("fs/sharefs",
KRTLD_MODE_FIRST, &error)) == NULL)) {
return (SET_ERROR(ENOSYS));
}
if (zshare_fs == NULL && ((zshare_fs =
(int (*)(enum sharefs_sys_op, share_t *, uint32_t))
ddi_modsym(sharefs_mod, "sharefs_impl", &error)) == NULL)) {
return (SET_ERROR(ENOSYS));
}
return (0);
}
#endif /* illumos */
static int
zfs_ioc_share(zfs_cmd_t *zc)
{
#ifdef illumos
int error;
int opcode;
switch (zc->zc_share.z_sharetype) {
case ZFS_SHARE_NFS:
case ZFS_UNSHARE_NFS:
if (zfs_nfsshare_inited == 0) {
mutex_enter(&zfs_share_lock);
if (nfs_mod == NULL && ((nfs_mod = ddi_modopen("fs/nfs",
KRTLD_MODE_FIRST, &error)) == NULL)) {
mutex_exit(&zfs_share_lock);
return (SET_ERROR(ENOSYS));
}
if (znfsexport_fs == NULL &&
((znfsexport_fs = (int (*)(void *))
ddi_modsym(nfs_mod,
"nfs_export", &error)) == NULL)) {
mutex_exit(&zfs_share_lock);
return (SET_ERROR(ENOSYS));
}
error = zfs_init_sharefs();
if (error != 0) {
mutex_exit(&zfs_share_lock);
return (SET_ERROR(ENOSYS));
}
zfs_nfsshare_inited = 1;
mutex_exit(&zfs_share_lock);
}
break;
case ZFS_SHARE_SMB:
case ZFS_UNSHARE_SMB:
if (zfs_smbshare_inited == 0) {
mutex_enter(&zfs_share_lock);
if (smbsrv_mod == NULL && ((smbsrv_mod =
ddi_modopen("drv/smbsrv",
KRTLD_MODE_FIRST, &error)) == NULL)) {
mutex_exit(&zfs_share_lock);
return (SET_ERROR(ENOSYS));
}
if (zsmbexport_fs == NULL && ((zsmbexport_fs =
(int (*)(void *, boolean_t))ddi_modsym(smbsrv_mod,
"smb_server_share", &error)) == NULL)) {
mutex_exit(&zfs_share_lock);
return (SET_ERROR(ENOSYS));
}
error = zfs_init_sharefs();
if (error != 0) {
mutex_exit(&zfs_share_lock);
return (SET_ERROR(ENOSYS));
}
zfs_smbshare_inited = 1;
mutex_exit(&zfs_share_lock);
}
break;
default:
return (SET_ERROR(EINVAL));
}
switch (zc->zc_share.z_sharetype) {
case ZFS_SHARE_NFS:
case ZFS_UNSHARE_NFS:
if (error =
znfsexport_fs((void *)
(uintptr_t)zc->zc_share.z_exportdata))
return (error);
break;
case ZFS_SHARE_SMB:
case ZFS_UNSHARE_SMB:
if (error = zsmbexport_fs((void *)
(uintptr_t)zc->zc_share.z_exportdata,
zc->zc_share.z_sharetype == ZFS_SHARE_SMB ?
B_TRUE: B_FALSE)) {
return (error);
}
break;
}
opcode = (zc->zc_share.z_sharetype == ZFS_SHARE_NFS ||
zc->zc_share.z_sharetype == ZFS_SHARE_SMB) ?
SHAREFS_ADD : SHAREFS_REMOVE;
/*
* Add or remove share from sharetab
*/
error = zshare_fs(opcode,
(void *)(uintptr_t)zc->zc_share.z_sharedata,
zc->zc_share.z_sharemax);
return (error);
#else /* !illumos */
return (ENOSYS);
#endif /* illumos */
}
ace_t full_access[] = {
{(uid_t)-1, ACE_ALL_PERMS, ACE_EVERYONE, 0}
};
/*
* inputs:
* zc_name name of containing filesystem
* zc_obj object # beyond which we want next in-use object #
*
* outputs:
* zc_obj next in-use object #
*/
static int
zfs_ioc_next_obj(zfs_cmd_t *zc)
{
objset_t *os = NULL;
int error;
error = dmu_objset_hold(zc->zc_name, FTAG, &os);
if (error != 0)
return (error);
error = dmu_object_next(os, &zc->zc_obj, B_FALSE,
dsl_dataset_phys(os->os_dsl_dataset)->ds_prev_snap_txg);
dmu_objset_rele(os, FTAG);
return (error);
}
/*
* inputs:
* zc_name name of filesystem
* zc_value prefix name for snapshot
* zc_cleanup_fd cleanup-on-exit file descriptor for calling process
*
* outputs:
* zc_value short name of new snapshot
*/
static int
zfs_ioc_tmp_snapshot(zfs_cmd_t *zc)
{
char *snap_name;
char *hold_name;
int error;
minor_t minor;
error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor);
if (error != 0)
return (error);
snap_name = kmem_asprintf("%s-%016llx", zc->zc_value,
(u_longlong_t)ddi_get_lbolt64());
hold_name = kmem_asprintf("%%%s", zc->zc_value);
error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor,
hold_name);
if (error == 0)
(void) strcpy(zc->zc_value, snap_name);
strfree(snap_name);
strfree(hold_name);
zfs_onexit_fd_rele(zc->zc_cleanup_fd);
return (error);
}
/*
* inputs:
* zc_name name of "to" snapshot
* zc_value name of "from" snapshot
* zc_cookie file descriptor to write diff data on
*
* outputs:
* dmu_diff_record_t's to the file descriptor
*/
static int
zfs_ioc_diff(zfs_cmd_t *zc)
{
file_t *fp;
offset_t off;
int error;
#ifdef illumos
fp = getf(zc->zc_cookie);
#else
fget_write(curthread, zc->zc_cookie, &cap_write_rights, &fp);
#endif
if (fp == NULL)
return (SET_ERROR(EBADF));
off = fp->f_offset;
#ifdef illumos
error = dmu_diff(zc->zc_name, zc->zc_value, fp->f_vnode, &off);
#else
error = dmu_diff(zc->zc_name, zc->zc_value, fp, &off);
#endif
if (off >= 0 && off <= MAXOFFSET_T)
fp->f_offset = off;
releasef(zc->zc_cookie);
return (error);
}
#ifdef illumos
/*
* Remove all ACL files in shares dir
*/
static int
zfs_smb_acl_purge(znode_t *dzp)
{
zap_cursor_t zc;
zap_attribute_t zap;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
int error;
for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
(error = zap_cursor_retrieve(&zc, &zap)) == 0;
zap_cursor_advance(&zc)) {
if ((error = VOP_REMOVE(ZTOV(dzp), zap.za_name, kcred,
NULL, 0)) != 0)
break;
}
zap_cursor_fini(&zc);
return (error);
}
#endif /* illumos */
static int
zfs_ioc_smb_acl(zfs_cmd_t *zc)
{
#ifdef illumos
vnode_t *vp;
znode_t *dzp;
vnode_t *resourcevp = NULL;
znode_t *sharedir;
zfsvfs_t *zfsvfs;
nvlist_t *nvlist;
char *src, *target;
vattr_t vattr;
vsecattr_t vsec;
int error = 0;
if ((error = lookupname(zc->zc_value, UIO_SYSSPACE,
NO_FOLLOW, NULL, &vp)) != 0)
return (error);
/* Now make sure mntpnt and dataset are ZFS */
if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 ||
(strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource),
zc->zc_name) != 0)) {
VN_RELE(vp);
return (SET_ERROR(EINVAL));
}
dzp = VTOZ(vp);
zfsvfs = dzp->z_zfsvfs;
ZFS_ENTER(zfsvfs);
/*
* Create share dir if its missing.
*/
mutex_enter(&zfsvfs->z_lock);
if (zfsvfs->z_shares_dir == 0) {
dmu_tx_t *tx;
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, TRUE,
ZFS_SHARES_DIR);
dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error != 0) {
dmu_tx_abort(tx);
} else {
error = zfs_create_share_dir(zfsvfs, tx);
dmu_tx_commit(tx);
}
if (error != 0) {
mutex_exit(&zfsvfs->z_lock);
VN_RELE(vp);
ZFS_EXIT(zfsvfs);
return (error);
}
}
mutex_exit(&zfsvfs->z_lock);
ASSERT(zfsvfs->z_shares_dir);
if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &sharedir)) != 0) {
VN_RELE(vp);
ZFS_EXIT(zfsvfs);
return (error);
}
switch (zc->zc_cookie) {
case ZFS_SMB_ACL_ADD:
vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
vattr.va_type = VREG;
vattr.va_mode = S_IFREG|0777;
vattr.va_uid = 0;
vattr.va_gid = 0;
vsec.vsa_mask = VSA_ACE;
vsec.vsa_aclentp = &full_access;
vsec.vsa_aclentsz = sizeof (full_access);
vsec.vsa_aclcnt = 1;
error = VOP_CREATE(ZTOV(sharedir), zc->zc_string,
&vattr, EXCL, 0, &resourcevp, kcred, 0, NULL, &vsec);
if (resourcevp)
VN_RELE(resourcevp);
break;
case ZFS_SMB_ACL_REMOVE:
error = VOP_REMOVE(ZTOV(sharedir), zc->zc_string, kcred,
NULL, 0);
break;
case ZFS_SMB_ACL_RENAME:
if ((error = get_nvlist(zc->zc_nvlist_src,
zc->zc_nvlist_src_size, zc->zc_iflags, &nvlist)) != 0) {
VN_RELE(vp);
VN_RELE(ZTOV(sharedir));
ZFS_EXIT(zfsvfs);
return (error);
}
if (nvlist_lookup_string(nvlist, ZFS_SMB_ACL_SRC, &src) ||
nvlist_lookup_string(nvlist, ZFS_SMB_ACL_TARGET,
&target)) {
VN_RELE(vp);
VN_RELE(ZTOV(sharedir));
ZFS_EXIT(zfsvfs);
nvlist_free(nvlist);
return (error);
}
error = VOP_RENAME(ZTOV(sharedir), src, ZTOV(sharedir), target,
kcred, NULL, 0);
nvlist_free(nvlist);
break;
case ZFS_SMB_ACL_PURGE:
error = zfs_smb_acl_purge(sharedir);
break;
default:
error = SET_ERROR(EINVAL);
break;
}
VN_RELE(vp);
VN_RELE(ZTOV(sharedir));
ZFS_EXIT(zfsvfs);
return (error);
#else /* !illumos */
return (EOPNOTSUPP);
#endif /* illumos */
}
/*
* innvl: {
* "holds" -> { snapname -> holdname (string), ... }
* (optional) "cleanup_fd" -> fd (int32)
* }
*
* outnvl: {
* snapname -> error value (int32)
* ...
* }
*/
/* ARGSUSED */
static int
zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist)
{
nvpair_t *pair;
nvlist_t *holds;
int cleanup_fd = -1;
int error;
minor_t minor = 0;
error = nvlist_lookup_nvlist(args, "holds", &holds);
if (error != 0)
return (SET_ERROR(EINVAL));
/* make sure the user didn't pass us any invalid (empty) tags */
for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
pair = nvlist_next_nvpair(holds, pair)) {
char *htag;
error = nvpair_value_string(pair, &htag);
if (error != 0)
return (SET_ERROR(error));
if (strlen(htag) == 0)
return (SET_ERROR(EINVAL));
}
if (nvlist_lookup_int32(args, "cleanup_fd", &cleanup_fd) == 0) {
error = zfs_onexit_fd_hold(cleanup_fd, &minor);
if (error != 0)
return (error);
}
error = dsl_dataset_user_hold(holds, minor, errlist);
if (minor != 0)
zfs_onexit_fd_rele(cleanup_fd);
return (error);
}
/*
* innvl is not used.
*
* outnvl: {
* holdname -> time added (uint64 seconds since epoch)
* ...
* }
*/
/* ARGSUSED */
static int
zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl)
{
ASSERT3P(args, ==, NULL);
return (dsl_dataset_get_holds(snapname, outnvl));
}
/*
* innvl: {
* snapname -> { holdname, ... }
* ...
* }
*
* outnvl: {
* snapname -> error value (int32)
* ...
* }
*/
/* ARGSUSED */
static int
zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist)
{
return (dsl_dataset_user_release(holds, errlist));
}
/*
* inputs:
* zc_name name of new filesystem or snapshot
* zc_value full name of old snapshot
*
* outputs:
* zc_cookie space in bytes
* zc_objset_type compressed space in bytes
* zc_perm_action uncompressed space in bytes
*/
static int
zfs_ioc_space_written(zfs_cmd_t *zc)
{
int error;
dsl_pool_t *dp;
dsl_dataset_t *new, *old;
error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
if (error != 0)
return (error);
error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &new);
if (error != 0) {
dsl_pool_rele(dp, FTAG);
return (error);
}
error = dsl_dataset_hold(dp, zc->zc_value, FTAG, &old);
if (error != 0) {
dsl_dataset_rele(new, FTAG);
dsl_pool_rele(dp, FTAG);
return (error);
}
error = dsl_dataset_space_written(old, new, &zc->zc_cookie,
&zc->zc_objset_type, &zc->zc_perm_action);
dsl_dataset_rele(old, FTAG);
dsl_dataset_rele(new, FTAG);
dsl_pool_rele(dp, FTAG);
return (error);
}
/*
* innvl: {
* "firstsnap" -> snapshot name
* }
*
* outnvl: {
* "used" -> space in bytes
* "compressed" -> compressed space in bytes
* "uncompressed" -> uncompressed space in bytes
* }
*/
static int
zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl)
{
int error;
dsl_pool_t *dp;
dsl_dataset_t *new, *old;
char *firstsnap;
uint64_t used, comp, uncomp;
if (nvlist_lookup_string(innvl, "firstsnap", &firstsnap) != 0)
return (SET_ERROR(EINVAL));
error = dsl_pool_hold(lastsnap, FTAG, &dp);
if (error != 0)
return (error);
error = dsl_dataset_hold(dp, lastsnap, FTAG, &new);
if (error == 0 && !new->ds_is_snapshot) {
dsl_dataset_rele(new, FTAG);
error = SET_ERROR(EINVAL);
}
if (error != 0) {
dsl_pool_rele(dp, FTAG);
return (error);
}
error = dsl_dataset_hold(dp, firstsnap, FTAG, &old);
if (error == 0 && !old->ds_is_snapshot) {
dsl_dataset_rele(old, FTAG);
error = SET_ERROR(EINVAL);
}
if (error != 0) {
dsl_dataset_rele(new, FTAG);
dsl_pool_rele(dp, FTAG);
return (error);
}
error = dsl_dataset_space_wouldfree(old, new, &used, &comp, &uncomp);
dsl_dataset_rele(old, FTAG);
dsl_dataset_rele(new, FTAG);
dsl_pool_rele(dp, FTAG);
fnvlist_add_uint64(outnvl, "used", used);
fnvlist_add_uint64(outnvl, "compressed", comp);
fnvlist_add_uint64(outnvl, "uncompressed", uncomp);
return (error);
}
static int
zfs_ioc_jail(zfs_cmd_t *zc)
{
return (zone_dataset_attach(curthread->td_ucred, zc->zc_name,
(int)zc->zc_jailid));
}
static int
zfs_ioc_unjail(zfs_cmd_t *zc)
{
return (zone_dataset_detach(curthread->td_ucred, zc->zc_name,
(int)zc->zc_jailid));
}
/*
* innvl: {
* "fd" -> file descriptor to write stream to (int32)
* (optional) "fromsnap" -> full snap name to send an incremental from
* (optional) "largeblockok" -> (value ignored)
* indicates that blocks > 128KB are permitted
* (optional) "embedok" -> (value ignored)
* presence indicates DRR_WRITE_EMBEDDED records are permitted
* (optional) "compressok" -> (value ignored)
* presence indicates compressed DRR_WRITE records are permitted
* (optional) "resume_object" and "resume_offset" -> (uint64)
* if present, resume send stream from specified object and offset.
* }
*
* outnvl is unused
*/
/* ARGSUSED */
static int
zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
{
file_t *fp;
int error;
offset_t off;
char *fromname = NULL;
int fd;
boolean_t largeblockok;
boolean_t embedok;
boolean_t compressok;
uint64_t resumeobj = 0;
uint64_t resumeoff = 0;
error = nvlist_lookup_int32(innvl, "fd", &fd);
if (error != 0)
return (SET_ERROR(EINVAL));
(void) nvlist_lookup_string(innvl, "fromsnap", &fromname);
largeblockok = nvlist_exists(innvl, "largeblockok");
embedok = nvlist_exists(innvl, "embedok");
compressok = nvlist_exists(innvl, "compressok");
(void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj);
(void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff);
#ifdef illumos
file_t *fp = getf(fd);
#else
fget_write(curthread, fd, &cap_write_rights, &fp);
#endif
if (fp == NULL)
return (SET_ERROR(EBADF));
off = fp->f_offset;
error = dmu_send(snapname, fromname, embedok, largeblockok, compressok,
#ifdef illumos
fd, resumeobj, resumeoff, fp->f_vnode, &off);
#else
fd, resumeobj, resumeoff, fp, &off);
#endif
#ifdef illumos
if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
fp->f_offset = off;
#else
fp->f_offset = off;
#endif
releasef(fd);
return (error);
}
/*
* Determine approximately how large a zfs send stream will be -- the number
* of bytes that will be written to the fd supplied to zfs_ioc_send_new().
*
* innvl: {
* (optional) "from" -> full snap or bookmark name to send an incremental
* from
* (optional) "largeblockok" -> (value ignored)
* indicates that blocks > 128KB are permitted
* (optional) "embedok" -> (value ignored)
* presence indicates DRR_WRITE_EMBEDDED records are permitted
* (optional) "compressok" -> (value ignored)
* presence indicates compressed DRR_WRITE records are permitted
* }
*
* outnvl: {
* "space" -> bytes of space (uint64)
* }
*/
static int
zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
{
dsl_pool_t *dp;
dsl_dataset_t *tosnap;
int error;
char *fromname;
boolean_t compressok;
uint64_t space;
error = dsl_pool_hold(snapname, FTAG, &dp);
if (error != 0)
return (error);
error = dsl_dataset_hold(dp, snapname, FTAG, &tosnap);
if (error != 0) {
dsl_pool_rele(dp, FTAG);
return (error);
}
compressok = nvlist_exists(innvl, "compressok");
error = nvlist_lookup_string(innvl, "from", &fromname);
if (error == 0) {
if (strchr(fromname, '@') != NULL) {
/*
* If from is a snapshot, hold it and use the more
* efficient dmu_send_estimate to estimate send space
* size using deadlists.
*/
dsl_dataset_t *fromsnap;
error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap);
if (error != 0)
goto out;
error = dmu_send_estimate(tosnap, fromsnap, compressok,
&space);
dsl_dataset_rele(fromsnap, FTAG);
} else if (strchr(fromname, '#') != NULL) {
/*
* If from is a bookmark, fetch the creation TXG of the
* snapshot it was created from and use that to find
* blocks that were born after it.
*/
zfs_bookmark_phys_t frombm;
error = dsl_bookmark_lookup(dp, fromname, tosnap,
&frombm);
if (error != 0)
goto out;
error = dmu_send_estimate_from_txg(tosnap,
frombm.zbm_creation_txg, compressok, &space);
} else {
/*
* from is not properly formatted as a snapshot or
* bookmark
*/
error = SET_ERROR(EINVAL);
goto out;
}
} else {
/*
* If estimating the size of a full send, use dmu_send_estimate.
*/
error = dmu_send_estimate(tosnap, NULL, compressok, &space);
}
fnvlist_add_uint64(outnvl, "space", space);
out:
dsl_dataset_rele(tosnap, FTAG);
dsl_pool_rele(dp, FTAG);
return (error);
}
/*
* Sync the currently open TXG to disk for the specified pool.
* This is somewhat similar to 'zfs_sync()'.
* For cases that do not result in error this ioctl will wait for
* the currently open TXG to commit before returning back to the caller.
*
* innvl: {
* "force" -> when true, force uberblock update even if there is no dirty data.
* In addition this will cause the vdev configuration to be written
* out including updating the zpool cache file. (boolean_t)
* }
*
* onvl is unused
*/
/* ARGSUSED */
static int
zfs_ioc_pool_sync(const char *pool, nvlist_t *innvl, nvlist_t *onvl)
{
int err;
boolean_t force;
spa_t *spa;
if ((err = spa_open(pool, &spa, FTAG)) != 0)
return (err);
force = fnvlist_lookup_boolean_value(innvl, "force");
if (force) {
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_WRITER);
vdev_config_dirty(spa->spa_root_vdev);
spa_config_exit(spa, SCL_CONFIG, FTAG);
}
txg_wait_synced(spa_get_dsl(spa), 0);
spa_close(spa, FTAG);
return (err);
}
static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST];
static void
zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck,
boolean_t log_history, zfs_ioc_poolcheck_t pool_check)
{
zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST];
ASSERT3U(ioc, >=, ZFS_IOC_FIRST);
ASSERT3U(ioc, <, ZFS_IOC_LAST);
ASSERT3P(vec->zvec_legacy_func, ==, NULL);
ASSERT3P(vec->zvec_func, ==, NULL);
vec->zvec_legacy_func = func;
vec->zvec_secpolicy = secpolicy;
vec->zvec_namecheck = namecheck;
vec->zvec_allow_log = log_history;
vec->zvec_pool_check = pool_check;
}
/*
* See the block comment at the beginning of this file for details on
* each argument to this function.
*/
static void
zfs_ioctl_register(const char *name, zfs_ioc_t ioc, zfs_ioc_func_t *func,
zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck,
zfs_ioc_poolcheck_t pool_check, boolean_t smush_outnvlist,
boolean_t allow_log)
{
zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST];
ASSERT3U(ioc, >=, ZFS_IOC_FIRST);
ASSERT3U(ioc, <, ZFS_IOC_LAST);
ASSERT3P(vec->zvec_legacy_func, ==, NULL);
ASSERT3P(vec->zvec_func, ==, NULL);
/* if we are logging, the name must be valid */
ASSERT(!allow_log || namecheck != NO_NAME);
vec->zvec_name = name;
vec->zvec_func = func;
vec->zvec_secpolicy = secpolicy;
vec->zvec_namecheck = namecheck;
vec->zvec_pool_check = pool_check;
vec->zvec_smush_outnvlist = smush_outnvlist;
vec->zvec_allow_log = allow_log;
}
static void
zfs_ioctl_register_pool(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
zfs_secpolicy_func_t *secpolicy, boolean_t log_history,
zfs_ioc_poolcheck_t pool_check)
{
zfs_ioctl_register_legacy(ioc, func, secpolicy,
POOL_NAME, log_history, pool_check);
}
static void
zfs_ioctl_register_dataset_nolog(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
zfs_secpolicy_func_t *secpolicy, zfs_ioc_poolcheck_t pool_check)
{
zfs_ioctl_register_legacy(ioc, func, secpolicy,
DATASET_NAME, B_FALSE, pool_check);
}
static void
zfs_ioctl_register_pool_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func)
{
zfs_ioctl_register_legacy(ioc, func, zfs_secpolicy_config,
POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
}
static void
zfs_ioctl_register_pool_meta(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
zfs_secpolicy_func_t *secpolicy)
{
zfs_ioctl_register_legacy(ioc, func, secpolicy,
NO_NAME, B_FALSE, POOL_CHECK_NONE);
}
static void
zfs_ioctl_register_dataset_read_secpolicy(zfs_ioc_t ioc,
zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy)
{
zfs_ioctl_register_legacy(ioc, func, secpolicy,
DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED);
}
static void
zfs_ioctl_register_dataset_read(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func)
{
zfs_ioctl_register_dataset_read_secpolicy(ioc, func,
zfs_secpolicy_read);
}
static void
zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
zfs_secpolicy_func_t *secpolicy)
{
zfs_ioctl_register_legacy(ioc, func, secpolicy,
DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
}
static void
zfs_ioctl_init(void)
{
zfs_ioctl_register("snapshot", ZFS_IOC_SNAPSHOT,
zfs_ioc_snapshot, zfs_secpolicy_snapshot, POOL_NAME,
POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
zfs_ioctl_register("log_history", ZFS_IOC_LOG_HISTORY,
zfs_ioc_log_history, zfs_secpolicy_log_history, NO_NAME,
POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE);
zfs_ioctl_register("space_snaps", ZFS_IOC_SPACE_SNAPS,
zfs_ioc_space_snaps, zfs_secpolicy_read, DATASET_NAME,
POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);
zfs_ioctl_register("send", ZFS_IOC_SEND_NEW,
zfs_ioc_send_new, zfs_secpolicy_send_new, DATASET_NAME,
POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);
zfs_ioctl_register("send_space", ZFS_IOC_SEND_SPACE,
zfs_ioc_send_space, zfs_secpolicy_read, DATASET_NAME,
POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);
zfs_ioctl_register("create", ZFS_IOC_CREATE,
zfs_ioc_create, zfs_secpolicy_create_clone, DATASET_NAME,
POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
zfs_ioctl_register("clone", ZFS_IOC_CLONE,
zfs_ioc_clone, zfs_secpolicy_create_clone, DATASET_NAME,
POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
zfs_ioctl_register("remap", ZFS_IOC_REMAP,
zfs_ioc_remap, zfs_secpolicy_remap, DATASET_NAME,
POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE);
zfs_ioctl_register("destroy_snaps", ZFS_IOC_DESTROY_SNAPS,
zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, POOL_NAME,
POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
zfs_ioctl_register("hold", ZFS_IOC_HOLD,
zfs_ioc_hold, zfs_secpolicy_hold, POOL_NAME,
POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
zfs_ioctl_register("release", ZFS_IOC_RELEASE,
zfs_ioc_release, zfs_secpolicy_release, POOL_NAME,
POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
zfs_ioctl_register("get_holds", ZFS_IOC_GET_HOLDS,
zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME,
POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);
zfs_ioctl_register("rollback", ZFS_IOC_ROLLBACK,
zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME,
POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE);
zfs_ioctl_register("bookmark", ZFS_IOC_BOOKMARK,
zfs_ioc_bookmark, zfs_secpolicy_bookmark, POOL_NAME,
POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
zfs_ioctl_register("get_bookmarks", ZFS_IOC_GET_BOOKMARKS,
zfs_ioc_get_bookmarks, zfs_secpolicy_read, DATASET_NAME,
POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);
zfs_ioctl_register("destroy_bookmarks", ZFS_IOC_DESTROY_BOOKMARKS,
zfs_ioc_destroy_bookmarks, zfs_secpolicy_destroy_bookmarks,
POOL_NAME,
POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
zfs_ioctl_register("channel_program", ZFS_IOC_CHANNEL_PROGRAM,
zfs_ioc_channel_program, zfs_secpolicy_config,
POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE,
B_TRUE);
zfs_ioctl_register("zpool_checkpoint", ZFS_IOC_POOL_CHECKPOINT,
zfs_ioc_pool_checkpoint, zfs_secpolicy_config, POOL_NAME,
POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
zfs_ioctl_register("zpool_discard_checkpoint",
ZFS_IOC_POOL_DISCARD_CHECKPOINT, zfs_ioc_pool_discard_checkpoint,
zfs_secpolicy_config, POOL_NAME,
POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
zfs_ioctl_register("initialize", ZFS_IOC_POOL_INITIALIZE,
zfs_ioc_pool_initialize, zfs_secpolicy_config, POOL_NAME,
POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
zfs_ioctl_register("sync", ZFS_IOC_POOL_SYNC,
zfs_ioc_pool_sync, zfs_secpolicy_none, POOL_NAME,
POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE);
/* IOCTLS that use the legacy function signature */
zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze,
zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_READONLY);
zfs_ioctl_register_pool(ZFS_IOC_POOL_CREATE, zfs_ioc_pool_create,
zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SCAN,
zfs_ioc_pool_scan);
zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_UPGRADE,
zfs_ioc_pool_upgrade);
zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ADD,
zfs_ioc_vdev_add);
zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_REMOVE,
zfs_ioc_vdev_remove);
zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SET_STATE,
zfs_ioc_vdev_set_state);
zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ATTACH,
zfs_ioc_vdev_attach);
zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_DETACH,
zfs_ioc_vdev_detach);
zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETPATH,
zfs_ioc_vdev_setpath);
zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETFRU,
zfs_ioc_vdev_setfru);
zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SET_PROPS,
zfs_ioc_pool_set_props);
zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SPLIT,
zfs_ioc_vdev_split);
zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_REGUID,
zfs_ioc_pool_reguid);
zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_CONFIGS,
zfs_ioc_pool_configs, zfs_secpolicy_none);
zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_TRYIMPORT,
zfs_ioc_pool_tryimport, zfs_secpolicy_config);
zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_FAULT,
zfs_ioc_inject_fault, zfs_secpolicy_inject);
zfs_ioctl_register_pool_meta(ZFS_IOC_CLEAR_FAULT,
zfs_ioc_clear_fault, zfs_secpolicy_inject);
zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_LIST_NEXT,
zfs_ioc_inject_list_next, zfs_secpolicy_inject);
/*
* pool destroy, and export don't log the history as part of
* zfsdev_ioctl, but rather zfs_ioc_pool_export
* does the logging of those commands.
*/
zfs_ioctl_register_pool(ZFS_IOC_POOL_DESTROY, zfs_ioc_pool_destroy,
zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE);
zfs_ioctl_register_pool(ZFS_IOC_POOL_EXPORT, zfs_ioc_pool_export,
zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE);
zfs_ioctl_register_pool(ZFS_IOC_POOL_STATS, zfs_ioc_pool_stats,
zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE);
zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_PROPS, zfs_ioc_pool_get_props,
zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE);
zfs_ioctl_register_pool(ZFS_IOC_ERROR_LOG, zfs_ioc_error_log,
zfs_secpolicy_inject, B_FALSE, POOL_CHECK_NONE);
zfs_ioctl_register_pool(ZFS_IOC_DSOBJ_TO_DSNAME,
zfs_ioc_dsobj_to_dsname,
zfs_secpolicy_diff, B_FALSE, POOL_CHECK_NONE);
zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_HISTORY,
zfs_ioc_pool_get_history,
zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED);
zfs_ioctl_register_pool(ZFS_IOC_POOL_IMPORT, zfs_ioc_pool_import,
zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear,
zfs_secpolicy_config, B_TRUE, POOL_CHECK_READONLY);
zfs_ioctl_register_pool(ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen,
zfs_secpolicy_config, B_TRUE, POOL_CHECK_SUSPENDED);
zfs_ioctl_register_dataset_read(ZFS_IOC_SPACE_WRITTEN,
zfs_ioc_space_written);
zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_RECVD_PROPS,
zfs_ioc_objset_recvd_props);
zfs_ioctl_register_dataset_read(ZFS_IOC_NEXT_OBJ,
zfs_ioc_next_obj);
zfs_ioctl_register_dataset_read(ZFS_IOC_GET_FSACL,
zfs_ioc_get_fsacl);
zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_STATS,
zfs_ioc_objset_stats);
zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_ZPLPROPS,
zfs_ioc_objset_zplprops);
zfs_ioctl_register_dataset_read(ZFS_IOC_DATASET_LIST_NEXT,
zfs_ioc_dataset_list_next);
zfs_ioctl_register_dataset_read(ZFS_IOC_SNAPSHOT_LIST_NEXT,
zfs_ioc_snapshot_list_next);
zfs_ioctl_register_dataset_read(ZFS_IOC_SEND_PROGRESS,
zfs_ioc_send_progress);
zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_DIFF,
zfs_ioc_diff, zfs_secpolicy_diff);
zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_STATS,
zfs_ioc_obj_to_stats, zfs_secpolicy_diff);
zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_PATH,
zfs_ioc_obj_to_path, zfs_secpolicy_diff);
zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_ONE,
zfs_ioc_userspace_one, zfs_secpolicy_userspace_one);
zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_MANY,
zfs_ioc_userspace_many, zfs_secpolicy_userspace_many);
zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_SEND,
zfs_ioc_send, zfs_secpolicy_send);
zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_PROP, zfs_ioc_set_prop,
zfs_secpolicy_none);
zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy,
zfs_secpolicy_destroy);
zfs_ioctl_register_dataset_modify(ZFS_IOC_RECV, zfs_ioc_recv,
zfs_secpolicy_recv);
zfs_ioctl_register_dataset_modify(ZFS_IOC_PROMOTE, zfs_ioc_promote,
zfs_secpolicy_promote);
zfs_ioctl_register_dataset_modify(ZFS_IOC_INHERIT_PROP,
zfs_ioc_inherit_prop, zfs_secpolicy_inherit_prop);
zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_FSACL, zfs_ioc_set_fsacl,
zfs_secpolicy_set_fsacl);
/*
* Not using zfs_ioctl_register_dataset_modify as DATASET_NAME check
* won't allow a bookmark name.
*/
zfs_ioctl_register_legacy(ZFS_IOC_RENAME, zfs_ioc_rename,
zfs_secpolicy_rename, ENTITY_NAME, B_TRUE,
POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
zfs_ioctl_register_dataset_nolog(ZFS_IOC_SHARE, zfs_ioc_share,
zfs_secpolicy_share, POOL_CHECK_NONE);
zfs_ioctl_register_dataset_nolog(ZFS_IOC_SMB_ACL, zfs_ioc_smb_acl,
zfs_secpolicy_smb_acl, POOL_CHECK_NONE);
zfs_ioctl_register_dataset_nolog(ZFS_IOC_USERSPACE_UPGRADE,
zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade,
POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
zfs_ioctl_register_dataset_nolog(ZFS_IOC_TMP_SNAPSHOT,
zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot,
POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
#ifdef __FreeBSD__
zfs_ioctl_register_dataset_nolog(ZFS_IOC_JAIL, zfs_ioc_jail,
zfs_secpolicy_config, POOL_CHECK_NONE);
zfs_ioctl_register_dataset_nolog(ZFS_IOC_UNJAIL, zfs_ioc_unjail,
zfs_secpolicy_config, POOL_CHECK_NONE);
zfs_ioctl_register("fbsd_nextboot", ZFS_IOC_NEXTBOOT,
zfs_ioc_nextboot, zfs_secpolicy_config, NO_NAME,
POOL_CHECK_NONE, B_FALSE, B_FALSE);
#endif
}
int
pool_status_check(const char *name, zfs_ioc_namecheck_t type,
zfs_ioc_poolcheck_t check)
{
spa_t *spa;
int error;
ASSERT(type == POOL_NAME || type == DATASET_NAME ||
type == ENTITY_NAME);
if (check & POOL_CHECK_NONE)
return (0);
error = spa_open(name, &spa, FTAG);
if (error == 0) {
if ((check & POOL_CHECK_SUSPENDED) && spa_suspended(spa))
error = SET_ERROR(EAGAIN);
else if ((check & POOL_CHECK_READONLY) && !spa_writeable(spa))
error = SET_ERROR(EROFS);
spa_close(spa, FTAG);
}
return (error);
}
/*
* Find a free minor number.
*/
minor_t
zfsdev_minor_alloc(void)
{
static minor_t last_minor;
minor_t m;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
for (m = last_minor + 1; m != last_minor; m++) {
if (m > ZFSDEV_MAX_MINOR)
m = 1;
if (ddi_get_soft_state(zfsdev_state, m) == NULL) {
last_minor = m;
return (m);
}
}
return (0);
}
static int
zfs_ctldev_init(struct cdev *devp)
{
minor_t minor;
zfs_soft_state_t *zs;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
minor = zfsdev_minor_alloc();
if (minor == 0)
return (SET_ERROR(ENXIO));
if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS)
return (SET_ERROR(EAGAIN));
devfs_set_cdevpriv((void *)(uintptr_t)minor, zfsdev_close);
zs = ddi_get_soft_state(zfsdev_state, minor);
zs->zss_type = ZSST_CTLDEV;
zfs_onexit_init((zfs_onexit_t **)&zs->zss_data);
return (0);
}
static void
zfs_ctldev_destroy(zfs_onexit_t *zo, minor_t minor)
{
ASSERT(MUTEX_HELD(&spa_namespace_lock));
zfs_onexit_destroy(zo);
ddi_soft_state_free(zfsdev_state, minor);
}
void *
zfsdev_get_soft_state(minor_t minor, enum zfs_soft_state_type which)
{
zfs_soft_state_t *zp;
zp = ddi_get_soft_state(zfsdev_state, minor);
if (zp == NULL || zp->zss_type != which)
return (NULL);
return (zp->zss_data);
}
static int
zfsdev_open(struct cdev *devp, int flag, int mode, struct thread *td)
{
int error = 0;
#ifdef illumos
if (getminor(*devp) != 0)
return (zvol_open(devp, flag, otyp, cr));
#endif
/* This is the control device. Allocate a new minor if requested. */
if (flag & FEXCL) {
mutex_enter(&spa_namespace_lock);
error = zfs_ctldev_init(devp);
mutex_exit(&spa_namespace_lock);
}
return (error);
}
static void
zfsdev_close(void *data)
{
zfs_onexit_t *zo;
minor_t minor = (minor_t)(uintptr_t)data;
if (minor == 0)
return;
mutex_enter(&spa_namespace_lock);
zo = zfsdev_get_soft_state(minor, ZSST_CTLDEV);
if (zo == NULL) {
mutex_exit(&spa_namespace_lock);
return;
}
zfs_ctldev_destroy(zo, minor);
mutex_exit(&spa_namespace_lock);
}
static int
zfsdev_ioctl(struct cdev *dev, u_long zcmd, caddr_t arg, int flag,
struct thread *td)
{
zfs_cmd_t *zc;
uint_t vecnum;
int error, rc, len;
#ifdef illumos
minor_t minor = getminor(dev);
#else
zfs_iocparm_t *zc_iocparm;
int cflag, cmd, oldvecnum;
boolean_t newioc, compat;
void *compat_zc = NULL;
cred_t *cr = td->td_ucred;
#endif
const zfs_ioc_vec_t *vec;
char *saved_poolname = NULL;
nvlist_t *innvl = NULL;
cflag = ZFS_CMD_COMPAT_NONE;
compat = B_FALSE;
newioc = B_TRUE; /* "new" style (zfs_iocparm_t) ioctl */
len = IOCPARM_LEN(zcmd);
vecnum = cmd = zcmd & 0xff;
/*
* Check if we are talking to supported older binaries
* and translate zfs_cmd if necessary
*/
if (len != sizeof(zfs_iocparm_t)) {
newioc = B_FALSE;
compat = B_TRUE;
vecnum = cmd;
switch (len) {
case sizeof(zfs_cmd_zcmd_t):
cflag = ZFS_CMD_COMPAT_LZC;
break;
case sizeof(zfs_cmd_deadman_t):
cflag = ZFS_CMD_COMPAT_DEADMAN;
break;
case sizeof(zfs_cmd_v28_t):
cflag = ZFS_CMD_COMPAT_V28;
break;
case sizeof(zfs_cmd_v15_t):
if (cmd >= sizeof(zfs_ioctl_v15_to_v28) /
sizeof(zfs_ioctl_v15_to_v28[0]))
return (EINVAL);
cflag = ZFS_CMD_COMPAT_V15;
vecnum = zfs_ioctl_v15_to_v28[cmd];
/*
* Return without further handling
* if the command is blacklisted.
*/
if (vecnum == ZFS_IOC_COMPAT_PASS)
return (0);
else if (vecnum == ZFS_IOC_COMPAT_FAIL)
return (ENOTSUP);
break;
default:
return (EINVAL);
}
}
#ifdef illumos
vecnum = cmd - ZFS_IOC_FIRST;
ASSERT3U(getmajor(dev), ==, ddi_driver_major(zfs_dip));
#endif
if (vecnum >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0]))
return (SET_ERROR(EINVAL));
vec = &zfs_ioc_vec[vecnum];
zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP);
#ifdef illumos
error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag);
if (error != 0) {
error = SET_ERROR(EFAULT);
goto out;
}
#else /* !illumos */
bzero(zc, sizeof(zfs_cmd_t));
if (newioc) {
zc_iocparm = (void *)arg;
switch (zc_iocparm->zfs_ioctl_version) {
case ZFS_IOCVER_CURRENT:
if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_t)) {
error = SET_ERROR(EINVAL);
goto out;
}
break;
case ZFS_IOCVER_INLANES:
if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_inlanes_t)) {
error = SET_ERROR(EFAULT);
goto out;
}
compat = B_TRUE;
cflag = ZFS_CMD_COMPAT_INLANES;
break;
case ZFS_IOCVER_RESUME:
if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_resume_t)) {
error = SET_ERROR(EFAULT);
goto out;
}
compat = B_TRUE;
cflag = ZFS_CMD_COMPAT_RESUME;
break;
case ZFS_IOCVER_EDBP:
if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_edbp_t)) {
error = SET_ERROR(EFAULT);
goto out;
}
compat = B_TRUE;
cflag = ZFS_CMD_COMPAT_EDBP;
break;
case ZFS_IOCVER_ZCMD:
if (zc_iocparm->zfs_cmd_size > sizeof(zfs_cmd_t) ||
zc_iocparm->zfs_cmd_size < sizeof(zfs_cmd_zcmd_t)) {
error = SET_ERROR(EFAULT);
goto out;
}
compat = B_TRUE;
cflag = ZFS_CMD_COMPAT_ZCMD;
break;
default:
error = SET_ERROR(EINVAL);
goto out;
/* NOTREACHED */
}
if (compat) {
ASSERT(sizeof(zfs_cmd_t) >= zc_iocparm->zfs_cmd_size);
compat_zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP);
bzero(compat_zc, sizeof(zfs_cmd_t));
error = ddi_copyin((void *)(uintptr_t)zc_iocparm->zfs_cmd,
compat_zc, zc_iocparm->zfs_cmd_size, flag);
if (error != 0) {
error = SET_ERROR(EFAULT);
goto out;
}
} else {
error = ddi_copyin((void *)(uintptr_t)zc_iocparm->zfs_cmd,
zc, zc_iocparm->zfs_cmd_size, flag);
if (error != 0) {
error = SET_ERROR(EFAULT);
goto out;
}
}
}
if (compat) {
if (newioc) {
ASSERT(compat_zc != NULL);
zfs_cmd_compat_get(zc, compat_zc, cflag);
} else {
ASSERT(compat_zc == NULL);
zfs_cmd_compat_get(zc, arg, cflag);
}
oldvecnum = vecnum;
error = zfs_ioctl_compat_pre(zc, &vecnum, cflag);
if (error != 0)
goto out;
if (oldvecnum != vecnum)
vec = &zfs_ioc_vec[vecnum];
}
#endif /* !illumos */
zc->zc_iflags = flag & FKIOCTL;
if (zc->zc_nvlist_src_size != 0) {
error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
zc->zc_iflags, &innvl);
if (error != 0)
goto out;
}
/* rewrite innvl for backwards compatibility */
if (compat)
innvl = zfs_ioctl_compat_innvl(zc, innvl, vecnum, cflag);
/*
* Ensure that all pool/dataset names are valid before we pass down to
* the lower layers.
*/
zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
switch (vec->zvec_namecheck) {
case POOL_NAME:
if (pool_namecheck(zc->zc_name, NULL, NULL) != 0)
error = SET_ERROR(EINVAL);
else
error = pool_status_check(zc->zc_name,
vec->zvec_namecheck, vec->zvec_pool_check);
break;
case DATASET_NAME:
if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0)
error = SET_ERROR(EINVAL);
else
error = pool_status_check(zc->zc_name,
vec->zvec_namecheck, vec->zvec_pool_check);
break;
case ENTITY_NAME:
if (entity_namecheck(zc->zc_name, NULL, NULL) != 0) {
error = SET_ERROR(EINVAL);
} else {
error = pool_status_check(zc->zc_name,
vec->zvec_namecheck, vec->zvec_pool_check);
}
break;
case NO_NAME:
break;
}
if (error == 0)
error = vec->zvec_secpolicy(zc, innvl, cr);
if (error != 0)
goto out;
/* legacy ioctls can modify zc_name */
len = strcspn(zc->zc_name, "/@#") + 1;
saved_poolname = kmem_alloc(len, KM_SLEEP);
(void) strlcpy(saved_poolname, zc->zc_name, len);
if (vec->zvec_func != NULL) {
nvlist_t *outnvl;
int puterror = 0;
spa_t *spa;
nvlist_t *lognv = NULL;
ASSERT(vec->zvec_legacy_func == NULL);
/*
* Add the innvl to the lognv before calling the func,
* in case the func changes the innvl.
*/
if (vec->zvec_allow_log) {
lognv = fnvlist_alloc();
fnvlist_add_string(lognv, ZPOOL_HIST_IOCTL,
vec->zvec_name);
if (!nvlist_empty(innvl)) {
fnvlist_add_nvlist(lognv, ZPOOL_HIST_INPUT_NVL,
innvl);
}
}
outnvl = fnvlist_alloc();
error = vec->zvec_func(zc->zc_name, innvl, outnvl);
/*
* Some commands can partially execute, modify state, and still
* return an error. In these cases, attempt to record what
* was modified.
*/
if ((error == 0 ||
(cmd == ZFS_IOC_CHANNEL_PROGRAM && error != EINVAL)) &&
vec->zvec_allow_log &&
spa_open(zc->zc_name, &spa, FTAG) == 0) {
if (!nvlist_empty(outnvl)) {
fnvlist_add_nvlist(lognv, ZPOOL_HIST_OUTPUT_NVL,
outnvl);
}
if (error != 0) {
fnvlist_add_int64(lognv, ZPOOL_HIST_ERRNO,
error);
}
(void) spa_history_log_nvl(spa, lognv);
spa_close(spa, FTAG);
}
fnvlist_free(lognv);
/* rewrite outnvl for backwards compatibility */
if (compat)
outnvl = zfs_ioctl_compat_outnvl(zc, outnvl, vecnum,
cflag);
if (!nvlist_empty(outnvl) || zc->zc_nvlist_dst_size != 0) {
int smusherror = 0;
if (vec->zvec_smush_outnvlist) {
smusherror = nvlist_smush(outnvl,
zc->zc_nvlist_dst_size);
}
if (smusherror == 0)
puterror = put_nvlist(zc, outnvl);
}
if (puterror != 0)
error = puterror;
nvlist_free(outnvl);
} else {
error = vec->zvec_legacy_func(zc);
}
out:
nvlist_free(innvl);
#ifdef illumos
rc = ddi_copyout(zc, (void *)arg, sizeof (zfs_cmd_t), flag);
if (error == 0 && rc != 0)
error = SET_ERROR(EFAULT);
#else
if (compat) {
zfs_ioctl_compat_post(zc, cmd, cflag);
if (newioc) {
ASSERT(compat_zc != NULL);
ASSERT(sizeof(zfs_cmd_t) >= zc_iocparm->zfs_cmd_size);
zfs_cmd_compat_put(zc, compat_zc, vecnum, cflag);
rc = ddi_copyout(compat_zc,
(void *)(uintptr_t)zc_iocparm->zfs_cmd,
zc_iocparm->zfs_cmd_size, flag);
if (error == 0 && rc != 0)
error = SET_ERROR(EFAULT);
kmem_free(compat_zc, sizeof (zfs_cmd_t));
} else {
zfs_cmd_compat_put(zc, arg, vecnum, cflag);
}
} else {
ASSERT(newioc);
rc = ddi_copyout(zc, (void *)(uintptr_t)zc_iocparm->zfs_cmd,
sizeof (zfs_cmd_t), flag);
if (error == 0 && rc != 0)
error = SET_ERROR(EFAULT);
}
#endif
if (error == 0 && vec->zvec_allow_log) {
char *s = tsd_get(zfs_allow_log_key);
if (s != NULL)
strfree(s);
(void) tsd_set(zfs_allow_log_key, saved_poolname);
} else {
if (saved_poolname != NULL)
strfree(saved_poolname);
}
kmem_free(zc, sizeof (zfs_cmd_t));
return (error);
}
#ifdef illumos
static int
zfs_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
{
if (cmd != DDI_ATTACH)
return (DDI_FAILURE);
if (ddi_create_minor_node(dip, "zfs", S_IFCHR, 0,
DDI_PSEUDO, 0) == DDI_FAILURE)
return (DDI_FAILURE);
zfs_dip = dip;
ddi_report_dev(dip);
return (DDI_SUCCESS);
}
static int
zfs_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
{
if (spa_busy() || zfs_busy() || zvol_busy())
return (DDI_FAILURE);
if (cmd != DDI_DETACH)
return (DDI_FAILURE);
zfs_dip = NULL;
ddi_prop_remove_all(dip);
ddi_remove_minor_node(dip, NULL);
return (DDI_SUCCESS);
}
/*ARGSUSED*/
static int
zfs_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
{
switch (infocmd) {
case DDI_INFO_DEVT2DEVINFO:
*result = zfs_dip;
return (DDI_SUCCESS);
case DDI_INFO_DEVT2INSTANCE:
*result = (void *)0;
return (DDI_SUCCESS);
}
return (DDI_FAILURE);
}
#endif /* illumos */
/*
* OK, so this is a little weird.
*
* /dev/zfs is the control node, i.e. minor 0.
* /dev/zvol/[r]dsk/pool/dataset are the zvols, minor > 0.
*
* /dev/zfs has basically nothing to do except serve up ioctls,
* so most of the standard driver entry points are in zvol.c.
*/
#ifdef illumos
static struct cb_ops zfs_cb_ops = {
zfsdev_open, /* open */
zfsdev_close, /* close */
zvol_strategy, /* strategy */
nodev, /* print */
zvol_dump, /* dump */
zvol_read, /* read */
zvol_write, /* write */
zfsdev_ioctl, /* ioctl */
nodev, /* devmap */
nodev, /* mmap */
nodev, /* segmap */
nochpoll, /* poll */
ddi_prop_op, /* prop_op */
NULL, /* streamtab */
D_NEW | D_MP | D_64BIT, /* Driver compatibility flag */
CB_REV, /* version */
nodev, /* async read */
nodev, /* async write */
};
static struct dev_ops zfs_dev_ops = {
DEVO_REV, /* version */
0, /* refcnt */
zfs_info, /* info */
nulldev, /* identify */
nulldev, /* probe */
zfs_attach, /* attach */
zfs_detach, /* detach */
nodev, /* reset */
&zfs_cb_ops, /* driver operations */
NULL, /* no bus operations */
NULL, /* power */
ddi_quiesce_not_needed, /* quiesce */
};
static struct modldrv zfs_modldrv = {
&mod_driverops,
"ZFS storage pool",
&zfs_dev_ops
};
static struct modlinkage modlinkage = {
MODREV_1,
(void *)&zfs_modlfs,
(void *)&zfs_modldrv,
NULL
};
#endif /* illumos */
static struct cdevsw zfs_cdevsw = {
.d_version = D_VERSION,
.d_open = zfsdev_open,
.d_ioctl = zfsdev_ioctl,
.d_name = ZFS_DEV_NAME
};
static void
zfs_allow_log_destroy(void *arg)
{
char *poolname = arg;
strfree(poolname);
}
static void
zfsdev_init(void)
{
zfsdev = make_dev(&zfs_cdevsw, 0x0, UID_ROOT, GID_OPERATOR, 0666,
ZFS_DEV_NAME);
}
static void
zfsdev_fini(void)
{
if (zfsdev != NULL)
destroy_dev(zfsdev);
}
static struct root_hold_token *zfs_root_token;
#ifdef illumos
int
_init(void)
{
int error;
spa_init(FREAD | FWRITE);
zfs_init();
zvol_init();
zfs_ioctl_init();
if ((error = mod_install(&modlinkage)) != 0) {
zvol_fini();
zfs_fini();
spa_fini();
return (error);
}
tsd_create(&zfs_fsyncer_key, NULL);
tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy);
error = ldi_ident_from_mod(&modlinkage, &zfs_li);
ASSERT(error == 0);
mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL);
return (0);
}
int
_fini(void)
{
int error;
if (spa_busy() || zfs_busy() || zvol_busy() || zio_injection_enabled)
return (SET_ERROR(EBUSY));
if ((error = mod_remove(&modlinkage)) != 0)
return (error);
zvol_fini();
zfs_fini();
spa_fini();
if (zfs_nfsshare_inited)
(void) ddi_modclose(nfs_mod);
if (zfs_smbshare_inited)
(void) ddi_modclose(smbsrv_mod);
if (zfs_nfsshare_inited || zfs_smbshare_inited)
(void) ddi_modclose(sharefs_mod);
tsd_destroy(&zfs_fsyncer_key);
ldi_ident_release(zfs_li);
zfs_li = NULL;
mutex_destroy(&zfs_share_lock);
return (error);
}
int
_info(struct modinfo *modinfop)
{
return (mod_info(&modlinkage, modinfop));
}
#endif /* illumos */
static int zfs__init(void);
static int zfs__fini(void);
static void zfs_shutdown(void *, int);
static eventhandler_tag zfs_shutdown_event_tag;
#ifdef __FreeBSD__
#define ZFS_MIN_KSTACK_PAGES 4
#endif
int
zfs__init(void)
{
#ifdef __FreeBSD__
#if KSTACK_PAGES < ZFS_MIN_KSTACK_PAGES
printf("ZFS NOTICE: KSTACK_PAGES is %d which could result in stack "
"overflow panic!\nPlease consider adding "
"'options KSTACK_PAGES=%d' to your kernel config\n", KSTACK_PAGES,
ZFS_MIN_KSTACK_PAGES);
#endif
#endif
zfs_root_token = root_mount_hold("ZFS");
mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL);
spa_init(FREAD | FWRITE);
zfs_init();
zvol_init();
zfs_ioctl_init();
tsd_create(&zfs_fsyncer_key, NULL);
tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy);
tsd_create(&zfs_geom_probe_vdev_key, NULL);
printf("ZFS storage pool version: features support (" SPA_VERSION_STRING ")\n");
root_mount_rel(zfs_root_token);
zfsdev_init();
return (0);
}
int
zfs__fini(void)
{
if (spa_busy() || zfs_busy() || zvol_busy() ||
zio_injection_enabled) {
return (EBUSY);
}
zfsdev_fini();
zvol_fini();
zfs_fini();
spa_fini();
tsd_destroy(&zfs_fsyncer_key);
tsd_destroy(&rrw_tsd_key);
tsd_destroy(&zfs_allow_log_key);
mutex_destroy(&zfs_share_lock);
return (0);
}
static void
zfs_shutdown(void *arg __unused, int howto __unused)
{
/*
* ZFS fini routines can not properly work in a panic-ed system.
*/
- if (panicstr == NULL)
+ if (!KERNEL_PANICKED())
(void)zfs__fini();
}
static int
zfs_modevent(module_t mod, int type, void *unused __unused)
{
int err;
switch (type) {
case MOD_LOAD:
err = zfs__init();
if (err == 0)
zfs_shutdown_event_tag = EVENTHANDLER_REGISTER(
shutdown_post_sync, zfs_shutdown, NULL,
SHUTDOWN_PRI_FIRST);
return (err);
case MOD_UNLOAD:
err = zfs__fini();
if (err == 0 && zfs_shutdown_event_tag != NULL)
EVENTHANDLER_DEREGISTER(shutdown_post_sync,
zfs_shutdown_event_tag);
return (err);
case MOD_SHUTDOWN:
return (0);
default:
break;
}
return (EOPNOTSUPP);
}
static moduledata_t zfs_mod = {
"zfsctrl",
zfs_modevent,
0
};
DECLARE_MODULE(zfsctrl, zfs_mod, SI_SUB_VFS, SI_ORDER_ANY);
MODULE_VERSION(zfsctrl, 1);
MODULE_DEPEND(zfsctrl, opensolaris, 1, 1, 1);
MODULE_DEPEND(zfsctrl, krpc, 1, 1, 1);
MODULE_DEPEND(zfsctrl, acl_nfs4, 1, 1, 1);
MODULE_DEPEND(zfsctrl, zlib, 1, 1, 1);
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c (revision 356654)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c (revision 356655)
@@ -1,2794 +1,2794 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 Pawel Jakub Dawidek .
* All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2016 Nexenta Systems, Inc. All rights reserved.
*/
/* Portions Copyright 2010 Robert Milkowski */
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "zfs_comutil.h"
struct mtx zfs_debug_mtx;
MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
int zfs_super_owner;
SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
"File system owner can perform privileged operation on his file systems");
int zfs_debug_level;
SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
"Debug level");
SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
static int zfs_version_acl = ZFS_ACL_VERSION;
SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
"ZFS_ACL_VERSION");
static int zfs_version_spa = SPA_VERSION;
SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
"SPA_VERSION");
static int zfs_version_zpl = ZPL_VERSION;
SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
"ZPL_VERSION");
static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg);
static int zfs_mount(vfs_t *vfsp);
static int zfs_umount(vfs_t *vfsp, int fflag);
static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
static int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
static int zfs_sync(vfs_t *vfsp, int waitfor);
static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
struct ucred **credanonp, int *numsecflavors, int **secflavors);
static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp);
static void zfs_objset_close(zfsvfs_t *zfsvfs);
static void zfs_freevfs(vfs_t *vfsp);
struct vfsops zfs_vfsops = {
.vfs_mount = zfs_mount,
.vfs_unmount = zfs_umount,
.vfs_root = vfs_cache_root,
.vfs_cachedroot = zfs_root,
.vfs_statfs = zfs_statfs,
.vfs_vget = zfs_vget,
.vfs_sync = zfs_sync,
.vfs_checkexp = zfs_checkexp,
.vfs_fhtovp = zfs_fhtovp,
.vfs_quotactl = zfs_quotactl,
};
VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN);
/*
* We need to keep a count of active fs's.
* This is necessary to prevent our module
* from being unloaded after a umount -f
*/
static uint32_t zfs_active_fs_count = 0;
static int
zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp)
{
int error = 0;
char buf[32];
int err;
uint64_t usedobj, quotaobj;
uint64_t quota, used = 0;
timespec_t now;
usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
if (quotaobj == 0 || zfsvfs->z_replay) {
error = EINVAL;
goto done;
}
(void)sprintf(buf, "%llx", (longlong_t)id);
if ((error = zap_lookup(zfsvfs->z_os, quotaobj,
buf, sizeof(quota), 1, "a)) != 0) {
dprintf("%s(%d): quotaobj lookup failed\n", __FUNCTION__, __LINE__);
goto done;
}
/*
* quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit".
* So we set them to be the same.
*/
dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota);
error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof(used), 1, &used);
if (error && error != ENOENT) {
dprintf("%s(%d): usedobj failed; %d\n", __FUNCTION__, __LINE__, error);
goto done;
}
dqp->dqb_curblocks = btodb(used);
dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0;
vfs_timestamp(&now);
/*
* Setting this to 0 causes FreeBSD quota(8) to print
* the number of days since the epoch, which isn't
* particularly useful.
*/
dqp->dqb_btime = dqp->dqb_itime = now.tv_sec;
done:
return (error);
}
static int
zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg)
{
zfsvfs_t *zfsvfs = vfsp->vfs_data;
struct thread *td;
int cmd, type, error = 0;
int bitsize;
uint64_t fuid;
zfs_userquota_prop_t quota_type;
struct dqblk64 dqblk = { 0 };
td = curthread;
cmd = cmds >> SUBCMDSHIFT;
type = cmds & SUBCMDMASK;
ZFS_ENTER(zfsvfs);
if (id == -1) {
switch (type) {
case USRQUOTA:
id = td->td_ucred->cr_ruid;
break;
case GRPQUOTA:
id = td->td_ucred->cr_rgid;
break;
default:
error = EINVAL;
if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF)
vfs_unbusy(vfsp);
goto done;
}
}
/*
* Map BSD type to:
* ZFS_PROP_USERUSED,
* ZFS_PROP_USERQUOTA,
* ZFS_PROP_GROUPUSED,
* ZFS_PROP_GROUPQUOTA
*/
switch (cmd) {
case Q_SETQUOTA:
case Q_SETQUOTA32:
if (type == USRQUOTA)
quota_type = ZFS_PROP_USERQUOTA;
else if (type == GRPQUOTA)
quota_type = ZFS_PROP_GROUPQUOTA;
else
error = EINVAL;
break;
case Q_GETQUOTA:
case Q_GETQUOTA32:
if (type == USRQUOTA)
quota_type = ZFS_PROP_USERUSED;
else if (type == GRPQUOTA)
quota_type = ZFS_PROP_GROUPUSED;
else
error = EINVAL;
break;
}
/*
* Depending on the cmd, we may need to get
* the ruid and domain (see fuidstr_to_sid?),
* the fuid (how?), or other information.
* Create fuid using zfs_fuid_create(zfsvfs, id,
* ZFS_OWNER or ZFS_GROUP, cr, &fuidp)?
* I think I can use just the id?
*
* Look at zfs_fuid_overquota() to look up a quota.
* zap_lookup(something, quotaobj, fuidstring, sizeof(long long), 1, "a)
*
* See zfs_set_userquota() to set a quota.
*/
if ((u_int)type >= MAXQUOTAS) {
error = EINVAL;
goto done;
}
switch (cmd) {
case Q_GETQUOTASIZE:
bitsize = 64;
error = copyout(&bitsize, arg, sizeof(int));
break;
case Q_QUOTAON:
// As far as I can tell, you can't turn quotas on or off on zfs
error = 0;
vfs_unbusy(vfsp);
break;
case Q_QUOTAOFF:
error = ENOTSUP;
vfs_unbusy(vfsp);
break;
case Q_SETQUOTA:
error = copyin(&dqblk, arg, sizeof(dqblk));
if (error == 0)
error = zfs_set_userquota(zfsvfs, quota_type,
"", id, dbtob(dqblk.dqb_bhardlimit));
break;
case Q_GETQUOTA:
error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk);
if (error == 0)
error = copyout(&dqblk, arg, sizeof(dqblk));
break;
default:
error = EINVAL;
break;
}
done:
ZFS_EXIT(zfsvfs);
return (error);
}
/*ARGSUSED*/
static int
zfs_sync(vfs_t *vfsp, int waitfor)
{
/*
* Data integrity is job one. We don't want a compromised kernel
* writing to the storage pool, so we never sync during panic.
*/
- if (panicstr)
+ if (KERNEL_PANICKED())
return (0);
/*
* Ignore the system syncher. ZFS already commits async data
* at zfs_txg_timeout intervals.
*/
if (waitfor == MNT_LAZY)
return (0);
if (vfsp != NULL) {
/*
* Sync a specific filesystem.
*/
zfsvfs_t *zfsvfs = vfsp->vfs_data;
dsl_pool_t *dp;
int error;
error = vfs_stdsync(vfsp, waitfor);
if (error != 0)
return (error);
ZFS_ENTER(zfsvfs);
dp = dmu_objset_pool(zfsvfs->z_os);
/*
* If the system is shutting down, then skip any
* filesystems which may exist on a suspended pool.
*/
if (sys_shutdown && spa_suspended(dp->dp_spa)) {
ZFS_EXIT(zfsvfs);
return (0);
}
if (zfsvfs->z_log != NULL)
zil_commit(zfsvfs->z_log, 0);
ZFS_EXIT(zfsvfs);
} else {
/*
* Sync all ZFS filesystems. This is what happens when you
* run sync(1M). Unlike other filesystems, ZFS honors the
* request by waiting for all pools to commit all dirty data.
*/
spa_sync_allpools();
}
return (0);
}
#ifndef __FreeBSD_kernel__
static int
zfs_create_unique_device(dev_t *dev)
{
major_t new_major;
do {
ASSERT3U(zfs_minor, <=, MAXMIN32);
minor_t start = zfs_minor;
do {
mutex_enter(&zfs_dev_mtx);
if (zfs_minor >= MAXMIN32) {
/*
* If we're still using the real major
* keep out of /dev/zfs and /dev/zvol minor
* number space. If we're using a getudev()'ed
* major number, we can use all of its minors.
*/
if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
zfs_minor = ZFS_MIN_MINOR;
else
zfs_minor = 0;
} else {
zfs_minor++;
}
*dev = makedevice(zfs_major, zfs_minor);
mutex_exit(&zfs_dev_mtx);
} while (vfs_devismounted(*dev) && zfs_minor != start);
if (zfs_minor == start) {
/*
* We are using all ~262,000 minor numbers for the
* current major number. Create a new major number.
*/
if ((new_major = getudev()) == (major_t)-1) {
cmn_err(CE_WARN,
"zfs_mount: Can't get unique major "
"device number.");
return (-1);
}
mutex_enter(&zfs_dev_mtx);
zfs_major = new_major;
zfs_minor = 0;
mutex_exit(&zfs_dev_mtx);
} else {
break;
}
/* CONSTANTCONDITION */
} while (1);
return (0);
}
#endif /* !__FreeBSD_kernel__ */
static void
atime_changed_cb(void *arg, uint64_t newval)
{
zfsvfs_t *zfsvfs = arg;
if (newval == TRUE) {
zfsvfs->z_atime = TRUE;
zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
} else {
zfsvfs->z_atime = FALSE;
zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
}
}
static void
xattr_changed_cb(void *arg, uint64_t newval)
{
zfsvfs_t *zfsvfs = arg;
if (newval == TRUE) {
/* XXX locking on vfs_flag? */
#ifdef TODO
zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
#endif
vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
} else {
/* XXX locking on vfs_flag? */
#ifdef TODO
zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
#endif
vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
}
}
static void
blksz_changed_cb(void *arg, uint64_t newval)
{
zfsvfs_t *zfsvfs = arg;
ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
ASSERT(ISP2(newval));
zfsvfs->z_max_blksz = newval;
zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
}
static void
readonly_changed_cb(void *arg, uint64_t newval)
{
zfsvfs_t *zfsvfs = arg;
if (newval) {
/* XXX locking on vfs_flag? */
zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
} else {
/* XXX locking on vfs_flag? */
zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
}
}
static void
setuid_changed_cb(void *arg, uint64_t newval)
{
zfsvfs_t *zfsvfs = arg;
if (newval == FALSE) {
zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
} else {
zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
}
}
static void
exec_changed_cb(void *arg, uint64_t newval)
{
zfsvfs_t *zfsvfs = arg;
if (newval == FALSE) {
zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
} else {
zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
}
}
/*
* The nbmand mount option can be changed at mount time.
* We can't allow it to be toggled on live file systems or incorrect
* behavior may be seen from cifs clients
*
* This property isn't registered via dsl_prop_register(), but this callback
* will be called when a file system is first mounted
*/
static void
nbmand_changed_cb(void *arg, uint64_t newval)
{
zfsvfs_t *zfsvfs = arg;
if (newval == FALSE) {
vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
} else {
vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
}
}
static void
snapdir_changed_cb(void *arg, uint64_t newval)
{
zfsvfs_t *zfsvfs = arg;
zfsvfs->z_show_ctldir = newval;
}
static void
vscan_changed_cb(void *arg, uint64_t newval)
{
zfsvfs_t *zfsvfs = arg;
zfsvfs->z_vscan = newval;
}
static void
acl_mode_changed_cb(void *arg, uint64_t newval)
{
zfsvfs_t *zfsvfs = arg;
zfsvfs->z_acl_mode = newval;
}
static void
acl_inherit_changed_cb(void *arg, uint64_t newval)
{
zfsvfs_t *zfsvfs = arg;
zfsvfs->z_acl_inherit = newval;
}
static int
zfs_register_callbacks(vfs_t *vfsp)
{
struct dsl_dataset *ds = NULL;
objset_t *os = NULL;
zfsvfs_t *zfsvfs = NULL;
uint64_t nbmand;
boolean_t readonly = B_FALSE;
boolean_t do_readonly = B_FALSE;
boolean_t setuid = B_FALSE;
boolean_t do_setuid = B_FALSE;
boolean_t exec = B_FALSE;
boolean_t do_exec = B_FALSE;
#ifdef illumos
boolean_t devices = B_FALSE;
boolean_t do_devices = B_FALSE;
#endif
boolean_t xattr = B_FALSE;
boolean_t do_xattr = B_FALSE;
boolean_t atime = B_FALSE;
boolean_t do_atime = B_FALSE;
int error = 0;
ASSERT(vfsp);
zfsvfs = vfsp->vfs_data;
ASSERT(zfsvfs);
os = zfsvfs->z_os;
/*
* This function can be called for a snapshot when we update snapshot's
* mount point, which isn't really supported.
*/
if (dmu_objset_is_snapshot(os))
return (EOPNOTSUPP);
/*
* The act of registering our callbacks will destroy any mount
* options we may have. In order to enable temporary overrides
* of mount options, we stash away the current values and
* restore them after we register the callbacks.
*/
if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
!spa_writeable(dmu_objset_spa(os))) {
readonly = B_TRUE;
do_readonly = B_TRUE;
} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
readonly = B_FALSE;
do_readonly = B_TRUE;
}
if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
setuid = B_FALSE;
do_setuid = B_TRUE;
} else {
if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
setuid = B_FALSE;
do_setuid = B_TRUE;
} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
setuid = B_TRUE;
do_setuid = B_TRUE;
}
}
if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
exec = B_FALSE;
do_exec = B_TRUE;
} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
exec = B_TRUE;
do_exec = B_TRUE;
}
if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
xattr = B_FALSE;
do_xattr = B_TRUE;
} else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
xattr = B_TRUE;
do_xattr = B_TRUE;
}
if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
atime = B_FALSE;
do_atime = B_TRUE;
} else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
atime = B_TRUE;
do_atime = B_TRUE;
}
/*
* We need to enter pool configuration here, so that we can use
* dsl_prop_get_int_ds() to handle the special nbmand property below.
* dsl_prop_get_integer() can not be used, because it has to acquire
* spa_namespace_lock and we can not do that because we already hold
* z_teardown_lock. The problem is that spa_write_cachefile() is called
* with spa_namespace_lock held and the function calls ZFS vnode
* operations to write the cache file and thus z_teardown_lock is
* acquired after spa_namespace_lock.
*/
ds = dmu_objset_ds(os);
dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
/*
* nbmand is a special property. It can only be changed at
* mount time.
*
* This is weird, but it is documented to only be changeable
* at mount time.
*/
if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
nbmand = B_FALSE;
} else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
nbmand = B_TRUE;
} else if (error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0) {
dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
return (error);
}
/*
* Register property callbacks.
*
* It would probably be fine to just check for i/o error from
* the first prop_register(), but I guess I like to go
* overboard...
*/
error = dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
error = error ? error : dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
error = error ? error : dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
error = error ? error : dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
#ifdef illumos
error = error ? error : dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs);
#endif
error = error ? error : dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
error = error ? error : dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
error = error ? error : dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
error = error ? error : dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
error = error ? error : dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
zfsvfs);
error = error ? error : dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs);
dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
if (error)
goto unregister;
/*
* Invoke our callbacks to restore temporary mount options.
*/
if (do_readonly)
readonly_changed_cb(zfsvfs, readonly);
if (do_setuid)
setuid_changed_cb(zfsvfs, setuid);
if (do_exec)
exec_changed_cb(zfsvfs, exec);
if (do_xattr)
xattr_changed_cb(zfsvfs, xattr);
if (do_atime)
atime_changed_cb(zfsvfs, atime);
nbmand_changed_cb(zfsvfs, nbmand);
return (0);
unregister:
dsl_prop_unregister_all(ds, zfsvfs);
return (error);
}
static int
zfs_space_delta_cb(dmu_object_type_t bonustype, void *data,
uint64_t *userp, uint64_t *groupp)
{
/*
* Is it a valid type of object to track?
*/
if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
return (SET_ERROR(ENOENT));
/*
* If we have a NULL data pointer
* then assume the id's aren't changing and
* return EEXIST to the dmu to let it know to
* use the same ids
*/
if (data == NULL)
return (SET_ERROR(EEXIST));
if (bonustype == DMU_OT_ZNODE) {
znode_phys_t *znp = data;
*userp = znp->zp_uid;
*groupp = znp->zp_gid;
} else {
int hdrsize;
sa_hdr_phys_t *sap = data;
sa_hdr_phys_t sa = *sap;
boolean_t swap = B_FALSE;
ASSERT(bonustype == DMU_OT_SA);
if (sa.sa_magic == 0) {
/*
* This should only happen for newly created
* files that haven't had the znode data filled
* in yet.
*/
*userp = 0;
*groupp = 0;
return (0);
}
if (sa.sa_magic == BSWAP_32(SA_MAGIC)) {
sa.sa_magic = SA_MAGIC;
sa.sa_layout_info = BSWAP_16(sa.sa_layout_info);
swap = B_TRUE;
} else {
VERIFY3U(sa.sa_magic, ==, SA_MAGIC);
}
hdrsize = sa_hdrsize(&sa);
VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t));
*userp = *((uint64_t *)((uintptr_t)data + hdrsize +
SA_UID_OFFSET));
*groupp = *((uint64_t *)((uintptr_t)data + hdrsize +
SA_GID_OFFSET));
if (swap) {
*userp = BSWAP_64(*userp);
*groupp = BSWAP_64(*groupp);
}
}
return (0);
}
static void
fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr,
char *domainbuf, int buflen, uid_t *ridp)
{
uint64_t fuid;
const char *domain;
fuid = zfs_strtonum(fuidstr, NULL);
domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid));
if (domain)
(void) strlcpy(domainbuf, domain, buflen);
else
domainbuf[0] = '\0';
*ridp = FUID_RID(fuid);
}
static uint64_t
zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type)
{
switch (type) {
case ZFS_PROP_USERUSED:
return (DMU_USERUSED_OBJECT);
case ZFS_PROP_GROUPUSED:
return (DMU_GROUPUSED_OBJECT);
case ZFS_PROP_USERQUOTA:
return (zfsvfs->z_userquota_obj);
case ZFS_PROP_GROUPQUOTA:
return (zfsvfs->z_groupquota_obj);
}
return (0);
}
int
zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
uint64_t *cookiep, void *vbuf, uint64_t *bufsizep)
{
int error;
zap_cursor_t zc;
zap_attribute_t za;
zfs_useracct_t *buf = vbuf;
uint64_t obj;
if (!dmu_objset_userspace_present(zfsvfs->z_os))
return (SET_ERROR(ENOTSUP));
obj = zfs_userquota_prop_to_obj(zfsvfs, type);
if (obj == 0) {
*bufsizep = 0;
return (0);
}
for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep);
(error = zap_cursor_retrieve(&zc, &za)) == 0;
zap_cursor_advance(&zc)) {
if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) >
*bufsizep)
break;
fuidstr_to_sid(zfsvfs, za.za_name,
buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid);
buf->zu_space = za.za_first_integer;
buf++;
}
if (error == ENOENT)
error = 0;
ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep);
*bufsizep = (uintptr_t)buf - (uintptr_t)vbuf;
*cookiep = zap_cursor_serialize(&zc);
zap_cursor_fini(&zc);
return (error);
}
/*
* buf must be big enough (eg, 32 bytes)
*/
static int
id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid,
char *buf, boolean_t addok)
{
uint64_t fuid;
int domainid = 0;
if (domain && domain[0]) {
domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok);
if (domainid == -1)
return (SET_ERROR(ENOENT));
}
fuid = FUID_ENCODE(domainid, rid);
(void) sprintf(buf, "%llx", (longlong_t)fuid);
return (0);
}
int
zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
const char *domain, uint64_t rid, uint64_t *valp)
{
char buf[32];
int err;
uint64_t obj;
*valp = 0;
if (!dmu_objset_userspace_present(zfsvfs->z_os))
return (SET_ERROR(ENOTSUP));
obj = zfs_userquota_prop_to_obj(zfsvfs, type);
if (obj == 0)
return (0);
err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE);
if (err)
return (err);
err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp);
if (err == ENOENT)
err = 0;
return (err);
}
int
zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
const char *domain, uint64_t rid, uint64_t quota)
{
char buf[32];
int err;
dmu_tx_t *tx;
uint64_t *objp;
boolean_t fuid_dirtied;
if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA)
return (SET_ERROR(EINVAL));
if (zfsvfs->z_version < ZPL_VERSION_USERSPACE)
return (SET_ERROR(ENOTSUP));
objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj :
&zfsvfs->z_groupquota_obj;
err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE);
if (err)
return (err);
fuid_dirtied = zfsvfs->z_fuid_dirty;
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL);
if (*objp == 0) {
dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
zfs_userquota_prop_prefixes[type]);
}
if (fuid_dirtied)
zfs_fuid_txhold(zfsvfs, tx);
err = dmu_tx_assign(tx, TXG_WAIT);
if (err) {
dmu_tx_abort(tx);
return (err);
}
mutex_enter(&zfsvfs->z_lock);
if (*objp == 0) {
*objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
DMU_OT_NONE, 0, tx);
VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
}
mutex_exit(&zfsvfs->z_lock);
if (quota == 0) {
err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
if (err == ENOENT)
err = 0;
} else {
err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, "a, tx);
}
ASSERT(err == 0);
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);
dmu_tx_commit(tx);
return (err);
}
boolean_t
zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
{
char buf[32];
uint64_t used, quota, usedobj, quotaobj;
int err;
usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
if (quotaobj == 0 || zfsvfs->z_replay)
return (B_FALSE);
(void) sprintf(buf, "%llx", (longlong_t)fuid);
err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a);
if (err != 0)
return (B_FALSE);
err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
if (err != 0)
return (B_FALSE);
return (used >= quota);
}
boolean_t
zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup)
{
uint64_t fuid;
uint64_t quotaobj;
quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
fuid = isgroup ? zp->z_gid : zp->z_uid;
if (quotaobj == 0 || zfsvfs->z_replay)
return (B_FALSE);
return (zfs_fuid_overquota(zfsvfs, isgroup, fuid));
}
/*
* Associate this zfsvfs with the given objset, which must be owned.
* This will cache a bunch of on-disk state from the objset in the
* zfsvfs.
*/
static int
zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
{
int error;
uint64_t val;
zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
zfsvfs->z_os = os;
error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
if (error != 0)
return (error);
if (zfsvfs->z_version >
zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
(void) printf("Can't mount a version %lld file system "
"on a version %lld pool\n. Pool must be upgraded to mount "
"this file system.", (u_longlong_t)zfsvfs->z_version,
(u_longlong_t)spa_version(dmu_objset_spa(os)));
return (SET_ERROR(ENOTSUP));
}
error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
if (error != 0)
return (error);
zfsvfs->z_norm = (int)val;
error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
if (error != 0)
return (error);
zfsvfs->z_utf8 = (val != 0);
error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
if (error != 0)
return (error);
zfsvfs->z_case = (uint_t)val;
/*
* Fold case on file systems that are always or sometimes case
* insensitive.
*/
if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
zfsvfs->z_case == ZFS_CASE_MIXED)
zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
uint64_t sa_obj = 0;
if (zfsvfs->z_use_sa) {
/* should either have both of these objects or none */
error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
&sa_obj);
if (error != 0)
return (error);
}
error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
&zfsvfs->z_attr_table);
if (error != 0)
return (error);
if (zfsvfs->z_version >= ZPL_VERSION_SA)
sa_register_update_callback(os, zfs_sa_upgrade);
error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
&zfsvfs->z_root);
if (error != 0)
return (error);
ASSERT(zfsvfs->z_root != 0);
error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
&zfsvfs->z_unlinkedobj);
if (error != 0)
return (error);
error = zap_lookup(os, MASTER_NODE_OBJ,
zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
8, 1, &zfsvfs->z_userquota_obj);
if (error == ENOENT)
zfsvfs->z_userquota_obj = 0;
else if (error != 0)
return (error);
error = zap_lookup(os, MASTER_NODE_OBJ,
zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
8, 1, &zfsvfs->z_groupquota_obj);
if (error == ENOENT)
zfsvfs->z_groupquota_obj = 0;
else if (error != 0)
return (error);
error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
&zfsvfs->z_fuid_obj);
if (error == ENOENT)
zfsvfs->z_fuid_obj = 0;
else if (error != 0)
return (error);
error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
&zfsvfs->z_shares_dir);
if (error == ENOENT)
zfsvfs->z_shares_dir = 0;
else if (error != 0)
return (error);
/*
* Only use the name cache if we are looking for a
* name on a file system that does not require normalization
* or case folding. We can also look there if we happen to be
* on a non-normalizing, mixed sensitivity file system IF we
* are looking for the exact name (which is always the case on
* FreeBSD).
*/
zfsvfs->z_use_namecache = !zfsvfs->z_norm ||
((zfsvfs->z_case == ZFS_CASE_MIXED) &&
!(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));
return (0);
}
#if defined(__FreeBSD__)
taskq_t *zfsvfs_taskq;
static void
zfsvfs_task_unlinked_drain(void *context, int pending __unused)
{
zfs_unlinked_drain((zfsvfs_t *)context);
}
#endif
int
zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
{
objset_t *os;
zfsvfs_t *zfsvfs;
int error;
/*
* XXX: Fix struct statfs so this isn't necessary!
*
* The 'osname' is used as the filesystem's special node, which means
* it must fit in statfs.f_mntfromname, or else it can't be
* enumerated, so libzfs_mnttab_find() returns NULL, which causes
* 'zfs unmount' to think it's not mounted when it is.
*/
if (strlen(osname) >= MNAMELEN)
return (SET_ERROR(ENAMETOOLONG));
zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
/*
* We claim to always be readonly so we can open snapshots;
* other ZPL code will prevent us from writing to snapshots.
*/
error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
if (error != 0) {
kmem_free(zfsvfs, sizeof (zfsvfs_t));
return (error);
}
error = zfsvfs_create_impl(zfvp, zfsvfs, os);
if (error != 0) {
dmu_objset_disown(os, zfsvfs);
}
return (error);
}
int
zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
{
int error;
zfsvfs->z_vfs = NULL;
zfsvfs->z_parent = zfsvfs;
mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
offsetof(znode_t, z_link_node));
#if defined(__FreeBSD__)
TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0,
zfsvfs_task_unlinked_drain, zfsvfs);
#endif
#ifdef DIAGNOSTIC
rrm_init(&zfsvfs->z_teardown_lock, B_TRUE);
#else
rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
#endif
rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
error = zfsvfs_init(zfsvfs, os);
if (error != 0) {
*zfvp = NULL;
kmem_free(zfsvfs, sizeof (zfsvfs_t));
return (error);
}
*zfvp = zfsvfs;
return (0);
}
static int
zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
{
int error;
error = zfs_register_callbacks(zfsvfs->z_vfs);
if (error)
return (error);
zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
/*
* If we are not mounting (ie: online recv), then we don't
* have to worry about replaying the log as we blocked all
* operations out since we closed the ZIL.
*/
if (mounting) {
boolean_t readonly;
/*
* During replay we remove the read only flag to
* allow replays to succeed.
*/
readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
if (readonly != 0)
zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
else
zfs_unlinked_drain(zfsvfs);
/*
* Parse and replay the intent log.
*
* Because of ziltest, this must be done after
* zfs_unlinked_drain(). (Further note: ziltest
* doesn't use readonly mounts, where
* zfs_unlinked_drain() isn't called.) This is because
* ziltest causes spa_sync() to think it's committed,
* but actually it is not, so the intent log contains
* many txg's worth of changes.
*
* In particular, if object N is in the unlinked set in
* the last txg to actually sync, then it could be
* actually freed in a later txg and then reallocated
* in a yet later txg. This would write a "create
* object N" record to the intent log. Normally, this
* would be fine because the spa_sync() would have
* written out the fact that object N is free, before
* we could write the "create object N" intent log
* record.
*
* But when we are in ziltest mode, we advance the "open
* txg" without actually spa_sync()-ing the changes to
* disk. So we would see that object N is still
* allocated and in the unlinked set, and there is an
* intent log record saying to allocate it.
*/
if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
if (zil_replay_disable) {
zil_destroy(zfsvfs->z_log, B_FALSE);
} else {
zfsvfs->z_replay = B_TRUE;
zil_replay(zfsvfs->z_os, zfsvfs,
zfs_replay_vector);
zfsvfs->z_replay = B_FALSE;
}
}
zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
}
/*
* Set the objset user_ptr to track its zfsvfs.
*/
mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
return (0);
}
extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */
void
zfsvfs_free(zfsvfs_t *zfsvfs)
{
int i;
/*
* This is a barrier to prevent the filesystem from going away in
* zfs_znode_move() until we can safely ensure that the filesystem is
* not unmounted. We consider the filesystem valid before the barrier
* and invalid after the barrier.
*/
rw_enter(&zfsvfs_lock, RW_READER);
rw_exit(&zfsvfs_lock);
zfs_fuid_destroy(zfsvfs);
mutex_destroy(&zfsvfs->z_znodes_lock);
mutex_destroy(&zfsvfs->z_lock);
list_destroy(&zfsvfs->z_all_znodes);
rrm_destroy(&zfsvfs->z_teardown_lock);
rw_destroy(&zfsvfs->z_teardown_inactive_lock);
rw_destroy(&zfsvfs->z_fuid_lock);
for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
mutex_destroy(&zfsvfs->z_hold_mtx[i]);
kmem_free(zfsvfs, sizeof (zfsvfs_t));
}
static void
zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
{
zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
if (zfsvfs->z_vfs) {
if (zfsvfs->z_use_fuids) {
vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
} else {
vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
}
}
zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
}
static int
zfs_domount(vfs_t *vfsp, char *osname)
{
uint64_t recordsize, fsid_guid;
int error = 0;
zfsvfs_t *zfsvfs;
vnode_t *vp;
ASSERT(vfsp);
ASSERT(osname);
error = zfsvfs_create(osname, &zfsvfs);
if (error)
return (error);
zfsvfs->z_vfs = vfsp;
#ifdef illumos
/* Initialize the generic filesystem structure. */
vfsp->vfs_bcount = 0;
vfsp->vfs_data = NULL;
if (zfs_create_unique_device(&mount_dev) == -1) {
error = SET_ERROR(ENODEV);
goto out;
}
ASSERT(vfs_devismounted(mount_dev) == 0);
#endif
if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
NULL))
goto out;
zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;
vfsp->vfs_data = zfsvfs;
vfsp->mnt_flag |= MNT_LOCAL;
vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED;
vfsp->mnt_kern_flag |= MNTK_NO_IOPF; /* vn_io_fault can be used */
vfsp->mnt_kern_flag |= MNTK_NOMSYNC;
vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG;
/*
* The fsid is 64 bits, composed of an 8-bit fs type, which
* separates our fsid from any other filesystem types, and a
* 56-bit objset unique ID. The objset unique ID is unique to
* all objsets open on this system, provided by unique_create().
* The 8-bit fs type must be put in the low bits of fsid[1]
* because that's where other Solaris filesystems put it.
*/
fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
vfsp->vfs_fsid.val[0] = fsid_guid;
vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
vfsp->mnt_vfc->vfc_typenum & 0xFF;
/*
* Set features for file system.
*/
zfs_set_fuid_feature(zfsvfs);
if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
} else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
}
vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);
if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
uint64_t pval;
atime_changed_cb(zfsvfs, B_FALSE);
readonly_changed_cb(zfsvfs, B_TRUE);
if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
goto out;
xattr_changed_cb(zfsvfs, pval);
zfsvfs->z_issnap = B_TRUE;
zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
} else {
error = zfsvfs_setup(zfsvfs, B_TRUE);
}
vfs_mountedfrom(vfsp, osname);
if (!zfsvfs->z_issnap)
zfsctl_create(zfsvfs);
out:
if (error) {
dmu_objset_disown(zfsvfs->z_os, zfsvfs);
zfsvfs_free(zfsvfs);
} else {
atomic_inc_32(&zfs_active_fs_count);
}
return (error);
}
void
zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
{
objset_t *os = zfsvfs->z_os;
if (!dmu_objset_is_snapshot(os))
dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
}
#ifdef SECLABEL
/*
* Convert a decimal digit string to a uint64_t integer.
*/
static int
str_to_uint64(char *str, uint64_t *objnum)
{
uint64_t num = 0;
while (*str) {
if (*str < '0' || *str > '9')
return (SET_ERROR(EINVAL));
num = num*10 + *str++ - '0';
}
*objnum = num;
return (0);
}
/*
* The boot path passed from the boot loader is in the form of
* "rootpool-name/root-filesystem-object-number'. Convert this
* string to a dataset name: "rootpool-name/root-filesystem-name".
*/
static int
zfs_parse_bootfs(char *bpath, char *outpath)
{
char *slashp;
uint64_t objnum;
int error;
if (*bpath == 0 || *bpath == '/')
return (SET_ERROR(EINVAL));
(void) strcpy(outpath, bpath);
slashp = strchr(bpath, '/');
/* if no '/', just return the pool name */
if (slashp == NULL) {
return (0);
}
/* if not a number, just return the root dataset name */
if (str_to_uint64(slashp+1, &objnum)) {
return (0);
}
*slashp = '\0';
error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
*slashp = '/';
return (error);
}
/*
* Check that the hex label string is appropriate for the dataset being
* mounted into the global_zone proper.
*
* Return an error if the hex label string is not default or
* admin_low/admin_high. For admin_low labels, the corresponding
* dataset must be readonly.
*/
int
zfs_check_global_label(const char *dsname, const char *hexsl)
{
if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
return (0);
if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
return (0);
if (strcasecmp(hexsl, ADMIN_LOW) == 0) {
/* must be readonly */
uint64_t rdonly;
if (dsl_prop_get_integer(dsname,
zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
return (SET_ERROR(EACCES));
return (rdonly ? 0 : EACCES);
}
return (SET_ERROR(EACCES));
}
/*
* Determine whether the mount is allowed according to MAC check.
* by comparing (where appropriate) label of the dataset against
* the label of the zone being mounted into. If the dataset has
* no label, create one.
*
* Returns 0 if access allowed, error otherwise (e.g. EACCES)
*/
static int
zfs_mount_label_policy(vfs_t *vfsp, char *osname)
{
int error, retv;
zone_t *mntzone = NULL;
ts_label_t *mnt_tsl;
bslabel_t *mnt_sl;
bslabel_t ds_sl;
char ds_hexsl[MAXNAMELEN];
retv = EACCES; /* assume the worst */
/*
* Start by getting the dataset label if it exists.
*/
error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
1, sizeof (ds_hexsl), &ds_hexsl, NULL);
if (error)
return (SET_ERROR(EACCES));
/*
* If labeling is NOT enabled, then disallow the mount of datasets
* which have a non-default label already. No other label checks
* are needed.
*/
if (!is_system_labeled()) {
if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
return (0);
return (SET_ERROR(EACCES));
}
/*
* Get the label of the mountpoint. If mounting into the global
* zone (i.e. mountpoint is not within an active zone and the
* zoned property is off), the label must be default or
* admin_low/admin_high only; no other checks are needed.
*/
mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
if (mntzone->zone_id == GLOBAL_ZONEID) {
uint64_t zoned;
zone_rele(mntzone);
if (dsl_prop_get_integer(osname,
zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
return (SET_ERROR(EACCES));
if (!zoned)
return (zfs_check_global_label(osname, ds_hexsl));
else
/*
* This is the case of a zone dataset being mounted
* initially, before the zone has been fully created;
* allow this mount into global zone.
*/
return (0);
}
mnt_tsl = mntzone->zone_slabel;
ASSERT(mnt_tsl != NULL);
label_hold(mnt_tsl);
mnt_sl = label2bslabel(mnt_tsl);
if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) {
/*
* The dataset doesn't have a real label, so fabricate one.
*/
char *str = NULL;
if (l_to_str_internal(mnt_sl, &str) == 0 &&
dsl_prop_set_string(osname,
zfs_prop_to_name(ZFS_PROP_MLSLABEL),
ZPROP_SRC_LOCAL, str) == 0)
retv = 0;
if (str != NULL)
kmem_free(str, strlen(str) + 1);
} else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) {
/*
* Now compare labels to complete the MAC check. If the
* labels are equal then allow access. If the mountpoint
* label dominates the dataset label, allow readonly access.
* Otherwise, access is denied.
*/
if (blequal(mnt_sl, &ds_sl))
retv = 0;
else if (bldominates(mnt_sl, &ds_sl)) {
vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
retv = 0;
}
}
label_rele(mnt_tsl);
zone_rele(mntzone);
return (retv);
}
#endif /* SECLABEL */
#ifdef OPENSOLARIS_MOUNTROOT
static int
zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
{
int error = 0;
static int zfsrootdone = 0;
zfsvfs_t *zfsvfs = NULL;
znode_t *zp = NULL;
vnode_t *vp = NULL;
char *zfs_bootfs;
char *zfs_devid;
ASSERT(vfsp);
/*
* The filesystem that we mount as root is defined in the
* boot property "zfs-bootfs" with a format of
* "poolname/root-dataset-objnum".
*/
if (why == ROOT_INIT) {
if (zfsrootdone++)
return (SET_ERROR(EBUSY));
/*
* the process of doing a spa_load will require the
* clock to be set before we could (for example) do
* something better by looking at the timestamp on
* an uberblock, so just set it to -1.
*/
clkset(-1);
if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) {
cmn_err(CE_NOTE, "spa_get_bootfs: can not get "
"bootfs name");
return (SET_ERROR(EINVAL));
}
zfs_devid = spa_get_bootprop("diskdevid");
error = spa_import_rootpool(rootfs.bo_name, zfs_devid);
if (zfs_devid)
spa_free_bootprop(zfs_devid);
if (error) {
spa_free_bootprop(zfs_bootfs);
cmn_err(CE_NOTE, "spa_import_rootpool: error %d",
error);
return (error);
}
if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) {
spa_free_bootprop(zfs_bootfs);
cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d",
error);
return (error);
}
spa_free_bootprop(zfs_bootfs);
if (error = vfs_lock(vfsp))
return (error);
if (error = zfs_domount(vfsp, rootfs.bo_name)) {
cmn_err(CE_NOTE, "zfs_domount: error %d", error);
goto out;
}
zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
ASSERT(zfsvfs);
if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) {
cmn_err(CE_NOTE, "zfs_zget: error %d", error);
goto out;
}
vp = ZTOV(zp);
mutex_enter(&vp->v_lock);
vp->v_flag |= VROOT;
mutex_exit(&vp->v_lock);
rootvp = vp;
/*
* Leave rootvp held. The root file system is never unmounted.
*/
vfs_add((struct vnode *)0, vfsp,
(vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
out:
vfs_unlock(vfsp);
return (error);
} else if (why == ROOT_REMOUNT) {
readonly_changed_cb(vfsp->vfs_data, B_FALSE);
vfsp->vfs_flag |= VFS_REMOUNT;
/* refresh mount options */
zfs_unregister_callbacks(vfsp->vfs_data);
return (zfs_register_callbacks(vfsp));
} else if (why == ROOT_UNMOUNT) {
zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
(void) zfs_sync(vfsp, 0, 0);
return (0);
}
/*
* if "why" is equal to anything else other than ROOT_INIT,
* ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
*/
return (SET_ERROR(ENOTSUP));
}
#endif /* OPENSOLARIS_MOUNTROOT */
static int
getpoolname(const char *osname, char *poolname)
{
char *p;
p = strchr(osname, '/');
if (p == NULL) {
if (strlen(osname) >= MAXNAMELEN)
return (ENAMETOOLONG);
(void) strcpy(poolname, osname);
} else {
if (p - osname >= MAXNAMELEN)
return (ENAMETOOLONG);
(void) strncpy(poolname, osname, p - osname);
poolname[p - osname] = '\0';
}
return (0);
}
/*ARGSUSED*/
static int
zfs_mount(vfs_t *vfsp)
{
kthread_t *td = curthread;
vnode_t *mvp = vfsp->mnt_vnodecovered;
cred_t *cr = td->td_ucred;
char *osname;
int error = 0;
int canwrite;
#ifdef illumos
if (mvp->v_type != VDIR)
return (SET_ERROR(ENOTDIR));
mutex_enter(&mvp->v_lock);
if ((uap->flags & MS_REMOUNT) == 0 &&
(uap->flags & MS_OVERLAY) == 0 &&
(mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
mutex_exit(&mvp->v_lock);
return (SET_ERROR(EBUSY));
}
mutex_exit(&mvp->v_lock);
/*
* ZFS does not support passing unparsed data in via MS_DATA.
* Users should use the MS_OPTIONSTR interface; this means
* that all option parsing is already done and the options struct
* can be interrogated.
*/
if ((uap->flags & MS_DATA) && uap->datalen > 0)
return (SET_ERROR(EINVAL));
/*
* Get the objset name (the "special" mount argument).
*/
if (error = pn_get(uap->spec, fromspace, &spn))
return (error);
osname = spn.pn_path;
#else /* !illumos */
if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
return (SET_ERROR(EINVAL));
/*
* If full-owner-access is enabled and delegated administration is
* turned on, we must set nosuid.
*/
if (zfs_super_owner &&
dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
secpolicy_fs_mount_clearopts(cr, vfsp);
}
#endif /* illumos */
/*
* Check for mount privilege?
*
* If we don't have privilege then see if
* we have local permission to allow it
*/
error = secpolicy_fs_mount(cr, mvp, vfsp);
if (error) {
if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0)
goto out;
if (!(vfsp->vfs_flag & MS_REMOUNT)) {
vattr_t vattr;
/*
* Make sure user is the owner of the mount point
* or has sufficient privileges.
*/
vattr.va_mask = AT_UID;
vn_lock(mvp, LK_SHARED | LK_RETRY);
if (VOP_GETATTR(mvp, &vattr, cr)) {
VOP_UNLOCK(mvp);
goto out;
}
if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
VOP_UNLOCK(mvp);
goto out;
}
VOP_UNLOCK(mvp);
}
secpolicy_fs_mount_clearopts(cr, vfsp);
}
/*
* Refuse to mount a filesystem if we are in a local zone and the
* dataset is not visible.
*/
if (!INGLOBALZONE(curthread) &&
(!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
error = SET_ERROR(EPERM);
goto out;
}
#ifdef SECLABEL
error = zfs_mount_label_policy(vfsp, osname);
if (error)
goto out;
#endif
vfsp->vfs_flag |= MNT_NFS4ACLS;
/*
* When doing a remount, we simply refresh our temporary properties
* according to those options set in the current VFS options.
*/
if (vfsp->vfs_flag & MS_REMOUNT) {
zfsvfs_t *zfsvfs = vfsp->vfs_data;
/*
* Refresh mount options with z_teardown_lock blocking I/O while
* the filesystem is in an inconsistent state.
* The lock also serializes this code with filesystem
* manipulations between entry to zfs_suspend_fs() and return
* from zfs_resume_fs().
*/
rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
zfs_unregister_callbacks(zfsvfs);
error = zfs_register_callbacks(vfsp);
rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
goto out;
}
/* Initial root mount: try hard to import the requested root pool. */
if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 &&
(vfsp->vfs_flag & MNT_UPDATE) == 0) {
char pname[MAXNAMELEN];
error = getpoolname(osname, pname);
if (error == 0)
error = spa_import_rootpool(pname);
if (error)
goto out;
}
DROP_GIANT();
error = zfs_domount(vfsp, osname);
PICKUP_GIANT();
#ifdef illumos
/*
* Add an extra VFS_HOLD on our parent vfs so that it can't
* disappear due to a forced unmount.
*/
if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap)
VFS_HOLD(mvp->v_vfsp);
#endif
out:
return (error);
}
static int
zfs_statfs(vfs_t *vfsp, struct statfs *statp)
{
zfsvfs_t *zfsvfs = vfsp->vfs_data;
uint64_t refdbytes, availbytes, usedobjs, availobjs;
statp->f_version = STATFS_VERSION;
ZFS_ENTER(zfsvfs);
dmu_objset_space(zfsvfs->z_os,
&refdbytes, &availbytes, &usedobjs, &availobjs);
/*
* The underlying storage pool actually uses multiple block sizes.
* We report the fragsize as the smallest block size we support,
* and we report our blocksize as the filesystem's maximum blocksize.
*/
statp->f_bsize = SPA_MINBLOCKSIZE;
statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;
/*
* The following report "total" blocks of various kinds in the
* file system, but reported in terms of f_frsize - the
* "fragment" size.
*/
statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
statp->f_bfree = availbytes / statp->f_bsize;
statp->f_bavail = statp->f_bfree; /* no root reservation */
/*
* statvfs() should really be called statufs(), because it assumes
* static metadata. ZFS doesn't preallocate files, so the best
* we can do is report the max that could possibly fit in f_files,
* and that minus the number actually used in f_ffree.
* For f_ffree, report the smaller of the number of object available
* and the number of blocks (each object will take at least a block).
*/
statp->f_ffree = MIN(availobjs, statp->f_bfree);
statp->f_files = statp->f_ffree + usedobjs;
/*
* We're a zfs filesystem.
*/
(void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename));
strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
sizeof(statp->f_mntfromname));
strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
sizeof(statp->f_mntonname));
statp->f_namemax = MAXNAMELEN - 1;
ZFS_EXIT(zfsvfs);
return (0);
}
static int
zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
{
zfsvfs_t *zfsvfs = vfsp->vfs_data;
znode_t *rootzp;
int error;
ZFS_ENTER(zfsvfs);
error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
if (error == 0)
*vpp = ZTOV(rootzp);
ZFS_EXIT(zfsvfs);
if (error == 0) {
error = vn_lock(*vpp, flags);
if (error != 0) {
VN_RELE(*vpp);
*vpp = NULL;
}
}
return (error);
}
/*
* Teardown the zfsvfs::z_os.
*
* Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
* and 'z_teardown_inactive_lock' held.
*/
static int
zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
{
znode_t *zp;
rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
if (!unmounting) {
/*
* We purge the parent filesystem's vfsp as the parent
* filesystem and all of its snapshots have their vnode's
* v_vfsp set to the parent's filesystem's vfsp. Note,
* 'z_parent' is self referential for non-snapshots.
*/
(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
#ifdef FREEBSD_NAMECACHE
cache_purgevfs(zfsvfs->z_parent->z_vfs, true);
#endif
}
/*
* Close the zil. NB: Can't close the zil while zfs_inactive
* threads are blocked as zil_close can call zfs_inactive.
*/
if (zfsvfs->z_log) {
zil_close(zfsvfs->z_log);
zfsvfs->z_log = NULL;
}
rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
/*
* If we are not unmounting (ie: online recv) and someone already
* unmounted this file system while we were doing the switcheroo,
* or a reopen of z_os failed then just bail out now.
*/
if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
rw_exit(&zfsvfs->z_teardown_inactive_lock);
rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
return (SET_ERROR(EIO));
}
/*
* At this point there are no vops active, and any new vops will
* fail with EIO since we have z_teardown_lock for writer (only
* relavent for forced unmount).
*
* Release all holds on dbufs.
*/
mutex_enter(&zfsvfs->z_znodes_lock);
for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
zp = list_next(&zfsvfs->z_all_znodes, zp))
if (zp->z_sa_hdl) {
ASSERT(ZTOV(zp)->v_count >= 0);
zfs_znode_dmu_fini(zp);
}
mutex_exit(&zfsvfs->z_znodes_lock);
/*
* If we are unmounting, set the unmounted flag and let new vops
* unblock. zfs_inactive will have the unmounted behavior, and all
* other vops will fail with EIO.
*/
if (unmounting) {
zfsvfs->z_unmounted = B_TRUE;
rw_exit(&zfsvfs->z_teardown_inactive_lock);
rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
}
/*
* z_os will be NULL if there was an error in attempting to reopen
* zfsvfs, so just return as the properties had already been
* unregistered and cached data had been evicted before.
*/
if (zfsvfs->z_os == NULL)
return (0);
/*
* Unregister properties.
*/
zfs_unregister_callbacks(zfsvfs);
/*
* Evict cached data
*/
if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) &&
!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY))
txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
dmu_objset_evict_dbufs(zfsvfs->z_os);
return (0);
}
/*ARGSUSED*/
static int
zfs_umount(vfs_t *vfsp, int fflag)
{
kthread_t *td = curthread;
zfsvfs_t *zfsvfs = vfsp->vfs_data;
objset_t *os;
cred_t *cr = td->td_ucred;
int ret;
ret = secpolicy_fs_unmount(cr, vfsp);
if (ret) {
if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
ZFS_DELEG_PERM_MOUNT, cr))
return (ret);
}
/*
* We purge the parent filesystem's vfsp as the parent filesystem
* and all of its snapshots have their vnode's v_vfsp set to the
* parent's filesystem's vfsp. Note, 'z_parent' is self
* referential for non-snapshots.
*/
(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
/*
* Unmount any snapshots mounted under .zfs before unmounting the
* dataset itself.
*/
if (zfsvfs->z_ctldir != NULL) {
if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
return (ret);
}
if (fflag & MS_FORCE) {
/*
* Mark file system as unmounted before calling
* vflush(FORCECLOSE). This way we ensure no future vnops
* will be called and risk operating on DOOMED vnodes.
*/
rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
zfsvfs->z_unmounted = B_TRUE;
rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
}
/*
* Flush all the files.
*/
ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
if (ret != 0)
return (ret);
#ifdef illumos
if (!(fflag & MS_FORCE)) {
/*
* Check the number of active vnodes in the file system.
* Our count is maintained in the vfs structure, but the
* number is off by 1 to indicate a hold on the vfs
* structure itself.
*
* The '.zfs' directory maintains a reference of its
* own, and any active references underneath are
* reflected in the vnode count.
*/
if (zfsvfs->z_ctldir == NULL) {
if (vfsp->vfs_count > 1)
return (SET_ERROR(EBUSY));
} else {
if (vfsp->vfs_count > 2 ||
zfsvfs->z_ctldir->v_count > 1)
return (SET_ERROR(EBUSY));
}
}
#endif
while (taskqueue_cancel(zfsvfs_taskq->tq_queue,
&zfsvfs->z_unlinked_drain_task, NULL) != 0)
taskqueue_drain(zfsvfs_taskq->tq_queue,
&zfsvfs->z_unlinked_drain_task);
VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
os = zfsvfs->z_os;
/*
* z_os will be NULL if there was an error in
* attempting to reopen zfsvfs.
*/
if (os != NULL) {
/*
* Unset the objset user_ptr.
*/
mutex_enter(&os->os_user_ptr_lock);
dmu_objset_set_user(os, NULL);
mutex_exit(&os->os_user_ptr_lock);
/*
* Finally release the objset
*/
dmu_objset_disown(os, zfsvfs);
}
/*
* We can now safely destroy the '.zfs' directory node.
*/
if (zfsvfs->z_ctldir != NULL)
zfsctl_destroy(zfsvfs);
zfs_freevfs(vfsp);
return (0);
}
static int
zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
{
zfsvfs_t *zfsvfs = vfsp->vfs_data;
znode_t *zp;
int err;
/*
* zfs_zget() can't operate on virtual entries like .zfs/ or
* .zfs/snapshot/ directories, that's why we return EOPNOTSUPP.
* This will make NFS to switch to LOOKUP instead of using VGET.
*/
if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR ||
(zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir))
return (EOPNOTSUPP);
ZFS_ENTER(zfsvfs);
err = zfs_zget(zfsvfs, ino, &zp);
if (err == 0 && zp->z_unlinked) {
vrele(ZTOV(zp));
err = EINVAL;
}
if (err == 0)
*vpp = ZTOV(zp);
ZFS_EXIT(zfsvfs);
if (err == 0) {
err = vn_lock(*vpp, flags);
if (err != 0)
vrele(*vpp);
}
if (err != 0)
*vpp = NULL;
return (err);
}
static int
zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
struct ucred **credanonp, int *numsecflavors, int **secflavors)
{
zfsvfs_t *zfsvfs = vfsp->vfs_data;
/*
* If this is regular file system vfsp is the same as
* zfsvfs->z_parent->z_vfs, but if it is snapshot,
* zfsvfs->z_parent->z_vfs represents parent file system
* which we have to use here, because only this file system
* has mnt_export configured.
*/
return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
credanonp, numsecflavors, secflavors));
}
CTASSERT(SHORT_FID_LEN <= sizeof(struct fid));
CTASSERT(LONG_FID_LEN <= sizeof(struct fid));
static int
zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
{
struct componentname cn;
zfsvfs_t *zfsvfs = vfsp->vfs_data;
znode_t *zp;
vnode_t *dvp;
uint64_t object = 0;
uint64_t fid_gen = 0;
uint64_t gen_mask;
uint64_t zp_gen;
int i, err;
*vpp = NULL;
ZFS_ENTER(zfsvfs);
/*
* On FreeBSD we can get snapshot's mount point or its parent file
* system mount point depending if snapshot is already mounted or not.
*/
if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
zfid_long_t *zlfid = (zfid_long_t *)fidp;
uint64_t objsetid = 0;
uint64_t setgen = 0;
for (i = 0; i < sizeof (zlfid->zf_setid); i++)
objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
ZFS_EXIT(zfsvfs);
err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
if (err)
return (SET_ERROR(EINVAL));
ZFS_ENTER(zfsvfs);
}
if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
zfid_short_t *zfid = (zfid_short_t *)fidp;
for (i = 0; i < sizeof (zfid->zf_object); i++)
object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
for (i = 0; i < sizeof (zfid->zf_gen); i++)
fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
} else {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EINVAL));
}
/*
* A zero fid_gen means we are in .zfs or the .zfs/snapshot
* directory tree. If the object == zfsvfs->z_shares_dir, then
* we are in the .zfs/shares directory tree.
*/
if ((fid_gen == 0 &&
(object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) ||
(zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) {
ZFS_EXIT(zfsvfs);
VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp));
if (object == ZFSCTL_INO_SNAPDIR) {
cn.cn_nameptr = "snapshot";
cn.cn_namelen = strlen(cn.cn_nameptr);
cn.cn_nameiop = LOOKUP;
cn.cn_flags = ISLASTCN | LOCKLEAF;
cn.cn_lkflags = flags;
VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
vput(dvp);
} else if (object == zfsvfs->z_shares_dir) {
/*
* XXX This branch must not be taken,
* if it is, then the lookup below will
* explode.
*/
cn.cn_nameptr = "shares";
cn.cn_namelen = strlen(cn.cn_nameptr);
cn.cn_nameiop = LOOKUP;
cn.cn_flags = ISLASTCN;
cn.cn_lkflags = flags;
VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
vput(dvp);
} else {
*vpp = dvp;
}
return (err);
}
gen_mask = -1ULL >> (64 - 8 * i);
dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
if (err = zfs_zget(zfsvfs, object, &zp)) {
ZFS_EXIT(zfsvfs);
return (err);
}
(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
sizeof (uint64_t));
zp_gen = zp_gen & gen_mask;
if (zp_gen == 0)
zp_gen = 1;
if (zp->z_unlinked || zp_gen != fid_gen) {
dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
vrele(ZTOV(zp));
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EINVAL));
}
*vpp = ZTOV(zp);
ZFS_EXIT(zfsvfs);
err = vn_lock(*vpp, flags);
if (err == 0)
vnode_create_vobject(*vpp, zp->z_size, curthread);
else
*vpp = NULL;
return (err);
}
/*
* Block out VOPs and close zfsvfs_t::z_os
*
* Note, if successful, then we return with the 'z_teardown_lock' and
* 'z_teardown_inactive_lock' write held. We leave ownership of the underlying
* dataset and objset intact so that they can be atomically handed off during
* a subsequent rollback or recv operation and the resume thereafter.
*/
int
zfs_suspend_fs(zfsvfs_t *zfsvfs)
{
int error;
if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
return (error);
return (0);
}
/*
* Rebuild SA and release VOPs. Note that ownership of the underlying dataset
* is an invariant across any of the operations that can be performed while the
* filesystem was suspended. Whether it succeeded or failed, the preconditions
* are the same: the relevant objset and associated dataset are owned by
* zfsvfs, held, and long held on entry.
*/
int
zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
{
int err;
znode_t *zp;
ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
/*
* We already own this, so just update the objset_t, as the one we
* had before may have been evicted.
*/
objset_t *os;
VERIFY3P(ds->ds_owner, ==, zfsvfs);
VERIFY(dsl_dataset_long_held(ds));
VERIFY0(dmu_objset_from_ds(ds, &os));
err = zfsvfs_init(zfsvfs, os);
if (err != 0)
goto bail;
VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
zfs_set_fuid_feature(zfsvfs);
/*
* Attempt to re-establish all the active znodes with
* their dbufs. If a zfs_rezget() fails, then we'll let
* any potential callers discover that via ZFS_ENTER_VERIFY_VP
* when they try to use their znode.
*/
mutex_enter(&zfsvfs->z_znodes_lock);
for (zp = list_head(&zfsvfs->z_all_znodes); zp;
zp = list_next(&zfsvfs->z_all_znodes, zp)) {
(void) zfs_rezget(zp);
}
mutex_exit(&zfsvfs->z_znodes_lock);
bail:
/* release the VOPs */
rw_exit(&zfsvfs->z_teardown_inactive_lock);
rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
if (err) {
/*
* Since we couldn't setup the sa framework, try to force
* unmount this file system.
*/
if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) {
vfs_ref(zfsvfs->z_vfs);
(void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
}
}
return (err);
}
static void
zfs_freevfs(vfs_t *vfsp)
{
zfsvfs_t *zfsvfs = vfsp->vfs_data;
#ifdef illumos
/*
* If this is a snapshot, we have an extra VFS_HOLD on our parent
* from zfs_mount(). Release it here. If we came through
* zfs_mountroot() instead, we didn't grab an extra hold, so
* skip the VFS_RELE for rootvfs.
*/
if (zfsvfs->z_issnap && (vfsp != rootvfs))
VFS_RELE(zfsvfs->z_parent->z_vfs);
#endif
zfsvfs_free(zfsvfs);
atomic_dec_32(&zfs_active_fs_count);
}
#ifdef __i386__
static int desiredvnodes_backup;
#endif
static void
zfs_vnodes_adjust(void)
{
#ifdef __i386__
int newdesiredvnodes;
desiredvnodes_backup = desiredvnodes;
/*
* We calculate newdesiredvnodes the same way it is done in
* vntblinit(). If it is equal to desiredvnodes, it means that
* it wasn't tuned by the administrator and we can tune it down.
*/
newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 *
vm_kmem_size / (5 * (sizeof(struct vm_object) +
sizeof(struct vnode))));
if (newdesiredvnodes == desiredvnodes)
desiredvnodes = (3 * newdesiredvnodes) / 4;
#endif
}
static void
zfs_vnodes_adjust_back(void)
{
#ifdef __i386__
desiredvnodes = desiredvnodes_backup;
#endif
}
void
zfs_init(void)
{
printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n");
/*
* Initialize .zfs directory structures
*/
zfsctl_init();
/*
* Initialize znode cache, vnode ops, etc...
*/
zfs_znode_init();
/*
* Reduce number of vnodes. Originally number of vnodes is calculated
* with UFS inode in mind. We reduce it here, because it's too big for
* ZFS/i386.
*/
zfs_vnodes_adjust();
dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
#if defined(__FreeBSD__)
zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0);
#endif
}
void
zfs_fini(void)
{
#if defined(__FreeBSD__)
taskq_destroy(zfsvfs_taskq);
#endif
zfsctl_fini();
zfs_znode_fini();
zfs_vnodes_adjust_back();
}
int
zfs_busy(void)
{
return (zfs_active_fs_count != 0);
}
int
zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
{
int error;
objset_t *os = zfsvfs->z_os;
dmu_tx_t *tx;
if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
return (SET_ERROR(EINVAL));
if (newvers < zfsvfs->z_version)
return (SET_ERROR(EINVAL));
if (zfs_spa_version_map(newvers) >
spa_version(dmu_objset_spa(zfsvfs->z_os)))
return (SET_ERROR(ENOTSUP));
tx = dmu_tx_create(os);
dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
ZFS_SA_ATTRS);
dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
}
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
return (error);
}
error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
8, 1, &newvers, tx);
if (error) {
dmu_tx_commit(tx);
return (error);
}
if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
uint64_t sa_obj;
ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
SPA_VERSION_SA);
sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
DMU_OT_NONE, 0, tx);
error = zap_add(os, MASTER_NODE_OBJ,
ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
ASSERT0(error);
VERIFY(0 == sa_set_sa_object(os, sa_obj));
sa_register_update_callback(os, zfs_sa_upgrade);
}
spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
"from %llu to %llu", zfsvfs->z_version, newvers);
dmu_tx_commit(tx);
zfsvfs->z_version = newvers;
os->os_version = newvers;
zfs_set_fuid_feature(zfsvfs);
return (0);
}
/*
* Read a property stored within the master node.
*/
int
zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
{
uint64_t *cached_copy = NULL;
/*
* Figure out where in the objset_t the cached copy would live, if it
* is available for the requested property.
*/
if (os != NULL) {
switch (prop) {
case ZFS_PROP_VERSION:
cached_copy = &os->os_version;
break;
case ZFS_PROP_NORMALIZE:
cached_copy = &os->os_normalization;
break;
case ZFS_PROP_UTF8ONLY:
cached_copy = &os->os_utf8only;
break;
case ZFS_PROP_CASE:
cached_copy = &os->os_casesensitivity;
break;
default:
break;
}
}
if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
*value = *cached_copy;
return (0);
}
/*
* If the property wasn't cached, look up the file system's value for
* the property. For the version property, we look up a slightly
* different string.
*/
const char *pname;
int error = ENOENT;
if (prop == ZFS_PROP_VERSION) {
pname = ZPL_VERSION_STR;
} else {
pname = zfs_prop_to_name(prop);
}
if (os != NULL) {
ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
}
if (error == ENOENT) {
/* No value set, use the default value */
switch (prop) {
case ZFS_PROP_VERSION:
*value = ZPL_VERSION;
break;
case ZFS_PROP_NORMALIZE:
case ZFS_PROP_UTF8ONLY:
*value = 0;
break;
case ZFS_PROP_CASE:
*value = ZFS_CASE_SENSITIVE;
break;
default:
return (error);
}
error = 0;
}
/*
* If one of the methods for getting the property value above worked,
* copy it into the objset_t's cache.
*/
if (error == 0 && cached_copy != NULL) {
*cached_copy = *value;
}
return (error);
}
/*
* Return true if the coresponding vfs's unmounted flag is set.
* Otherwise return false.
* If this function returns true we know VFS unmount has been initiated.
*/
boolean_t
zfs_get_vfs_flag_unmounted(objset_t *os)
{
zfsvfs_t *zfvp;
boolean_t unmounted = B_FALSE;
ASSERT(dmu_objset_type(os) == DMU_OST_ZFS);
mutex_enter(&os->os_user_ptr_lock);
zfvp = dmu_objset_get_user(os);
if (zfvp != NULL && zfvp->z_vfs != NULL &&
(zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT))
unmounted = B_TRUE;
mutex_exit(&os->os_user_ptr_lock);
return (unmounted);
}
#ifdef _KERNEL
void
zfsvfs_update_fromname(const char *oldname, const char *newname)
{
char tmpbuf[MAXPATHLEN];
struct mount *mp;
char *fromname;
size_t oldlen;
oldlen = strlen(oldname);
mtx_lock(&mountlist_mtx);
TAILQ_FOREACH(mp, &mountlist, mnt_list) {
fromname = mp->mnt_stat.f_mntfromname;
if (strcmp(fromname, oldname) == 0) {
(void)strlcpy(fromname, newname,
sizeof(mp->mnt_stat.f_mntfromname));
continue;
}
if (strncmp(fromname, oldname, oldlen) == 0 &&
(fromname[oldlen] == '/' || fromname[oldlen] == '@')) {
(void)snprintf(tmpbuf, sizeof(tmpbuf), "%s%s",
newname, fromname + oldlen);
(void)strlcpy(fromname, tmpbuf,
sizeof(mp->mnt_stat.f_mntfromname));
continue;
}
}
mtx_unlock(&mountlist_mtx);
}
#endif
Index: head/sys/cddl/contrib/opensolaris/uts/common/os/fm.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/os/fm.c (revision 356654)
+++ head/sys/cddl/contrib/opensolaris/uts/common/os/fm.c (revision 356655)
@@ -1,1399 +1,1399 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
* Fault Management Architecture (FMA) Resource and Protocol Support
*
* The routines contained herein provide services to support kernel subsystems
* in publishing fault management telemetry (see PSARC 2002/412 and 2003/089).
*
* Name-Value Pair Lists
*
* The embodiment of an FMA protocol element (event, fmri or authority) is a
* name-value pair list (nvlist_t). FMA-specific nvlist construtor and
* destructor functions, fm_nvlist_create() and fm_nvlist_destroy(), are used
* to create an nvpair list using custom allocators. Callers may choose to
* allocate either from the kernel memory allocator, or from a preallocated
* buffer, useful in constrained contexts like high-level interrupt routines.
*
* Protocol Event and FMRI Construction
*
* Convenience routines are provided to construct nvlist events according to
* the FMA Event Protocol and Naming Schema specification for ereports and
* FMRIs for the dev, cpu, hc, mem, legacy hc and de schemes.
*
* ENA Manipulation
*
* Routines to generate ENA formats 0, 1 and 2 are available as well as
* routines to increment formats 1 and 2. Individual fields within the
* ENA are extractable via fm_ena_time_get(), fm_ena_id_get(),
* fm_ena_format_get() and fm_ena_gen_get().
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
/*
* URL and SUNW-MSG-ID value to display for fm_panic(), defined below. These
* values must be kept in sync with the FMA source code in usr/src/cmd/fm.
*/
static const char *fm_url = "http://www.sun.com/msg";
static const char *fm_msgid = "SUNOS-8000-0G";
static char *volatile fm_panicstr = NULL;
#ifdef illumos
errorq_t *ereport_errorq;
#endif
void *ereport_dumpbuf;
size_t ereport_dumplen;
static uint_t ereport_chanlen = ERPT_EVCH_MAX;
static evchan_t *ereport_chan = NULL;
static ulong_t ereport_qlen = 0;
static size_t ereport_size = 0;
static int ereport_cols = 80;
extern void fastreboot_disable_highpil(void);
/*
* Common fault management kstats to record ereport generation
* failures
*/
struct erpt_kstat {
kstat_named_t erpt_dropped; /* num erpts dropped on post */
kstat_named_t erpt_set_failed; /* num erpt set failures */
kstat_named_t fmri_set_failed; /* num fmri set failures */
kstat_named_t payload_set_failed; /* num payload set failures */
};
static struct erpt_kstat erpt_kstat_data = {
{ "erpt-dropped", KSTAT_DATA_UINT64 },
{ "erpt-set-failed", KSTAT_DATA_UINT64 },
{ "fmri-set-failed", KSTAT_DATA_UINT64 },
{ "payload-set-failed", KSTAT_DATA_UINT64 }
};
#ifdef illumos
/*ARGSUSED*/
static void
fm_drain(void *private, void *data, errorq_elem_t *eep)
{
nvlist_t *nvl = errorq_elem_nvl(ereport_errorq, eep);
- if (!panicstr)
+ if (!KERNEL_PANICKED())
(void) fm_ereport_post(nvl, EVCH_TRYHARD);
else
fm_nvprint(nvl);
}
#endif
void
fm_init(void)
{
kstat_t *ksp;
#ifdef illumos
(void) sysevent_evc_bind(FM_ERROR_CHAN,
&ereport_chan, EVCH_CREAT | EVCH_HOLD_PEND);
(void) sysevent_evc_control(ereport_chan,
EVCH_SET_CHAN_LEN, &ereport_chanlen);
#endif
if (ereport_qlen == 0)
ereport_qlen = ERPT_MAX_ERRS * MAX(max_ncpus, 4);
if (ereport_size == 0)
ereport_size = ERPT_DATA_SZ;
#ifdef illumos
ereport_errorq = errorq_nvcreate("fm_ereport_queue",
(errorq_func_t)fm_drain, NULL, ereport_qlen, ereport_size,
FM_ERR_PIL, ERRORQ_VITAL);
if (ereport_errorq == NULL)
panic("failed to create required ereport error queue");
#endif
ereport_dumpbuf = kmem_alloc(ereport_size, KM_SLEEP);
ereport_dumplen = ereport_size;
/* Initialize ereport allocation and generation kstats */
ksp = kstat_create("unix", 0, "fm", "misc", KSTAT_TYPE_NAMED,
sizeof (struct erpt_kstat) / sizeof (kstat_named_t),
KSTAT_FLAG_VIRTUAL);
if (ksp != NULL) {
ksp->ks_data = &erpt_kstat_data;
kstat_install(ksp);
} else {
cmn_err(CE_NOTE, "failed to create fm/misc kstat\n");
}
}
#ifdef illumos
/*
* Formatting utility function for fm_nvprintr. We attempt to wrap chunks of
* output so they aren't split across console lines, and return the end column.
*/
/*PRINTFLIKE4*/
static int
fm_printf(int depth, int c, int cols, const char *format, ...)
{
va_list ap;
int width;
char c1;
va_start(ap, format);
width = vsnprintf(&c1, sizeof (c1), format, ap);
va_end(ap);
if (c + width >= cols) {
console_printf("\n\r");
c = 0;
if (format[0] != ' ' && depth > 0) {
console_printf(" ");
c++;
}
}
va_start(ap, format);
console_vprintf(format, ap);
va_end(ap);
return ((c + width) % cols);
}
/*
* Recursively print a nvlist in the specified column width and return the
* column we end up in. This function is called recursively by fm_nvprint(),
* below. We generically format the entire nvpair using hexadecimal
* integers and strings, and elide any integer arrays. Arrays are basically
* used for cache dumps right now, so we suppress them so as not to overwhelm
* the amount of console output we produce at panic time. This can be further
* enhanced as FMA technology grows based upon the needs of consumers. All
* FMA telemetry is logged using the dump device transport, so the console
* output serves only as a fallback in case this procedure is unsuccessful.
*/
static int
fm_nvprintr(nvlist_t *nvl, int d, int c, int cols)
{
nvpair_t *nvp;
for (nvp = nvlist_next_nvpair(nvl, NULL);
nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) {
data_type_t type = nvpair_type(nvp);
const char *name = nvpair_name(nvp);
boolean_t b;
uint8_t i8;
uint16_t i16;
uint32_t i32;
uint64_t i64;
char *str;
nvlist_t *cnv;
if (strcmp(name, FM_CLASS) == 0)
continue; /* already printed by caller */
c = fm_printf(d, c, cols, " %s=", name);
switch (type) {
case DATA_TYPE_BOOLEAN:
c = fm_printf(d + 1, c, cols, " 1");
break;
case DATA_TYPE_BOOLEAN_VALUE:
(void) nvpair_value_boolean_value(nvp, &b);
c = fm_printf(d + 1, c, cols, b ? "1" : "0");
break;
case DATA_TYPE_BYTE:
(void) nvpair_value_byte(nvp, &i8);
c = fm_printf(d + 1, c, cols, "%x", i8);
break;
case DATA_TYPE_INT8:
(void) nvpair_value_int8(nvp, (void *)&i8);
c = fm_printf(d + 1, c, cols, "%x", i8);
break;
case DATA_TYPE_UINT8:
(void) nvpair_value_uint8(nvp, &i8);
c = fm_printf(d + 1, c, cols, "%x", i8);
break;
case DATA_TYPE_INT16:
(void) nvpair_value_int16(nvp, (void *)&i16);
c = fm_printf(d + 1, c, cols, "%x", i16);
break;
case DATA_TYPE_UINT16:
(void) nvpair_value_uint16(nvp, &i16);
c = fm_printf(d + 1, c, cols, "%x", i16);
break;
case DATA_TYPE_INT32:
(void) nvpair_value_int32(nvp, (void *)&i32);
c = fm_printf(d + 1, c, cols, "%x", i32);
break;
case DATA_TYPE_UINT32:
(void) nvpair_value_uint32(nvp, &i32);
c = fm_printf(d + 1, c, cols, "%x", i32);
break;
case DATA_TYPE_INT64:
(void) nvpair_value_int64(nvp, (void *)&i64);
c = fm_printf(d + 1, c, cols, "%llx",
(u_longlong_t)i64);
break;
case DATA_TYPE_UINT64:
(void) nvpair_value_uint64(nvp, &i64);
c = fm_printf(d + 1, c, cols, "%llx",
(u_longlong_t)i64);
break;
case DATA_TYPE_HRTIME:
(void) nvpair_value_hrtime(nvp, (void *)&i64);
c = fm_printf(d + 1, c, cols, "%llx",
(u_longlong_t)i64);
break;
case DATA_TYPE_STRING:
(void) nvpair_value_string(nvp, &str);
c = fm_printf(d + 1, c, cols, "\"%s\"",
str ? str : "");
break;
case DATA_TYPE_NVLIST:
c = fm_printf(d + 1, c, cols, "[");
(void) nvpair_value_nvlist(nvp, &cnv);
c = fm_nvprintr(cnv, d + 1, c, cols);
c = fm_printf(d + 1, c, cols, " ]");
break;
case DATA_TYPE_NVLIST_ARRAY: {
nvlist_t **val;
uint_t i, nelem;
c = fm_printf(d + 1, c, cols, "[");
(void) nvpair_value_nvlist_array(nvp, &val, &nelem);
for (i = 0; i < nelem; i++) {
c = fm_nvprintr(val[i], d + 1, c, cols);
}
c = fm_printf(d + 1, c, cols, " ]");
}
break;
case DATA_TYPE_BOOLEAN_ARRAY:
case DATA_TYPE_BYTE_ARRAY:
case DATA_TYPE_INT8_ARRAY:
case DATA_TYPE_UINT8_ARRAY:
case DATA_TYPE_INT16_ARRAY:
case DATA_TYPE_UINT16_ARRAY:
case DATA_TYPE_INT32_ARRAY:
case DATA_TYPE_UINT32_ARRAY:
case DATA_TYPE_INT64_ARRAY:
case DATA_TYPE_UINT64_ARRAY:
case DATA_TYPE_STRING_ARRAY:
c = fm_printf(d + 1, c, cols, "[...]");
break;
case DATA_TYPE_UNKNOWN:
c = fm_printf(d + 1, c, cols, "");
break;
}
}
return (c);
}
void
fm_nvprint(nvlist_t *nvl)
{
char *class;
int c = 0;
console_printf("\r");
if (nvlist_lookup_string(nvl, FM_CLASS, &class) == 0)
c = fm_printf(0, c, ereport_cols, "%s", class);
if (fm_nvprintr(nvl, 0, c, ereport_cols) != 0)
console_printf("\n");
console_printf("\n");
}
/*
* Wrapper for panic() that first produces an FMA-style message for admins.
* Normally such messages are generated by fmd(1M)'s syslog-msgs agent: this
* is the one exception to that rule and the only error that gets messaged.
* This function is intended for use by subsystems that have detected a fatal
* error and enqueued appropriate ereports and wish to then force a panic.
*/
/*PRINTFLIKE1*/
void
fm_panic(const char *format, ...)
{
va_list ap;
(void) atomic_cas_ptr((void *)&fm_panicstr, NULL, (void *)format);
#if defined(__i386) || defined(__amd64)
fastreboot_disable_highpil();
#endif /* __i386 || __amd64 */
va_start(ap, format);
vpanic(format, ap);
va_end(ap);
}
/*
* Simply tell the caller if fm_panicstr is set, ie. an fma event has
* caused the panic. If so, something other than the default panic
* diagnosis method will diagnose the cause of the panic.
*/
int
is_fm_panic()
{
if (fm_panicstr)
return (1);
else
return (0);
}
/*
* Print any appropriate FMA banner message before the panic message. This
* function is called by panicsys() and prints the message for fm_panic().
* We print the message here so that it comes after the system is quiesced.
* A one-line summary is recorded in the log only (cmn_err(9F) with "!" prefix).
* The rest of the message is for the console only and not needed in the log,
* so it is printed using console_printf(). We break it up into multiple
* chunks so as to avoid overflowing any small legacy prom_printf() buffers.
*/
void
fm_banner(void)
{
timespec_t tod;
hrtime_t now;
if (!fm_panicstr)
return; /* panic was not initiated by fm_panic(); do nothing */
- if (panicstr) {
+ if (KERNEL_PANICKED()) {
tod = panic_hrestime;
now = panic_hrtime;
} else {
gethrestime(&tod);
now = gethrtime_waitfree();
}
cmn_err(CE_NOTE, "!SUNW-MSG-ID: %s, "
"TYPE: Error, VER: 1, SEVERITY: Major\n", fm_msgid);
console_printf(
"\n\rSUNW-MSG-ID: %s, TYPE: Error, VER: 1, SEVERITY: Major\n"
"EVENT-TIME: 0x%lx.0x%lx (0x%llx)\n",
fm_msgid, tod.tv_sec, tod.tv_nsec, (u_longlong_t)now);
console_printf(
"PLATFORM: %s, CSN: -, HOSTNAME: %s\n"
"SOURCE: %s, REV: %s %s\n",
platform, utsname.nodename, utsname.sysname,
utsname.release, utsname.version);
console_printf(
"DESC: Errors have been detected that require a reboot to ensure system\n"
"integrity. See %s/%s for more information.\n",
fm_url, fm_msgid);
console_printf(
"AUTO-RESPONSE: Solaris will attempt to save and diagnose the error telemetry\n"
"IMPACT: The system will sync files, save a crash dump if needed, and reboot\n"
"REC-ACTION: Save the error summary below in case telemetry cannot be saved\n");
console_printf("\n");
}
/*
* Utility function to write all of the pending ereports to the dump device.
* This function is called at either normal reboot or panic time, and simply
* iterates over the in-transit messages in the ereport sysevent channel.
*/
void
fm_ereport_dump(void)
{
evchanq_t *chq;
sysevent_t *sep;
erpt_dump_t ed;
timespec_t tod;
hrtime_t now;
char *buf;
size_t len;
- if (panicstr) {
+ if (KERNEL_PANICKED()) {
tod = panic_hrestime;
now = panic_hrtime;
} else {
if (ereport_errorq != NULL)
errorq_drain(ereport_errorq);
gethrestime(&tod);
now = gethrtime_waitfree();
}
/*
* In the panic case, sysevent_evc_walk_init() will return NULL.
*/
if ((chq = sysevent_evc_walk_init(ereport_chan, NULL)) == NULL &&
- !panicstr)
+ !KERNEL_PANICKED())
return; /* event channel isn't initialized yet */
while ((sep = sysevent_evc_walk_step(chq)) != NULL) {
if ((buf = sysevent_evc_event_attr(sep, &len)) == NULL)
break;
ed.ed_magic = ERPT_MAGIC;
ed.ed_chksum = checksum32(buf, len);
ed.ed_size = (uint32_t)len;
ed.ed_pad = 0;
ed.ed_hrt_nsec = SE_TIME(sep);
ed.ed_hrt_base = now;
ed.ed_tod_base.sec = tod.tv_sec;
ed.ed_tod_base.nsec = tod.tv_nsec;
dumpvp_write(&ed, sizeof (ed));
dumpvp_write(buf, len);
}
sysevent_evc_walk_fini(chq);
}
#endif
/*
* Post an error report (ereport) to the sysevent error channel. The error
* channel must be established with a prior call to sysevent_evc_create()
* before publication may occur.
*/
void
fm_ereport_post(nvlist_t *ereport, int evc_flag)
{
size_t nvl_size = 0;
evchan_t *error_chan;
sysevent_id_t eid;
(void) nvlist_size(ereport, &nvl_size, NV_ENCODE_NATIVE);
if (nvl_size > ERPT_DATA_SZ || nvl_size == 0) {
atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64);
return;
}
#ifdef illumos
if (sysevent_evc_bind(FM_ERROR_CHAN, &error_chan,
EVCH_CREAT|EVCH_HOLD_PEND) != 0) {
atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64);
return;
}
if (sysevent_evc_publish(error_chan, EC_FM, ESC_FM_ERROR,
SUNW_VENDOR, FM_PUB, ereport, evc_flag) != 0) {
atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64);
(void) sysevent_evc_unbind(error_chan);
return;
}
(void) sysevent_evc_unbind(error_chan);
#else
(void) ddi_log_sysevent(NULL, SUNW_VENDOR, EC_DEV_STATUS,
ESC_DEV_DLE, ereport, &eid, DDI_SLEEP);
#endif
}
/*
* Wrapppers for FM nvlist allocators
*/
/* ARGSUSED */
static void *
i_fm_alloc(nv_alloc_t *nva, size_t size)
{
return (kmem_zalloc(size, KM_SLEEP));
}
/* ARGSUSED */
static void
i_fm_free(nv_alloc_t *nva, void *buf, size_t size)
{
kmem_free(buf, size);
}
const nv_alloc_ops_t fm_mem_alloc_ops = {
NULL,
NULL,
i_fm_alloc,
i_fm_free,
NULL
};
/*
* Create and initialize a new nv_alloc_t for a fixed buffer, buf. A pointer
* to the newly allocated nv_alloc_t structure is returned upon success or NULL
* is returned to indicate that the nv_alloc structure could not be created.
*/
nv_alloc_t *
fm_nva_xcreate(char *buf, size_t bufsz)
{
nv_alloc_t *nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_SLEEP);
if (bufsz == 0 || nv_alloc_init(nvhdl, nv_fixed_ops, buf, bufsz) != 0) {
kmem_free(nvhdl, sizeof (nv_alloc_t));
return (NULL);
}
return (nvhdl);
}
/*
* Destroy a previously allocated nv_alloc structure. The fixed buffer
* associated with nva must be freed by the caller.
*/
void
fm_nva_xdestroy(nv_alloc_t *nva)
{
nv_alloc_fini(nva);
kmem_free(nva, sizeof (nv_alloc_t));
}
/*
* Create a new nv list. A pointer to a new nv list structure is returned
* upon success or NULL is returned to indicate that the structure could
* not be created. The newly created nv list is created and managed by the
* operations installed in nva. If nva is NULL, the default FMA nva
* operations are installed and used.
*
* When called from the kernel and nva == NULL, this function must be called
* from passive kernel context with no locks held that can prevent a
* sleeping memory allocation from occurring. Otherwise, this function may
* be called from other kernel contexts as long a valid nva created via
* fm_nva_create() is supplied.
*/
nvlist_t *
fm_nvlist_create(nv_alloc_t *nva)
{
int hdl_alloced = 0;
nvlist_t *nvl;
nv_alloc_t *nvhdl;
if (nva == NULL) {
nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_SLEEP);
if (nv_alloc_init(nvhdl, &fm_mem_alloc_ops, NULL, 0) != 0) {
kmem_free(nvhdl, sizeof (nv_alloc_t));
return (NULL);
}
hdl_alloced = 1;
} else {
nvhdl = nva;
}
if (nvlist_xalloc(&nvl, NV_UNIQUE_NAME, nvhdl) != 0) {
if (hdl_alloced) {
nv_alloc_fini(nvhdl);
kmem_free(nvhdl, sizeof (nv_alloc_t));
}
return (NULL);
}
return (nvl);
}
/*
* Destroy a previously allocated nvlist structure. flag indicates whether
* or not the associated nva structure should be freed (FM_NVA_FREE) or
* retained (FM_NVA_RETAIN). Retaining the nv alloc structure allows
* it to be re-used for future nvlist creation operations.
*/
void
fm_nvlist_destroy(nvlist_t *nvl, int flag)
{
nv_alloc_t *nva = nvlist_lookup_nv_alloc(nvl);
nvlist_free(nvl);
if (nva != NULL) {
if (flag == FM_NVA_FREE)
fm_nva_xdestroy(nva);
}
}
int
i_fm_payload_set(nvlist_t *payload, const char *name, va_list ap)
{
int nelem, ret = 0;
data_type_t type;
while (ret == 0 && name != NULL) {
type = va_arg(ap, data_type_t);
switch (type) {
case DATA_TYPE_BYTE:
ret = nvlist_add_byte(payload, name,
va_arg(ap, uint_t));
break;
case DATA_TYPE_BYTE_ARRAY:
nelem = va_arg(ap, int);
ret = nvlist_add_byte_array(payload, name,
va_arg(ap, uchar_t *), nelem);
break;
case DATA_TYPE_BOOLEAN_VALUE:
ret = nvlist_add_boolean_value(payload, name,
va_arg(ap, boolean_t));
break;
case DATA_TYPE_BOOLEAN_ARRAY:
nelem = va_arg(ap, int);
ret = nvlist_add_boolean_array(payload, name,
va_arg(ap, boolean_t *), nelem);
break;
case DATA_TYPE_INT8:
ret = nvlist_add_int8(payload, name,
va_arg(ap, int));
break;
case DATA_TYPE_INT8_ARRAY:
nelem = va_arg(ap, int);
ret = nvlist_add_int8_array(payload, name,
va_arg(ap, int8_t *), nelem);
break;
case DATA_TYPE_UINT8:
ret = nvlist_add_uint8(payload, name,
va_arg(ap, uint_t));
break;
case DATA_TYPE_UINT8_ARRAY:
nelem = va_arg(ap, int);
ret = nvlist_add_uint8_array(payload, name,
va_arg(ap, uint8_t *), nelem);
break;
case DATA_TYPE_INT16:
ret = nvlist_add_int16(payload, name,
va_arg(ap, int));
break;
case DATA_TYPE_INT16_ARRAY:
nelem = va_arg(ap, int);
ret = nvlist_add_int16_array(payload, name,
va_arg(ap, int16_t *), nelem);
break;
case DATA_TYPE_UINT16:
ret = nvlist_add_uint16(payload, name,
va_arg(ap, uint_t));
break;
case DATA_TYPE_UINT16_ARRAY:
nelem = va_arg(ap, int);
ret = nvlist_add_uint16_array(payload, name,
va_arg(ap, uint16_t *), nelem);
break;
case DATA_TYPE_INT32:
ret = nvlist_add_int32(payload, name,
va_arg(ap, int32_t));
break;
case DATA_TYPE_INT32_ARRAY:
nelem = va_arg(ap, int);
ret = nvlist_add_int32_array(payload, name,
va_arg(ap, int32_t *), nelem);
break;
case DATA_TYPE_UINT32:
ret = nvlist_add_uint32(payload, name,
va_arg(ap, uint32_t));
break;
case DATA_TYPE_UINT32_ARRAY:
nelem = va_arg(ap, int);
ret = nvlist_add_uint32_array(payload, name,
va_arg(ap, uint32_t *), nelem);
break;
case DATA_TYPE_INT64:
ret = nvlist_add_int64(payload, name,
va_arg(ap, int64_t));
break;
case DATA_TYPE_INT64_ARRAY:
nelem = va_arg(ap, int);
ret = nvlist_add_int64_array(payload, name,
va_arg(ap, int64_t *), nelem);
break;
case DATA_TYPE_UINT64:
ret = nvlist_add_uint64(payload, name,
va_arg(ap, uint64_t));
break;
case DATA_TYPE_UINT64_ARRAY:
nelem = va_arg(ap, int);
ret = nvlist_add_uint64_array(payload, name,
va_arg(ap, uint64_t *), nelem);
break;
case DATA_TYPE_STRING:
ret = nvlist_add_string(payload, name,
va_arg(ap, char *));
break;
case DATA_TYPE_STRING_ARRAY:
nelem = va_arg(ap, int);
ret = nvlist_add_string_array(payload, name,
va_arg(ap, char **), nelem);
break;
case DATA_TYPE_NVLIST:
ret = nvlist_add_nvlist(payload, name,
va_arg(ap, nvlist_t *));
break;
case DATA_TYPE_NVLIST_ARRAY:
nelem = va_arg(ap, int);
ret = nvlist_add_nvlist_array(payload, name,
va_arg(ap, nvlist_t **), nelem);
break;
default:
ret = EINVAL;
}
name = va_arg(ap, char *);
}
return (ret);
}
void
fm_payload_set(nvlist_t *payload, ...)
{
int ret;
const char *name;
va_list ap;
va_start(ap, payload);
name = va_arg(ap, char *);
ret = i_fm_payload_set(payload, name, ap);
va_end(ap);
if (ret)
atomic_inc_64(&erpt_kstat_data.payload_set_failed.value.ui64);
}
/*
* Set-up and validate the members of an ereport event according to:
*
* Member name Type Value
* ====================================================
* class string ereport
* version uint8_t 0
* ena uint64_t
* detector nvlist_t
* ereport-payload nvlist_t
*
* We don't actually add a 'version' member to the payload. Really,
* the version quoted to us by our caller is that of the category 1
* "ereport" event class (and we require FM_EREPORT_VERS0) but
* the payload version of the actual leaf class event under construction
* may be something else. Callers should supply a version in the varargs,
* or (better) we could take two version arguments - one for the
* ereport category 1 classification (expect FM_EREPORT_VERS0) and one
* for the leaf class.
*/
void
fm_ereport_set(nvlist_t *ereport, int version, const char *erpt_class,
uint64_t ena, const nvlist_t *detector, ...)
{
char ereport_class[FM_MAX_CLASS];
const char *name;
va_list ap;
int ret;
if (version != FM_EREPORT_VERS0) {
atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
return;
}
(void) snprintf(ereport_class, FM_MAX_CLASS, "%s.%s",
FM_EREPORT_CLASS, erpt_class);
if (nvlist_add_string(ereport, FM_CLASS, ereport_class) != 0) {
atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
return;
}
if (nvlist_add_uint64(ereport, FM_EREPORT_ENA, ena)) {
atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
}
if (nvlist_add_nvlist(ereport, FM_EREPORT_DETECTOR,
(nvlist_t *)detector) != 0) {
atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
}
va_start(ap, detector);
name = va_arg(ap, const char *);
ret = i_fm_payload_set(ereport, name, ap);
va_end(ap);
if (ret)
atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
}
/*
* Set-up and validate the members of an hc fmri according to;
*
* Member name Type Value
* ===================================================
* version uint8_t 0
* auth nvlist_t
* hc-name string
* hc-id string
*
* Note that auth and hc-id are optional members.
*/
#define HC_MAXPAIRS 20
#define HC_MAXNAMELEN 50
static int
fm_fmri_hc_set_common(nvlist_t *fmri, int version, const nvlist_t *auth)
{
if (version != FM_HC_SCHEME_VERSION) {
atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
return (0);
}
if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0 ||
nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC) != 0) {
atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
return (0);
}
if (auth != NULL && nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY,
(nvlist_t *)auth) != 0) {
atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
return (0);
}
return (1);
}
void
fm_fmri_hc_set(nvlist_t *fmri, int version, const nvlist_t *auth,
nvlist_t *snvl, int npairs, ...)
{
nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri);
nvlist_t *pairs[HC_MAXPAIRS];
va_list ap;
int i;
if (!fm_fmri_hc_set_common(fmri, version, auth))
return;
npairs = MIN(npairs, HC_MAXPAIRS);
va_start(ap, npairs);
for (i = 0; i < npairs; i++) {
const char *name = va_arg(ap, const char *);
uint32_t id = va_arg(ap, uint32_t);
char idstr[11];
(void) snprintf(idstr, sizeof (idstr), "%u", id);
pairs[i] = fm_nvlist_create(nva);
if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 ||
nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) {
atomic_inc_64(
&erpt_kstat_data.fmri_set_failed.value.ui64);
}
}
va_end(ap);
if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs, npairs) != 0)
atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
for (i = 0; i < npairs; i++)
fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN);
if (snvl != NULL) {
if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) {
atomic_inc_64(
&erpt_kstat_data.fmri_set_failed.value.ui64);
}
}
}
/*
* Set-up and validate the members of an dev fmri according to:
*
* Member name Type Value
* ====================================================
* version uint8_t 0
* auth nvlist_t
* devpath string
* [devid] string
* [target-port-l0id] string
*
* Note that auth and devid are optional members.
*/
void
fm_fmri_dev_set(nvlist_t *fmri_dev, int version, const nvlist_t *auth,
const char *devpath, const char *devid, const char *tpl0)
{
int err = 0;
if (version != DEV_SCHEME_VERSION0) {
atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
return;
}
err |= nvlist_add_uint8(fmri_dev, FM_VERSION, version);
err |= nvlist_add_string(fmri_dev, FM_FMRI_SCHEME, FM_FMRI_SCHEME_DEV);
if (auth != NULL) {
err |= nvlist_add_nvlist(fmri_dev, FM_FMRI_AUTHORITY,
(nvlist_t *)auth);
}
err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_PATH, devpath);
if (devid != NULL)
err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_ID, devid);
if (tpl0 != NULL)
err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_TGTPTLUN0, tpl0);
if (err)
atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
}
/*
* Set-up and validate the members of an cpu fmri according to:
*
* Member name Type Value
* ====================================================
* version uint8_t 0
* auth nvlist_t
* cpuid uint32_t
* cpumask uint8_t
* serial uint64_t
*
* Note that auth, cpumask, serial are optional members.
*
*/
void
fm_fmri_cpu_set(nvlist_t *fmri_cpu, int version, const nvlist_t *auth,
uint32_t cpu_id, uint8_t *cpu_maskp, const char *serial_idp)
{
uint64_t *failedp = &erpt_kstat_data.fmri_set_failed.value.ui64;
if (version < CPU_SCHEME_VERSION1) {
atomic_inc_64(failedp);
return;
}
if (nvlist_add_uint8(fmri_cpu, FM_VERSION, version) != 0) {
atomic_inc_64(failedp);
return;
}
if (nvlist_add_string(fmri_cpu, FM_FMRI_SCHEME,
FM_FMRI_SCHEME_CPU) != 0) {
atomic_inc_64(failedp);
return;
}
if (auth != NULL && nvlist_add_nvlist(fmri_cpu, FM_FMRI_AUTHORITY,
(nvlist_t *)auth) != 0)
atomic_inc_64(failedp);
if (nvlist_add_uint32(fmri_cpu, FM_FMRI_CPU_ID, cpu_id) != 0)
atomic_inc_64(failedp);
if (cpu_maskp != NULL && nvlist_add_uint8(fmri_cpu, FM_FMRI_CPU_MASK,
*cpu_maskp) != 0)
atomic_inc_64(failedp);
if (serial_idp == NULL || nvlist_add_string(fmri_cpu,
FM_FMRI_CPU_SERIAL_ID, (char *)serial_idp) != 0)
atomic_inc_64(failedp);
}
/*
* Set-up and validate the members of a mem according to:
*
* Member name Type Value
* ====================================================
* version uint8_t 0
* auth nvlist_t [optional]
* unum string
* serial string [optional*]
* offset uint64_t [optional]
*
* * serial is required if offset is present
*/
void
fm_fmri_mem_set(nvlist_t *fmri, int version, const nvlist_t *auth,
const char *unum, const char *serial, uint64_t offset)
{
if (version != MEM_SCHEME_VERSION0) {
atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
return;
}
if (!serial && (offset != (uint64_t)-1)) {
atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
return;
}
if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) {
atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
return;
}
if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_MEM) != 0) {
atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
return;
}
if (auth != NULL) {
if (nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY,
(nvlist_t *)auth) != 0) {
atomic_inc_64(
&erpt_kstat_data.fmri_set_failed.value.ui64);
}
}
if (nvlist_add_string(fmri, FM_FMRI_MEM_UNUM, unum) != 0) {
atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
}
if (serial != NULL) {
if (nvlist_add_string_array(fmri, FM_FMRI_MEM_SERIAL_ID,
(char **)&serial, 1) != 0) {
atomic_inc_64(
&erpt_kstat_data.fmri_set_failed.value.ui64);
}
if (offset != (uint64_t)-1 && nvlist_add_uint64(fmri,
FM_FMRI_MEM_OFFSET, offset) != 0) {
atomic_inc_64(
&erpt_kstat_data.fmri_set_failed.value.ui64);
}
}
}
void
fm_fmri_zfs_set(nvlist_t *fmri, int version, uint64_t pool_guid,
uint64_t vdev_guid)
{
if (version != ZFS_SCHEME_VERSION0) {
atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
return;
}
if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) {
atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
return;
}
if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS) != 0) {
atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
return;
}
if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_POOL, pool_guid) != 0) {
atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
}
if (vdev_guid != 0) {
if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_VDEV, vdev_guid) != 0) {
atomic_inc_64(
&erpt_kstat_data.fmri_set_failed.value.ui64);
}
}
}
uint64_t
fm_ena_increment(uint64_t ena)
{
uint64_t new_ena;
switch (ENA_FORMAT(ena)) {
case FM_ENA_FMT1:
new_ena = ena + (1 << ENA_FMT1_GEN_SHFT);
break;
case FM_ENA_FMT2:
new_ena = ena + (1 << ENA_FMT2_GEN_SHFT);
break;
default:
new_ena = 0;
}
return (new_ena);
}
uint64_t
fm_ena_generate_cpu(uint64_t timestamp, processorid_t cpuid, uchar_t format)
{
uint64_t ena = 0;
switch (format) {
case FM_ENA_FMT1:
if (timestamp) {
ena = (uint64_t)((format & ENA_FORMAT_MASK) |
((cpuid << ENA_FMT1_CPUID_SHFT) &
ENA_FMT1_CPUID_MASK) |
((timestamp << ENA_FMT1_TIME_SHFT) &
ENA_FMT1_TIME_MASK));
} else {
ena = (uint64_t)((format & ENA_FORMAT_MASK) |
((cpuid << ENA_FMT1_CPUID_SHFT) &
ENA_FMT1_CPUID_MASK) |
((gethrtime_waitfree() << ENA_FMT1_TIME_SHFT) &
ENA_FMT1_TIME_MASK));
}
break;
case FM_ENA_FMT2:
ena = (uint64_t)((format & ENA_FORMAT_MASK) |
((timestamp << ENA_FMT2_TIME_SHFT) & ENA_FMT2_TIME_MASK));
break;
default:
break;
}
return (ena);
}
uint64_t
fm_ena_generate(uint64_t timestamp, uchar_t format)
{
return (fm_ena_generate_cpu(timestamp, PCPU_GET(cpuid), format));
}
uint64_t
fm_ena_generation_get(uint64_t ena)
{
uint64_t gen;
switch (ENA_FORMAT(ena)) {
case FM_ENA_FMT1:
gen = (ena & ENA_FMT1_GEN_MASK) >> ENA_FMT1_GEN_SHFT;
break;
case FM_ENA_FMT2:
gen = (ena & ENA_FMT2_GEN_MASK) >> ENA_FMT2_GEN_SHFT;
break;
default:
gen = 0;
break;
}
return (gen);
}
uchar_t
fm_ena_format_get(uint64_t ena)
{
return (ENA_FORMAT(ena));
}
uint64_t
fm_ena_id_get(uint64_t ena)
{
uint64_t id;
switch (ENA_FORMAT(ena)) {
case FM_ENA_FMT1:
id = (ena & ENA_FMT1_ID_MASK) >> ENA_FMT1_ID_SHFT;
break;
case FM_ENA_FMT2:
id = (ena & ENA_FMT2_ID_MASK) >> ENA_FMT2_ID_SHFT;
break;
default:
id = 0;
}
return (id);
}
uint64_t
fm_ena_time_get(uint64_t ena)
{
uint64_t time;
switch (ENA_FORMAT(ena)) {
case FM_ENA_FMT1:
time = (ena & ENA_FMT1_TIME_MASK) >> ENA_FMT1_TIME_SHFT;
break;
case FM_ENA_FMT2:
time = (ena & ENA_FMT2_TIME_MASK) >> ENA_FMT2_TIME_SHFT;
break;
default:
time = 0;
}
return (time);
}
#ifdef illumos
/*
* Convert a getpcstack() trace to symbolic name+offset, and add the resulting
* string array to a Fault Management ereport as FM_EREPORT_PAYLOAD_NAME_STACK.
*/
void
fm_payload_stack_add(nvlist_t *payload, const pc_t *stack, int depth)
{
int i;
char *sym;
ulong_t off;
char *stkpp[FM_STK_DEPTH];
char buf[FM_STK_DEPTH * FM_SYM_SZ];
char *stkp = buf;
for (i = 0; i < depth && i != FM_STK_DEPTH; i++, stkp += FM_SYM_SZ) {
if ((sym = kobj_getsymname(stack[i], &off)) != NULL)
(void) snprintf(stkp, FM_SYM_SZ, "%s+%lx", sym, off);
else
(void) snprintf(stkp, FM_SYM_SZ, "%lx", (long)stack[i]);
stkpp[i] = stkp;
}
fm_payload_set(payload, FM_EREPORT_PAYLOAD_NAME_STACK,
DATA_TYPE_STRING_ARRAY, depth, stkpp, NULL);
}
#endif
#ifdef illumos
void
print_msg_hwerr(ctid_t ct_id, proc_t *p)
{
uprintf("Killed process %d (%s) in contract id %d "
"due to hardware error\n", p->p_pid, p->p_user.u_comm, ct_id);
}
#endif
void
fm_fmri_hc_create(nvlist_t *fmri, int version, const nvlist_t *auth,
nvlist_t *snvl, nvlist_t *bboard, int npairs, ...)
{
nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri);
nvlist_t *pairs[HC_MAXPAIRS];
nvlist_t **hcl;
uint_t n;
int i, j;
va_list ap;
char *hcname, *hcid;
if (!fm_fmri_hc_set_common(fmri, version, auth))
return;
/*
* copy the bboard nvpairs to the pairs array
*/
if (nvlist_lookup_nvlist_array(bboard, FM_FMRI_HC_LIST, &hcl, &n)
!= 0) {
atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
return;
}
for (i = 0; i < n; i++) {
if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_NAME,
&hcname) != 0) {
atomic_inc_64(
&erpt_kstat_data.fmri_set_failed.value.ui64);
return;
}
if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_ID, &hcid) != 0) {
atomic_inc_64(
&erpt_kstat_data.fmri_set_failed.value.ui64);
return;
}
pairs[i] = fm_nvlist_create(nva);
if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, hcname) != 0 ||
nvlist_add_string(pairs[i], FM_FMRI_HC_ID, hcid) != 0) {
for (j = 0; j <= i; j++) {
if (pairs[j] != NULL)
fm_nvlist_destroy(pairs[j],
FM_NVA_RETAIN);
}
atomic_inc_64(
&erpt_kstat_data.fmri_set_failed.value.ui64);
return;
}
}
/*
* create the pairs from passed in pairs
*/
npairs = MIN(npairs, HC_MAXPAIRS);
va_start(ap, npairs);
for (i = n; i < npairs + n; i++) {
const char *name = va_arg(ap, const char *);
uint32_t id = va_arg(ap, uint32_t);
char idstr[11];
(void) snprintf(idstr, sizeof (idstr), "%u", id);
pairs[i] = fm_nvlist_create(nva);
if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 ||
nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) {
for (j = 0; j <= i; j++) {
if (pairs[j] != NULL)
fm_nvlist_destroy(pairs[j],
FM_NVA_RETAIN);
}
atomic_inc_64(
&erpt_kstat_data.fmri_set_failed.value.ui64);
return;
}
}
va_end(ap);
/*
* Create the fmri hc list
*/
if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs,
npairs + n) != 0) {
atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
return;
}
for (i = 0; i < npairs + n; i++) {
fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN);
}
if (snvl != NULL) {
if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) {
atomic_inc_64(
&erpt_kstat_data.fmri_set_failed.value.ui64);
return;
}
}
}
Index: head/sys/ddb/db_textdump.c
===================================================================
--- head/sys/ddb/db_textdump.c (revision 356654)
+++ head/sys/ddb/db_textdump.c (revision 356655)
@@ -1,563 +1,563 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2007 Robert N. M. Watson
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*-
* Kernel text-dump support: write a series of text files to the dump
* partition for later recovery, including captured DDB output, kernel
* configuration, message buffer, and panic message. This allows for a more
* compact representation of critical debugging information than traditional
* binary dumps, as well as allowing dump information to be used without
* access to kernel symbols, source code, etc.
*
* Storage Layout
* --------------
*
* Crash dumps are aligned to the end of the dump or swap partition in order
* to minimize the chances of swap duing fsck eating into the dump. However,
* unlike a memory dump, we don't know the size of the textdump a priori, so
* can't just write it out sequentially in order from a known starting point
* calculated with respect to the end of the partition. In order to address
* this, we actually write out the textdump in reverse block order, allowing
* us to directly align it to the end of the partition and then write out the
* dump header and trailer before and after it once done. savecore(8) must
* know to reverse the order of the blocks in order to produce a readable
* file.
*
* Data is written out in the ustar file format so that we can write data
* incrementally as a stream without reference to previous files.
*
* TODO
* ----
*
* - Allow subsystems to register to submit files for inclusion in the text
* dump in a generic way.
*/
#include
__FBSDID("$FreeBSD$");
#include "opt_config.h"
#include "opt_ddb.h"
#include
#include
#include
#include
#include
#include
#include
#include
#include
static SYSCTL_NODE(_debug_ddb, OID_AUTO, textdump, CTLFLAG_RW, 0,
"DDB textdump options");
/*
* Don't touch the first SIZEOF_METADATA bytes on the dump device. This is
* to protect us from metadata and metadata from us.
*/
#define SIZEOF_METADATA (64*1024)
/*
* Data is written out as a series of files in the ustar tar format. ustar
* is a simple streamed format consiting of a series of files prefixed with
* headers, and all padded to 512-byte block boundaries, which maps
* conveniently to our requirements.
*/
struct ustar_header {
char uh_filename[100];
char uh_mode[8];
char uh_tar_owner[8];
char uh_tar_group[8];
char uh_size[12];
char uh_mtime[12];
char uh_sum[8];
char uh_type;
char uh_linkfile[100];
char uh_ustar[6];
char uh_version[2];
char uh_owner[32];
char uh_group[32];
char uh_major[8];
char uh_minor[8];
char uh_filenameprefix[155];
char uh_zeropad[12];
} __packed;
/*
* Various size assertions -- pretty much everything must be one block in
* size.
*/
CTASSERT(sizeof(struct kerneldumpheader) == TEXTDUMP_BLOCKSIZE);
CTASSERT(sizeof(struct ustar_header) == TEXTDUMP_BLOCKSIZE);
/*
* Is a textdump scheduled? If so, the shutdown code will invoke our dumpsys
* routine instead of the machine-dependent kernel dump routine.
*/
#ifdef TEXTDUMP_PREFERRED
int textdump_pending = 1;
#else
int textdump_pending = 0;
#endif
SYSCTL_INT(_debug_ddb_textdump, OID_AUTO, pending, CTLFLAG_RW,
&textdump_pending, 0,
"Perform textdump instead of regular kernel dump.");
/*
* Various constants for tar headers and contents.
*/
#define TAR_USER "root"
#define TAR_GROUP "wheel"
#define TAR_UID "0"
#define TAR_GID "0"
#define TAR_MODE "0600"
#define TAR_USTAR "ustar"
#define TAR_CONFIG_FILENAME "config.txt" /* Kernel configuration. */
#define TAR_MSGBUF_FILENAME "msgbuf.txt" /* Kernel messsage buffer. */
#define TAR_PANIC_FILENAME "panic.txt" /* Panic message. */
#define TAR_VERSION_FILENAME "version.txt" /* Kernel version. */
/*
* Configure which files will be dumped.
*/
#ifdef INCLUDE_CONFIG_FILE
static int textdump_do_config = 1;
SYSCTL_INT(_debug_ddb_textdump, OID_AUTO, do_config, CTLFLAG_RW,
&textdump_do_config, 0, "Dump kernel configuration in textdump");
#endif
static int textdump_do_ddb = 1;
SYSCTL_INT(_debug_ddb_textdump, OID_AUTO, do_ddb, CTLFLAG_RW,
&textdump_do_ddb, 0, "Dump DDB captured output in textdump");
static int textdump_do_msgbuf = 1;
SYSCTL_INT(_debug_ddb_textdump, OID_AUTO, do_msgbuf, CTLFLAG_RW,
&textdump_do_msgbuf, 0, "Dump kernel message buffer in textdump");
static int textdump_do_panic = 1;
SYSCTL_INT(_debug_ddb_textdump, OID_AUTO, do_panic, CTLFLAG_RW,
&textdump_do_panic, 0, "Dump kernel panic message in textdump");
static int textdump_do_version = 1;
SYSCTL_INT(_debug_ddb_textdump, OID_AUTO, do_version, CTLFLAG_RW,
&textdump_do_version, 0, "Dump kernel version string in textdump");
/*
* State related to incremental writing of blocks to disk.
*/
static off_t textdump_offset; /* Offset of next sequential write. */
static int textdump_error; /* Carried write error, if any. */
/*
* Statically allocate space to prepare block-sized headers and data.
*/
char textdump_block_buffer[TEXTDUMP_BLOCKSIZE];
static struct kerneldumpheader kdh;
/*
* Calculate and fill in the checksum for a ustar header.
*/
static void
ustar_checksum(struct ustar_header *uhp)
{
u_int sum;
int i;
for (i = 0; i < sizeof(uhp->uh_sum); i++)
uhp->uh_sum[i] = ' ';
sum = 0;
for (i = 0; i < sizeof(*uhp); i++)
sum += ((u_char *)uhp)[i];
snprintf(uhp->uh_sum, sizeof(uhp->uh_sum), "%6o", sum);
}
/*
* Each file in the tarball has a block-sized header with its name and other,
* largely hard-coded, properties.
*/
void
textdump_mkustar(char *block_buffer, const char *filename, u_int size)
{
struct ustar_header *uhp;
#ifdef TEXTDUMP_VERBOSE
if (textdump_error == 0)
printf("textdump: creating '%s'.\n", filename);
#endif
uhp = (struct ustar_header *)block_buffer;
bzero(uhp, sizeof(*uhp));
strlcpy(uhp->uh_filename, filename, sizeof(uhp->uh_filename));
strlcpy(uhp->uh_mode, TAR_MODE, sizeof(uhp->uh_mode));
snprintf(uhp->uh_size, sizeof(uhp->uh_size), "%o", size);
strlcpy(uhp->uh_tar_owner, TAR_UID, sizeof(uhp->uh_tar_owner));
strlcpy(uhp->uh_tar_group, TAR_GID, sizeof(uhp->uh_tar_group));
strlcpy(uhp->uh_owner, TAR_USER, sizeof(uhp->uh_owner));
strlcpy(uhp->uh_group, TAR_GROUP, sizeof(uhp->uh_group));
snprintf(uhp->uh_mtime, sizeof(uhp->uh_mtime), "%lo",
(unsigned long)time_second);
uhp->uh_type = 0;
strlcpy(uhp->uh_ustar, TAR_USTAR, sizeof(uhp->uh_ustar));
ustar_checksum(uhp);
}
/*
* textdump_writeblock() writes TEXTDUMP_BLOCKSIZE-sized blocks of data to
* the space between di->mediaoffset and di->mediaoffset + di->mediasize. It
* accepts an offset relative to di->mediaoffset. If we're carrying any
* error from previous I/O, return that error and don't continue to try to
* write. Most writers ignore the error and forge ahead on the basis that
* there's not much you can do.
*/
static int
textdump_writeblock(struct dumperinfo *di, off_t offset, char *buffer)
{
if (textdump_error)
return (textdump_error);
if (offset + TEXTDUMP_BLOCKSIZE > di->mediasize)
return (EIO);
if (offset < SIZEOF_METADATA)
return (ENOSPC);
textdump_error = dump_write(di, buffer, 0, offset + di->mediaoffset,
TEXTDUMP_BLOCKSIZE);
if (textdump_error)
printf("textdump_writeblock: offset %jd, error %d\n", (intmax_t)offset,
textdump_error);
return (textdump_error);
}
/*
* Interfaces to save and restore the dump offset, so that printers can go
* back to rewrite a header if required, while avoiding their knowing about
* the global layout of the blocks.
*
* If we ever want to support writing textdumps to tape or other
* stream-oriented target, we'll need to remove this.
*/
void
textdump_saveoff(off_t *offsetp)
{
*offsetp = textdump_offset;
}
void
textdump_restoreoff(off_t offset)
{
textdump_offset = offset;
}
/*
* Interface to write the "next block" relative to the current offset; since
* we write backwards from the end of the partition, we subtract, but there's
* no reason for the caller to know this.
*/
int
textdump_writenextblock(struct dumperinfo *di, char *buffer)
{
int error;
error = textdump_writeblock(di, textdump_offset, buffer);
textdump_offset -= TEXTDUMP_BLOCKSIZE;
return (error);
}
#ifdef INCLUDE_CONFIG_FILE
extern char kernconfstring[];
/*
* Dump kernel configuration.
*/
static void
textdump_dump_config(struct dumperinfo *di)
{
u_int count, fullblocks, len;
len = strlen(kernconfstring);
textdump_mkustar(textdump_block_buffer, TAR_CONFIG_FILENAME, len);
(void)textdump_writenextblock(di, textdump_block_buffer);
/*
* Write out all full blocks directly from the string, and handle any
* left-over bits by copying it to out to the local buffer and
* zero-padding it.
*/
fullblocks = len / TEXTDUMP_BLOCKSIZE;
for (count = 0; count < fullblocks; count++)
(void)textdump_writenextblock(di, kernconfstring + count *
TEXTDUMP_BLOCKSIZE);
if (len % TEXTDUMP_BLOCKSIZE != 0) {
bzero(textdump_block_buffer, TEXTDUMP_BLOCKSIZE);
bcopy(kernconfstring + count * TEXTDUMP_BLOCKSIZE,
textdump_block_buffer, len % TEXTDUMP_BLOCKSIZE);
(void)textdump_writenextblock(di, textdump_block_buffer);
}
}
#endif /* INCLUDE_CONFIG_FILE */
/*
* Dump kernel message buffer.
*/
static void
textdump_dump_msgbuf(struct dumperinfo *di)
{
off_t end_offset, tarhdr_offset;
u_int i, len, offset, seq, total_len;
char buf[16];
/*
* Write out a dummy tar header to advance the offset; we'll rewrite
* it later once we know the true size.
*/
textdump_saveoff(&tarhdr_offset);
textdump_mkustar(textdump_block_buffer, TAR_MSGBUF_FILENAME, 0);
(void)textdump_writenextblock(di, textdump_block_buffer);
/*
* Copy out the data in small chunks, but don't copy nuls that may be
* present if the message buffer has not yet completely filled at
* least once.
*/
total_len = 0;
offset = 0;
msgbuf_peekbytes(msgbufp, NULL, 0, &seq);
while ((len = msgbuf_peekbytes(msgbufp, buf, sizeof(buf), &seq)) > 0) {
for (i = 0; i < len; i++) {
if (buf[i] == '\0')
continue;
textdump_block_buffer[offset] = buf[i];
offset++;
if (offset != sizeof(textdump_block_buffer))
continue;
(void)textdump_writenextblock(di,
textdump_block_buffer);
total_len += offset;
offset = 0;
}
}
total_len += offset; /* Without the zero-padding. */
if (offset != 0) {
bzero(textdump_block_buffer + offset,
sizeof(textdump_block_buffer) - offset);
(void)textdump_writenextblock(di, textdump_block_buffer);
}
/*
* Rewrite tar header to reflect how much was actually written.
*/
textdump_saveoff(&end_offset);
textdump_restoreoff(tarhdr_offset);
textdump_mkustar(textdump_block_buffer, TAR_MSGBUF_FILENAME,
total_len);
(void)textdump_writenextblock(di, textdump_block_buffer);
textdump_restoreoff(end_offset);
}
static void
textdump_dump_panic(struct dumperinfo *di)
{
u_int len;
/*
* Write out tar header -- we store up to one block of panic message.
*/
len = min(strlen(panicstr), TEXTDUMP_BLOCKSIZE);
textdump_mkustar(textdump_block_buffer, TAR_PANIC_FILENAME, len);
(void)textdump_writenextblock(di, textdump_block_buffer);
/*
* Zero-pad the panic string and write out block.
*/
bzero(textdump_block_buffer, sizeof(textdump_block_buffer));
bcopy(panicstr, textdump_block_buffer, len);
(void)textdump_writenextblock(di, textdump_block_buffer);
}
static void
textdump_dump_version(struct dumperinfo *di)
{
u_int len;
/*
* Write out tar header -- at most one block of version information.
*/
len = min(strlen(version), TEXTDUMP_BLOCKSIZE);
textdump_mkustar(textdump_block_buffer, TAR_VERSION_FILENAME, len);
(void)textdump_writenextblock(di, textdump_block_buffer);
/*
* Zero pad the version string and write out block.
*/
bzero(textdump_block_buffer, sizeof(textdump_block_buffer));
bcopy(version, textdump_block_buffer, len);
(void)textdump_writenextblock(di, textdump_block_buffer);
}
/*
* Commit text dump to disk.
*/
void
textdump_dumpsys(struct dumperinfo *di)
{
struct kerneldumpcrypto *kdc;
off_t dumplen, trailer_offset;
if (di->blocksize != TEXTDUMP_BLOCKSIZE) {
printf("Dump partition block size (%ju) not textdump "
"block size (%ju)", (uintmax_t)di->blocksize,
(uintmax_t)TEXTDUMP_BLOCKSIZE);
return;
}
/*
* We don't know a priori how large the dump will be, but we do know
* that we need to reserve space for metadata and that we need two
* dump headers. Also leave room for one ustar header and one block
* of data.
*/
if (di->mediasize < SIZEOF_METADATA + 2 * sizeof(kdh)) {
printf("Insufficient space on dump partition for minimal textdump.\n");
return;
}
textdump_error = 0;
/*
* Disable EKCD because we don't provide encrypted textdumps.
*/
kdc = di->kdcrypto;
di->kdcrypto = NULL;
/*
* Position the start of the dump so that we'll write the kernel dump
* trailer immediately before the end of the partition, and then work
* our way back. We will rewrite this header later to reflect the
* true size if things go well.
*/
textdump_offset = di->mediasize - sizeof(kdh);
textdump_saveoff(&trailer_offset);
dump_init_header(di, &kdh, TEXTDUMPMAGIC, KERNELDUMP_TEXT_VERSION, 0);
(void)textdump_writenextblock(di, (char *)&kdh);
/*
* Write a series of files in ustar format.
*/
if (textdump_do_ddb)
db_capture_dump(di);
#ifdef INCLUDE_CONFIG_FILE
if (textdump_do_config)
textdump_dump_config(di);
#endif
if (textdump_do_msgbuf)
textdump_dump_msgbuf(di);
- if (textdump_do_panic && panicstr != NULL)
+ if (textdump_do_panic && KERNEL_PANICKED())
textdump_dump_panic(di);
if (textdump_do_version)
textdump_dump_version(di);
/*
* Now that we know the true size, we can write out the header, then
* seek back to the end and rewrite the trailer with the correct
* size.
*/
dumplen = trailer_offset - (textdump_offset + TEXTDUMP_BLOCKSIZE);
dump_init_header(di, &kdh, TEXTDUMPMAGIC, KERNELDUMP_TEXT_VERSION,
dumplen);
(void)textdump_writenextblock(di, (char *)&kdh);
textdump_restoreoff(trailer_offset);
(void)textdump_writenextblock(di, (char *)&kdh);
/*
* Terminate the dump, report any errors, and clear the pending flag.
*/
if (textdump_error == 0)
(void)dump_write(di, NULL, 0, 0, 0);
if (textdump_error == ENOSPC)
printf("Textdump: Insufficient space on dump partition\n");
else if (textdump_error != 0)
printf("Textdump: Error %d writing dump\n", textdump_error);
else
printf("Textdump complete.\n");
textdump_pending = 0;
/*
* Restore EKCD status.
*/
di->kdcrypto = kdc;
}
/*-
* DDB(4) command to manage textdumps:
*
* textdump set - request a textdump
* textdump status - print DDB output textdump status
* textdump unset - clear textdump request
*/
static void
db_textdump_usage(void)
{
db_printf("textdump [unset|set|status|dump]\n");
}
void
db_textdump_cmd(db_expr_t addr, bool have_addr, db_expr_t count, char *modif)
{
int t;
t = db_read_token();
if (t != tIDENT) {
db_textdump_usage();
return;
}
if (db_read_token() != tEOL) {
db_textdump_usage();
return;
}
if (strcmp(db_tok_string, "set") == 0) {
textdump_pending = 1;
db_printf("textdump set\n");
} else if (strcmp(db_tok_string, "status") == 0) {
if (textdump_pending)
db_printf("textdump is set\n");
else
db_printf("textdump is not set\n");
} else if (strcmp(db_tok_string, "unset") == 0) {
textdump_pending = 0;
db_printf("textdump unset\n");
} else if (strcmp(db_tok_string, "dump") == 0) {
textdump_pending = 1;
doadump(true);
} else {
db_textdump_usage();
}
}
Index: head/sys/dev/acpica/acpi.c
===================================================================
--- head/sys/dev/acpica/acpi.c (revision 356654)
+++ head/sys/dev/acpica/acpi.c (revision 356655)
@@ -1,4298 +1,4298 @@
/*-
* Copyright (c) 2000 Takanori Watanabe
* Copyright (c) 2000 Mitsuru IWASAKI
* Copyright (c) 2000, 2001 Michael Smith
* Copyright (c) 2000 BSDi
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include
__FBSDID("$FreeBSD$");
#include "opt_acpi.h"
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#if defined(__i386__) || defined(__amd64__)
#include
#include
#endif
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
static MALLOC_DEFINE(M_ACPIDEV, "acpidev", "ACPI devices");
/* Hooks for the ACPI CA debugging infrastructure */
#define _COMPONENT ACPI_BUS
ACPI_MODULE_NAME("ACPI")
static d_open_t acpiopen;
static d_close_t acpiclose;
static d_ioctl_t acpiioctl;
static struct cdevsw acpi_cdevsw = {
.d_version = D_VERSION,
.d_open = acpiopen,
.d_close = acpiclose,
.d_ioctl = acpiioctl,
.d_name = "acpi",
};
struct acpi_interface {
ACPI_STRING *data;
int num;
};
static char *sysres_ids[] = { "PNP0C01", "PNP0C02", NULL };
static char *pcilink_ids[] = { "PNP0C0F", NULL };
/* Global mutex for locking access to the ACPI subsystem. */
struct mtx acpi_mutex;
struct callout acpi_sleep_timer;
/* Bitmap of device quirks. */
int acpi_quirks;
/* Supported sleep states. */
static BOOLEAN acpi_sleep_states[ACPI_S_STATE_COUNT];
static void acpi_lookup(void *arg, const char *name, device_t *dev);
static int acpi_modevent(struct module *mod, int event, void *junk);
static int acpi_probe(device_t dev);
static int acpi_attach(device_t dev);
static int acpi_suspend(device_t dev);
static int acpi_resume(device_t dev);
static int acpi_shutdown(device_t dev);
static device_t acpi_add_child(device_t bus, u_int order, const char *name,
int unit);
static int acpi_print_child(device_t bus, device_t child);
static void acpi_probe_nomatch(device_t bus, device_t child);
static void acpi_driver_added(device_t dev, driver_t *driver);
static int acpi_read_ivar(device_t dev, device_t child, int index,
uintptr_t *result);
static int acpi_write_ivar(device_t dev, device_t child, int index,
uintptr_t value);
static struct resource_list *acpi_get_rlist(device_t dev, device_t child);
static void acpi_reserve_resources(device_t dev);
static int acpi_sysres_alloc(device_t dev);
static int acpi_set_resource(device_t dev, device_t child, int type,
int rid, rman_res_t start, rman_res_t count);
static struct resource *acpi_alloc_resource(device_t bus, device_t child,
int type, int *rid, rman_res_t start, rman_res_t end,
rman_res_t count, u_int flags);
static int acpi_adjust_resource(device_t bus, device_t child, int type,
struct resource *r, rman_res_t start, rman_res_t end);
static int acpi_release_resource(device_t bus, device_t child, int type,
int rid, struct resource *r);
static void acpi_delete_resource(device_t bus, device_t child, int type,
int rid);
static uint32_t acpi_isa_get_logicalid(device_t dev);
static int acpi_isa_get_compatid(device_t dev, uint32_t *cids, int count);
static int acpi_device_id_probe(device_t bus, device_t dev, char **ids, char **match);
static ACPI_STATUS acpi_device_eval_obj(device_t bus, device_t dev,
ACPI_STRING pathname, ACPI_OBJECT_LIST *parameters,
ACPI_BUFFER *ret);
static ACPI_STATUS acpi_device_scan_cb(ACPI_HANDLE h, UINT32 level,
void *context, void **retval);
static ACPI_STATUS acpi_device_scan_children(device_t bus, device_t dev,
int max_depth, acpi_scan_cb_t user_fn, void *arg);
static int acpi_set_powerstate(device_t child, int state);
static int acpi_isa_pnp_probe(device_t bus, device_t child,
struct isa_pnp_id *ids);
static void acpi_probe_children(device_t bus);
static void acpi_probe_order(ACPI_HANDLE handle, int *order);
static ACPI_STATUS acpi_probe_child(ACPI_HANDLE handle, UINT32 level,
void *context, void **status);
static void acpi_sleep_enable(void *arg);
static ACPI_STATUS acpi_sleep_disable(struct acpi_softc *sc);
static ACPI_STATUS acpi_EnterSleepState(struct acpi_softc *sc, int state);
static void acpi_shutdown_final(void *arg, int howto);
static void acpi_enable_fixed_events(struct acpi_softc *sc);
static BOOLEAN acpi_has_hid(ACPI_HANDLE handle);
static void acpi_resync_clock(struct acpi_softc *sc);
static int acpi_wake_sleep_prep(ACPI_HANDLE handle, int sstate);
static int acpi_wake_run_prep(ACPI_HANDLE handle, int sstate);
static int acpi_wake_prep_walk(int sstate);
static int acpi_wake_sysctl_walk(device_t dev);
static int acpi_wake_set_sysctl(SYSCTL_HANDLER_ARGS);
static void acpi_system_eventhandler_sleep(void *arg, int state);
static void acpi_system_eventhandler_wakeup(void *arg, int state);
static int acpi_sname2sstate(const char *sname);
static const char *acpi_sstate2sname(int sstate);
static int acpi_supported_sleep_state_sysctl(SYSCTL_HANDLER_ARGS);
static int acpi_sleep_state_sysctl(SYSCTL_HANDLER_ARGS);
static int acpi_debug_objects_sysctl(SYSCTL_HANDLER_ARGS);
static int acpi_pm_func(u_long cmd, void *arg, ...);
static int acpi_child_location_str_method(device_t acdev, device_t child,
char *buf, size_t buflen);
static int acpi_child_pnpinfo_str_method(device_t acdev, device_t child,
char *buf, size_t buflen);
static void acpi_enable_pcie(void);
static void acpi_hint_device_unit(device_t acdev, device_t child,
const char *name, int *unitp);
static void acpi_reset_interfaces(device_t dev);
static device_method_t acpi_methods[] = {
/* Device interface */
DEVMETHOD(device_probe, acpi_probe),
DEVMETHOD(device_attach, acpi_attach),
DEVMETHOD(device_shutdown, acpi_shutdown),
DEVMETHOD(device_detach, bus_generic_detach),
DEVMETHOD(device_suspend, acpi_suspend),
DEVMETHOD(device_resume, acpi_resume),
/* Bus interface */
DEVMETHOD(bus_add_child, acpi_add_child),
DEVMETHOD(bus_print_child, acpi_print_child),
DEVMETHOD(bus_probe_nomatch, acpi_probe_nomatch),
DEVMETHOD(bus_driver_added, acpi_driver_added),
DEVMETHOD(bus_read_ivar, acpi_read_ivar),
DEVMETHOD(bus_write_ivar, acpi_write_ivar),
DEVMETHOD(bus_get_resource_list, acpi_get_rlist),
DEVMETHOD(bus_set_resource, acpi_set_resource),
DEVMETHOD(bus_get_resource, bus_generic_rl_get_resource),
DEVMETHOD(bus_alloc_resource, acpi_alloc_resource),
DEVMETHOD(bus_adjust_resource, acpi_adjust_resource),
DEVMETHOD(bus_release_resource, acpi_release_resource),
DEVMETHOD(bus_delete_resource, acpi_delete_resource),
DEVMETHOD(bus_child_pnpinfo_str, acpi_child_pnpinfo_str_method),
DEVMETHOD(bus_child_location_str, acpi_child_location_str_method),
DEVMETHOD(bus_activate_resource, bus_generic_activate_resource),
DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource),
DEVMETHOD(bus_setup_intr, bus_generic_setup_intr),
DEVMETHOD(bus_teardown_intr, bus_generic_teardown_intr),
DEVMETHOD(bus_hint_device_unit, acpi_hint_device_unit),
DEVMETHOD(bus_get_cpus, acpi_get_cpus),
DEVMETHOD(bus_get_domain, acpi_get_domain),
/* ACPI bus */
DEVMETHOD(acpi_id_probe, acpi_device_id_probe),
DEVMETHOD(acpi_evaluate_object, acpi_device_eval_obj),
DEVMETHOD(acpi_pwr_for_sleep, acpi_device_pwr_for_sleep),
DEVMETHOD(acpi_scan_children, acpi_device_scan_children),
/* ISA emulation */
DEVMETHOD(isa_pnp_probe, acpi_isa_pnp_probe),
DEVMETHOD_END
};
static driver_t acpi_driver = {
"acpi",
acpi_methods,
sizeof(struct acpi_softc),
};
static devclass_t acpi_devclass;
DRIVER_MODULE(acpi, nexus, acpi_driver, acpi_devclass, acpi_modevent, 0);
MODULE_VERSION(acpi, 1);
ACPI_SERIAL_DECL(acpi, "ACPI root bus");
/* Local pools for managing system resources for ACPI child devices. */
static struct rman acpi_rman_io, acpi_rman_mem;
#define ACPI_MINIMUM_AWAKETIME 5
/* Holds the description of the acpi0 device. */
static char acpi_desc[ACPI_OEM_ID_SIZE + ACPI_OEM_TABLE_ID_SIZE + 2];
SYSCTL_NODE(_debug, OID_AUTO, acpi, CTLFLAG_RD, NULL, "ACPI debugging");
static char acpi_ca_version[12];
SYSCTL_STRING(_debug_acpi, OID_AUTO, acpi_ca_version, CTLFLAG_RD,
acpi_ca_version, 0, "Version of Intel ACPI-CA");
/*
* Allow overriding _OSI methods.
*/
static char acpi_install_interface[256];
TUNABLE_STR("hw.acpi.install_interface", acpi_install_interface,
sizeof(acpi_install_interface));
static char acpi_remove_interface[256];
TUNABLE_STR("hw.acpi.remove_interface", acpi_remove_interface,
sizeof(acpi_remove_interface));
/* Allow users to dump Debug objects without ACPI debugger. */
static int acpi_debug_objects;
TUNABLE_INT("debug.acpi.enable_debug_objects", &acpi_debug_objects);
SYSCTL_PROC(_debug_acpi, OID_AUTO, enable_debug_objects,
CTLFLAG_RW | CTLTYPE_INT, NULL, 0, acpi_debug_objects_sysctl, "I",
"Enable Debug objects");
/* Allow the interpreter to ignore common mistakes in BIOS. */
static int acpi_interpreter_slack = 1;
TUNABLE_INT("debug.acpi.interpreter_slack", &acpi_interpreter_slack);
SYSCTL_INT(_debug_acpi, OID_AUTO, interpreter_slack, CTLFLAG_RDTUN,
&acpi_interpreter_slack, 1, "Turn on interpreter slack mode.");
/* Ignore register widths set by FADT and use default widths instead. */
static int acpi_ignore_reg_width = 1;
TUNABLE_INT("debug.acpi.default_register_width", &acpi_ignore_reg_width);
SYSCTL_INT(_debug_acpi, OID_AUTO, default_register_width, CTLFLAG_RDTUN,
&acpi_ignore_reg_width, 1, "Ignore register widths set by FADT");
/* Allow users to override quirks. */
TUNABLE_INT("debug.acpi.quirks", &acpi_quirks);
int acpi_susp_bounce;
SYSCTL_INT(_debug_acpi, OID_AUTO, suspend_bounce, CTLFLAG_RW,
&acpi_susp_bounce, 0, "Don't actually suspend, just test devices.");
/*
* ACPI can only be loaded as a module by the loader; activating it after
* system bootstrap time is not useful, and can be fatal to the system.
* It also cannot be unloaded, since the entire system bus hierarchy hangs
* off it.
*/
static int
acpi_modevent(struct module *mod, int event, void *junk)
{
switch (event) {
case MOD_LOAD:
if (!cold) {
printf("The ACPI driver cannot be loaded after boot.\n");
return (EPERM);
}
break;
case MOD_UNLOAD:
if (!cold && power_pm_get_type() == POWER_PM_TYPE_ACPI)
return (EBUSY);
break;
default:
break;
}
return (0);
}
/*
* Perform early initialization.
*/
ACPI_STATUS
acpi_Startup(void)
{
static int started = 0;
ACPI_STATUS status;
int val;
ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
/* Only run the startup code once. The MADT driver also calls this. */
if (started)
return_VALUE (AE_OK);
started = 1;
/*
* Initialize the ACPICA subsystem.
*/
if (ACPI_FAILURE(status = AcpiInitializeSubsystem())) {
printf("ACPI: Could not initialize Subsystem: %s\n",
AcpiFormatException(status));
return_VALUE (status);
}
/*
* Pre-allocate space for RSDT/XSDT and DSDT tables and allow resizing
* if more tables exist.
*/
if (ACPI_FAILURE(status = AcpiInitializeTables(NULL, 2, TRUE))) {
printf("ACPI: Table initialisation failed: %s\n",
AcpiFormatException(status));
return_VALUE (status);
}
/* Set up any quirks we have for this system. */
if (acpi_quirks == ACPI_Q_OK)
acpi_table_quirks(&acpi_quirks);
/* If the user manually set the disabled hint to 0, force-enable ACPI. */
if (resource_int_value("acpi", 0, "disabled", &val) == 0 && val == 0)
acpi_quirks &= ~ACPI_Q_BROKEN;
if (acpi_quirks & ACPI_Q_BROKEN) {
printf("ACPI disabled by blacklist. Contact your BIOS vendor.\n");
status = AE_SUPPORT;
}
return_VALUE (status);
}
/*
* Detect ACPI and perform early initialisation.
*/
int
acpi_identify(void)
{
ACPI_TABLE_RSDP *rsdp;
ACPI_TABLE_HEADER *rsdt;
ACPI_PHYSICAL_ADDRESS paddr;
struct sbuf sb;
ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
if (!cold)
return (ENXIO);
/* Check that we haven't been disabled with a hint. */
if (resource_disabled("acpi", 0))
return (ENXIO);
/* Check for other PM systems. */
if (power_pm_get_type() != POWER_PM_TYPE_NONE &&
power_pm_get_type() != POWER_PM_TYPE_ACPI) {
printf("ACPI identify failed, other PM system enabled.\n");
return (ENXIO);
}
/* Initialize root tables. */
if (ACPI_FAILURE(acpi_Startup())) {
printf("ACPI: Try disabling either ACPI or apic support.\n");
return (ENXIO);
}
if ((paddr = AcpiOsGetRootPointer()) == 0 ||
(rsdp = AcpiOsMapMemory(paddr, sizeof(ACPI_TABLE_RSDP))) == NULL)
return (ENXIO);
if (rsdp->Revision > 1 && rsdp->XsdtPhysicalAddress != 0)
paddr = (ACPI_PHYSICAL_ADDRESS)rsdp->XsdtPhysicalAddress;
else
paddr = (ACPI_PHYSICAL_ADDRESS)rsdp->RsdtPhysicalAddress;
AcpiOsUnmapMemory(rsdp, sizeof(ACPI_TABLE_RSDP));
if ((rsdt = AcpiOsMapMemory(paddr, sizeof(ACPI_TABLE_HEADER))) == NULL)
return (ENXIO);
sbuf_new(&sb, acpi_desc, sizeof(acpi_desc), SBUF_FIXEDLEN);
sbuf_bcat(&sb, rsdt->OemId, ACPI_OEM_ID_SIZE);
sbuf_trim(&sb);
sbuf_putc(&sb, ' ');
sbuf_bcat(&sb, rsdt->OemTableId, ACPI_OEM_TABLE_ID_SIZE);
sbuf_trim(&sb);
sbuf_finish(&sb);
sbuf_delete(&sb);
AcpiOsUnmapMemory(rsdt, sizeof(ACPI_TABLE_HEADER));
snprintf(acpi_ca_version, sizeof(acpi_ca_version), "%x", ACPI_CA_VERSION);
return (0);
}
/*
* Fetch some descriptive data from ACPI to put in our attach message.
*/
static int
acpi_probe(device_t dev)
{
ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
device_set_desc(dev, acpi_desc);
return_VALUE (BUS_PROBE_NOWILDCARD);
}
static int
acpi_attach(device_t dev)
{
struct acpi_softc *sc;
ACPI_STATUS status;
int error, state;
UINT32 flags;
UINT8 TypeA, TypeB;
char *env;
ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
sc = device_get_softc(dev);
sc->acpi_dev = dev;
callout_init(&sc->susp_force_to, 1);
error = ENXIO;
/* Initialize resource manager. */
acpi_rman_io.rm_type = RMAN_ARRAY;
acpi_rman_io.rm_start = 0;
acpi_rman_io.rm_end = 0xffff;
acpi_rman_io.rm_descr = "ACPI I/O ports";
if (rman_init(&acpi_rman_io) != 0)
panic("acpi rman_init IO ports failed");
acpi_rman_mem.rm_type = RMAN_ARRAY;
acpi_rman_mem.rm_descr = "ACPI I/O memory addresses";
if (rman_init(&acpi_rman_mem) != 0)
panic("acpi rman_init memory failed");
/* Initialise the ACPI mutex */
mtx_init(&acpi_mutex, "ACPI global lock", NULL, MTX_DEF);
/*
* Set the globals from our tunables. This is needed because ACPI-CA
* uses UINT8 for some values and we have no tunable_byte.
*/
AcpiGbl_EnableInterpreterSlack = acpi_interpreter_slack ? TRUE : FALSE;
AcpiGbl_EnableAmlDebugObject = acpi_debug_objects ? TRUE : FALSE;
AcpiGbl_UseDefaultRegisterWidths = acpi_ignore_reg_width ? TRUE : FALSE;
#ifndef ACPI_DEBUG
/*
* Disable all debugging layers and levels.
*/
AcpiDbgLayer = 0;
AcpiDbgLevel = 0;
#endif
/* Override OS interfaces if the user requested. */
acpi_reset_interfaces(dev);
/* Load ACPI name space. */
status = AcpiLoadTables();
if (ACPI_FAILURE(status)) {
device_printf(dev, "Could not load Namespace: %s\n",
AcpiFormatException(status));
goto out;
}
/* Handle MCFG table if present. */
acpi_enable_pcie();
/*
* Note that some systems (specifically, those with namespace evaluation
* issues that require the avoidance of parts of the namespace) must
* avoid running _INI and _STA on everything, as well as dodging the final
* object init pass.
*
* For these devices, we set ACPI_NO_DEVICE_INIT and ACPI_NO_OBJECT_INIT).
*
* XXX We should arrange for the object init pass after we have attached
* all our child devices, but on many systems it works here.
*/
flags = 0;
if (testenv("debug.acpi.avoid"))
flags = ACPI_NO_DEVICE_INIT | ACPI_NO_OBJECT_INIT;
/* Bring the hardware and basic handlers online. */
if (ACPI_FAILURE(status = AcpiEnableSubsystem(flags))) {
device_printf(dev, "Could not enable ACPI: %s\n",
AcpiFormatException(status));
goto out;
}
/*
* Call the ECDT probe function to provide EC functionality before
* the namespace has been evaluated.
*
* XXX This happens before the sysresource devices have been probed and
* attached so its resources come from nexus0. In practice, this isn't
* a problem but should be addressed eventually.
*/
acpi_ec_ecdt_probe(dev);
/* Bring device objects and regions online. */
if (ACPI_FAILURE(status = AcpiInitializeObjects(flags))) {
device_printf(dev, "Could not initialize ACPI objects: %s\n",
AcpiFormatException(status));
goto out;
}
/*
* Setup our sysctl tree.
*
* XXX: This doesn't check to make sure that none of these fail.
*/
sysctl_ctx_init(&sc->acpi_sysctl_ctx);
sc->acpi_sysctl_tree = SYSCTL_ADD_NODE(&sc->acpi_sysctl_ctx,
SYSCTL_STATIC_CHILDREN(_hw), OID_AUTO,
device_get_name(dev), CTLFLAG_RD, 0, "");
SYSCTL_ADD_PROC(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree),
OID_AUTO, "supported_sleep_state", CTLTYPE_STRING | CTLFLAG_RD,
0, 0, acpi_supported_sleep_state_sysctl, "A",
"List supported ACPI sleep states.");
SYSCTL_ADD_PROC(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree),
OID_AUTO, "power_button_state", CTLTYPE_STRING | CTLFLAG_RW,
&sc->acpi_power_button_sx, 0, acpi_sleep_state_sysctl, "A",
"Power button ACPI sleep state.");
SYSCTL_ADD_PROC(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree),
OID_AUTO, "sleep_button_state", CTLTYPE_STRING | CTLFLAG_RW,
&sc->acpi_sleep_button_sx, 0, acpi_sleep_state_sysctl, "A",
"Sleep button ACPI sleep state.");
SYSCTL_ADD_PROC(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree),
OID_AUTO, "lid_switch_state", CTLTYPE_STRING | CTLFLAG_RW,
&sc->acpi_lid_switch_sx, 0, acpi_sleep_state_sysctl, "A",
"Lid ACPI sleep state. Set to S3 if you want to suspend your laptop when close the Lid.");
SYSCTL_ADD_PROC(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree),
OID_AUTO, "standby_state", CTLTYPE_STRING | CTLFLAG_RW,
&sc->acpi_standby_sx, 0, acpi_sleep_state_sysctl, "A", "");
SYSCTL_ADD_PROC(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree),
OID_AUTO, "suspend_state", CTLTYPE_STRING | CTLFLAG_RW,
&sc->acpi_suspend_sx, 0, acpi_sleep_state_sysctl, "A", "");
SYSCTL_ADD_INT(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree),
OID_AUTO, "sleep_delay", CTLFLAG_RW, &sc->acpi_sleep_delay, 0,
"sleep delay in seconds");
SYSCTL_ADD_INT(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree),
OID_AUTO, "s4bios", CTLFLAG_RW, &sc->acpi_s4bios, 0, "S4BIOS mode");
SYSCTL_ADD_INT(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree),
OID_AUTO, "verbose", CTLFLAG_RW, &sc->acpi_verbose, 0, "verbose mode");
SYSCTL_ADD_INT(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree),
OID_AUTO, "disable_on_reboot", CTLFLAG_RW,
&sc->acpi_do_disable, 0, "Disable ACPI when rebooting/halting system");
SYSCTL_ADD_INT(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree),
OID_AUTO, "handle_reboot", CTLFLAG_RW,
&sc->acpi_handle_reboot, 0, "Use ACPI Reset Register to reboot");
/*
* Default to 1 second before sleeping to give some machines time to
* stabilize.
*/
sc->acpi_sleep_delay = 1;
if (bootverbose)
sc->acpi_verbose = 1;
if ((env = kern_getenv("hw.acpi.verbose")) != NULL) {
if (strcmp(env, "0") != 0)
sc->acpi_verbose = 1;
freeenv(env);
}
/* Only enable reboot by default if the FADT says it is available. */
if (AcpiGbl_FADT.Flags & ACPI_FADT_RESET_REGISTER)
sc->acpi_handle_reboot = 1;
#if !ACPI_REDUCED_HARDWARE
/* Only enable S4BIOS by default if the FACS says it is available. */
if (AcpiGbl_FACS != NULL && AcpiGbl_FACS->Flags & ACPI_FACS_S4_BIOS_PRESENT)
sc->acpi_s4bios = 1;
#endif
/* Probe all supported sleep states. */
acpi_sleep_states[ACPI_STATE_S0] = TRUE;
for (state = ACPI_STATE_S1; state < ACPI_S_STATE_COUNT; state++)
if (ACPI_SUCCESS(AcpiEvaluateObject(ACPI_ROOT_OBJECT,
__DECONST(char *, AcpiGbl_SleepStateNames[state]), NULL, NULL)) &&
ACPI_SUCCESS(AcpiGetSleepTypeData(state, &TypeA, &TypeB)))
acpi_sleep_states[state] = TRUE;
/*
* Dispatch the default sleep state to devices. The lid switch is set
* to UNKNOWN by default to avoid surprising users.
*/
sc->acpi_power_button_sx = acpi_sleep_states[ACPI_STATE_S5] ?
ACPI_STATE_S5 : ACPI_STATE_UNKNOWN;
sc->acpi_lid_switch_sx = ACPI_STATE_UNKNOWN;
sc->acpi_standby_sx = acpi_sleep_states[ACPI_STATE_S1] ?
ACPI_STATE_S1 : ACPI_STATE_UNKNOWN;
sc->acpi_suspend_sx = acpi_sleep_states[ACPI_STATE_S3] ?
ACPI_STATE_S3 : ACPI_STATE_UNKNOWN;
/* Pick the first valid sleep state for the sleep button default. */
sc->acpi_sleep_button_sx = ACPI_STATE_UNKNOWN;
for (state = ACPI_STATE_S1; state <= ACPI_STATE_S4; state++)
if (acpi_sleep_states[state]) {
sc->acpi_sleep_button_sx = state;
break;
}
acpi_enable_fixed_events(sc);
/*
* Scan the namespace and attach/initialise children.
*/
/* Register our shutdown handler. */
EVENTHANDLER_REGISTER(shutdown_final, acpi_shutdown_final, sc,
SHUTDOWN_PRI_LAST);
/*
* Register our acpi event handlers.
* XXX should be configurable eg. via userland policy manager.
*/
EVENTHANDLER_REGISTER(acpi_sleep_event, acpi_system_eventhandler_sleep,
sc, ACPI_EVENT_PRI_LAST);
EVENTHANDLER_REGISTER(acpi_wakeup_event, acpi_system_eventhandler_wakeup,
sc, ACPI_EVENT_PRI_LAST);
/* Flag our initial states. */
sc->acpi_enabled = TRUE;
sc->acpi_sstate = ACPI_STATE_S0;
sc->acpi_sleep_disabled = TRUE;
/* Create the control device */
sc->acpi_dev_t = make_dev(&acpi_cdevsw, 0, UID_ROOT, GID_OPERATOR, 0664,
"acpi");
sc->acpi_dev_t->si_drv1 = sc;
if ((error = acpi_machdep_init(dev)))
goto out;
/* Register ACPI again to pass the correct argument of pm_func. */
power_pm_register(POWER_PM_TYPE_ACPI, acpi_pm_func, sc);
if (!acpi_disabled("bus")) {
EVENTHANDLER_REGISTER(dev_lookup, acpi_lookup, NULL, 1000);
acpi_probe_children(dev);
}
/* Update all GPEs and enable runtime GPEs. */
status = AcpiUpdateAllGpes();
if (ACPI_FAILURE(status))
device_printf(dev, "Could not update all GPEs: %s\n",
AcpiFormatException(status));
/* Allow sleep request after a while. */
callout_init_mtx(&acpi_sleep_timer, &acpi_mutex, 0);
callout_reset(&acpi_sleep_timer, hz * ACPI_MINIMUM_AWAKETIME,
acpi_sleep_enable, sc);
error = 0;
out:
return_VALUE (error);
}
static void
acpi_set_power_children(device_t dev, int state)
{
device_t child;
device_t *devlist;
int dstate, i, numdevs;
if (device_get_children(dev, &devlist, &numdevs) != 0)
return;
/*
* Retrieve and set D-state for the sleep state if _SxD is present.
* Skip children who aren't attached since they are handled separately.
*/
for (i = 0; i < numdevs; i++) {
child = devlist[i];
dstate = state;
if (device_is_attached(child) &&
acpi_device_pwr_for_sleep(dev, child, &dstate) == 0)
acpi_set_powerstate(child, dstate);
}
free(devlist, M_TEMP);
}
static int
acpi_suspend(device_t dev)
{
int error;
GIANT_REQUIRED;
error = bus_generic_suspend(dev);
if (error == 0)
acpi_set_power_children(dev, ACPI_STATE_D3);
return (error);
}
static int
acpi_resume(device_t dev)
{
GIANT_REQUIRED;
acpi_set_power_children(dev, ACPI_STATE_D0);
return (bus_generic_resume(dev));
}
static int
acpi_shutdown(device_t dev)
{
GIANT_REQUIRED;
/* Allow children to shutdown first. */
bus_generic_shutdown(dev);
/*
* Enable any GPEs that are able to power-on the system (i.e., RTC).
* Also, disable any that are not valid for this state (most).
*/
acpi_wake_prep_walk(ACPI_STATE_S5);
return (0);
}
/*
* Handle a new device being added
*/
static device_t
acpi_add_child(device_t bus, u_int order, const char *name, int unit)
{
struct acpi_device *ad;
device_t child;
if ((ad = malloc(sizeof(*ad), M_ACPIDEV, M_NOWAIT | M_ZERO)) == NULL)
return (NULL);
resource_list_init(&ad->ad_rl);
child = device_add_child_ordered(bus, order, name, unit);
if (child != NULL)
device_set_ivars(child, ad);
else
free(ad, M_ACPIDEV);
return (child);
}
static int
acpi_print_child(device_t bus, device_t child)
{
struct acpi_device *adev = device_get_ivars(child);
struct resource_list *rl = &adev->ad_rl;
int retval = 0;
retval += bus_print_child_header(bus, child);
retval += resource_list_print_type(rl, "port", SYS_RES_IOPORT, "%#jx");
retval += resource_list_print_type(rl, "iomem", SYS_RES_MEMORY, "%#jx");
retval += resource_list_print_type(rl, "irq", SYS_RES_IRQ, "%jd");
retval += resource_list_print_type(rl, "drq", SYS_RES_DRQ, "%jd");
if (device_get_flags(child))
retval += printf(" flags %#x", device_get_flags(child));
retval += bus_print_child_domain(bus, child);
retval += bus_print_child_footer(bus, child);
return (retval);
}
/*
* If this device is an ACPI child but no one claimed it, attempt
* to power it off. We'll power it back up when a driver is added.
*
* XXX Disabled for now since many necessary devices (like fdc and
* ATA) don't claim the devices we created for them but still expect
* them to be powered up.
*/
static void
acpi_probe_nomatch(device_t bus, device_t child)
{
#ifdef ACPI_ENABLE_POWERDOWN_NODRIVER
acpi_set_powerstate(child, ACPI_STATE_D3);
#endif
}
/*
* If a new driver has a chance to probe a child, first power it up.
*
* XXX Disabled for now (see acpi_probe_nomatch for details).
*/
static void
acpi_driver_added(device_t dev, driver_t *driver)
{
device_t child, *devlist;
int i, numdevs;
DEVICE_IDENTIFY(driver, dev);
if (device_get_children(dev, &devlist, &numdevs))
return;
for (i = 0; i < numdevs; i++) {
child = devlist[i];
if (device_get_state(child) == DS_NOTPRESENT) {
#ifdef ACPI_ENABLE_POWERDOWN_NODRIVER
acpi_set_powerstate(child, ACPI_STATE_D0);
if (device_probe_and_attach(child) != 0)
acpi_set_powerstate(child, ACPI_STATE_D3);
#else
device_probe_and_attach(child);
#endif
}
}
free(devlist, M_TEMP);
}
/* Location hint for devctl(8) */
static int
acpi_child_location_str_method(device_t cbdev, device_t child, char *buf,
size_t buflen)
{
struct acpi_device *dinfo = device_get_ivars(child);
char buf2[32];
int pxm;
if (dinfo->ad_handle) {
snprintf(buf, buflen, "handle=%s", acpi_name(dinfo->ad_handle));
if (ACPI_SUCCESS(acpi_GetInteger(dinfo->ad_handle, "_PXM", &pxm))) {
snprintf(buf2, 32, " _PXM=%d", pxm);
strlcat(buf, buf2, buflen);
}
} else {
snprintf(buf, buflen, "");
}
return (0);
}
/* PnP information for devctl(8) */
static int
acpi_child_pnpinfo_str_method(device_t cbdev, device_t child, char *buf,
size_t buflen)
{
struct acpi_device *dinfo = device_get_ivars(child);
ACPI_DEVICE_INFO *adinfo;
if (ACPI_FAILURE(AcpiGetObjectInfo(dinfo->ad_handle, &adinfo))) {
snprintf(buf, buflen, "unknown");
return (0);
}
snprintf(buf, buflen, "_HID=%s _UID=%lu _CID=%s",
(adinfo->Valid & ACPI_VALID_HID) ?
adinfo->HardwareId.String : "none",
(adinfo->Valid & ACPI_VALID_UID) ?
strtoul(adinfo->UniqueId.String, NULL, 10) : 0UL,
((adinfo->Valid & ACPI_VALID_CID) &&
adinfo->CompatibleIdList.Count > 0) ?
adinfo->CompatibleIdList.Ids[0].String : "none");
AcpiOsFree(adinfo);
return (0);
}
/*
* Handle per-device ivars
*/
static int
acpi_read_ivar(device_t dev, device_t child, int index, uintptr_t *result)
{
struct acpi_device *ad;
if ((ad = device_get_ivars(child)) == NULL) {
device_printf(child, "device has no ivars\n");
return (ENOENT);
}
/* ACPI and ISA compatibility ivars */
switch(index) {
case ACPI_IVAR_HANDLE:
*(ACPI_HANDLE *)result = ad->ad_handle;
break;
case ACPI_IVAR_PRIVATE:
*(void **)result = ad->ad_private;
break;
case ACPI_IVAR_FLAGS:
*(int *)result = ad->ad_flags;
break;
case ISA_IVAR_VENDORID:
case ISA_IVAR_SERIAL:
case ISA_IVAR_COMPATID:
*(int *)result = -1;
break;
case ISA_IVAR_LOGICALID:
*(int *)result = acpi_isa_get_logicalid(child);
break;
case PCI_IVAR_CLASS:
*(uint8_t*)result = (ad->ad_cls_class >> 16) & 0xff;
break;
case PCI_IVAR_SUBCLASS:
*(uint8_t*)result = (ad->ad_cls_class >> 8) & 0xff;
break;
case PCI_IVAR_PROGIF:
*(uint8_t*)result = (ad->ad_cls_class >> 0) & 0xff;
break;
default:
return (ENOENT);
}
return (0);
}
static int
acpi_write_ivar(device_t dev, device_t child, int index, uintptr_t value)
{
struct acpi_device *ad;
if ((ad = device_get_ivars(child)) == NULL) {
device_printf(child, "device has no ivars\n");
return (ENOENT);
}
switch(index) {
case ACPI_IVAR_HANDLE:
ad->ad_handle = (ACPI_HANDLE)value;
break;
case ACPI_IVAR_PRIVATE:
ad->ad_private = (void *)value;
break;
case ACPI_IVAR_FLAGS:
ad->ad_flags = (int)value;
break;
default:
panic("bad ivar write request (%d)", index);
return (ENOENT);
}
return (0);
}
/*
* Handle child resource allocation/removal
*/
static struct resource_list *
acpi_get_rlist(device_t dev, device_t child)
{
struct acpi_device *ad;
ad = device_get_ivars(child);
return (&ad->ad_rl);
}
static int
acpi_match_resource_hint(device_t dev, int type, long value)
{
struct acpi_device *ad = device_get_ivars(dev);
struct resource_list *rl = &ad->ad_rl;
struct resource_list_entry *rle;
STAILQ_FOREACH(rle, rl, link) {
if (rle->type != type)
continue;
if (rle->start <= value && rle->end >= value)
return (1);
}
return (0);
}
/*
* Wire device unit numbers based on resource matches in hints.
*/
static void
acpi_hint_device_unit(device_t acdev, device_t child, const char *name,
int *unitp)
{
const char *s;
long value;
int line, matches, unit;
/*
* Iterate over all the hints for the devices with the specified
* name to see if one's resources are a subset of this device.
*/
line = 0;
while (resource_find_dev(&line, name, &unit, "at", NULL) == 0) {
/* Must have an "at" for acpi or isa. */
resource_string_value(name, unit, "at", &s);
if (!(strcmp(s, "acpi0") == 0 || strcmp(s, "acpi") == 0 ||
strcmp(s, "isa0") == 0 || strcmp(s, "isa") == 0))
continue;
/*
* Check for matching resources. We must have at least one match.
* Since I/O and memory resources cannot be shared, if we get a
* match on either of those, ignore any mismatches in IRQs or DRQs.
*
* XXX: We may want to revisit this to be more lenient and wire
* as long as it gets one match.
*/
matches = 0;
if (resource_long_value(name, unit, "port", &value) == 0) {
/*
* Floppy drive controllers are notorious for having a
* wide variety of resources not all of which include the
* first port that is specified by the hint (typically
* 0x3f0) (see the comment above fdc_isa_alloc_resources()
* in fdc_isa.c). However, they do all seem to include
* port + 2 (e.g. 0x3f2) so for a floppy device, look for
* 'value + 2' in the port resources instead of the hint
* value.
*/
if (strcmp(name, "fdc") == 0)
value += 2;
if (acpi_match_resource_hint(child, SYS_RES_IOPORT, value))
matches++;
else
continue;
}
if (resource_long_value(name, unit, "maddr", &value) == 0) {
if (acpi_match_resource_hint(child, SYS_RES_MEMORY, value))
matches++;
else
continue;
}
if (matches > 0)
goto matched;
if (resource_long_value(name, unit, "irq", &value) == 0) {
if (acpi_match_resource_hint(child, SYS_RES_IRQ, value))
matches++;
else
continue;
}
if (resource_long_value(name, unit, "drq", &value) == 0) {
if (acpi_match_resource_hint(child, SYS_RES_DRQ, value))
matches++;
else
continue;
}
matched:
if (matches > 0) {
/* We have a winner! */
*unitp = unit;
break;
}
}
}
/*
* Fetch the NUMA domain for a device by mapping the value returned by
* _PXM to a NUMA domain. If the device does not have a _PXM method,
* -2 is returned. If any other error occurs, -1 is returned.
*/
static int
acpi_parse_pxm(device_t dev)
{
#ifdef NUMA
#if defined(__i386__) || defined(__amd64__)
ACPI_HANDLE handle;
ACPI_STATUS status;
int pxm;
handle = acpi_get_handle(dev);
if (handle == NULL)
return (-2);
status = acpi_GetInteger(handle, "_PXM", &pxm);
if (ACPI_SUCCESS(status))
return (acpi_map_pxm_to_vm_domainid(pxm));
if (status == AE_NOT_FOUND)
return (-2);
#endif
#endif
return (-1);
}
int
acpi_get_cpus(device_t dev, device_t child, enum cpu_sets op, size_t setsize,
cpuset_t *cpuset)
{
int d, error;
d = acpi_parse_pxm(child);
if (d < 0)
return (bus_generic_get_cpus(dev, child, op, setsize, cpuset));
switch (op) {
case LOCAL_CPUS:
if (setsize != sizeof(cpuset_t))
return (EINVAL);
*cpuset = cpuset_domain[d];
return (0);
case INTR_CPUS:
error = bus_generic_get_cpus(dev, child, op, setsize, cpuset);
if (error != 0)
return (error);
if (setsize != sizeof(cpuset_t))
return (EINVAL);
CPU_AND(cpuset, &cpuset_domain[d]);
return (0);
default:
return (bus_generic_get_cpus(dev, child, op, setsize, cpuset));
}
}
/*
* Fetch the NUMA domain for the given device 'dev'.
*
* If a device has a _PXM method, map that to a NUMA domain.
* Otherwise, pass the request up to the parent.
* If there's no matching domain or the domain cannot be
* determined, return ENOENT.
*/
int
acpi_get_domain(device_t dev, device_t child, int *domain)
{
int d;
d = acpi_parse_pxm(child);
if (d >= 0) {
*domain = d;
return (0);
}
if (d == -1)
return (ENOENT);
/* No _PXM node; go up a level */
return (bus_generic_get_domain(dev, child, domain));
}
/*
* Pre-allocate/manage all memory and IO resources. Since rman can't handle
* duplicates, we merge any in the sysresource attach routine.
*/
static int
acpi_sysres_alloc(device_t dev)
{
struct resource *res;
struct resource_list *rl;
struct resource_list_entry *rle;
struct rman *rm;
device_t *children;
int child_count, i;
/*
* Probe/attach any sysresource devices. This would be unnecessary if we
* had multi-pass probe/attach.
*/
if (device_get_children(dev, &children, &child_count) != 0)
return (ENXIO);
for (i = 0; i < child_count; i++) {
if (ACPI_ID_PROBE(dev, children[i], sysres_ids, NULL) <= 0)
device_probe_and_attach(children[i]);
}
free(children, M_TEMP);
rl = BUS_GET_RESOURCE_LIST(device_get_parent(dev), dev);
STAILQ_FOREACH(rle, rl, link) {
if (rle->res != NULL) {
device_printf(dev, "duplicate resource for %jx\n", rle->start);
continue;
}
/* Only memory and IO resources are valid here. */
switch (rle->type) {
case SYS_RES_IOPORT:
rm = &acpi_rman_io;
break;
case SYS_RES_MEMORY:
rm = &acpi_rman_mem;
break;
default:
continue;
}
/* Pre-allocate resource and add to our rman pool. */
res = BUS_ALLOC_RESOURCE(device_get_parent(dev), dev, rle->type,
&rle->rid, rle->start, rle->start + rle->count - 1, rle->count, 0);
if (res != NULL) {
rman_manage_region(rm, rman_get_start(res), rman_get_end(res));
rle->res = res;
} else if (bootverbose)
device_printf(dev, "reservation of %jx, %jx (%d) failed\n",
rle->start, rle->count, rle->type);
}
return (0);
}
/*
* Reserve declared resources for devices found during attach once system
* resources have been allocated.
*/
static void
acpi_reserve_resources(device_t dev)
{
struct resource_list_entry *rle;
struct resource_list *rl;
struct acpi_device *ad;
struct acpi_softc *sc;
device_t *children;
int child_count, i;
sc = device_get_softc(dev);
if (device_get_children(dev, &children, &child_count) != 0)
return;
for (i = 0; i < child_count; i++) {
ad = device_get_ivars(children[i]);
rl = &ad->ad_rl;
/* Don't reserve system resources. */
if (ACPI_ID_PROBE(dev, children[i], sysres_ids, NULL) <= 0)
continue;
STAILQ_FOREACH(rle, rl, link) {
/*
* Don't reserve IRQ resources. There are many sticky things
* to get right otherwise (e.g. IRQs for psm, atkbd, and HPET
* when using legacy routing).
*/
if (rle->type == SYS_RES_IRQ)
continue;
/*
* Don't reserve the resource if it is already allocated.
* The acpi_ec(4) driver can allocate its resources early
* if ECDT is present.
*/
if (rle->res != NULL)
continue;
/*
* Try to reserve the resource from our parent. If this
* fails because the resource is a system resource, just
* let it be. The resource range is already reserved so
* that other devices will not use it. If the driver
* needs to allocate the resource, then
* acpi_alloc_resource() will sub-alloc from the system
* resource.
*/
resource_list_reserve(rl, dev, children[i], rle->type, &rle->rid,
rle->start, rle->end, rle->count, 0);
}
}
free(children, M_TEMP);
sc->acpi_resources_reserved = 1;
}
static int
acpi_set_resource(device_t dev, device_t child, int type, int rid,
rman_res_t start, rman_res_t count)
{
struct acpi_softc *sc = device_get_softc(dev);
struct acpi_device *ad = device_get_ivars(child);
struct resource_list *rl = &ad->ad_rl;
ACPI_DEVICE_INFO *devinfo;
rman_res_t end;
int allow;
/* Ignore IRQ resources for PCI link devices. */
if (type == SYS_RES_IRQ &&
ACPI_ID_PROBE(dev, child, pcilink_ids, NULL) <= 0)
return (0);
/*
* Ignore most resources for PCI root bridges. Some BIOSes
* incorrectly enumerate the memory ranges they decode as plain
* memory resources instead of as ResourceProducer ranges. Other
* BIOSes incorrectly list system resource entries for I/O ranges
* under the PCI bridge. Do allow the one known-correct case on
* x86 of a PCI bridge claiming the I/O ports used for PCI config
* access.
*/
if (type == SYS_RES_MEMORY || type == SYS_RES_IOPORT) {
if (ACPI_SUCCESS(AcpiGetObjectInfo(ad->ad_handle, &devinfo))) {
if ((devinfo->Flags & ACPI_PCI_ROOT_BRIDGE) != 0) {
#if defined(__i386__) || defined(__amd64__)
allow = (type == SYS_RES_IOPORT && start == CONF1_ADDR_PORT);
#else
allow = 0;
#endif
if (!allow) {
AcpiOsFree(devinfo);
return (0);
}
}
AcpiOsFree(devinfo);
}
}
#ifdef INTRNG
/* map with default for now */
if (type == SYS_RES_IRQ)
start = (rman_res_t)acpi_map_intr(child, (u_int)start,
acpi_get_handle(child));
#endif
/* If the resource is already allocated, fail. */
if (resource_list_busy(rl, type, rid))
return (EBUSY);
/* If the resource is already reserved, release it. */
if (resource_list_reserved(rl, type, rid))
resource_list_unreserve(rl, dev, child, type, rid);
/* Add the resource. */
end = (start + count - 1);
resource_list_add(rl, type, rid, start, end, count);
/* Don't reserve resources until the system resources are allocated. */
if (!sc->acpi_resources_reserved)
return (0);
/* Don't reserve system resources. */
if (ACPI_ID_PROBE(dev, child, sysres_ids, NULL) <= 0)
return (0);
/*
* Don't reserve IRQ resources. There are many sticky things to
* get right otherwise (e.g. IRQs for psm, atkbd, and HPET when
* using legacy routing).
*/
if (type == SYS_RES_IRQ)
return (0);
/*
* Don't reserve resources for CPU devices. Some of these
* resources need to be allocated as shareable, but reservations
* are always non-shareable.
*/
if (device_get_devclass(child) == devclass_find("cpu"))
return (0);
/*
* Reserve the resource.
*
* XXX: Ignores failure for now. Failure here is probably a
* BIOS/firmware bug?
*/
resource_list_reserve(rl, dev, child, type, &rid, start, end, count, 0);
return (0);
}
static struct resource *
acpi_alloc_resource(device_t bus, device_t child, int type, int *rid,
rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
{
#ifndef INTRNG
ACPI_RESOURCE ares;
#endif
struct acpi_device *ad;
struct resource_list_entry *rle;
struct resource_list *rl;
struct resource *res;
int isdefault = RMAN_IS_DEFAULT_RANGE(start, end);
/*
* First attempt at allocating the resource. For direct children,
* use resource_list_alloc() to handle reserved resources. For
* other devices, pass the request up to our parent.
*/
if (bus == device_get_parent(child)) {
ad = device_get_ivars(child);
rl = &ad->ad_rl;
/*
* Simulate the behavior of the ISA bus for direct children
* devices. That is, if a non-default range is specified for
* a resource that doesn't exist, use bus_set_resource() to
* add the resource before allocating it. Note that these
* resources will not be reserved.
*/
if (!isdefault && resource_list_find(rl, type, *rid) == NULL)
resource_list_add(rl, type, *rid, start, end, count);
res = resource_list_alloc(rl, bus, child, type, rid, start, end, count,
flags);
#ifndef INTRNG
if (res != NULL && type == SYS_RES_IRQ) {
/*
* Since bus_config_intr() takes immediate effect, we cannot
* configure the interrupt associated with a device when we
* parse the resources but have to defer it until a driver
* actually allocates the interrupt via bus_alloc_resource().
*
* XXX: Should we handle the lookup failing?
*/
if (ACPI_SUCCESS(acpi_lookup_irq_resource(child, *rid, res, &ares)))
acpi_config_intr(child, &ares);
}
#endif
/*
* If this is an allocation of the "default" range for a given
* RID, fetch the exact bounds for this resource from the
* resource list entry to try to allocate the range from the
* system resource regions.
*/
if (res == NULL && isdefault) {
rle = resource_list_find(rl, type, *rid);
if (rle != NULL) {
start = rle->start;
end = rle->end;
count = rle->count;
}
}
} else
res = BUS_ALLOC_RESOURCE(device_get_parent(bus), child, type, rid,
start, end, count, flags);
/*
* If the first attempt failed and this is an allocation of a
* specific range, try to satisfy the request via a suballocation
* from our system resource regions.
*/
if (res == NULL && start + count - 1 == end)
res = acpi_alloc_sysres(child, type, rid, start, end, count, flags);
return (res);
}
/*
* Attempt to allocate a specific resource range from the system
* resource ranges. Note that we only handle memory and I/O port
* system resources.
*/
struct resource *
acpi_alloc_sysres(device_t child, int type, int *rid, rman_res_t start,
rman_res_t end, rman_res_t count, u_int flags)
{
struct rman *rm;
struct resource *res;
switch (type) {
case SYS_RES_IOPORT:
rm = &acpi_rman_io;
break;
case SYS_RES_MEMORY:
rm = &acpi_rman_mem;
break;
default:
return (NULL);
}
KASSERT(start + count - 1 == end, ("wildcard resource range"));
res = rman_reserve_resource(rm, start, end, count, flags & ~RF_ACTIVE,
child);
if (res == NULL)
return (NULL);
rman_set_rid(res, *rid);
/* If requested, activate the resource using the parent's method. */
if (flags & RF_ACTIVE)
if (bus_activate_resource(child, type, *rid, res) != 0) {
rman_release_resource(res);
return (NULL);
}
return (res);
}
static int
acpi_is_resource_managed(int type, struct resource *r)
{
/* We only handle memory and IO resources through rman. */
switch (type) {
case SYS_RES_IOPORT:
return (rman_is_region_manager(r, &acpi_rman_io));
case SYS_RES_MEMORY:
return (rman_is_region_manager(r, &acpi_rman_mem));
}
return (0);
}
static int
acpi_adjust_resource(device_t bus, device_t child, int type, struct resource *r,
rman_res_t start, rman_res_t end)
{
if (acpi_is_resource_managed(type, r))
return (rman_adjust_resource(r, start, end));
return (bus_generic_adjust_resource(bus, child, type, r, start, end));
}
static int
acpi_release_resource(device_t bus, device_t child, int type, int rid,
struct resource *r)
{
int ret;
/*
* If this resource belongs to one of our internal managers,
* deactivate it and release it to the local pool.
*/
if (acpi_is_resource_managed(type, r)) {
if (rman_get_flags(r) & RF_ACTIVE) {
ret = bus_deactivate_resource(child, type, rid, r);
if (ret != 0)
return (ret);
}
return (rman_release_resource(r));
}
return (bus_generic_rl_release_resource(bus, child, type, rid, r));
}
static void
acpi_delete_resource(device_t bus, device_t child, int type, int rid)
{
struct resource_list *rl;
rl = acpi_get_rlist(bus, child);
if (resource_list_busy(rl, type, rid)) {
device_printf(bus, "delete_resource: Resource still owned by child"
" (type=%d, rid=%d)\n", type, rid);
return;
}
resource_list_unreserve(rl, bus, child, type, rid);
resource_list_delete(rl, type, rid);
}
/* Allocate an IO port or memory resource, given its GAS. */
int
acpi_bus_alloc_gas(device_t dev, int *type, int *rid, ACPI_GENERIC_ADDRESS *gas,
struct resource **res, u_int flags)
{
int error, res_type;
error = ENOMEM;
if (type == NULL || rid == NULL || gas == NULL || res == NULL)
return (EINVAL);
/* We only support memory and IO spaces. */
switch (gas->SpaceId) {
case ACPI_ADR_SPACE_SYSTEM_MEMORY:
res_type = SYS_RES_MEMORY;
break;
case ACPI_ADR_SPACE_SYSTEM_IO:
res_type = SYS_RES_IOPORT;
break;
default:
return (EOPNOTSUPP);
}
/*
* If the register width is less than 8, assume the BIOS author means
* it is a bit field and just allocate a byte.
*/
if (gas->BitWidth && gas->BitWidth < 8)
gas->BitWidth = 8;
/* Validate the address after we're sure we support the space. */
if (gas->Address == 0 || gas->BitWidth == 0)
return (EINVAL);
bus_set_resource(dev, res_type, *rid, gas->Address,
gas->BitWidth / 8);
*res = bus_alloc_resource_any(dev, res_type, rid, RF_ACTIVE | flags);
if (*res != NULL) {
*type = res_type;
error = 0;
} else
bus_delete_resource(dev, res_type, *rid);
return (error);
}
/* Probe _HID and _CID for compatible ISA PNP ids. */
static uint32_t
acpi_isa_get_logicalid(device_t dev)
{
ACPI_DEVICE_INFO *devinfo;
ACPI_HANDLE h;
uint32_t pnpid;
ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
/* Fetch and validate the HID. */
if ((h = acpi_get_handle(dev)) == NULL ||
ACPI_FAILURE(AcpiGetObjectInfo(h, &devinfo)))
return_VALUE (0);
pnpid = (devinfo->Valid & ACPI_VALID_HID) != 0 &&
devinfo->HardwareId.Length >= ACPI_EISAID_STRING_SIZE ?
PNP_EISAID(devinfo->HardwareId.String) : 0;
AcpiOsFree(devinfo);
return_VALUE (pnpid);
}
static int
acpi_isa_get_compatid(device_t dev, uint32_t *cids, int count)
{
ACPI_DEVICE_INFO *devinfo;
ACPI_PNP_DEVICE_ID *ids;
ACPI_HANDLE h;
uint32_t *pnpid;
int i, valid;
ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
pnpid = cids;
/* Fetch and validate the CID */
if ((h = acpi_get_handle(dev)) == NULL ||
ACPI_FAILURE(AcpiGetObjectInfo(h, &devinfo)))
return_VALUE (0);
if ((devinfo->Valid & ACPI_VALID_CID) == 0) {
AcpiOsFree(devinfo);
return_VALUE (0);
}
if (devinfo->CompatibleIdList.Count < count)
count = devinfo->CompatibleIdList.Count;
ids = devinfo->CompatibleIdList.Ids;
for (i = 0, valid = 0; i < count; i++)
if (ids[i].Length >= ACPI_EISAID_STRING_SIZE &&
strncmp(ids[i].String, "PNP", 3) == 0) {
*pnpid++ = PNP_EISAID(ids[i].String);
valid++;
}
AcpiOsFree(devinfo);
return_VALUE (valid);
}
static int
acpi_device_id_probe(device_t bus, device_t dev, char **ids, char **match)
{
ACPI_HANDLE h;
ACPI_OBJECT_TYPE t;
int rv;
int i;
h = acpi_get_handle(dev);
if (ids == NULL || h == NULL)
return (ENXIO);
t = acpi_get_type(dev);
if (t != ACPI_TYPE_DEVICE && t != ACPI_TYPE_PROCESSOR)
return (ENXIO);
/* Try to match one of the array of IDs with a HID or CID. */
for (i = 0; ids[i] != NULL; i++) {
rv = acpi_MatchHid(h, ids[i]);
if (rv == ACPI_MATCHHID_NOMATCH)
continue;
if (match != NULL) {
*match = ids[i];
}
return ((rv == ACPI_MATCHHID_HID)?
BUS_PROBE_DEFAULT : BUS_PROBE_LOW_PRIORITY);
}
return (ENXIO);
}
static ACPI_STATUS
acpi_device_eval_obj(device_t bus, device_t dev, ACPI_STRING pathname,
ACPI_OBJECT_LIST *parameters, ACPI_BUFFER *ret)
{
ACPI_HANDLE h;
if (dev == NULL)
h = ACPI_ROOT_OBJECT;
else if ((h = acpi_get_handle(dev)) == NULL)
return (AE_BAD_PARAMETER);
return (AcpiEvaluateObject(h, pathname, parameters, ret));
}
int
acpi_device_pwr_for_sleep(device_t bus, device_t dev, int *dstate)
{
struct acpi_softc *sc;
ACPI_HANDLE handle;
ACPI_STATUS status;
char sxd[8];
handle = acpi_get_handle(dev);
/*
* XXX If we find these devices, don't try to power them down.
* The serial and IRDA ports on my T23 hang the system when
* set to D3 and it appears that such legacy devices may
* need special handling in their drivers.
*/
if (dstate == NULL || handle == NULL ||
acpi_MatchHid(handle, "PNP0500") ||
acpi_MatchHid(handle, "PNP0501") ||
acpi_MatchHid(handle, "PNP0502") ||
acpi_MatchHid(handle, "PNP0510") ||
acpi_MatchHid(handle, "PNP0511"))
return (ENXIO);
/*
* Override next state with the value from _SxD, if present.
* Note illegal _S0D is evaluated because some systems expect this.
*/
sc = device_get_softc(bus);
snprintf(sxd, sizeof(sxd), "_S%dD", sc->acpi_sstate);
status = acpi_GetInteger(handle, sxd, dstate);
if (ACPI_FAILURE(status) && status != AE_NOT_FOUND) {
device_printf(dev, "failed to get %s on %s: %s\n", sxd,
acpi_name(handle), AcpiFormatException(status));
return (ENXIO);
}
return (0);
}
/* Callback arg for our implementation of walking the namespace. */
struct acpi_device_scan_ctx {
acpi_scan_cb_t user_fn;
void *arg;
ACPI_HANDLE parent;
};
static ACPI_STATUS
acpi_device_scan_cb(ACPI_HANDLE h, UINT32 level, void *arg, void **retval)
{
struct acpi_device_scan_ctx *ctx;
device_t dev, old_dev;
ACPI_STATUS status;
ACPI_OBJECT_TYPE type;
/*
* Skip this device if we think we'll have trouble with it or it is
* the parent where the scan began.
*/
ctx = (struct acpi_device_scan_ctx *)arg;
if (acpi_avoid(h) || h == ctx->parent)
return (AE_OK);
/* If this is not a valid device type (e.g., a method), skip it. */
if (ACPI_FAILURE(AcpiGetType(h, &type)))
return (AE_OK);
if (type != ACPI_TYPE_DEVICE && type != ACPI_TYPE_PROCESSOR &&
type != ACPI_TYPE_THERMAL && type != ACPI_TYPE_POWER)
return (AE_OK);
/*
* Call the user function with the current device. If it is unchanged
* afterwards, return. Otherwise, we update the handle to the new dev.
*/
old_dev = acpi_get_device(h);
dev = old_dev;
status = ctx->user_fn(h, &dev, level, ctx->arg);
if (ACPI_FAILURE(status) || old_dev == dev)
return (status);
/* Remove the old child and its connection to the handle. */
if (old_dev != NULL) {
device_delete_child(device_get_parent(old_dev), old_dev);
AcpiDetachData(h, acpi_fake_objhandler);
}
/* Recreate the handle association if the user created a device. */
if (dev != NULL)
AcpiAttachData(h, acpi_fake_objhandler, dev);
return (AE_OK);
}
static ACPI_STATUS
acpi_device_scan_children(device_t bus, device_t dev, int max_depth,
acpi_scan_cb_t user_fn, void *arg)
{
ACPI_HANDLE h;
struct acpi_device_scan_ctx ctx;
if (acpi_disabled("children"))
return (AE_OK);
if (dev == NULL)
h = ACPI_ROOT_OBJECT;
else if ((h = acpi_get_handle(dev)) == NULL)
return (AE_BAD_PARAMETER);
ctx.user_fn = user_fn;
ctx.arg = arg;
ctx.parent = h;
return (AcpiWalkNamespace(ACPI_TYPE_ANY, h, max_depth,
acpi_device_scan_cb, NULL, &ctx, NULL));
}
/*
* Even though ACPI devices are not PCI, we use the PCI approach for setting
* device power states since it's close enough to ACPI.
*/
static int
acpi_set_powerstate(device_t child, int state)
{
ACPI_HANDLE h;
ACPI_STATUS status;
h = acpi_get_handle(child);
if (state < ACPI_STATE_D0 || state > ACPI_D_STATES_MAX)
return (EINVAL);
if (h == NULL)
return (0);
/* Ignore errors if the power methods aren't present. */
status = acpi_pwr_switch_consumer(h, state);
if (ACPI_SUCCESS(status)) {
if (bootverbose)
device_printf(child, "set ACPI power state D%d on %s\n",
state, acpi_name(h));
} else if (status != AE_NOT_FOUND)
device_printf(child,
"failed to set ACPI power state D%d on %s: %s\n", state,
acpi_name(h), AcpiFormatException(status));
return (0);
}
static int
acpi_isa_pnp_probe(device_t bus, device_t child, struct isa_pnp_id *ids)
{
int result, cid_count, i;
uint32_t lid, cids[8];
ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
/*
* ISA-style drivers attached to ACPI may persist and
* probe manually if we return ENOENT. We never want
* that to happen, so don't ever return it.
*/
result = ENXIO;
/* Scan the supplied IDs for a match */
lid = acpi_isa_get_logicalid(child);
cid_count = acpi_isa_get_compatid(child, cids, 8);
while (ids && ids->ip_id) {
if (lid == ids->ip_id) {
result = 0;
goto out;
}
for (i = 0; i < cid_count; i++) {
if (cids[i] == ids->ip_id) {
result = 0;
goto out;
}
}
ids++;
}
out:
if (result == 0 && ids->ip_desc)
device_set_desc(child, ids->ip_desc);
return_VALUE (result);
}
/*
* Look for a MCFG table. If it is present, use the settings for
* domain (segment) 0 to setup PCI config space access via the memory
* map.
*
* On non-x86 architectures (arm64 for now), this will be done from the
* PCI host bridge driver.
*/
static void
acpi_enable_pcie(void)
{
#if defined(__i386__) || defined(__amd64__)
ACPI_TABLE_HEADER *hdr;
ACPI_MCFG_ALLOCATION *alloc, *end;
ACPI_STATUS status;
status = AcpiGetTable(ACPI_SIG_MCFG, 1, &hdr);
if (ACPI_FAILURE(status))
return;
end = (ACPI_MCFG_ALLOCATION *)((char *)hdr + hdr->Length);
alloc = (ACPI_MCFG_ALLOCATION *)((ACPI_TABLE_MCFG *)hdr + 1);
while (alloc < end) {
if (alloc->PciSegment == 0) {
pcie_cfgregopen(alloc->Address, alloc->StartBusNumber,
alloc->EndBusNumber);
return;
}
alloc++;
}
#endif
}
/*
* Scan all of the ACPI namespace and attach child devices.
*
* We should only expect to find devices in the \_PR, \_TZ, \_SI, and
* \_SB scopes, and \_PR and \_TZ became obsolete in the ACPI 2.0 spec.
* However, in violation of the spec, some systems place their PCI link
* devices in \, so we have to walk the whole namespace. We check the
* type of namespace nodes, so this should be ok.
*/
static void
acpi_probe_children(device_t bus)
{
ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
/*
* Scan the namespace and insert placeholders for all the devices that
* we find. We also probe/attach any early devices.
*
* Note that we use AcpiWalkNamespace rather than AcpiGetDevices because
* we want to create nodes for all devices, not just those that are
* currently present. (This assumes that we don't want to create/remove
* devices as they appear, which might be smarter.)
*/
ACPI_DEBUG_PRINT((ACPI_DB_OBJECTS, "namespace scan\n"));
AcpiWalkNamespace(ACPI_TYPE_ANY, ACPI_ROOT_OBJECT, 100, acpi_probe_child,
NULL, bus, NULL);
/* Pre-allocate resources for our rman from any sysresource devices. */
acpi_sysres_alloc(bus);
/* Reserve resources already allocated to children. */
acpi_reserve_resources(bus);
/* Create any static children by calling device identify methods. */
ACPI_DEBUG_PRINT((ACPI_DB_OBJECTS, "device identify routines\n"));
bus_generic_probe(bus);
/* Probe/attach all children, created statically and from the namespace. */
ACPI_DEBUG_PRINT((ACPI_DB_OBJECTS, "acpi bus_generic_attach\n"));
bus_generic_attach(bus);
/* Attach wake sysctls. */
acpi_wake_sysctl_walk(bus);
ACPI_DEBUG_PRINT((ACPI_DB_OBJECTS, "done attaching children\n"));
return_VOID;
}
/*
* Determine the probe order for a given device.
*/
static void
acpi_probe_order(ACPI_HANDLE handle, int *order)
{
ACPI_OBJECT_TYPE type;
/*
* 0. CPUs
* 1. I/O port and memory system resource holders
* 2. Clocks and timers (to handle early accesses)
* 3. Embedded controllers (to handle early accesses)
* 4. PCI Link Devices
*/
AcpiGetType(handle, &type);
if (type == ACPI_TYPE_PROCESSOR)
*order = 0;
else if (acpi_MatchHid(handle, "PNP0C01") ||
acpi_MatchHid(handle, "PNP0C02"))
*order = 1;
else if (acpi_MatchHid(handle, "PNP0100") ||
acpi_MatchHid(handle, "PNP0103") ||
acpi_MatchHid(handle, "PNP0B00"))
*order = 2;
else if (acpi_MatchHid(handle, "PNP0C09"))
*order = 3;
else if (acpi_MatchHid(handle, "PNP0C0F"))
*order = 4;
}
/*
* Evaluate a child device and determine whether we might attach a device to
* it.
*/
static ACPI_STATUS
acpi_probe_child(ACPI_HANDLE handle, UINT32 level, void *context, void **status)
{
ACPI_DEVICE_INFO *devinfo;
struct acpi_device *ad;
struct acpi_prw_data prw;
ACPI_OBJECT_TYPE type;
ACPI_HANDLE h;
device_t bus, child;
char *handle_str;
int order;
ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
if (acpi_disabled("children"))
return_ACPI_STATUS (AE_OK);
/* Skip this device if we think we'll have trouble with it. */
if (acpi_avoid(handle))
return_ACPI_STATUS (AE_OK);
bus = (device_t)context;
if (ACPI_SUCCESS(AcpiGetType(handle, &type))) {
handle_str = acpi_name(handle);
switch (type) {
case ACPI_TYPE_DEVICE:
/*
* Since we scan from \, be sure to skip system scope objects.
* \_SB_ and \_TZ_ are defined in ACPICA as devices to work around
* BIOS bugs. For example, \_SB_ is to allow \_SB_._INI to be run
* during the initialization and \_TZ_ is to support Notify() on it.
*/
if (strcmp(handle_str, "\\_SB_") == 0 ||
strcmp(handle_str, "\\_TZ_") == 0)
break;
if (acpi_parse_prw(handle, &prw) == 0)
AcpiSetupGpeForWake(handle, prw.gpe_handle, prw.gpe_bit);
/*
* Ignore devices that do not have a _HID or _CID. They should
* be discovered by other buses (e.g. the PCI bus driver).
*/
if (!acpi_has_hid(handle))
break;
/* FALLTHROUGH */
case ACPI_TYPE_PROCESSOR:
case ACPI_TYPE_THERMAL:
case ACPI_TYPE_POWER:
/*
* Create a placeholder device for this node. Sort the
* placeholder so that the probe/attach passes will run
* breadth-first. Orders less than ACPI_DEV_BASE_ORDER
* are reserved for special objects (i.e., system
* resources).
*/
ACPI_DEBUG_PRINT((ACPI_DB_OBJECTS, "scanning '%s'\n", handle_str));
order = level * 10 + ACPI_DEV_BASE_ORDER;
acpi_probe_order(handle, &order);
child = BUS_ADD_CHILD(bus, order, NULL, -1);
if (child == NULL)
break;
/* Associate the handle with the device_t and vice versa. */
acpi_set_handle(child, handle);
AcpiAttachData(handle, acpi_fake_objhandler, child);
/*
* Check that the device is present. If it's not present,
* leave it disabled (so that we have a device_t attached to
* the handle, but we don't probe it).
*
* XXX PCI link devices sometimes report "present" but not
* "functional" (i.e. if disabled). Go ahead and probe them
* anyway since we may enable them later.
*/
if (type == ACPI_TYPE_DEVICE && !acpi_DeviceIsPresent(child)) {
/* Never disable PCI link devices. */
if (acpi_MatchHid(handle, "PNP0C0F"))
break;
/*
* Docking stations should remain enabled since the system
* may be undocked at boot.
*/
if (ACPI_SUCCESS(AcpiGetHandle(handle, "_DCK", &h)))
break;
device_disable(child);
break;
}
/*
* Get the device's resource settings and attach them.
* Note that if the device has _PRS but no _CRS, we need
* to decide when it's appropriate to try to configure the
* device. Ignore the return value here; it's OK for the
* device not to have any resources.
*/
acpi_parse_resources(child, handle, &acpi_res_parse_set, NULL);
ad = device_get_ivars(child);
ad->ad_cls_class = 0xffffff;
if (ACPI_SUCCESS(AcpiGetObjectInfo(handle, &devinfo))) {
if ((devinfo->Valid & ACPI_VALID_CLS) != 0 &&
devinfo->ClassCode.Length >= ACPI_PCICLS_STRING_SIZE) {
ad->ad_cls_class = strtoul(devinfo->ClassCode.String,
NULL, 16);
}
AcpiOsFree(devinfo);
}
break;
}
}
return_ACPI_STATUS (AE_OK);
}
/*
* AcpiAttachData() requires an object handler but never uses it. This is a
* placeholder object handler so we can store a device_t in an ACPI_HANDLE.
*/
void
acpi_fake_objhandler(ACPI_HANDLE h, void *data)
{
}
static void
acpi_shutdown_final(void *arg, int howto)
{
struct acpi_softc *sc = (struct acpi_softc *)arg;
register_t intr;
ACPI_STATUS status;
/*
* XXX Shutdown code should only run on the BSP (cpuid 0).
* Some chipsets do not power off the system correctly if called from
* an AP.
*/
if ((howto & RB_POWEROFF) != 0) {
status = AcpiEnterSleepStatePrep(ACPI_STATE_S5);
if (ACPI_FAILURE(status)) {
device_printf(sc->acpi_dev, "AcpiEnterSleepStatePrep failed - %s\n",
AcpiFormatException(status));
return;
}
device_printf(sc->acpi_dev, "Powering system off\n");
intr = intr_disable();
status = AcpiEnterSleepState(ACPI_STATE_S5);
if (ACPI_FAILURE(status)) {
intr_restore(intr);
device_printf(sc->acpi_dev, "power-off failed - %s\n",
AcpiFormatException(status));
} else {
DELAY(1000000);
intr_restore(intr);
device_printf(sc->acpi_dev, "power-off failed - timeout\n");
}
} else if ((howto & RB_HALT) == 0 && sc->acpi_handle_reboot) {
/* Reboot using the reset register. */
status = AcpiReset();
if (ACPI_SUCCESS(status)) {
DELAY(1000000);
device_printf(sc->acpi_dev, "reset failed - timeout\n");
} else if (status != AE_NOT_EXIST)
device_printf(sc->acpi_dev, "reset failed - %s\n",
AcpiFormatException(status));
- } else if (sc->acpi_do_disable && panicstr == NULL) {
+ } else if (sc->acpi_do_disable && !KERNEL_PANICKED()) {
/*
* Only disable ACPI if the user requested. On some systems, writing
* the disable value to SMI_CMD hangs the system.
*/
device_printf(sc->acpi_dev, "Shutting down\n");
AcpiTerminate();
}
}
static void
acpi_enable_fixed_events(struct acpi_softc *sc)
{
static int first_time = 1;
/* Enable and clear fixed events and install handlers. */
if ((AcpiGbl_FADT.Flags & ACPI_FADT_POWER_BUTTON) == 0) {
AcpiClearEvent(ACPI_EVENT_POWER_BUTTON);
AcpiInstallFixedEventHandler(ACPI_EVENT_POWER_BUTTON,
acpi_event_power_button_sleep, sc);
if (first_time)
device_printf(sc->acpi_dev, "Power Button (fixed)\n");
}
if ((AcpiGbl_FADT.Flags & ACPI_FADT_SLEEP_BUTTON) == 0) {
AcpiClearEvent(ACPI_EVENT_SLEEP_BUTTON);
AcpiInstallFixedEventHandler(ACPI_EVENT_SLEEP_BUTTON,
acpi_event_sleep_button_sleep, sc);
if (first_time)
device_printf(sc->acpi_dev, "Sleep Button (fixed)\n");
}
first_time = 0;
}
/*
* Returns true if the device is actually present and should
* be attached to. This requires the present, enabled, UI-visible
* and diagnostics-passed bits to be set.
*/
BOOLEAN
acpi_DeviceIsPresent(device_t dev)
{
ACPI_HANDLE h;
UINT32 s;
ACPI_STATUS status;
h = acpi_get_handle(dev);
if (h == NULL)
return (FALSE);
/*
* Certain Treadripper boards always returns 0 for FreeBSD because it
* only returns non-zero for the OS string "Windows 2015". Otherwise it
* will return zero. Force them to always be treated as present.
* Beata versions were worse: they always returned 0.
*/
if (acpi_MatchHid(h, "AMDI0020") || acpi_MatchHid(h, "AMDI0010"))
return (TRUE);
status = acpi_GetInteger(h, "_STA", &s);
/*
* If no _STA method or if it failed, then assume that
* the device is present.
*/
if (ACPI_FAILURE(status))
return (TRUE);
return (ACPI_DEVICE_PRESENT(s) ? TRUE : FALSE);
}
/*
* Returns true if the battery is actually present and inserted.
*/
BOOLEAN
acpi_BatteryIsPresent(device_t dev)
{
ACPI_HANDLE h;
UINT32 s;
ACPI_STATUS status;
h = acpi_get_handle(dev);
if (h == NULL)
return (FALSE);
status = acpi_GetInteger(h, "_STA", &s);
/*
* If no _STA method or if it failed, then assume that
* the device is present.
*/
if (ACPI_FAILURE(status))
return (TRUE);
return (ACPI_BATTERY_PRESENT(s) ? TRUE : FALSE);
}
/*
* Returns true if a device has at least one valid device ID.
*/
static BOOLEAN
acpi_has_hid(ACPI_HANDLE h)
{
ACPI_DEVICE_INFO *devinfo;
BOOLEAN ret;
if (h == NULL ||
ACPI_FAILURE(AcpiGetObjectInfo(h, &devinfo)))
return (FALSE);
ret = FALSE;
if ((devinfo->Valid & ACPI_VALID_HID) != 0)
ret = TRUE;
else if ((devinfo->Valid & ACPI_VALID_CID) != 0)
if (devinfo->CompatibleIdList.Count > 0)
ret = TRUE;
AcpiOsFree(devinfo);
return (ret);
}
/*
* Match a HID string against a handle
* returns ACPI_MATCHHID_HID if _HID match
* ACPI_MATCHHID_CID if _CID match and not _HID match.
* ACPI_MATCHHID_NOMATCH=0 if no match.
*/
int
acpi_MatchHid(ACPI_HANDLE h, const char *hid)
{
ACPI_DEVICE_INFO *devinfo;
BOOLEAN ret;
int i;
if (hid == NULL || h == NULL ||
ACPI_FAILURE(AcpiGetObjectInfo(h, &devinfo)))
return (ACPI_MATCHHID_NOMATCH);
ret = ACPI_MATCHHID_NOMATCH;
if ((devinfo->Valid & ACPI_VALID_HID) != 0 &&
strcmp(hid, devinfo->HardwareId.String) == 0)
ret = ACPI_MATCHHID_HID;
else if ((devinfo->Valid & ACPI_VALID_CID) != 0)
for (i = 0; i < devinfo->CompatibleIdList.Count; i++) {
if (strcmp(hid, devinfo->CompatibleIdList.Ids[i].String) == 0) {
ret = ACPI_MATCHHID_CID;
break;
}
}
AcpiOsFree(devinfo);
return (ret);
}
/*
* Return the handle of a named object within our scope, ie. that of (parent)
* or one if its parents.
*/
ACPI_STATUS
acpi_GetHandleInScope(ACPI_HANDLE parent, char *path, ACPI_HANDLE *result)
{
ACPI_HANDLE r;
ACPI_STATUS status;
/* Walk back up the tree to the root */
for (;;) {
status = AcpiGetHandle(parent, path, &r);
if (ACPI_SUCCESS(status)) {
*result = r;
return (AE_OK);
}
/* XXX Return error here? */
if (status != AE_NOT_FOUND)
return (AE_OK);
if (ACPI_FAILURE(AcpiGetParent(parent, &r)))
return (AE_NOT_FOUND);
parent = r;
}
}
/*
* Allocate a buffer with a preset data size.
*/
ACPI_BUFFER *
acpi_AllocBuffer(int size)
{
ACPI_BUFFER *buf;
if ((buf = malloc(size + sizeof(*buf), M_ACPIDEV, M_NOWAIT)) == NULL)
return (NULL);
buf->Length = size;
buf->Pointer = (void *)(buf + 1);
return (buf);
}
ACPI_STATUS
acpi_SetInteger(ACPI_HANDLE handle, char *path, UINT32 number)
{
ACPI_OBJECT arg1;
ACPI_OBJECT_LIST args;
arg1.Type = ACPI_TYPE_INTEGER;
arg1.Integer.Value = number;
args.Count = 1;
args.Pointer = &arg1;
return (AcpiEvaluateObject(handle, path, &args, NULL));
}
/*
* Evaluate a path that should return an integer.
*/
ACPI_STATUS
acpi_GetInteger(ACPI_HANDLE handle, char *path, UINT32 *number)
{
ACPI_STATUS status;
ACPI_BUFFER buf;
ACPI_OBJECT param;
if (handle == NULL)
handle = ACPI_ROOT_OBJECT;
/*
* Assume that what we've been pointed at is an Integer object, or
* a method that will return an Integer.
*/
buf.Pointer = ¶m;
buf.Length = sizeof(param);
status = AcpiEvaluateObject(handle, path, NULL, &buf);
if (ACPI_SUCCESS(status)) {
if (param.Type == ACPI_TYPE_INTEGER)
*number = param.Integer.Value;
else
status = AE_TYPE;
}
/*
* In some applications, a method that's expected to return an Integer
* may instead return a Buffer (probably to simplify some internal
* arithmetic). We'll try to fetch whatever it is, and if it's a Buffer,
* convert it into an Integer as best we can.
*
* This is a hack.
*/
if (status == AE_BUFFER_OVERFLOW) {
if ((buf.Pointer = AcpiOsAllocate(buf.Length)) == NULL) {
status = AE_NO_MEMORY;
} else {
status = AcpiEvaluateObject(handle, path, NULL, &buf);
if (ACPI_SUCCESS(status))
status = acpi_ConvertBufferToInteger(&buf, number);
AcpiOsFree(buf.Pointer);
}
}
return (status);
}
ACPI_STATUS
acpi_ConvertBufferToInteger(ACPI_BUFFER *bufp, UINT32 *number)
{
ACPI_OBJECT *p;
UINT8 *val;
int i;
p = (ACPI_OBJECT *)bufp->Pointer;
if (p->Type == ACPI_TYPE_INTEGER) {
*number = p->Integer.Value;
return (AE_OK);
}
if (p->Type != ACPI_TYPE_BUFFER)
return (AE_TYPE);
if (p->Buffer.Length > sizeof(int))
return (AE_BAD_DATA);
*number = 0;
val = p->Buffer.Pointer;
for (i = 0; i < p->Buffer.Length; i++)
*number += val[i] << (i * 8);
return (AE_OK);
}
/*
* Iterate over the elements of an a package object, calling the supplied
* function for each element.
*
* XXX possible enhancement might be to abort traversal on error.
*/
ACPI_STATUS
acpi_ForeachPackageObject(ACPI_OBJECT *pkg,
void (*func)(ACPI_OBJECT *comp, void *arg), void *arg)
{
ACPI_OBJECT *comp;
int i;
if (pkg == NULL || pkg->Type != ACPI_TYPE_PACKAGE)
return (AE_BAD_PARAMETER);
/* Iterate over components */
i = 0;
comp = pkg->Package.Elements;
for (; i < pkg->Package.Count; i++, comp++)
func(comp, arg);
return (AE_OK);
}
/*
* Find the (index)th resource object in a set.
*/
ACPI_STATUS
acpi_FindIndexedResource(ACPI_BUFFER *buf, int index, ACPI_RESOURCE **resp)
{
ACPI_RESOURCE *rp;
int i;
rp = (ACPI_RESOURCE *)buf->Pointer;
i = index;
while (i-- > 0) {
/* Range check */
if (rp > (ACPI_RESOURCE *)((u_int8_t *)buf->Pointer + buf->Length))
return (AE_BAD_PARAMETER);
/* Check for terminator */
if (rp->Type == ACPI_RESOURCE_TYPE_END_TAG || rp->Length == 0)
return (AE_NOT_FOUND);
rp = ACPI_NEXT_RESOURCE(rp);
}
if (resp != NULL)
*resp = rp;
return (AE_OK);
}
/*
* Append an ACPI_RESOURCE to an ACPI_BUFFER.
*
* Given a pointer to an ACPI_RESOURCE structure, expand the ACPI_BUFFER
* provided to contain it. If the ACPI_BUFFER is empty, allocate a sensible
* backing block. If the ACPI_RESOURCE is NULL, return an empty set of
* resources.
*/
#define ACPI_INITIAL_RESOURCE_BUFFER_SIZE 512
ACPI_STATUS
acpi_AppendBufferResource(ACPI_BUFFER *buf, ACPI_RESOURCE *res)
{
ACPI_RESOURCE *rp;
void *newp;
/* Initialise the buffer if necessary. */
if (buf->Pointer == NULL) {
buf->Length = ACPI_INITIAL_RESOURCE_BUFFER_SIZE;
if ((buf->Pointer = AcpiOsAllocate(buf->Length)) == NULL)
return (AE_NO_MEMORY);
rp = (ACPI_RESOURCE *)buf->Pointer;
rp->Type = ACPI_RESOURCE_TYPE_END_TAG;
rp->Length = ACPI_RS_SIZE_MIN;
}
if (res == NULL)
return (AE_OK);
/*
* Scan the current buffer looking for the terminator.
* This will either find the terminator or hit the end
* of the buffer and return an error.
*/
rp = (ACPI_RESOURCE *)buf->Pointer;
for (;;) {
/* Range check, don't go outside the buffer */
if (rp >= (ACPI_RESOURCE *)((u_int8_t *)buf->Pointer + buf->Length))
return (AE_BAD_PARAMETER);
if (rp->Type == ACPI_RESOURCE_TYPE_END_TAG || rp->Length == 0)
break;
rp = ACPI_NEXT_RESOURCE(rp);
}
/*
* Check the size of the buffer and expand if required.
*
* Required size is:
* size of existing resources before terminator +
* size of new resource and header +
* size of terminator.
*
* Note that this loop should really only run once, unless
* for some reason we are stuffing a *really* huge resource.
*/
while ((((u_int8_t *)rp - (u_int8_t *)buf->Pointer) +
res->Length + ACPI_RS_SIZE_NO_DATA +
ACPI_RS_SIZE_MIN) >= buf->Length) {
if ((newp = AcpiOsAllocate(buf->Length * 2)) == NULL)
return (AE_NO_MEMORY);
bcopy(buf->Pointer, newp, buf->Length);
rp = (ACPI_RESOURCE *)((u_int8_t *)newp +
((u_int8_t *)rp - (u_int8_t *)buf->Pointer));
AcpiOsFree(buf->Pointer);
buf->Pointer = newp;
buf->Length += buf->Length;
}
/* Insert the new resource. */
bcopy(res, rp, res->Length + ACPI_RS_SIZE_NO_DATA);
/* And add the terminator. */
rp = ACPI_NEXT_RESOURCE(rp);
rp->Type = ACPI_RESOURCE_TYPE_END_TAG;
rp->Length = ACPI_RS_SIZE_MIN;
return (AE_OK);
}
UINT8
acpi_DSMQuery(ACPI_HANDLE h, uint8_t *uuid, int revision)
{
/*
* ACPI spec 9.1.1 defines this.
*
* "Arg2: Function Index Represents a specific function whose meaning is
* specific to the UUID and Revision ID. Function indices should start
* with 1. Function number zero is a query function (see the special
* return code defined below)."
*/
ACPI_BUFFER buf;
ACPI_OBJECT *obj;
UINT8 ret = 0;
if (!ACPI_SUCCESS(acpi_EvaluateDSM(h, uuid, revision, 0, NULL, &buf))) {
ACPI_INFO(("Failed to enumerate DSM functions\n"));
return (0);
}
obj = (ACPI_OBJECT *)buf.Pointer;
KASSERT(obj, ("Object not allowed to be NULL\n"));
/*
* From ACPI 6.2 spec 9.1.1:
* If Function Index = 0, a Buffer containing a function index bitfield.
* Otherwise, the return value and type depends on the UUID and revision
* ID (see below).
*/
switch (obj->Type) {
case ACPI_TYPE_BUFFER:
ret = *(uint8_t *)obj->Buffer.Pointer;
break;
case ACPI_TYPE_INTEGER:
ACPI_BIOS_WARNING((AE_INFO,
"Possibly buggy BIOS with ACPI_TYPE_INTEGER for function enumeration\n"));
ret = obj->Integer.Value & 0xFF;
break;
default:
ACPI_WARNING((AE_INFO, "Unexpected return type %u\n", obj->Type));
};
AcpiOsFree(obj);
return ret;
}
/*
* DSM may return multiple types depending on the function. It is therefore
* unsafe to use the typed evaluation. It is highly recommended that the caller
* check the type of the returned object.
*/
ACPI_STATUS
acpi_EvaluateDSM(ACPI_HANDLE handle, uint8_t *uuid, int revision,
uint64_t function, union acpi_object *package, ACPI_BUFFER *out_buf)
{
ACPI_OBJECT arg[4];
ACPI_OBJECT_LIST arglist;
ACPI_BUFFER buf;
ACPI_STATUS status;
if (out_buf == NULL)
return (AE_NO_MEMORY);
arg[0].Type = ACPI_TYPE_BUFFER;
arg[0].Buffer.Length = ACPI_UUID_LENGTH;
arg[0].Buffer.Pointer = uuid;
arg[1].Type = ACPI_TYPE_INTEGER;
arg[1].Integer.Value = revision;
arg[2].Type = ACPI_TYPE_INTEGER;
arg[2].Integer.Value = function;
if (package) {
arg[3] = *package;
} else {
arg[3].Type = ACPI_TYPE_PACKAGE;
arg[3].Package.Count = 0;
arg[3].Package.Elements = NULL;
}
arglist.Pointer = arg;
arglist.Count = 4;
buf.Pointer = NULL;
buf.Length = ACPI_ALLOCATE_BUFFER;
status = AcpiEvaluateObject(handle, "_DSM", &arglist, &buf);
if (ACPI_FAILURE(status))
return (status);
KASSERT(ACPI_SUCCESS(status), ("Unexpected status"));
*out_buf = buf;
return (status);
}
ACPI_STATUS
acpi_EvaluateOSC(ACPI_HANDLE handle, uint8_t *uuid, int revision, int count,
uint32_t *caps_in, uint32_t *caps_out, bool query)
{
ACPI_OBJECT arg[4], *ret;
ACPI_OBJECT_LIST arglist;
ACPI_BUFFER buf;
ACPI_STATUS status;
arglist.Pointer = arg;
arglist.Count = 4;
arg[0].Type = ACPI_TYPE_BUFFER;
arg[0].Buffer.Length = ACPI_UUID_LENGTH;
arg[0].Buffer.Pointer = uuid;
arg[1].Type = ACPI_TYPE_INTEGER;
arg[1].Integer.Value = revision;
arg[2].Type = ACPI_TYPE_INTEGER;
arg[2].Integer.Value = count;
arg[3].Type = ACPI_TYPE_BUFFER;
arg[3].Buffer.Length = count * sizeof(*caps_in);
arg[3].Buffer.Pointer = (uint8_t *)caps_in;
caps_in[0] = query ? 1 : 0;
buf.Pointer = NULL;
buf.Length = ACPI_ALLOCATE_BUFFER;
status = AcpiEvaluateObjectTyped(handle, "_OSC", &arglist, &buf,
ACPI_TYPE_BUFFER);
if (ACPI_FAILURE(status))
return (status);
if (caps_out != NULL) {
ret = buf.Pointer;
if (ret->Buffer.Length != count * sizeof(*caps_out)) {
AcpiOsFree(buf.Pointer);
return (AE_BUFFER_OVERFLOW);
}
bcopy(ret->Buffer.Pointer, caps_out, ret->Buffer.Length);
}
AcpiOsFree(buf.Pointer);
return (status);
}
/*
* Set interrupt model.
*/
ACPI_STATUS
acpi_SetIntrModel(int model)
{
return (acpi_SetInteger(ACPI_ROOT_OBJECT, "_PIC", model));
}
/*
* Walk subtables of a table and call a callback routine for each
* subtable. The caller should provide the first subtable and a
* pointer to the end of the table. This can be used to walk tables
* such as MADT and SRAT that use subtable entries.
*/
void
acpi_walk_subtables(void *first, void *end, acpi_subtable_handler *handler,
void *arg)
{
ACPI_SUBTABLE_HEADER *entry;
for (entry = first; (void *)entry < end; ) {
/* Avoid an infinite loop if we hit a bogus entry. */
if (entry->Length < sizeof(ACPI_SUBTABLE_HEADER))
return;
handler(entry, arg);
entry = ACPI_ADD_PTR(ACPI_SUBTABLE_HEADER, entry, entry->Length);
}
}
/*
* DEPRECATED. This interface has serious deficiencies and will be
* removed.
*
* Immediately enter the sleep state. In the old model, acpiconf(8) ran
* rc.suspend and rc.resume so we don't have to notify devd(8) to do this.
*/
ACPI_STATUS
acpi_SetSleepState(struct acpi_softc *sc, int state)
{
static int once;
if (!once) {
device_printf(sc->acpi_dev,
"warning: acpi_SetSleepState() deprecated, need to update your software\n");
once = 1;
}
return (acpi_EnterSleepState(sc, state));
}
#if defined(__amd64__) || defined(__i386__)
static void
acpi_sleep_force_task(void *context)
{
struct acpi_softc *sc = (struct acpi_softc *)context;
if (ACPI_FAILURE(acpi_EnterSleepState(sc, sc->acpi_next_sstate)))
device_printf(sc->acpi_dev, "force sleep state S%d failed\n",
sc->acpi_next_sstate);
}
static void
acpi_sleep_force(void *arg)
{
struct acpi_softc *sc = (struct acpi_softc *)arg;
device_printf(sc->acpi_dev,
"suspend request timed out, forcing sleep now\n");
/*
* XXX Suspending from callout causes freezes in DEVICE_SUSPEND().
* Suspend from acpi_task thread instead.
*/
if (ACPI_FAILURE(AcpiOsExecute(OSL_NOTIFY_HANDLER,
acpi_sleep_force_task, sc)))
device_printf(sc->acpi_dev, "AcpiOsExecute() for sleeping failed\n");
}
#endif
/*
* Request that the system enter the given suspend state. All /dev/apm
* devices and devd(8) will be notified. Userland then has a chance to
* save state and acknowledge the request. The system sleeps once all
* acks are in.
*/
int
acpi_ReqSleepState(struct acpi_softc *sc, int state)
{
#if defined(__amd64__) || defined(__i386__)
struct apm_clone_data *clone;
ACPI_STATUS status;
if (state < ACPI_STATE_S1 || state > ACPI_S_STATES_MAX)
return (EINVAL);
if (!acpi_sleep_states[state])
return (EOPNOTSUPP);
/*
* If a reboot/shutdown/suspend request is already in progress or
* suspend is blocked due to an upcoming shutdown, just return.
*/
if (rebooting || sc->acpi_next_sstate != 0 || suspend_blocked) {
return (0);
}
/* Wait until sleep is enabled. */
while (sc->acpi_sleep_disabled) {
AcpiOsSleep(1000);
}
ACPI_LOCK(acpi);
sc->acpi_next_sstate = state;
/* S5 (soft-off) should be entered directly with no waiting. */
if (state == ACPI_STATE_S5) {
ACPI_UNLOCK(acpi);
status = acpi_EnterSleepState(sc, state);
return (ACPI_SUCCESS(status) ? 0 : ENXIO);
}
/* Record the pending state and notify all apm devices. */
STAILQ_FOREACH(clone, &sc->apm_cdevs, entries) {
clone->notify_status = APM_EV_NONE;
if ((clone->flags & ACPI_EVF_DEVD) == 0) {
selwakeuppri(&clone->sel_read, PZERO);
KNOTE_LOCKED(&clone->sel_read.si_note, 0);
}
}
/* If devd(8) is not running, immediately enter the sleep state. */
if (!devctl_process_running()) {
ACPI_UNLOCK(acpi);
status = acpi_EnterSleepState(sc, state);
return (ACPI_SUCCESS(status) ? 0 : ENXIO);
}
/*
* Set a timeout to fire if userland doesn't ack the suspend request
* in time. This way we still eventually go to sleep if we were
* overheating or running low on battery, even if userland is hung.
* We cancel this timeout once all userland acks are in or the
* suspend request is aborted.
*/
callout_reset(&sc->susp_force_to, 10 * hz, acpi_sleep_force, sc);
ACPI_UNLOCK(acpi);
/* Now notify devd(8) also. */
acpi_UserNotify("Suspend", ACPI_ROOT_OBJECT, state);
return (0);
#else
/* This platform does not support acpi suspend/resume. */
return (EOPNOTSUPP);
#endif
}
/*
* Acknowledge (or reject) a pending sleep state. The caller has
* prepared for suspend and is now ready for it to proceed. If the
* error argument is non-zero, it indicates suspend should be cancelled
* and gives an errno value describing why. Once all votes are in,
* we suspend the system.
*/
int
acpi_AckSleepState(struct apm_clone_data *clone, int error)
{
#if defined(__amd64__) || defined(__i386__)
struct acpi_softc *sc;
int ret, sleeping;
/* If no pending sleep state, return an error. */
ACPI_LOCK(acpi);
sc = clone->acpi_sc;
if (sc->acpi_next_sstate == 0) {
ACPI_UNLOCK(acpi);
return (ENXIO);
}
/* Caller wants to abort suspend process. */
if (error) {
sc->acpi_next_sstate = 0;
callout_stop(&sc->susp_force_to);
device_printf(sc->acpi_dev,
"listener on %s cancelled the pending suspend\n",
devtoname(clone->cdev));
ACPI_UNLOCK(acpi);
return (0);
}
/*
* Mark this device as acking the suspend request. Then, walk through
* all devices, seeing if they agree yet. We only count devices that
* are writable since read-only devices couldn't ack the request.
*/
sleeping = TRUE;
clone->notify_status = APM_EV_ACKED;
STAILQ_FOREACH(clone, &sc->apm_cdevs, entries) {
if ((clone->flags & ACPI_EVF_WRITE) != 0 &&
clone->notify_status != APM_EV_ACKED) {
sleeping = FALSE;
break;
}
}
/* If all devices have voted "yes", we will suspend now. */
if (sleeping)
callout_stop(&sc->susp_force_to);
ACPI_UNLOCK(acpi);
ret = 0;
if (sleeping) {
if (ACPI_FAILURE(acpi_EnterSleepState(sc, sc->acpi_next_sstate)))
ret = ENODEV;
}
return (ret);
#else
/* This platform does not support acpi suspend/resume. */
return (EOPNOTSUPP);
#endif
}
static void
acpi_sleep_enable(void *arg)
{
struct acpi_softc *sc = (struct acpi_softc *)arg;
ACPI_LOCK_ASSERT(acpi);
/* Reschedule if the system is not fully up and running. */
if (!AcpiGbl_SystemAwakeAndRunning) {
callout_schedule(&acpi_sleep_timer, hz * ACPI_MINIMUM_AWAKETIME);
return;
}
sc->acpi_sleep_disabled = FALSE;
}
static ACPI_STATUS
acpi_sleep_disable(struct acpi_softc *sc)
{
ACPI_STATUS status;
/* Fail if the system is not fully up and running. */
if (!AcpiGbl_SystemAwakeAndRunning)
return (AE_ERROR);
ACPI_LOCK(acpi);
status = sc->acpi_sleep_disabled ? AE_ERROR : AE_OK;
sc->acpi_sleep_disabled = TRUE;
ACPI_UNLOCK(acpi);
return (status);
}
enum acpi_sleep_state {
ACPI_SS_NONE,
ACPI_SS_GPE_SET,
ACPI_SS_DEV_SUSPEND,
ACPI_SS_SLP_PREP,
ACPI_SS_SLEPT,
};
/*
* Enter the desired system sleep state.
*
* Currently we support S1-S5 but S4 is only S4BIOS
*/
static ACPI_STATUS
acpi_EnterSleepState(struct acpi_softc *sc, int state)
{
register_t intr;
ACPI_STATUS status;
ACPI_EVENT_STATUS power_button_status;
enum acpi_sleep_state slp_state;
int sleep_result;
ACPI_FUNCTION_TRACE_U32((char *)(uintptr_t)__func__, state);
if (state < ACPI_STATE_S1 || state > ACPI_S_STATES_MAX)
return_ACPI_STATUS (AE_BAD_PARAMETER);
if (!acpi_sleep_states[state]) {
device_printf(sc->acpi_dev, "Sleep state S%d not supported by BIOS\n",
state);
return (AE_SUPPORT);
}
/* Re-entry once we're suspending is not allowed. */
status = acpi_sleep_disable(sc);
if (ACPI_FAILURE(status)) {
device_printf(sc->acpi_dev,
"suspend request ignored (not ready yet)\n");
return (status);
}
if (state == ACPI_STATE_S5) {
/*
* Shut down cleanly and power off. This will call us back through the
* shutdown handlers.
*/
shutdown_nice(RB_POWEROFF);
return_ACPI_STATUS (AE_OK);
}
EVENTHANDLER_INVOKE(power_suspend_early);
stop_all_proc();
EVENTHANDLER_INVOKE(power_suspend);
#ifdef EARLY_AP_STARTUP
MPASS(mp_ncpus == 1 || smp_started);
thread_lock(curthread);
sched_bind(curthread, 0);
thread_unlock(curthread);
#else
if (smp_started) {
thread_lock(curthread);
sched_bind(curthread, 0);
thread_unlock(curthread);
}
#endif
/*
* Be sure to hold Giant across DEVICE_SUSPEND/RESUME since non-MPSAFE
* drivers need this.
*/
mtx_lock(&Giant);
slp_state = ACPI_SS_NONE;
sc->acpi_sstate = state;
/* Enable any GPEs as appropriate and requested by the user. */
acpi_wake_prep_walk(state);
slp_state = ACPI_SS_GPE_SET;
/*
* Inform all devices that we are going to sleep. If at least one
* device fails, DEVICE_SUSPEND() automatically resumes the tree.
*
* XXX Note that a better two-pass approach with a 'veto' pass
* followed by a "real thing" pass would be better, but the current
* bus interface does not provide for this.
*/
if (DEVICE_SUSPEND(root_bus) != 0) {
device_printf(sc->acpi_dev, "device_suspend failed\n");
goto backout;
}
slp_state = ACPI_SS_DEV_SUSPEND;
status = AcpiEnterSleepStatePrep(state);
if (ACPI_FAILURE(status)) {
device_printf(sc->acpi_dev, "AcpiEnterSleepStatePrep failed - %s\n",
AcpiFormatException(status));
goto backout;
}
slp_state = ACPI_SS_SLP_PREP;
if (sc->acpi_sleep_delay > 0)
DELAY(sc->acpi_sleep_delay * 1000000);
suspendclock();
intr = intr_disable();
if (state != ACPI_STATE_S1) {
sleep_result = acpi_sleep_machdep(sc, state);
acpi_wakeup_machdep(sc, state, sleep_result, 0);
/*
* XXX According to ACPI specification SCI_EN bit should be restored
* by ACPI platform (BIOS, firmware) to its pre-sleep state.
* Unfortunately some BIOSes fail to do that and that leads to
* unexpected and serious consequences during wake up like a system
* getting stuck in SMI handlers.
* This hack is picked up from Linux, which claims that it follows
* Windows behavior.
*/
if (sleep_result == 1 && state != ACPI_STATE_S4)
AcpiWriteBitRegister(ACPI_BITREG_SCI_ENABLE, ACPI_ENABLE_EVENT);
if (sleep_result == 1 && state == ACPI_STATE_S3) {
/*
* Prevent mis-interpretation of the wakeup by power button
* as a request for power off.
* Ideally we should post an appropriate wakeup event,
* perhaps using acpi_event_power_button_wake or alike.
*
* Clearing of power button status after wakeup is mandated
* by ACPI specification in section "Fixed Power Button".
*
* XXX As of ACPICA 20121114 AcpiGetEventStatus provides
* status as 0/1 corressponding to inactive/active despite
* its type being ACPI_EVENT_STATUS. In other words,
* we should not test for ACPI_EVENT_FLAG_SET for time being.
*/
if (ACPI_SUCCESS(AcpiGetEventStatus(ACPI_EVENT_POWER_BUTTON,
&power_button_status)) && power_button_status != 0) {
AcpiClearEvent(ACPI_EVENT_POWER_BUTTON);
device_printf(sc->acpi_dev,
"cleared fixed power button status\n");
}
}
intr_restore(intr);
/* call acpi_wakeup_machdep() again with interrupt enabled */
acpi_wakeup_machdep(sc, state, sleep_result, 1);
AcpiLeaveSleepStatePrep(state);
if (sleep_result == -1)
goto backout;
/* Re-enable ACPI hardware on wakeup from sleep state 4. */
if (state == ACPI_STATE_S4)
AcpiEnable();
} else {
status = AcpiEnterSleepState(state);
intr_restore(intr);
AcpiLeaveSleepStatePrep(state);
if (ACPI_FAILURE(status)) {
device_printf(sc->acpi_dev, "AcpiEnterSleepState failed - %s\n",
AcpiFormatException(status));
goto backout;
}
}
slp_state = ACPI_SS_SLEPT;
/*
* Back out state according to how far along we got in the suspend
* process. This handles both the error and success cases.
*/
backout:
if (slp_state >= ACPI_SS_SLP_PREP)
resumeclock();
if (slp_state >= ACPI_SS_GPE_SET) {
acpi_wake_prep_walk(state);
sc->acpi_sstate = ACPI_STATE_S0;
}
if (slp_state >= ACPI_SS_DEV_SUSPEND)
DEVICE_RESUME(root_bus);
if (slp_state >= ACPI_SS_SLP_PREP)
AcpiLeaveSleepState(state);
if (slp_state >= ACPI_SS_SLEPT) {
#if defined(__i386__) || defined(__amd64__)
/* NB: we are still using ACPI timecounter at this point. */
resume_TSC();
#endif
acpi_resync_clock(sc);
acpi_enable_fixed_events(sc);
}
sc->acpi_next_sstate = 0;
mtx_unlock(&Giant);
#ifdef EARLY_AP_STARTUP
thread_lock(curthread);
sched_unbind(curthread);
thread_unlock(curthread);
#else
if (smp_started) {
thread_lock(curthread);
sched_unbind(curthread);
thread_unlock(curthread);
}
#endif
resume_all_proc();
EVENTHANDLER_INVOKE(power_resume);
/* Allow another sleep request after a while. */
callout_schedule(&acpi_sleep_timer, hz * ACPI_MINIMUM_AWAKETIME);
/* Run /etc/rc.resume after we are back. */
if (devctl_process_running())
acpi_UserNotify("Resume", ACPI_ROOT_OBJECT, state);
return_ACPI_STATUS (status);
}
static void
acpi_resync_clock(struct acpi_softc *sc)
{
/*
* Warm up timecounter again and reset system clock.
*/
(void)timecounter->tc_get_timecount(timecounter);
(void)timecounter->tc_get_timecount(timecounter);
inittodr(time_second + sc->acpi_sleep_delay);
}
/* Enable or disable the device's wake GPE. */
int
acpi_wake_set_enable(device_t dev, int enable)
{
struct acpi_prw_data prw;
ACPI_STATUS status;
int flags;
/* Make sure the device supports waking the system and get the GPE. */
if (acpi_parse_prw(acpi_get_handle(dev), &prw) != 0)
return (ENXIO);
flags = acpi_get_flags(dev);
if (enable) {
status = AcpiSetGpeWakeMask(prw.gpe_handle, prw.gpe_bit,
ACPI_GPE_ENABLE);
if (ACPI_FAILURE(status)) {
device_printf(dev, "enable wake failed\n");
return (ENXIO);
}
acpi_set_flags(dev, flags | ACPI_FLAG_WAKE_ENABLED);
} else {
status = AcpiSetGpeWakeMask(prw.gpe_handle, prw.gpe_bit,
ACPI_GPE_DISABLE);
if (ACPI_FAILURE(status)) {
device_printf(dev, "disable wake failed\n");
return (ENXIO);
}
acpi_set_flags(dev, flags & ~ACPI_FLAG_WAKE_ENABLED);
}
return (0);
}
static int
acpi_wake_sleep_prep(ACPI_HANDLE handle, int sstate)
{
struct acpi_prw_data prw;
device_t dev;
/* Check that this is a wake-capable device and get its GPE. */
if (acpi_parse_prw(handle, &prw) != 0)
return (ENXIO);
dev = acpi_get_device(handle);
/*
* The destination sleep state must be less than (i.e., higher power)
* or equal to the value specified by _PRW. If this GPE cannot be
* enabled for the next sleep state, then disable it. If it can and
* the user requested it be enabled, turn on any required power resources
* and set _PSW.
*/
if (sstate > prw.lowest_wake) {
AcpiSetGpeWakeMask(prw.gpe_handle, prw.gpe_bit, ACPI_GPE_DISABLE);
if (bootverbose)
device_printf(dev, "wake_prep disabled wake for %s (S%d)\n",
acpi_name(handle), sstate);
} else if (dev && (acpi_get_flags(dev) & ACPI_FLAG_WAKE_ENABLED) != 0) {
acpi_pwr_wake_enable(handle, 1);
acpi_SetInteger(handle, "_PSW", 1);
if (bootverbose)
device_printf(dev, "wake_prep enabled for %s (S%d)\n",
acpi_name(handle), sstate);
}
return (0);
}
static int
acpi_wake_run_prep(ACPI_HANDLE handle, int sstate)
{
struct acpi_prw_data prw;
device_t dev;
/*
* Check that this is a wake-capable device and get its GPE. Return
* now if the user didn't enable this device for wake.
*/
if (acpi_parse_prw(handle, &prw) != 0)
return (ENXIO);
dev = acpi_get_device(handle);
if (dev == NULL || (acpi_get_flags(dev) & ACPI_FLAG_WAKE_ENABLED) == 0)
return (0);
/*
* If this GPE couldn't be enabled for the previous sleep state, it was
* disabled before going to sleep so re-enable it. If it was enabled,
* clear _PSW and turn off any power resources it used.
*/
if (sstate > prw.lowest_wake) {
AcpiSetGpeWakeMask(prw.gpe_handle, prw.gpe_bit, ACPI_GPE_ENABLE);
if (bootverbose)
device_printf(dev, "run_prep re-enabled %s\n", acpi_name(handle));
} else {
acpi_SetInteger(handle, "_PSW", 0);
acpi_pwr_wake_enable(handle, 0);
if (bootverbose)
device_printf(dev, "run_prep cleaned up for %s\n",
acpi_name(handle));
}
return (0);
}
static ACPI_STATUS
acpi_wake_prep(ACPI_HANDLE handle, UINT32 level, void *context, void **status)
{
int sstate;
/* If suspending, run the sleep prep function, otherwise wake. */
sstate = *(int *)context;
if (AcpiGbl_SystemAwakeAndRunning)
acpi_wake_sleep_prep(handle, sstate);
else
acpi_wake_run_prep(handle, sstate);
return (AE_OK);
}
/* Walk the tree rooted at acpi0 to prep devices for suspend/resume. */
static int
acpi_wake_prep_walk(int sstate)
{
ACPI_HANDLE sb_handle;
if (ACPI_SUCCESS(AcpiGetHandle(ACPI_ROOT_OBJECT, "\\_SB_", &sb_handle)))
AcpiWalkNamespace(ACPI_TYPE_DEVICE, sb_handle, 100,
acpi_wake_prep, NULL, &sstate, NULL);
return (0);
}
/* Walk the tree rooted at acpi0 to attach per-device wake sysctls. */
static int
acpi_wake_sysctl_walk(device_t dev)
{
int error, i, numdevs;
device_t *devlist;
device_t child;
ACPI_STATUS status;
error = device_get_children(dev, &devlist, &numdevs);
if (error != 0 || numdevs == 0) {
if (numdevs == 0)
free(devlist, M_TEMP);
return (error);
}
for (i = 0; i < numdevs; i++) {
child = devlist[i];
acpi_wake_sysctl_walk(child);
if (!device_is_attached(child))
continue;
status = AcpiEvaluateObject(acpi_get_handle(child), "_PRW", NULL, NULL);
if (ACPI_SUCCESS(status)) {
SYSCTL_ADD_PROC(device_get_sysctl_ctx(child),
SYSCTL_CHILDREN(device_get_sysctl_tree(child)), OID_AUTO,
"wake", CTLTYPE_INT | CTLFLAG_RW, child, 0,
acpi_wake_set_sysctl, "I", "Device set to wake the system");
}
}
free(devlist, M_TEMP);
return (0);
}
/* Enable or disable wake from userland. */
static int
acpi_wake_set_sysctl(SYSCTL_HANDLER_ARGS)
{
int enable, error;
device_t dev;
dev = (device_t)arg1;
enable = (acpi_get_flags(dev) & ACPI_FLAG_WAKE_ENABLED) ? 1 : 0;
error = sysctl_handle_int(oidp, &enable, 0, req);
if (error != 0 || req->newptr == NULL)
return (error);
if (enable != 0 && enable != 1)
return (EINVAL);
return (acpi_wake_set_enable(dev, enable));
}
/* Parse a device's _PRW into a structure. */
int
acpi_parse_prw(ACPI_HANDLE h, struct acpi_prw_data *prw)
{
ACPI_STATUS status;
ACPI_BUFFER prw_buffer;
ACPI_OBJECT *res, *res2;
int error, i, power_count;
if (h == NULL || prw == NULL)
return (EINVAL);
/*
* The _PRW object (7.2.9) is only required for devices that have the
* ability to wake the system from a sleeping state.
*/
error = EINVAL;
prw_buffer.Pointer = NULL;
prw_buffer.Length = ACPI_ALLOCATE_BUFFER;
status = AcpiEvaluateObject(h, "_PRW", NULL, &prw_buffer);
if (ACPI_FAILURE(status))
return (ENOENT);
res = (ACPI_OBJECT *)prw_buffer.Pointer;
if (res == NULL)
return (ENOENT);
if (!ACPI_PKG_VALID(res, 2))
goto out;
/*
* Element 1 of the _PRW object:
* The lowest power system sleeping state that can be entered while still
* providing wake functionality. The sleeping state being entered must
* be less than (i.e., higher power) or equal to this value.
*/
if (acpi_PkgInt32(res, 1, &prw->lowest_wake) != 0)
goto out;
/*
* Element 0 of the _PRW object:
*/
switch (res->Package.Elements[0].Type) {
case ACPI_TYPE_INTEGER:
/*
* If the data type of this package element is numeric, then this
* _PRW package element is the bit index in the GPEx_EN, in the
* GPE blocks described in the FADT, of the enable bit that is
* enabled for the wake event.
*/
prw->gpe_handle = NULL;
prw->gpe_bit = res->Package.Elements[0].Integer.Value;
error = 0;
break;
case ACPI_TYPE_PACKAGE:
/*
* If the data type of this package element is a package, then this
* _PRW package element is itself a package containing two
* elements. The first is an object reference to the GPE Block
* device that contains the GPE that will be triggered by the wake
* event. The second element is numeric and it contains the bit
* index in the GPEx_EN, in the GPE Block referenced by the
* first element in the package, of the enable bit that is enabled for
* the wake event.
*
* For example, if this field is a package then it is of the form:
* Package() {\_SB.PCI0.ISA.GPE, 2}
*/
res2 = &res->Package.Elements[0];
if (!ACPI_PKG_VALID(res2, 2))
goto out;
prw->gpe_handle = acpi_GetReference(NULL, &res2->Package.Elements[0]);
if (prw->gpe_handle == NULL)
goto out;
if (acpi_PkgInt32(res2, 1, &prw->gpe_bit) != 0)
goto out;
error = 0;
break;
default:
goto out;
}
/* Elements 2 to N of the _PRW object are power resources. */
power_count = res->Package.Count - 2;
if (power_count > ACPI_PRW_MAX_POWERRES) {
printf("ACPI device %s has too many power resources\n", acpi_name(h));
power_count = 0;
}
prw->power_res_count = power_count;
for (i = 0; i < power_count; i++)
prw->power_res[i] = res->Package.Elements[i];
out:
if (prw_buffer.Pointer != NULL)
AcpiOsFree(prw_buffer.Pointer);
return (error);
}
/*
* ACPI Event Handlers
*/
/* System Event Handlers (registered by EVENTHANDLER_REGISTER) */
static void
acpi_system_eventhandler_sleep(void *arg, int state)
{
struct acpi_softc *sc = (struct acpi_softc *)arg;
int ret;
ACPI_FUNCTION_TRACE_U32((char *)(uintptr_t)__func__, state);
/* Check if button action is disabled or unknown. */
if (state == ACPI_STATE_UNKNOWN)
return;
/* Request that the system prepare to enter the given suspend state. */
ret = acpi_ReqSleepState(sc, state);
if (ret != 0)
device_printf(sc->acpi_dev,
"request to enter state S%d failed (err %d)\n", state, ret);
return_VOID;
}
static void
acpi_system_eventhandler_wakeup(void *arg, int state)
{
ACPI_FUNCTION_TRACE_U32((char *)(uintptr_t)__func__, state);
/* Currently, nothing to do for wakeup. */
return_VOID;
}
/*
* ACPICA Event Handlers (FixedEvent, also called from button notify handler)
*/
static void
acpi_invoke_sleep_eventhandler(void *context)
{
EVENTHANDLER_INVOKE(acpi_sleep_event, *(int *)context);
}
static void
acpi_invoke_wake_eventhandler(void *context)
{
EVENTHANDLER_INVOKE(acpi_wakeup_event, *(int *)context);
}
UINT32
acpi_event_power_button_sleep(void *context)
{
struct acpi_softc *sc = (struct acpi_softc *)context;
ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
if (ACPI_FAILURE(AcpiOsExecute(OSL_NOTIFY_HANDLER,
acpi_invoke_sleep_eventhandler, &sc->acpi_power_button_sx)))
return_VALUE (ACPI_INTERRUPT_NOT_HANDLED);
return_VALUE (ACPI_INTERRUPT_HANDLED);
}
UINT32
acpi_event_power_button_wake(void *context)
{
struct acpi_softc *sc = (struct acpi_softc *)context;
ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
if (ACPI_FAILURE(AcpiOsExecute(OSL_NOTIFY_HANDLER,
acpi_invoke_wake_eventhandler, &sc->acpi_power_button_sx)))
return_VALUE (ACPI_INTERRUPT_NOT_HANDLED);
return_VALUE (ACPI_INTERRUPT_HANDLED);
}
UINT32
acpi_event_sleep_button_sleep(void *context)
{
struct acpi_softc *sc = (struct acpi_softc *)context;
ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
if (ACPI_FAILURE(AcpiOsExecute(OSL_NOTIFY_HANDLER,
acpi_invoke_sleep_eventhandler, &sc->acpi_sleep_button_sx)))
return_VALUE (ACPI_INTERRUPT_NOT_HANDLED);
return_VALUE (ACPI_INTERRUPT_HANDLED);
}
UINT32
acpi_event_sleep_button_wake(void *context)
{
struct acpi_softc *sc = (struct acpi_softc *)context;
ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
if (ACPI_FAILURE(AcpiOsExecute(OSL_NOTIFY_HANDLER,
acpi_invoke_wake_eventhandler, &sc->acpi_sleep_button_sx)))
return_VALUE (ACPI_INTERRUPT_NOT_HANDLED);
return_VALUE (ACPI_INTERRUPT_HANDLED);
}
/*
* XXX This static buffer is suboptimal. There is no locking so only
* use this for single-threaded callers.
*/
char *
acpi_name(ACPI_HANDLE handle)
{
ACPI_BUFFER buf;
static char data[256];
buf.Length = sizeof(data);
buf.Pointer = data;
if (handle && ACPI_SUCCESS(AcpiGetName(handle, ACPI_FULL_PATHNAME, &buf)))
return (data);
return ("(unknown)");
}
/*
* Debugging/bug-avoidance. Avoid trying to fetch info on various
* parts of the namespace.
*/
int
acpi_avoid(ACPI_HANDLE handle)
{
char *cp, *env, *np;
int len;
np = acpi_name(handle);
if (*np == '\\')
np++;
if ((env = kern_getenv("debug.acpi.avoid")) == NULL)
return (0);
/* Scan the avoid list checking for a match */
cp = env;
for (;;) {
while (*cp != 0 && isspace(*cp))
cp++;
if (*cp == 0)
break;
len = 0;
while (cp[len] != 0 && !isspace(cp[len]))
len++;
if (!strncmp(cp, np, len)) {
freeenv(env);
return(1);
}
cp += len;
}
freeenv(env);
return (0);
}
/*
* Debugging/bug-avoidance. Disable ACPI subsystem components.
*/
int
acpi_disabled(char *subsys)
{
char *cp, *env;
int len;
if ((env = kern_getenv("debug.acpi.disabled")) == NULL)
return (0);
if (strcmp(env, "all") == 0) {
freeenv(env);
return (1);
}
/* Scan the disable list, checking for a match. */
cp = env;
for (;;) {
while (*cp != '\0' && isspace(*cp))
cp++;
if (*cp == '\0')
break;
len = 0;
while (cp[len] != '\0' && !isspace(cp[len]))
len++;
if (strncmp(cp, subsys, len) == 0) {
freeenv(env);
return (1);
}
cp += len;
}
freeenv(env);
return (0);
}
static void
acpi_lookup(void *arg, const char *name, device_t *dev)
{
ACPI_HANDLE handle;
if (*dev != NULL)
return;
/*
* Allow any handle name that is specified as an absolute path and
* starts with '\'. We could restrict this to \_SB and friends,
* but see acpi_probe_children() for notes on why we scan the entire
* namespace for devices.
*
* XXX: The pathname argument to AcpiGetHandle() should be fixed to
* be const.
*/
if (name[0] != '\\')
return;
if (ACPI_FAILURE(AcpiGetHandle(ACPI_ROOT_OBJECT, __DECONST(char *, name),
&handle)))
return;
*dev = acpi_get_device(handle);
}
/*
* Control interface.
*
* We multiplex ioctls for all participating ACPI devices here. Individual
* drivers wanting to be accessible via /dev/acpi should use the
* register/deregister interface to make their handlers visible.
*/
struct acpi_ioctl_hook
{
TAILQ_ENTRY(acpi_ioctl_hook) link;
u_long cmd;
acpi_ioctl_fn fn;
void *arg;
};
static TAILQ_HEAD(,acpi_ioctl_hook) acpi_ioctl_hooks;
static int acpi_ioctl_hooks_initted;
int
acpi_register_ioctl(u_long cmd, acpi_ioctl_fn fn, void *arg)
{
struct acpi_ioctl_hook *hp;
if ((hp = malloc(sizeof(*hp), M_ACPIDEV, M_NOWAIT)) == NULL)
return (ENOMEM);
hp->cmd = cmd;
hp->fn = fn;
hp->arg = arg;
ACPI_LOCK(acpi);
if (acpi_ioctl_hooks_initted == 0) {
TAILQ_INIT(&acpi_ioctl_hooks);
acpi_ioctl_hooks_initted = 1;
}
TAILQ_INSERT_TAIL(&acpi_ioctl_hooks, hp, link);
ACPI_UNLOCK(acpi);
return (0);
}
void
acpi_deregister_ioctl(u_long cmd, acpi_ioctl_fn fn)
{
struct acpi_ioctl_hook *hp;
ACPI_LOCK(acpi);
TAILQ_FOREACH(hp, &acpi_ioctl_hooks, link)
if (hp->cmd == cmd && hp->fn == fn)
break;
if (hp != NULL) {
TAILQ_REMOVE(&acpi_ioctl_hooks, hp, link);
free(hp, M_ACPIDEV);
}
ACPI_UNLOCK(acpi);
}
static int
acpiopen(struct cdev *dev, int flag, int fmt, struct thread *td)
{
return (0);
}
static int
acpiclose(struct cdev *dev, int flag, int fmt, struct thread *td)
{
return (0);
}
static int
acpiioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td)
{
struct acpi_softc *sc;
struct acpi_ioctl_hook *hp;
int error, state;
error = 0;
hp = NULL;
sc = dev->si_drv1;
/*
* Scan the list of registered ioctls, looking for handlers.
*/
ACPI_LOCK(acpi);
if (acpi_ioctl_hooks_initted)
TAILQ_FOREACH(hp, &acpi_ioctl_hooks, link) {
if (hp->cmd == cmd)
break;
}
ACPI_UNLOCK(acpi);
if (hp)
return (hp->fn(cmd, addr, hp->arg));
/*
* Core ioctls are not permitted for non-writable user.
* Currently, other ioctls just fetch information.
* Not changing system behavior.
*/
if ((flag & FWRITE) == 0)
return (EPERM);
/* Core system ioctls. */
switch (cmd) {
case ACPIIO_REQSLPSTATE:
state = *(int *)addr;
if (state != ACPI_STATE_S5)
return (acpi_ReqSleepState(sc, state));
device_printf(sc->acpi_dev, "power off via acpi ioctl not supported\n");
error = EOPNOTSUPP;
break;
case ACPIIO_ACKSLPSTATE:
error = *(int *)addr;
error = acpi_AckSleepState(sc->acpi_clone, error);
break;
case ACPIIO_SETSLPSTATE: /* DEPRECATED */
state = *(int *)addr;
if (state < ACPI_STATE_S0 || state > ACPI_S_STATES_MAX)
return (EINVAL);
if (!acpi_sleep_states[state])
return (EOPNOTSUPP);
if (ACPI_FAILURE(acpi_SetSleepState(sc, state)))
error = ENXIO;
break;
default:
error = ENXIO;
break;
}
return (error);
}
static int
acpi_sname2sstate(const char *sname)
{
int sstate;
if (toupper(sname[0]) == 'S') {
sstate = sname[1] - '0';
if (sstate >= ACPI_STATE_S0 && sstate <= ACPI_STATE_S5 &&
sname[2] == '\0')
return (sstate);
} else if (strcasecmp(sname, "NONE") == 0)
return (ACPI_STATE_UNKNOWN);
return (-1);
}
static const char *
acpi_sstate2sname(int sstate)
{
static const char *snames[] = { "S0", "S1", "S2", "S3", "S4", "S5" };
if (sstate >= ACPI_STATE_S0 && sstate <= ACPI_STATE_S5)
return (snames[sstate]);
else if (sstate == ACPI_STATE_UNKNOWN)
return ("NONE");
return (NULL);
}
static int
acpi_supported_sleep_state_sysctl(SYSCTL_HANDLER_ARGS)
{
int error;
struct sbuf sb;
UINT8 state;
sbuf_new(&sb, NULL, 32, SBUF_AUTOEXTEND);
for (state = ACPI_STATE_S1; state < ACPI_S_STATE_COUNT; state++)
if (acpi_sleep_states[state])
sbuf_printf(&sb, "%s ", acpi_sstate2sname(state));
sbuf_trim(&sb);
sbuf_finish(&sb);
error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
sbuf_delete(&sb);
return (error);
}
static int
acpi_sleep_state_sysctl(SYSCTL_HANDLER_ARGS)
{
char sleep_state[10];
int error, new_state, old_state;
old_state = *(int *)oidp->oid_arg1;
strlcpy(sleep_state, acpi_sstate2sname(old_state), sizeof(sleep_state));
error = sysctl_handle_string(oidp, sleep_state, sizeof(sleep_state), req);
if (error == 0 && req->newptr != NULL) {
new_state = acpi_sname2sstate(sleep_state);
if (new_state < ACPI_STATE_S1)
return (EINVAL);
if (new_state < ACPI_S_STATE_COUNT && !acpi_sleep_states[new_state])
return (EOPNOTSUPP);
if (new_state != old_state)
*(int *)oidp->oid_arg1 = new_state;
}
return (error);
}
/* Inform devctl(4) when we receive a Notify. */
void
acpi_UserNotify(const char *subsystem, ACPI_HANDLE h, uint8_t notify)
{
char notify_buf[16];
ACPI_BUFFER handle_buf;
ACPI_STATUS status;
if (subsystem == NULL)
return;
handle_buf.Pointer = NULL;
handle_buf.Length = ACPI_ALLOCATE_BUFFER;
status = AcpiNsHandleToPathname(h, &handle_buf, FALSE);
if (ACPI_FAILURE(status))
return;
snprintf(notify_buf, sizeof(notify_buf), "notify=0x%02x", notify);
devctl_notify("ACPI", subsystem, handle_buf.Pointer, notify_buf);
AcpiOsFree(handle_buf.Pointer);
}
#ifdef ACPI_DEBUG
/*
* Support for parsing debug options from the kernel environment.
*
* Bits may be set in the AcpiDbgLayer and AcpiDbgLevel debug registers
* by specifying the names of the bits in the debug.acpi.layer and
* debug.acpi.level environment variables. Bits may be unset by
* prefixing the bit name with !.
*/
struct debugtag
{
char *name;
UINT32 value;
};
static struct debugtag dbg_layer[] = {
{"ACPI_UTILITIES", ACPI_UTILITIES},
{"ACPI_HARDWARE", ACPI_HARDWARE},
{"ACPI_EVENTS", ACPI_EVENTS},
{"ACPI_TABLES", ACPI_TABLES},
{"ACPI_NAMESPACE", ACPI_NAMESPACE},
{"ACPI_PARSER", ACPI_PARSER},
{"ACPI_DISPATCHER", ACPI_DISPATCHER},
{"ACPI_EXECUTER", ACPI_EXECUTER},
{"ACPI_RESOURCES", ACPI_RESOURCES},
{"ACPI_CA_DEBUGGER", ACPI_CA_DEBUGGER},
{"ACPI_OS_SERVICES", ACPI_OS_SERVICES},
{"ACPI_CA_DISASSEMBLER", ACPI_CA_DISASSEMBLER},
{"ACPI_ALL_COMPONENTS", ACPI_ALL_COMPONENTS},
{"ACPI_AC_ADAPTER", ACPI_AC_ADAPTER},
{"ACPI_BATTERY", ACPI_BATTERY},
{"ACPI_BUS", ACPI_BUS},
{"ACPI_BUTTON", ACPI_BUTTON},
{"ACPI_EC", ACPI_EC},
{"ACPI_FAN", ACPI_FAN},
{"ACPI_POWERRES", ACPI_POWERRES},
{"ACPI_PROCESSOR", ACPI_PROCESSOR},
{"ACPI_THERMAL", ACPI_THERMAL},
{"ACPI_TIMER", ACPI_TIMER},
{"ACPI_ALL_DRIVERS", ACPI_ALL_DRIVERS},
{NULL, 0}
};
static struct debugtag dbg_level[] = {
{"ACPI_LV_INIT", ACPI_LV_INIT},
{"ACPI_LV_DEBUG_OBJECT", ACPI_LV_DEBUG_OBJECT},
{"ACPI_LV_INFO", ACPI_LV_INFO},
{"ACPI_LV_REPAIR", ACPI_LV_REPAIR},
{"ACPI_LV_ALL_EXCEPTIONS", ACPI_LV_ALL_EXCEPTIONS},
/* Trace verbosity level 1 [Standard Trace Level] */
{"ACPI_LV_INIT_NAMES", ACPI_LV_INIT_NAMES},
{"ACPI_LV_PARSE", ACPI_LV_PARSE},
{"ACPI_LV_LOAD", ACPI_LV_LOAD},
{"ACPI_LV_DISPATCH", ACPI_LV_DISPATCH},
{"ACPI_LV_EXEC", ACPI_LV_EXEC},
{"ACPI_LV_NAMES", ACPI_LV_NAMES},
{"ACPI_LV_OPREGION", ACPI_LV_OPREGION},
{"ACPI_LV_BFIELD", ACPI_LV_BFIELD},
{"ACPI_LV_TABLES", ACPI_LV_TABLES},
{"ACPI_LV_VALUES", ACPI_LV_VALUES},
{"ACPI_LV_OBJECTS", ACPI_LV_OBJECTS},
{"ACPI_LV_RESOURCES", ACPI_LV_RESOURCES},
{"ACPI_LV_USER_REQUESTS", ACPI_LV_USER_REQUESTS},
{"ACPI_LV_PACKAGE", ACPI_LV_PACKAGE},
{"ACPI_LV_VERBOSITY1", ACPI_LV_VERBOSITY1},
/* Trace verbosity level 2 [Function tracing and memory allocation] */
{"ACPI_LV_ALLOCATIONS", ACPI_LV_ALLOCATIONS},
{"ACPI_LV_FUNCTIONS", ACPI_LV_FUNCTIONS},
{"ACPI_LV_OPTIMIZATIONS", ACPI_LV_OPTIMIZATIONS},
{"ACPI_LV_VERBOSITY2", ACPI_LV_VERBOSITY2},
{"ACPI_LV_ALL", ACPI_LV_ALL},
/* Trace verbosity level 3 [Threading, I/O, and Interrupts] */
{"ACPI_LV_MUTEX", ACPI_LV_MUTEX},
{"ACPI_LV_THREADS", ACPI_LV_THREADS},
{"ACPI_LV_IO", ACPI_LV_IO},
{"ACPI_LV_INTERRUPTS", ACPI_LV_INTERRUPTS},
{"ACPI_LV_VERBOSITY3", ACPI_LV_VERBOSITY3},
/* Exceptionally verbose output -- also used in the global "DebugLevel" */
{"ACPI_LV_AML_DISASSEMBLE", ACPI_LV_AML_DISASSEMBLE},
{"ACPI_LV_VERBOSE_INFO", ACPI_LV_VERBOSE_INFO},
{"ACPI_LV_FULL_TABLES", ACPI_LV_FULL_TABLES},
{"ACPI_LV_EVENTS", ACPI_LV_EVENTS},
{"ACPI_LV_VERBOSE", ACPI_LV_VERBOSE},
{NULL, 0}
};
static void
acpi_parse_debug(char *cp, struct debugtag *tag, UINT32 *flag)
{
char *ep;
int i, l;
int set;
while (*cp) {
if (isspace(*cp)) {
cp++;
continue;
}
ep = cp;
while (*ep && !isspace(*ep))
ep++;
if (*cp == '!') {
set = 0;
cp++;
if (cp == ep)
continue;
} else {
set = 1;
}
l = ep - cp;
for (i = 0; tag[i].name != NULL; i++) {
if (!strncmp(cp, tag[i].name, l)) {
if (set)
*flag |= tag[i].value;
else
*flag &= ~tag[i].value;
}
}
cp = ep;
}
}
static void
acpi_set_debugging(void *junk)
{
char *layer, *level;
if (cold) {
AcpiDbgLayer = 0;
AcpiDbgLevel = 0;
}
layer = kern_getenv("debug.acpi.layer");
level = kern_getenv("debug.acpi.level");
if (layer == NULL && level == NULL)
return;
printf("ACPI set debug");
if (layer != NULL) {
if (strcmp("NONE", layer) != 0)
printf(" layer '%s'", layer);
acpi_parse_debug(layer, &dbg_layer[0], &AcpiDbgLayer);
freeenv(layer);
}
if (level != NULL) {
if (strcmp("NONE", level) != 0)
printf(" level '%s'", level);
acpi_parse_debug(level, &dbg_level[0], &AcpiDbgLevel);
freeenv(level);
}
printf("\n");
}
SYSINIT(acpi_debugging, SI_SUB_TUNABLES, SI_ORDER_ANY, acpi_set_debugging,
NULL);
static int
acpi_debug_sysctl(SYSCTL_HANDLER_ARGS)
{
int error, *dbg;
struct debugtag *tag;
struct sbuf sb;
char temp[128];
if (sbuf_new(&sb, NULL, 128, SBUF_AUTOEXTEND) == NULL)
return (ENOMEM);
if (strcmp(oidp->oid_arg1, "debug.acpi.layer") == 0) {
tag = &dbg_layer[0];
dbg = &AcpiDbgLayer;
} else {
tag = &dbg_level[0];
dbg = &AcpiDbgLevel;
}
/* Get old values if this is a get request. */
ACPI_SERIAL_BEGIN(acpi);
if (*dbg == 0) {
sbuf_cpy(&sb, "NONE");
} else if (req->newptr == NULL) {
for (; tag->name != NULL; tag++) {
if ((*dbg & tag->value) == tag->value)
sbuf_printf(&sb, "%s ", tag->name);
}
}
sbuf_trim(&sb);
sbuf_finish(&sb);
strlcpy(temp, sbuf_data(&sb), sizeof(temp));
sbuf_delete(&sb);
error = sysctl_handle_string(oidp, temp, sizeof(temp), req);
/* Check for error or no change */
if (error == 0 && req->newptr != NULL) {
*dbg = 0;
kern_setenv((char *)oidp->oid_arg1, temp);
acpi_set_debugging(NULL);
}
ACPI_SERIAL_END(acpi);
return (error);
}
SYSCTL_PROC(_debug_acpi, OID_AUTO, layer, CTLFLAG_RW | CTLTYPE_STRING,
"debug.acpi.layer", 0, acpi_debug_sysctl, "A", "");
SYSCTL_PROC(_debug_acpi, OID_AUTO, level, CTLFLAG_RW | CTLTYPE_STRING,
"debug.acpi.level", 0, acpi_debug_sysctl, "A", "");
#endif /* ACPI_DEBUG */
static int
acpi_debug_objects_sysctl(SYSCTL_HANDLER_ARGS)
{
int error;
int old;
old = acpi_debug_objects;
error = sysctl_handle_int(oidp, &acpi_debug_objects, 0, req);
if (error != 0 || req->newptr == NULL)
return (error);
if (old == acpi_debug_objects || (old && acpi_debug_objects))
return (0);
ACPI_SERIAL_BEGIN(acpi);
AcpiGbl_EnableAmlDebugObject = acpi_debug_objects ? TRUE : FALSE;
ACPI_SERIAL_END(acpi);
return (0);
}
static int
acpi_parse_interfaces(char *str, struct acpi_interface *iface)
{
char *p;
size_t len;
int i, j;
p = str;
while (isspace(*p) || *p == ',')
p++;
len = strlen(p);
if (len == 0)
return (0);
p = strdup(p, M_TEMP);
for (i = 0; i < len; i++)
if (p[i] == ',')
p[i] = '\0';
i = j = 0;
while (i < len)
if (isspace(p[i]) || p[i] == '\0')
i++;
else {
i += strlen(p + i) + 1;
j++;
}
if (j == 0) {
free(p, M_TEMP);
return (0);
}
iface->data = malloc(sizeof(*iface->data) * j, M_TEMP, M_WAITOK);
iface->num = j;
i = j = 0;
while (i < len)
if (isspace(p[i]) || p[i] == '\0')
i++;
else {
iface->data[j] = p + i;
i += strlen(p + i) + 1;
j++;
}
return (j);
}
static void
acpi_free_interfaces(struct acpi_interface *iface)
{
free(iface->data[0], M_TEMP);
free(iface->data, M_TEMP);
}
static void
acpi_reset_interfaces(device_t dev)
{
struct acpi_interface list;
ACPI_STATUS status;
int i;
if (acpi_parse_interfaces(acpi_install_interface, &list) > 0) {
for (i = 0; i < list.num; i++) {
status = AcpiInstallInterface(list.data[i]);
if (ACPI_FAILURE(status))
device_printf(dev,
"failed to install _OSI(\"%s\"): %s\n",
list.data[i], AcpiFormatException(status));
else if (bootverbose)
device_printf(dev, "installed _OSI(\"%s\")\n",
list.data[i]);
}
acpi_free_interfaces(&list);
}
if (acpi_parse_interfaces(acpi_remove_interface, &list) > 0) {
for (i = 0; i < list.num; i++) {
status = AcpiRemoveInterface(list.data[i]);
if (ACPI_FAILURE(status))
device_printf(dev,
"failed to remove _OSI(\"%s\"): %s\n",
list.data[i], AcpiFormatException(status));
else if (bootverbose)
device_printf(dev, "removed _OSI(\"%s\")\n",
list.data[i]);
}
acpi_free_interfaces(&list);
}
}
static int
acpi_pm_func(u_long cmd, void *arg, ...)
{
int state, acpi_state;
int error;
struct acpi_softc *sc;
va_list ap;
error = 0;
switch (cmd) {
case POWER_CMD_SUSPEND:
sc = (struct acpi_softc *)arg;
if (sc == NULL) {
error = EINVAL;
goto out;
}
va_start(ap, arg);
state = va_arg(ap, int);
va_end(ap);
switch (state) {
case POWER_SLEEP_STATE_STANDBY:
acpi_state = sc->acpi_standby_sx;
break;
case POWER_SLEEP_STATE_SUSPEND:
acpi_state = sc->acpi_suspend_sx;
break;
case POWER_SLEEP_STATE_HIBERNATE:
acpi_state = ACPI_STATE_S4;
break;
default:
error = EINVAL;
goto out;
}
if (ACPI_FAILURE(acpi_EnterSleepState(sc, acpi_state)))
error = ENXIO;
break;
default:
error = EINVAL;
goto out;
}
out:
return (error);
}
static void
acpi_pm_register(void *arg)
{
if (!cold || resource_disabled("acpi", 0))
return;
power_pm_register(POWER_PM_TYPE_ACPI, acpi_pm_func, NULL);
}
SYSINIT(power, SI_SUB_KLD, SI_ORDER_ANY, acpi_pm_register, NULL);
Index: head/sys/dev/drm2/drm_fb_helper.c
===================================================================
--- head/sys/dev/drm2/drm_fb_helper.c (revision 356654)
+++ head/sys/dev/drm2/drm_fb_helper.c (revision 356655)
@@ -1,1490 +1,1490 @@
/*
* Copyright (c) 2006-2009 Red Hat Inc.
* Copyright (c) 2006-2008 Intel Corporation
* Copyright (c) 2007 Dave Airlie
*
* DRM framebuffer helper functions
*
* Permission to use, copy, modify, distribute, and sell this software and its
* documentation for any purpose is hereby granted without fee, provided that
* the above copyright notice appear in all copies and that both that copyright
* notice and this permission notice appear in supporting documentation, and
* that the name of the copyright holders not be used in advertising or
* publicity pertaining to distribution of the software without specific,
* written prior permission. The copyright holders make no representations
* about the suitability of this software for any purpose. It is provided "as
* is" without express or implied warranty.
*
* THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
* INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
* EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
* CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
* DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
* TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THIS SOFTWARE.
*
* Authors:
* Dave Airlie
* Jesse Barnes
*/
#include
__FBSDID("$FreeBSD$");
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include
#include