diff --git a/sys/dev/acpica/acpi.c b/sys/dev/acpica/acpi.c
index 6e8e12e42f75..55075828a224 100644
--- a/sys/dev/acpica/acpi.c
+++ b/sys/dev/acpica/acpi.c
@@ -1,4369 +1,4372 @@
 /*-
  * Copyright (c) 2000 Takanori Watanabe <takawata@jp.freebsd.org>
  * Copyright (c) 2000 Mitsuru IWASAKI <iwasaki@jp.freebsd.org>
  * Copyright (c) 2000, 2001 Michael Smith
  * Copyright (c) 2000 BSDi
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_acpi.h"
 
 #include <sys/param.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/fcntl.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/bus.h>
 #include <sys/conf.h>
 #include <sys/ioccom.h>
 #include <sys/reboot.h>
 #include <sys/sysctl.h>
 #include <sys/ctype.h>
 #include <sys/linker.h>
+#include <sys/mount.h>
 #include <sys/power.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/timetc.h>
 
 #if defined(__i386__) || defined(__amd64__)
 #include <machine/clock.h>
 #include <machine/pci_cfgreg.h>
 #endif
 #include <machine/resource.h>
 #include <machine/bus.h>
 #include <sys/rman.h>
 #include <isa/isavar.h>
 #include <isa/pnpvar.h>
 
 #include <contrib/dev/acpica/include/acpi.h>
 #include <contrib/dev/acpica/include/accommon.h>
 #include <contrib/dev/acpica/include/acnamesp.h>
 
 #include <dev/acpica/acpivar.h>
 #include <dev/acpica/acpiio.h>
 
 #include <dev/pci/pcivar.h>
 
 #include <vm/vm_param.h>
 
 static MALLOC_DEFINE(M_ACPIDEV, "acpidev", "ACPI devices");
 
 /* Hooks for the ACPI CA debugging infrastructure */
 #define _COMPONENT	ACPI_BUS
 ACPI_MODULE_NAME("ACPI")
 
 static d_open_t		acpiopen;
 static d_close_t	acpiclose;
 static d_ioctl_t	acpiioctl;
 
 static struct cdevsw acpi_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_open =	acpiopen,
 	.d_close =	acpiclose,
 	.d_ioctl =	acpiioctl,
 	.d_name =	"acpi",
 };
 
 struct acpi_interface {
 	ACPI_STRING	*data;
 	int		num;
 };
 
 static char *sysres_ids[] = { "PNP0C01", "PNP0C02", NULL };
 static char *pcilink_ids[] = { "PNP0C0F", NULL };
 
 /* Global mutex for locking access to the ACPI subsystem. */
 struct mtx	acpi_mutex;
 struct callout	acpi_sleep_timer;
 
 /* Bitmap of device quirks. */
 int		acpi_quirks;
 
 /* Supported sleep states. */
 static BOOLEAN	acpi_sleep_states[ACPI_S_STATE_COUNT];
 
 static void	acpi_lookup(void *arg, const char *name, device_t *dev);
 static int	acpi_modevent(struct module *mod, int event, void *junk);
 static int	acpi_probe(device_t dev);
 static int	acpi_attach(device_t dev);
 static int	acpi_suspend(device_t dev);
 static int	acpi_resume(device_t dev);
 static int	acpi_shutdown(device_t dev);
 static device_t	acpi_add_child(device_t bus, u_int order, const char *name,
 			int unit);
 static int	acpi_print_child(device_t bus, device_t child);
 static void	acpi_probe_nomatch(device_t bus, device_t child);
 static void	acpi_driver_added(device_t dev, driver_t *driver);
 static void	acpi_child_deleted(device_t dev, device_t child);
 static int	acpi_read_ivar(device_t dev, device_t child, int index,
 			uintptr_t *result);
 static int	acpi_write_ivar(device_t dev, device_t child, int index,
 			uintptr_t value);
 static struct resource_list *acpi_get_rlist(device_t dev, device_t child);
 static void	acpi_reserve_resources(device_t dev);
 static int	acpi_sysres_alloc(device_t dev);
 static int	acpi_set_resource(device_t dev, device_t child, int type,
 			int rid, rman_res_t start, rman_res_t count);
 static struct resource *acpi_alloc_resource(device_t bus, device_t child,
 			int type, int *rid, rman_res_t start, rman_res_t end,
 			rman_res_t count, u_int flags);
 static int	acpi_adjust_resource(device_t bus, device_t child, int type,
 			struct resource *r, rman_res_t start, rman_res_t end);
 static int	acpi_release_resource(device_t bus, device_t child, int type,
 			int rid, struct resource *r);
 static void	acpi_delete_resource(device_t bus, device_t child, int type,
 		    int rid);
 static uint32_t	acpi_isa_get_logicalid(device_t dev);
 static int	acpi_isa_get_compatid(device_t dev, uint32_t *cids, int count);
 static int	acpi_device_id_probe(device_t bus, device_t dev, char **ids, char **match);
 static ACPI_STATUS acpi_device_eval_obj(device_t bus, device_t dev,
 		    ACPI_STRING pathname, ACPI_OBJECT_LIST *parameters,
 		    ACPI_BUFFER *ret);
 static ACPI_STATUS acpi_device_scan_cb(ACPI_HANDLE h, UINT32 level,
 		    void *context, void **retval);
 static ACPI_STATUS acpi_device_scan_children(device_t bus, device_t dev,
 		    int max_depth, acpi_scan_cb_t user_fn, void *arg);
 static int	acpi_isa_pnp_probe(device_t bus, device_t child,
 		    struct isa_pnp_id *ids);
 static void	acpi_platform_osc(device_t dev);
 static void	acpi_probe_children(device_t bus);
 static void	acpi_probe_order(ACPI_HANDLE handle, int *order);
 static ACPI_STATUS acpi_probe_child(ACPI_HANDLE handle, UINT32 level,
 		    void *context, void **status);
 static void	acpi_sleep_enable(void *arg);
 static ACPI_STATUS acpi_sleep_disable(struct acpi_softc *sc);
 static ACPI_STATUS acpi_EnterSleepState(struct acpi_softc *sc, int state);
 static void	acpi_shutdown_final(void *arg, int howto);
 static void	acpi_enable_fixed_events(struct acpi_softc *sc);
 static void	acpi_resync_clock(struct acpi_softc *sc);
 static int	acpi_wake_sleep_prep(ACPI_HANDLE handle, int sstate);
 static int	acpi_wake_run_prep(ACPI_HANDLE handle, int sstate);
 static int	acpi_wake_prep_walk(int sstate);
 static int	acpi_wake_sysctl_walk(device_t dev);
 static int	acpi_wake_set_sysctl(SYSCTL_HANDLER_ARGS);
 static void	acpi_system_eventhandler_sleep(void *arg, int state);
 static void	acpi_system_eventhandler_wakeup(void *arg, int state);
 static int	acpi_sname2sstate(const char *sname);
 static const char *acpi_sstate2sname(int sstate);
 static int	acpi_supported_sleep_state_sysctl(SYSCTL_HANDLER_ARGS);
 static int	acpi_sleep_state_sysctl(SYSCTL_HANDLER_ARGS);
 static int	acpi_debug_objects_sysctl(SYSCTL_HANDLER_ARGS);
 static int	acpi_pm_func(u_long cmd, void *arg, ...);
 static int	acpi_child_location_str_method(device_t acdev, device_t child,
 					       char *buf, size_t buflen);
 static int	acpi_child_pnpinfo_str_method(device_t acdev, device_t child,
 					      char *buf, size_t buflen);
 static void	acpi_enable_pcie(void);
 static void	acpi_hint_device_unit(device_t acdev, device_t child,
 		    const char *name, int *unitp);
 static void	acpi_reset_interfaces(device_t dev);
 
 static device_method_t acpi_methods[] = {
     /* Device interface */
     DEVMETHOD(device_probe,		acpi_probe),
     DEVMETHOD(device_attach,		acpi_attach),
     DEVMETHOD(device_shutdown,		acpi_shutdown),
     DEVMETHOD(device_detach,		bus_generic_detach),
     DEVMETHOD(device_suspend,		acpi_suspend),
     DEVMETHOD(device_resume,		acpi_resume),
 
     /* Bus interface */
     DEVMETHOD(bus_add_child,		acpi_add_child),
     DEVMETHOD(bus_print_child,		acpi_print_child),
     DEVMETHOD(bus_probe_nomatch,	acpi_probe_nomatch),
     DEVMETHOD(bus_driver_added,		acpi_driver_added),
     DEVMETHOD(bus_child_deleted,	acpi_child_deleted),
     DEVMETHOD(bus_read_ivar,		acpi_read_ivar),
     DEVMETHOD(bus_write_ivar,		acpi_write_ivar),
     DEVMETHOD(bus_get_resource_list,	acpi_get_rlist),
     DEVMETHOD(bus_set_resource,		acpi_set_resource),
     DEVMETHOD(bus_get_resource,		bus_generic_rl_get_resource),
     DEVMETHOD(bus_alloc_resource,	acpi_alloc_resource),
     DEVMETHOD(bus_adjust_resource,	acpi_adjust_resource),
     DEVMETHOD(bus_release_resource,	acpi_release_resource),
     DEVMETHOD(bus_delete_resource,	acpi_delete_resource),
     DEVMETHOD(bus_child_pnpinfo_str,	acpi_child_pnpinfo_str_method),
     DEVMETHOD(bus_child_location_str,	acpi_child_location_str_method),
     DEVMETHOD(bus_activate_resource,	bus_generic_activate_resource),
     DEVMETHOD(bus_deactivate_resource,	bus_generic_deactivate_resource),
     DEVMETHOD(bus_setup_intr,		bus_generic_setup_intr),
     DEVMETHOD(bus_teardown_intr,	bus_generic_teardown_intr),
     DEVMETHOD(bus_hint_device_unit,	acpi_hint_device_unit),
     DEVMETHOD(bus_get_cpus,		acpi_get_cpus),
     DEVMETHOD(bus_get_domain,		acpi_get_domain),
 
     /* ACPI bus */
     DEVMETHOD(acpi_id_probe,		acpi_device_id_probe),
     DEVMETHOD(acpi_evaluate_object,	acpi_device_eval_obj),
     DEVMETHOD(acpi_pwr_for_sleep,	acpi_device_pwr_for_sleep),
     DEVMETHOD(acpi_scan_children,	acpi_device_scan_children),
 
     /* ISA emulation */
     DEVMETHOD(isa_pnp_probe,		acpi_isa_pnp_probe),
 
     DEVMETHOD_END
 };
 
 static driver_t acpi_driver = {
     "acpi",
     acpi_methods,
     sizeof(struct acpi_softc),
 };
 
 static devclass_t acpi_devclass;
 EARLY_DRIVER_MODULE(acpi, nexus, acpi_driver, acpi_devclass, acpi_modevent, 0,
     BUS_PASS_BUS + BUS_PASS_ORDER_MIDDLE);
 MODULE_VERSION(acpi, 1);
 
 ACPI_SERIAL_DECL(acpi, "ACPI root bus");
 
 /* Local pools for managing system resources for ACPI child devices. */
 static struct rman acpi_rman_io, acpi_rman_mem;
 
 #define ACPI_MINIMUM_AWAKETIME	5
 
 /* Holds the description of the acpi0 device. */
 static char acpi_desc[ACPI_OEM_ID_SIZE + ACPI_OEM_TABLE_ID_SIZE + 2];
 
 SYSCTL_NODE(_debug, OID_AUTO, acpi, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
     "ACPI debugging");
 static char acpi_ca_version[12];
 SYSCTL_STRING(_debug_acpi, OID_AUTO, acpi_ca_version, CTLFLAG_RD,
 	      acpi_ca_version, 0, "Version of Intel ACPI-CA");
 
 /*
  * Allow overriding _OSI methods.
  */
 static char acpi_install_interface[256];
 TUNABLE_STR("hw.acpi.install_interface", acpi_install_interface,
     sizeof(acpi_install_interface));
 static char acpi_remove_interface[256];
 TUNABLE_STR("hw.acpi.remove_interface", acpi_remove_interface,
     sizeof(acpi_remove_interface));
 
 /* Allow users to dump Debug objects without ACPI debugger. */
 static int acpi_debug_objects;
 TUNABLE_INT("debug.acpi.enable_debug_objects", &acpi_debug_objects);
 SYSCTL_PROC(_debug_acpi, OID_AUTO, enable_debug_objects,
     CTLFLAG_RW | CTLTYPE_INT | CTLFLAG_NEEDGIANT, NULL, 0,
     acpi_debug_objects_sysctl, "I",
     "Enable Debug objects");
 
 /* Allow the interpreter to ignore common mistakes in BIOS. */
 static int acpi_interpreter_slack = 1;
 TUNABLE_INT("debug.acpi.interpreter_slack", &acpi_interpreter_slack);
 SYSCTL_INT(_debug_acpi, OID_AUTO, interpreter_slack, CTLFLAG_RDTUN,
     &acpi_interpreter_slack, 1, "Turn on interpreter slack mode.");
 
 /* Ignore register widths set by FADT and use default widths instead. */
 static int acpi_ignore_reg_width = 1;
 TUNABLE_INT("debug.acpi.default_register_width", &acpi_ignore_reg_width);
 SYSCTL_INT(_debug_acpi, OID_AUTO, default_register_width, CTLFLAG_RDTUN,
     &acpi_ignore_reg_width, 1, "Ignore register widths set by FADT");
 
 /* Allow users to override quirks. */
 TUNABLE_INT("debug.acpi.quirks", &acpi_quirks);
 
 int acpi_susp_bounce;
 SYSCTL_INT(_debug_acpi, OID_AUTO, suspend_bounce, CTLFLAG_RW,
     &acpi_susp_bounce, 0, "Don't actually suspend, just test devices.");
 
 /*
  * ACPI can only be loaded as a module by the loader; activating it after
  * system bootstrap time is not useful, and can be fatal to the system.
  * It also cannot be unloaded, since the entire system bus hierarchy hangs
  * off it.
  */
 static int
 acpi_modevent(struct module *mod, int event, void *junk)
 {
     switch (event) {
     case MOD_LOAD:
 	if (!cold) {
 	    printf("The ACPI driver cannot be loaded after boot.\n");
 	    return (EPERM);
 	}
 	break;
     case MOD_UNLOAD:
 	if (!cold && power_pm_get_type() == POWER_PM_TYPE_ACPI)
 	    return (EBUSY);
 	break;
     default:
 	break;
     }
     return (0);
 }
 
 /*
  * Perform early initialization.
  */
 ACPI_STATUS
 acpi_Startup(void)
 {
     static int started = 0;
     ACPI_STATUS status;
     int val;
 
     ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
 
     /* Only run the startup code once.  The MADT driver also calls this. */
     if (started)
 	return_VALUE (AE_OK);
     started = 1;
 
     /*
      * Initialize the ACPICA subsystem.
      */
     if (ACPI_FAILURE(status = AcpiInitializeSubsystem())) {
 	printf("ACPI: Could not initialize Subsystem: %s\n",
 	    AcpiFormatException(status));
 	return_VALUE (status);
     }
 
     /*
      * Pre-allocate space for RSDT/XSDT and DSDT tables and allow resizing
      * if more tables exist.
      */
     if (ACPI_FAILURE(status = AcpiInitializeTables(NULL, 2, TRUE))) {
 	printf("ACPI: Table initialisation failed: %s\n",
 	    AcpiFormatException(status));
 	return_VALUE (status);
     }
 
     /* Set up any quirks we have for this system. */
     if (acpi_quirks == ACPI_Q_OK)
 	acpi_table_quirks(&acpi_quirks);
 
     /* If the user manually set the disabled hint to 0, force-enable ACPI. */
     if (resource_int_value("acpi", 0, "disabled", &val) == 0 && val == 0)
 	acpi_quirks &= ~ACPI_Q_BROKEN;
     if (acpi_quirks & ACPI_Q_BROKEN) {
 	printf("ACPI disabled by blacklist.  Contact your BIOS vendor.\n");
 	status = AE_SUPPORT;
     }
 
     return_VALUE (status);
 }
 
 /*
  * Detect ACPI and perform early initialisation.
  */
 int
 acpi_identify(void)
 {
     ACPI_TABLE_RSDP	*rsdp;
     ACPI_TABLE_HEADER	*rsdt;
     ACPI_PHYSICAL_ADDRESS paddr;
     struct sbuf		sb;
 
     ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
 
     if (!cold)
 	return (ENXIO);
 
     /* Check that we haven't been disabled with a hint. */
     if (resource_disabled("acpi", 0))
 	return (ENXIO);
 
     /* Check for other PM systems. */
     if (power_pm_get_type() != POWER_PM_TYPE_NONE &&
 	power_pm_get_type() != POWER_PM_TYPE_ACPI) {
 	printf("ACPI identify failed, other PM system enabled.\n");
 	return (ENXIO);
     }
 
     /* Initialize root tables. */
     if (ACPI_FAILURE(acpi_Startup())) {
 	printf("ACPI: Try disabling either ACPI or apic support.\n");
 	return (ENXIO);
     }
 
     if ((paddr = AcpiOsGetRootPointer()) == 0 ||
 	(rsdp = AcpiOsMapMemory(paddr, sizeof(ACPI_TABLE_RSDP))) == NULL)
 	return (ENXIO);
     if (rsdp->Revision > 1 && rsdp->XsdtPhysicalAddress != 0)
 	paddr = (ACPI_PHYSICAL_ADDRESS)rsdp->XsdtPhysicalAddress;
     else
 	paddr = (ACPI_PHYSICAL_ADDRESS)rsdp->RsdtPhysicalAddress;
     AcpiOsUnmapMemory(rsdp, sizeof(ACPI_TABLE_RSDP));
 
     if ((rsdt = AcpiOsMapMemory(paddr, sizeof(ACPI_TABLE_HEADER))) == NULL)
 	return (ENXIO);
     sbuf_new(&sb, acpi_desc, sizeof(acpi_desc), SBUF_FIXEDLEN);
     sbuf_bcat(&sb, rsdt->OemId, ACPI_OEM_ID_SIZE);
     sbuf_trim(&sb);
     sbuf_putc(&sb, ' ');
     sbuf_bcat(&sb, rsdt->OemTableId, ACPI_OEM_TABLE_ID_SIZE);
     sbuf_trim(&sb);
     sbuf_finish(&sb);
     sbuf_delete(&sb);
     AcpiOsUnmapMemory(rsdt, sizeof(ACPI_TABLE_HEADER));
 
     snprintf(acpi_ca_version, sizeof(acpi_ca_version), "%x", ACPI_CA_VERSION);
 
     return (0);
 }
 
 /*
  * Fetch some descriptive data from ACPI to put in our attach message.
  */
 static int
 acpi_probe(device_t dev)
 {
 
     ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
 
     device_set_desc(dev, acpi_desc);
 
     return_VALUE (BUS_PROBE_NOWILDCARD);
 }
 
 static int
 acpi_attach(device_t dev)
 {
     struct acpi_softc	*sc;
     ACPI_STATUS		status;
     int			error, state;
     UINT32		flags;
     UINT8		TypeA, TypeB;
     char		*env;
 
     ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
 
     sc = device_get_softc(dev);
     sc->acpi_dev = dev;
     callout_init(&sc->susp_force_to, 1);
 
     error = ENXIO;
 
     /* Initialize resource manager. */
     acpi_rman_io.rm_type = RMAN_ARRAY;
     acpi_rman_io.rm_start = 0;
     acpi_rman_io.rm_end = 0xffff;
     acpi_rman_io.rm_descr = "ACPI I/O ports";
     if (rman_init(&acpi_rman_io) != 0)
 	panic("acpi rman_init IO ports failed");
     acpi_rman_mem.rm_type = RMAN_ARRAY;
     acpi_rman_mem.rm_descr = "ACPI I/O memory addresses";
     if (rman_init(&acpi_rman_mem) != 0)
 	panic("acpi rman_init memory failed");
 
     /* Initialise the ACPI mutex */
     mtx_init(&acpi_mutex, "ACPI global lock", NULL, MTX_DEF);
 
     /*
      * Set the globals from our tunables.  This is needed because ACPI-CA
      * uses UINT8 for some values and we have no tunable_byte.
      */
     AcpiGbl_EnableInterpreterSlack = acpi_interpreter_slack ? TRUE : FALSE;
     AcpiGbl_EnableAmlDebugObject = acpi_debug_objects ? TRUE : FALSE;
     AcpiGbl_UseDefaultRegisterWidths = acpi_ignore_reg_width ? TRUE : FALSE;
 
 #ifndef ACPI_DEBUG
     /*
      * Disable all debugging layers and levels.
      */
     AcpiDbgLayer = 0;
     AcpiDbgLevel = 0;
 #endif
 
     /* Override OS interfaces if the user requested. */
     acpi_reset_interfaces(dev);
 
     /* Load ACPI name space. */
     status = AcpiLoadTables();
     if (ACPI_FAILURE(status)) {
 	device_printf(dev, "Could not load Namespace: %s\n",
 		      AcpiFormatException(status));
 	goto out;
     }
 
     /* Handle MCFG table if present. */
     acpi_enable_pcie();
 
     /*
      * Note that some systems (specifically, those with namespace evaluation
      * issues that require the avoidance of parts of the namespace) must
      * avoid running _INI and _STA on everything, as well as dodging the final
      * object init pass.
      *
      * For these devices, we set ACPI_NO_DEVICE_INIT and ACPI_NO_OBJECT_INIT).
      *
      * XXX We should arrange for the object init pass after we have attached
      *     all our child devices, but on many systems it works here.
      */
     flags = 0;
     if (testenv("debug.acpi.avoid"))
 	flags = ACPI_NO_DEVICE_INIT | ACPI_NO_OBJECT_INIT;
 
     /* Bring the hardware and basic handlers online. */
     if (ACPI_FAILURE(status = AcpiEnableSubsystem(flags))) {
 	device_printf(dev, "Could not enable ACPI: %s\n",
 		      AcpiFormatException(status));
 	goto out;
     }
 
     /*
      * Call the ECDT probe function to provide EC functionality before
      * the namespace has been evaluated.
      *
      * XXX This happens before the sysresource devices have been probed and
      * attached so its resources come from nexus0.  In practice, this isn't
      * a problem but should be addressed eventually.
      */
     acpi_ec_ecdt_probe(dev);
 
     /* Bring device objects and regions online. */
     if (ACPI_FAILURE(status = AcpiInitializeObjects(flags))) {
 	device_printf(dev, "Could not initialize ACPI objects: %s\n",
 		      AcpiFormatException(status));
 	goto out;
     }
 
     /*
      * Setup our sysctl tree.
      *
      * XXX: This doesn't check to make sure that none of these fail.
      */
     sysctl_ctx_init(&sc->acpi_sysctl_ctx);
     sc->acpi_sysctl_tree = SYSCTL_ADD_NODE(&sc->acpi_sysctl_ctx,
         SYSCTL_STATIC_CHILDREN(_hw), OID_AUTO, device_get_name(dev),
 	CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
     SYSCTL_ADD_PROC(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree),
 	OID_AUTO, "supported_sleep_state",
 	CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT,
 	0, 0, acpi_supported_sleep_state_sysctl, "A",
 	"List supported ACPI sleep states.");
     SYSCTL_ADD_PROC(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree),
 	OID_AUTO, "power_button_state",
 	CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
 	&sc->acpi_power_button_sx, 0, acpi_sleep_state_sysctl, "A",
 	"Power button ACPI sleep state.");
     SYSCTL_ADD_PROC(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree),
 	OID_AUTO, "sleep_button_state",
 	CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
 	&sc->acpi_sleep_button_sx, 0, acpi_sleep_state_sysctl, "A",
 	"Sleep button ACPI sleep state.");
     SYSCTL_ADD_PROC(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree),
 	OID_AUTO, "lid_switch_state",
 	CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
 	&sc->acpi_lid_switch_sx, 0, acpi_sleep_state_sysctl, "A",
 	"Lid ACPI sleep state. Set to S3 if you want to suspend your laptop when close the Lid.");
     SYSCTL_ADD_PROC(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree),
 	OID_AUTO, "standby_state",
 	CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
 	&sc->acpi_standby_sx, 0, acpi_sleep_state_sysctl, "A", "");
     SYSCTL_ADD_PROC(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree),
 	OID_AUTO, "suspend_state",
 	CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
 	&sc->acpi_suspend_sx, 0, acpi_sleep_state_sysctl, "A", "");
     SYSCTL_ADD_INT(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree),
 	OID_AUTO, "sleep_delay", CTLFLAG_RW, &sc->acpi_sleep_delay, 0,
 	"sleep delay in seconds");
     SYSCTL_ADD_INT(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree),
 	OID_AUTO, "s4bios", CTLFLAG_RW, &sc->acpi_s4bios, 0, "S4BIOS mode");
     SYSCTL_ADD_INT(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree),
 	OID_AUTO, "verbose", CTLFLAG_RW, &sc->acpi_verbose, 0, "verbose mode");
     SYSCTL_ADD_INT(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree),
 	OID_AUTO, "disable_on_reboot", CTLFLAG_RW,
 	&sc->acpi_do_disable, 0, "Disable ACPI when rebooting/halting system");
     SYSCTL_ADD_INT(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree),
 	OID_AUTO, "handle_reboot", CTLFLAG_RW,
 	&sc->acpi_handle_reboot, 0, "Use ACPI Reset Register to reboot");
 
     /*
      * Default to 1 second before sleeping to give some machines time to
      * stabilize.
      */
     sc->acpi_sleep_delay = 1;
     if (bootverbose)
 	sc->acpi_verbose = 1;
     if ((env = kern_getenv("hw.acpi.verbose")) != NULL) {
 	if (strcmp(env, "0") != 0)
 	    sc->acpi_verbose = 1;
 	freeenv(env);
     }
 
     /* Only enable reboot by default if the FADT says it is available. */
     if (AcpiGbl_FADT.Flags & ACPI_FADT_RESET_REGISTER)
 	sc->acpi_handle_reboot = 1;
 
 #if !ACPI_REDUCED_HARDWARE
     /* Only enable S4BIOS by default if the FACS says it is available. */
     if (AcpiGbl_FACS != NULL && AcpiGbl_FACS->Flags & ACPI_FACS_S4_BIOS_PRESENT)
 	sc->acpi_s4bios = 1;
 #endif
 
     /* Probe all supported sleep states. */
     acpi_sleep_states[ACPI_STATE_S0] = TRUE;
     for (state = ACPI_STATE_S1; state < ACPI_S_STATE_COUNT; state++)
 	if (ACPI_SUCCESS(AcpiEvaluateObject(ACPI_ROOT_OBJECT,
 	    __DECONST(char *, AcpiGbl_SleepStateNames[state]), NULL, NULL)) &&
 	    ACPI_SUCCESS(AcpiGetSleepTypeData(state, &TypeA, &TypeB)))
 	    acpi_sleep_states[state] = TRUE;
 
     /*
      * Dispatch the default sleep state to devices.  The lid switch is set
      * to UNKNOWN by default to avoid surprising users.
      */
     sc->acpi_power_button_sx = acpi_sleep_states[ACPI_STATE_S5] ?
 	ACPI_STATE_S5 : ACPI_STATE_UNKNOWN;
     sc->acpi_lid_switch_sx = ACPI_STATE_UNKNOWN;
     sc->acpi_standby_sx = acpi_sleep_states[ACPI_STATE_S1] ?
 	ACPI_STATE_S1 : ACPI_STATE_UNKNOWN;
     sc->acpi_suspend_sx = acpi_sleep_states[ACPI_STATE_S3] ?
 	ACPI_STATE_S3 : ACPI_STATE_UNKNOWN;
 
     /* Pick the first valid sleep state for the sleep button default. */
     sc->acpi_sleep_button_sx = ACPI_STATE_UNKNOWN;
     for (state = ACPI_STATE_S1; state <= ACPI_STATE_S4; state++)
 	if (acpi_sleep_states[state]) {
 	    sc->acpi_sleep_button_sx = state;
 	    break;
 	}
 
     acpi_enable_fixed_events(sc);
 
     /*
      * Scan the namespace and attach/initialise children.
      */
 
     /* Register our shutdown handler. */
     EVENTHANDLER_REGISTER(shutdown_final, acpi_shutdown_final, sc,
 	SHUTDOWN_PRI_LAST);
 
     /*
      * Register our acpi event handlers.
      * XXX should be configurable eg. via userland policy manager.
      */
     EVENTHANDLER_REGISTER(acpi_sleep_event, acpi_system_eventhandler_sleep,
 	sc, ACPI_EVENT_PRI_LAST);
     EVENTHANDLER_REGISTER(acpi_wakeup_event, acpi_system_eventhandler_wakeup,
 	sc, ACPI_EVENT_PRI_LAST);
 
     /* Flag our initial states. */
     sc->acpi_enabled = TRUE;
     sc->acpi_sstate = ACPI_STATE_S0;
     sc->acpi_sleep_disabled = TRUE;
 
     /* Create the control device */
     sc->acpi_dev_t = make_dev(&acpi_cdevsw, 0, UID_ROOT, GID_OPERATOR, 0664,
 			      "acpi");
     sc->acpi_dev_t->si_drv1 = sc;
 
     if ((error = acpi_machdep_init(dev)))
 	goto out;
 
     /* Register ACPI again to pass the correct argument of pm_func. */
     power_pm_register(POWER_PM_TYPE_ACPI, acpi_pm_func, sc);
 
     acpi_platform_osc(dev);
 
     if (!acpi_disabled("bus")) {
 	EVENTHANDLER_REGISTER(dev_lookup, acpi_lookup, NULL, 1000);
 	acpi_probe_children(dev);
     }
 
     /* Update all GPEs and enable runtime GPEs. */
     status = AcpiUpdateAllGpes();
     if (ACPI_FAILURE(status))
 	device_printf(dev, "Could not update all GPEs: %s\n",
 	    AcpiFormatException(status));
 
     /* Allow sleep request after a while. */
     callout_init_mtx(&acpi_sleep_timer, &acpi_mutex, 0);
     callout_reset(&acpi_sleep_timer, hz * ACPI_MINIMUM_AWAKETIME,
 	acpi_sleep_enable, sc);
 
     error = 0;
 
  out:
     return_VALUE (error);
 }
 
 static void
 acpi_set_power_children(device_t dev, int state)
 {
 	device_t child;
 	device_t *devlist;
 	int dstate, i, numdevs;
 
 	if (device_get_children(dev, &devlist, &numdevs) != 0)
 		return;
 
 	/*
 	 * Retrieve and set D-state for the sleep state if _SxD is present.
 	 * Skip children who aren't attached since they are handled separately.
 	 */
 	for (i = 0; i < numdevs; i++) {
 		child = devlist[i];
 		dstate = state;
 		if (device_is_attached(child) &&
 		    acpi_device_pwr_for_sleep(dev, child, &dstate) == 0)
 			acpi_set_powerstate(child, dstate);
 	}
 	free(devlist, M_TEMP);
 }
 
 static int
 acpi_suspend(device_t dev)
 {
     int error;
 
     GIANT_REQUIRED;
 
     error = bus_generic_suspend(dev);
     if (error == 0)
 	acpi_set_power_children(dev, ACPI_STATE_D3);
 
     return (error);
 }
 
 static int
 acpi_resume(device_t dev)
 {
 
     GIANT_REQUIRED;
 
     acpi_set_power_children(dev, ACPI_STATE_D0);
 
     return (bus_generic_resume(dev));
 }
 
 static int
 acpi_shutdown(device_t dev)
 {
 
     GIANT_REQUIRED;
 
     /* Allow children to shutdown first. */
     bus_generic_shutdown(dev);
 
     /*
      * Enable any GPEs that are able to power-on the system (i.e., RTC).
      * Also, disable any that are not valid for this state (most).
      */
     acpi_wake_prep_walk(ACPI_STATE_S5);
 
     return (0);
 }
 
 /*
  * Handle a new device being added
  */
 static device_t
 acpi_add_child(device_t bus, u_int order, const char *name, int unit)
 {
     struct acpi_device	*ad;
     device_t		child;
 
     if ((ad = malloc(sizeof(*ad), M_ACPIDEV, M_NOWAIT | M_ZERO)) == NULL)
 	return (NULL);
 
     resource_list_init(&ad->ad_rl);
 
     child = device_add_child_ordered(bus, order, name, unit);
     if (child != NULL)
 	device_set_ivars(child, ad);
     else
 	free(ad, M_ACPIDEV);
     return (child);
 }
 
 static int
 acpi_print_child(device_t bus, device_t child)
 {
     struct acpi_device	 *adev = device_get_ivars(child);
     struct resource_list *rl = &adev->ad_rl;
     int retval = 0;
 
     retval += bus_print_child_header(bus, child);
     retval += resource_list_print_type(rl, "port",  SYS_RES_IOPORT, "%#jx");
     retval += resource_list_print_type(rl, "iomem", SYS_RES_MEMORY, "%#jx");
     retval += resource_list_print_type(rl, "irq",   SYS_RES_IRQ,    "%jd");
     retval += resource_list_print_type(rl, "drq",   SYS_RES_DRQ,    "%jd");
     if (device_get_flags(child))
 	retval += printf(" flags %#x", device_get_flags(child));
     retval += bus_print_child_domain(bus, child);
     retval += bus_print_child_footer(bus, child);
 
     return (retval);
 }
 
 /*
  * If this device is an ACPI child but no one claimed it, attempt
  * to power it off.  We'll power it back up when a driver is added.
  *
  * XXX Disabled for now since many necessary devices (like fdc and
  * ATA) don't claim the devices we created for them but still expect
  * them to be powered up.
  */
 static void
 acpi_probe_nomatch(device_t bus, device_t child)
 {
 #ifdef ACPI_ENABLE_POWERDOWN_NODRIVER
     acpi_set_powerstate(child, ACPI_STATE_D3);
 #endif
 }
 
 /*
  * If a new driver has a chance to probe a child, first power it up.
  *
  * XXX Disabled for now (see acpi_probe_nomatch for details).
  */
 static void
 acpi_driver_added(device_t dev, driver_t *driver)
 {
     device_t child, *devlist;
     int i, numdevs;
 
     DEVICE_IDENTIFY(driver, dev);
     if (device_get_children(dev, &devlist, &numdevs))
 	    return;
     for (i = 0; i < numdevs; i++) {
 	child = devlist[i];
 	if (device_get_state(child) == DS_NOTPRESENT) {
 #ifdef ACPI_ENABLE_POWERDOWN_NODRIVER
 	    acpi_set_powerstate(child, ACPI_STATE_D0);
 	    if (device_probe_and_attach(child) != 0)
 		acpi_set_powerstate(child, ACPI_STATE_D3);
 #else
 	    device_probe_and_attach(child);
 #endif
 	}
     }
     free(devlist, M_TEMP);
 }
 
 /* Location hint for devctl(8) */
 static int
 acpi_child_location_str_method(device_t cbdev, device_t child, char *buf,
     size_t buflen)
 {
     struct acpi_device *dinfo = device_get_ivars(child);
     char buf2[32];
     int pxm;
 
     if (dinfo->ad_handle) {
         snprintf(buf, buflen, "handle=%s", acpi_name(dinfo->ad_handle));
         if (ACPI_SUCCESS(acpi_GetInteger(dinfo->ad_handle, "_PXM", &pxm))) {
                 snprintf(buf2, 32, " _PXM=%d", pxm);
                 strlcat(buf, buf2, buflen);
         }
     } else {
         snprintf(buf, buflen, "");
     }
     return (0);
 }
 
 /* PnP information for devctl(8) */
 int
 acpi_pnpinfo_str(ACPI_HANDLE handle, char *buf, size_t buflen)
 {
     ACPI_DEVICE_INFO *adinfo;
 
     if (ACPI_FAILURE(AcpiGetObjectInfo(handle, &adinfo))) {
 	snprintf(buf, buflen, "unknown");
 	return (0);
     }
 
     snprintf(buf, buflen, "_HID=%s _UID=%lu _CID=%s",
 	(adinfo->Valid & ACPI_VALID_HID) ?
 	adinfo->HardwareId.String : "none",
 	(adinfo->Valid & ACPI_VALID_UID) ?
 	strtoul(adinfo->UniqueId.String, NULL, 10) : 0UL,
 	((adinfo->Valid & ACPI_VALID_CID) &&
 	 adinfo->CompatibleIdList.Count > 0) ?
 	adinfo->CompatibleIdList.Ids[0].String : "none");
     AcpiOsFree(adinfo);
 
     return (0);
 }
 
 static int
 acpi_child_pnpinfo_str_method(device_t cbdev, device_t child, char *buf,
     size_t buflen)
 {
     struct acpi_device *dinfo = device_get_ivars(child);
 
     return (acpi_pnpinfo_str(dinfo->ad_handle, buf, buflen));
 }
 
 /*
  * Handle device deletion.
  */
 static void
 acpi_child_deleted(device_t dev, device_t child)
 {
     struct acpi_device *dinfo = device_get_ivars(child);
 
     if (acpi_get_device(dinfo->ad_handle) == child)
 	AcpiDetachData(dinfo->ad_handle, acpi_fake_objhandler);
 }
 
 /*
  * Handle per-device ivars
  */
 static int
 acpi_read_ivar(device_t dev, device_t child, int index, uintptr_t *result)
 {
     struct acpi_device	*ad;
 
     if ((ad = device_get_ivars(child)) == NULL) {
 	device_printf(child, "device has no ivars\n");
 	return (ENOENT);
     }
 
     /* ACPI and ISA compatibility ivars */
     switch(index) {
     case ACPI_IVAR_HANDLE:
 	*(ACPI_HANDLE *)result = ad->ad_handle;
 	break;
     case ACPI_IVAR_PRIVATE:
 	*(void **)result = ad->ad_private;
 	break;
     case ACPI_IVAR_FLAGS:
 	*(int *)result = ad->ad_flags;
 	break;
     case ISA_IVAR_VENDORID:
     case ISA_IVAR_SERIAL:
     case ISA_IVAR_COMPATID:
 	*(int *)result = -1;
 	break;
     case ISA_IVAR_LOGICALID:
 	*(int *)result = acpi_isa_get_logicalid(child);
 	break;
     case PCI_IVAR_CLASS:
 	*(uint8_t*)result = (ad->ad_cls_class >> 16) & 0xff;
 	break;
     case PCI_IVAR_SUBCLASS:
 	*(uint8_t*)result = (ad->ad_cls_class >> 8) & 0xff;
 	break;
     case PCI_IVAR_PROGIF:
 	*(uint8_t*)result = (ad->ad_cls_class >> 0) & 0xff;
 	break;
     default:
 	return (ENOENT);
     }
 
     return (0);
 }
 
 static int
 acpi_write_ivar(device_t dev, device_t child, int index, uintptr_t value)
 {
     struct acpi_device	*ad;
 
     if ((ad = device_get_ivars(child)) == NULL) {
 	device_printf(child, "device has no ivars\n");
 	return (ENOENT);
     }
 
     switch(index) {
     case ACPI_IVAR_HANDLE:
 	ad->ad_handle = (ACPI_HANDLE)value;
 	break;
     case ACPI_IVAR_PRIVATE:
 	ad->ad_private = (void *)value;
 	break;
     case ACPI_IVAR_FLAGS:
 	ad->ad_flags = (int)value;
 	break;
     default:
 	panic("bad ivar write request (%d)", index);
 	return (ENOENT);
     }
 
     return (0);
 }
 
 /*
  * Handle child resource allocation/removal
  */
 static struct resource_list *
 acpi_get_rlist(device_t dev, device_t child)
 {
     struct acpi_device		*ad;
 
     ad = device_get_ivars(child);
     return (&ad->ad_rl);
 }
 
 static int
 acpi_match_resource_hint(device_t dev, int type, long value)
 {
     struct acpi_device *ad = device_get_ivars(dev);
     struct resource_list *rl = &ad->ad_rl;
     struct resource_list_entry *rle;
 
     STAILQ_FOREACH(rle, rl, link) {
 	if (rle->type != type)
 	    continue;
 	if (rle->start <= value && rle->end >= value)
 	    return (1);
     }
     return (0);
 }
 
 /*
  * Wire device unit numbers based on resource matches in hints.
  */
 static void
 acpi_hint_device_unit(device_t acdev, device_t child, const char *name,
     int *unitp)
 {
     const char *s;
     long value;
     int line, matches, unit;
 
     /*
      * Iterate over all the hints for the devices with the specified
      * name to see if one's resources are a subset of this device.
      */
     line = 0;
     while (resource_find_dev(&line, name, &unit, "at", NULL) == 0) {
 	/* Must have an "at" for acpi or isa. */
 	resource_string_value(name, unit, "at", &s);
 	if (!(strcmp(s, "acpi0") == 0 || strcmp(s, "acpi") == 0 ||
 	    strcmp(s, "isa0") == 0 || strcmp(s, "isa") == 0))
 	    continue;
 
 	/*
 	 * Check for matching resources.  We must have at least one match.
 	 * Since I/O and memory resources cannot be shared, if we get a
 	 * match on either of those, ignore any mismatches in IRQs or DRQs.
 	 *
 	 * XXX: We may want to revisit this to be more lenient and wire
 	 * as long as it gets one match.
 	 */
 	matches = 0;
 	if (resource_long_value(name, unit, "port", &value) == 0) {
 	    /*
 	     * Floppy drive controllers are notorious for having a
 	     * wide variety of resources not all of which include the
 	     * first port that is specified by the hint (typically
 	     * 0x3f0) (see the comment above fdc_isa_alloc_resources()
 	     * in fdc_isa.c).  However, they do all seem to include
 	     * port + 2 (e.g. 0x3f2) so for a floppy device, look for
 	     * 'value + 2' in the port resources instead of the hint
 	     * value.
 	     */
 	    if (strcmp(name, "fdc") == 0)
 		value += 2;
 	    if (acpi_match_resource_hint(child, SYS_RES_IOPORT, value))
 		matches++;
 	    else
 		continue;
 	}
 	if (resource_long_value(name, unit, "maddr", &value) == 0) {
 	    if (acpi_match_resource_hint(child, SYS_RES_MEMORY, value))
 		matches++;
 	    else
 		continue;
 	}
 	if (matches > 0)
 	    goto matched;
 	if (resource_long_value(name, unit, "irq", &value) == 0) {
 	    if (acpi_match_resource_hint(child, SYS_RES_IRQ, value))
 		matches++;
 	    else
 		continue;
 	}
 	if (resource_long_value(name, unit, "drq", &value) == 0) {
 	    if (acpi_match_resource_hint(child, SYS_RES_DRQ, value))
 		matches++;
 	    else
 		continue;
 	}
 
     matched:
 	if (matches > 0) {
 	    /* We have a winner! */
 	    *unitp = unit;
 	    break;
 	}
     }
 }
 
 /*
  * Fetch the NUMA domain for a device by mapping the value returned by
  * _PXM to a NUMA domain.  If the device does not have a _PXM method,
  * -2 is returned.  If any other error occurs, -1 is returned.
  */
 static int
 acpi_parse_pxm(device_t dev)
 {
 #ifdef NUMA
 #if defined(__i386__) || defined(__amd64__)
 	ACPI_HANDLE handle;
 	ACPI_STATUS status;
 	int pxm;
 
 	handle = acpi_get_handle(dev);
 	if (handle == NULL)
 		return (-2);
 	status = acpi_GetInteger(handle, "_PXM", &pxm);
 	if (ACPI_SUCCESS(status))
 		return (acpi_map_pxm_to_vm_domainid(pxm));
 	if (status == AE_NOT_FOUND)
 		return (-2);
 #endif
 #endif
 	return (-1);
 }
 
 int
 acpi_get_cpus(device_t dev, device_t child, enum cpu_sets op, size_t setsize,
     cpuset_t *cpuset)
 {
 	int d, error;
 
 	d = acpi_parse_pxm(child);
 	if (d < 0)
 		return (bus_generic_get_cpus(dev, child, op, setsize, cpuset));
 
 	switch (op) {
 	case LOCAL_CPUS:
 		if (setsize != sizeof(cpuset_t))
 			return (EINVAL);
 		*cpuset = cpuset_domain[d];
 		return (0);
 	case INTR_CPUS:
 		error = bus_generic_get_cpus(dev, child, op, setsize, cpuset);
 		if (error != 0)
 			return (error);
 		if (setsize != sizeof(cpuset_t))
 			return (EINVAL);
 		CPU_AND(cpuset, &cpuset_domain[d]);
 		return (0);
 	default:
 		return (bus_generic_get_cpus(dev, child, op, setsize, cpuset));
 	}
 }
 
 /*
  * Fetch the NUMA domain for the given device 'dev'.
  *
  * If a device has a _PXM method, map that to a NUMA domain.
  * Otherwise, pass the request up to the parent.
  * If there's no matching domain or the domain cannot be
  * determined, return ENOENT.
  */
 int
 acpi_get_domain(device_t dev, device_t child, int *domain)
 {
 	int d;
 
 	d = acpi_parse_pxm(child);
 	if (d >= 0) {
 		*domain = d;
 		return (0);
 	}
 	if (d == -1)
 		return (ENOENT);
 
 	/* No _PXM node; go up a level */
 	return (bus_generic_get_domain(dev, child, domain));
 }
 
 /*
  * Pre-allocate/manage all memory and IO resources.  Since rman can't handle
  * duplicates, we merge any in the sysresource attach routine.
  */
 static int
 acpi_sysres_alloc(device_t dev)
 {
     struct resource *res;
     struct resource_list *rl;
     struct resource_list_entry *rle;
     struct rman *rm;
     device_t *children;
     int child_count, i;
 
     /*
      * Probe/attach any sysresource devices.  This would be unnecessary if we
      * had multi-pass probe/attach.
      */
     if (device_get_children(dev, &children, &child_count) != 0)
 	return (ENXIO);
     for (i = 0; i < child_count; i++) {
 	if (ACPI_ID_PROBE(dev, children[i], sysres_ids, NULL) <= 0)
 	    device_probe_and_attach(children[i]);
     }
     free(children, M_TEMP);
 
     rl = BUS_GET_RESOURCE_LIST(device_get_parent(dev), dev);
     STAILQ_FOREACH(rle, rl, link) {
 	if (rle->res != NULL) {
 	    device_printf(dev, "duplicate resource for %jx\n", rle->start);
 	    continue;
 	}
 
 	/* Only memory and IO resources are valid here. */
 	switch (rle->type) {
 	case SYS_RES_IOPORT:
 	    rm = &acpi_rman_io;
 	    break;
 	case SYS_RES_MEMORY:
 	    rm = &acpi_rman_mem;
 	    break;
 	default:
 	    continue;
 	}
 
 	/* Pre-allocate resource and add to our rman pool. */
 	res = BUS_ALLOC_RESOURCE(device_get_parent(dev), dev, rle->type,
 	    &rle->rid, rle->start, rle->start + rle->count - 1, rle->count, 0);
 	if (res != NULL) {
 	    rman_manage_region(rm, rman_get_start(res), rman_get_end(res));
 	    rle->res = res;
 	} else if (bootverbose)
 	    device_printf(dev, "reservation of %jx, %jx (%d) failed\n",
 		rle->start, rle->count, rle->type);
     }
     return (0);
 }
 
 /*
  * Reserve declared resources for devices found during attach once system
  * resources have been allocated.
  */
 static void
 acpi_reserve_resources(device_t dev)
 {
     struct resource_list_entry *rle;
     struct resource_list *rl;
     struct acpi_device *ad;
     struct acpi_softc *sc;
     device_t *children;
     int child_count, i;
 
     sc = device_get_softc(dev);
     if (device_get_children(dev, &children, &child_count) != 0)
 	return;
     for (i = 0; i < child_count; i++) {
 	ad = device_get_ivars(children[i]);
 	rl = &ad->ad_rl;
 
 	/* Don't reserve system resources. */
 	if (ACPI_ID_PROBE(dev, children[i], sysres_ids, NULL) <= 0)
 	    continue;
 
 	STAILQ_FOREACH(rle, rl, link) {
 	    /*
 	     * Don't reserve IRQ resources.  There are many sticky things
 	     * to get right otherwise (e.g. IRQs for psm, atkbd, and HPET
 	     * when using legacy routing).
 	     */
 	    if (rle->type == SYS_RES_IRQ)
 		continue;
 
 	    /*
 	     * Don't reserve the resource if it is already allocated.
 	     * The acpi_ec(4) driver can allocate its resources early
 	     * if ECDT is present.
 	     */
 	    if (rle->res != NULL)
 		continue;
 
 	    /*
 	     * Try to reserve the resource from our parent.  If this
 	     * fails because the resource is a system resource, just
 	     * let it be.  The resource range is already reserved so
 	     * that other devices will not use it.  If the driver
 	     * needs to allocate the resource, then
 	     * acpi_alloc_resource() will sub-alloc from the system
 	     * resource.
 	     */
 	    resource_list_reserve(rl, dev, children[i], rle->type, &rle->rid,
 		rle->start, rle->end, rle->count, 0);
 	}
     }
     free(children, M_TEMP);
     sc->acpi_resources_reserved = 1;
 }
 
 static int
 acpi_set_resource(device_t dev, device_t child, int type, int rid,
     rman_res_t start, rman_res_t count)
 {
     struct acpi_softc *sc = device_get_softc(dev);
     struct acpi_device *ad = device_get_ivars(child);
     struct resource_list *rl = &ad->ad_rl;
     ACPI_DEVICE_INFO *devinfo;
     rman_res_t end;
     int allow;
 
     /* Ignore IRQ resources for PCI link devices. */
     if (type == SYS_RES_IRQ &&
 	ACPI_ID_PROBE(dev, child, pcilink_ids, NULL) <= 0)
 	return (0);
 
     /*
      * Ignore most resources for PCI root bridges.  Some BIOSes
      * incorrectly enumerate the memory ranges they decode as plain
      * memory resources instead of as ResourceProducer ranges.  Other
      * BIOSes incorrectly list system resource entries for I/O ranges
      * under the PCI bridge.  Do allow the one known-correct case on
      * x86 of a PCI bridge claiming the I/O ports used for PCI config
      * access.
      */
     if (type == SYS_RES_MEMORY || type == SYS_RES_IOPORT) {
 	if (ACPI_SUCCESS(AcpiGetObjectInfo(ad->ad_handle, &devinfo))) {
 	    if ((devinfo->Flags & ACPI_PCI_ROOT_BRIDGE) != 0) {
 #if defined(__i386__) || defined(__amd64__)
 		allow = (type == SYS_RES_IOPORT && start == CONF1_ADDR_PORT);
 #else
 		allow = 0;
 #endif
 		if (!allow) {
 		    AcpiOsFree(devinfo);
 		    return (0);
 		}
 	    }
 	    AcpiOsFree(devinfo);
 	}
     }
 
 #ifdef INTRNG
     /* map with default for now */
     if (type == SYS_RES_IRQ)
 	start = (rman_res_t)acpi_map_intr(child, (u_int)start,
 			acpi_get_handle(child));
 #endif
 
     /* If the resource is already allocated, fail. */
     if (resource_list_busy(rl, type, rid))
 	return (EBUSY);
 
     /* If the resource is already reserved, release it. */
     if (resource_list_reserved(rl, type, rid))
 	resource_list_unreserve(rl, dev, child, type, rid);
 
     /* Add the resource. */
     end = (start + count - 1);
     resource_list_add(rl, type, rid, start, end, count);
 
     /* Don't reserve resources until the system resources are allocated. */
     if (!sc->acpi_resources_reserved)
 	return (0);
 
     /* Don't reserve system resources. */
     if (ACPI_ID_PROBE(dev, child, sysres_ids, NULL) <= 0)
 	return (0);
 
     /*
      * Don't reserve IRQ resources.  There are many sticky things to
      * get right otherwise (e.g. IRQs for psm, atkbd, and HPET when
      * using legacy routing).
      */
     if (type == SYS_RES_IRQ)
 	return (0);
 
     /*
      * Don't reserve resources for CPU devices.  Some of these
      * resources need to be allocated as shareable, but reservations
      * are always non-shareable.
      */
     if (device_get_devclass(child) == devclass_find("cpu"))
 	return (0);
 
     /*
      * Reserve the resource.
      *
      * XXX: Ignores failure for now.  Failure here is probably a
      * BIOS/firmware bug?
      */
     resource_list_reserve(rl, dev, child, type, &rid, start, end, count, 0);
     return (0);
 }
 
 static struct resource *
 acpi_alloc_resource(device_t bus, device_t child, int type, int *rid,
     rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
 {
 #ifndef INTRNG
     ACPI_RESOURCE ares;
 #endif
     struct acpi_device *ad;
     struct resource_list_entry *rle;
     struct resource_list *rl;
     struct resource *res;
     int isdefault = RMAN_IS_DEFAULT_RANGE(start, end);
 
     /*
      * First attempt at allocating the resource.  For direct children,
      * use resource_list_alloc() to handle reserved resources.  For
      * other devices, pass the request up to our parent.
      */
     if (bus == device_get_parent(child)) {
 	ad = device_get_ivars(child);
 	rl = &ad->ad_rl;
 
 	/*
 	 * Simulate the behavior of the ISA bus for direct children
 	 * devices.  That is, if a non-default range is specified for
 	 * a resource that doesn't exist, use bus_set_resource() to
 	 * add the resource before allocating it.  Note that these
 	 * resources will not be reserved.
 	 */
 	if (!isdefault && resource_list_find(rl, type, *rid) == NULL)
 		resource_list_add(rl, type, *rid, start, end, count);
 	res = resource_list_alloc(rl, bus, child, type, rid, start, end, count,
 	    flags);
 #ifndef INTRNG
 	if (res != NULL && type == SYS_RES_IRQ) {
 	    /*
 	     * Since bus_config_intr() takes immediate effect, we cannot
 	     * configure the interrupt associated with a device when we
 	     * parse the resources but have to defer it until a driver
 	     * actually allocates the interrupt via bus_alloc_resource().
 	     *
 	     * XXX: Should we handle the lookup failing?
 	     */
 	    if (ACPI_SUCCESS(acpi_lookup_irq_resource(child, *rid, res, &ares)))
 		acpi_config_intr(child, &ares);
 	}
 #endif
 
 	/*
 	 * If this is an allocation of the "default" range for a given
 	 * RID, fetch the exact bounds for this resource from the
 	 * resource list entry to try to allocate the range from the
 	 * system resource regions.
 	 */
 	if (res == NULL && isdefault) {
 	    rle = resource_list_find(rl, type, *rid);
 	    if (rle != NULL) {
 		start = rle->start;
 		end = rle->end;
 		count = rle->count;
 	    }
 	}
     } else
 	res = BUS_ALLOC_RESOURCE(device_get_parent(bus), child, type, rid,
 	    start, end, count, flags);
 
     /*
      * If the first attempt failed and this is an allocation of a
      * specific range, try to satisfy the request via a suballocation
      * from our system resource regions.
      */
     if (res == NULL && start + count - 1 == end)
 	res = acpi_alloc_sysres(child, type, rid, start, end, count, flags);
     return (res);
 }
 
 /*
  * Attempt to allocate a specific resource range from the system
  * resource ranges.  Note that we only handle memory and I/O port
  * system resources.
  */
 struct resource *
 acpi_alloc_sysres(device_t child, int type, int *rid, rman_res_t start,
     rman_res_t end, rman_res_t count, u_int flags)
 {
     struct rman *rm;
     struct resource *res;
 
     switch (type) {
     case SYS_RES_IOPORT:
 	rm = &acpi_rman_io;
 	break;
     case SYS_RES_MEMORY:
 	rm = &acpi_rman_mem;
 	break;
     default:
 	return (NULL);
     }
 
     KASSERT(start + count - 1 == end, ("wildcard resource range"));
     res = rman_reserve_resource(rm, start, end, count, flags & ~RF_ACTIVE,
 	child);
     if (res == NULL)
 	return (NULL);
 
     rman_set_rid(res, *rid);
 
     /* If requested, activate the resource using the parent's method. */
     if (flags & RF_ACTIVE)
 	if (bus_activate_resource(child, type, *rid, res) != 0) {
 	    rman_release_resource(res);
 	    return (NULL);
 	}
 
     return (res);
 }
 
 static int
 acpi_is_resource_managed(int type, struct resource *r)
 {
 
     /* We only handle memory and IO resources through rman. */
     switch (type) {
     case SYS_RES_IOPORT:
 	return (rman_is_region_manager(r, &acpi_rman_io));
     case SYS_RES_MEMORY:
 	return (rman_is_region_manager(r, &acpi_rman_mem));
     }
     return (0);
 }
 
 static int
 acpi_adjust_resource(device_t bus, device_t child, int type, struct resource *r,
     rman_res_t start, rman_res_t end)
 {
 
     if (acpi_is_resource_managed(type, r))
 	return (rman_adjust_resource(r, start, end));
     return (bus_generic_adjust_resource(bus, child, type, r, start, end));
 }
 
 static int
 acpi_release_resource(device_t bus, device_t child, int type, int rid,
     struct resource *r)
 {
     int ret;
 
     /*
      * If this resource belongs to one of our internal managers,
      * deactivate it and release it to the local pool.
      */
     if (acpi_is_resource_managed(type, r)) {
 	if (rman_get_flags(r) & RF_ACTIVE) {
 	    ret = bus_deactivate_resource(child, type, rid, r);
 	    if (ret != 0)
 		return (ret);
 	}
 	return (rman_release_resource(r));
     }
 
     return (bus_generic_rl_release_resource(bus, child, type, rid, r));
 }
 
 static void
 acpi_delete_resource(device_t bus, device_t child, int type, int rid)
 {
     struct resource_list *rl;
 
     rl = acpi_get_rlist(bus, child);
     if (resource_list_busy(rl, type, rid)) {
 	device_printf(bus, "delete_resource: Resource still owned by child"
 	    " (type=%d, rid=%d)\n", type, rid);
 	return;
     }
     resource_list_unreserve(rl, bus, child, type, rid);
     resource_list_delete(rl, type, rid);
 }
 
 /* Allocate an IO port or memory resource, given its GAS. */
 int
 acpi_bus_alloc_gas(device_t dev, int *type, int *rid, ACPI_GENERIC_ADDRESS *gas,
     struct resource **res, u_int flags)
 {
     int error, res_type;
 
     error = ENOMEM;
     if (type == NULL || rid == NULL || gas == NULL || res == NULL)
 	return (EINVAL);
 
     /* We only support memory and IO spaces. */
     switch (gas->SpaceId) {
     case ACPI_ADR_SPACE_SYSTEM_MEMORY:
 	res_type = SYS_RES_MEMORY;
 	break;
     case ACPI_ADR_SPACE_SYSTEM_IO:
 	res_type = SYS_RES_IOPORT;
 	break;
     default:
 	return (EOPNOTSUPP);
     }
 
     /*
      * If the register width is less than 8, assume the BIOS author means
      * it is a bit field and just allocate a byte.
      */
     if (gas->BitWidth && gas->BitWidth < 8)
 	gas->BitWidth = 8;
 
     /* Validate the address after we're sure we support the space. */
     if (gas->Address == 0 || gas->BitWidth == 0)
 	return (EINVAL);
 
     bus_set_resource(dev, res_type, *rid, gas->Address,
 	gas->BitWidth / 8);
     *res = bus_alloc_resource_any(dev, res_type, rid, RF_ACTIVE | flags);
     if (*res != NULL) {
 	*type = res_type;
 	error = 0;
     } else
 	bus_delete_resource(dev, res_type, *rid);
 
     return (error);
 }
 
 /* Probe _HID and _CID for compatible ISA PNP ids. */
 static uint32_t
 acpi_isa_get_logicalid(device_t dev)
 {
     ACPI_DEVICE_INFO	*devinfo;
     ACPI_HANDLE		h;
     uint32_t		pnpid;
 
     ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
 
     /* Fetch and validate the HID. */
     if ((h = acpi_get_handle(dev)) == NULL ||
 	ACPI_FAILURE(AcpiGetObjectInfo(h, &devinfo)))
 	return_VALUE (0);
 
     pnpid = (devinfo->Valid & ACPI_VALID_HID) != 0 &&
 	devinfo->HardwareId.Length >= ACPI_EISAID_STRING_SIZE ?
 	PNP_EISAID(devinfo->HardwareId.String) : 0;
     AcpiOsFree(devinfo);
 
     return_VALUE (pnpid);
 }
 
 static int
 acpi_isa_get_compatid(device_t dev, uint32_t *cids, int count)
 {
     ACPI_DEVICE_INFO	*devinfo;
     ACPI_PNP_DEVICE_ID	*ids;
     ACPI_HANDLE		h;
     uint32_t		*pnpid;
     int			i, valid;
 
     ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
 
     pnpid = cids;
 
     /* Fetch and validate the CID */
     if ((h = acpi_get_handle(dev)) == NULL ||
 	ACPI_FAILURE(AcpiGetObjectInfo(h, &devinfo)))
 	return_VALUE (0);
 
     if ((devinfo->Valid & ACPI_VALID_CID) == 0) {
 	AcpiOsFree(devinfo);
 	return_VALUE (0);
     }
 
     if (devinfo->CompatibleIdList.Count < count)
 	count = devinfo->CompatibleIdList.Count;
     ids = devinfo->CompatibleIdList.Ids;
     for (i = 0, valid = 0; i < count; i++)
 	if (ids[i].Length >= ACPI_EISAID_STRING_SIZE &&
 	    strncmp(ids[i].String, "PNP", 3) == 0) {
 	    *pnpid++ = PNP_EISAID(ids[i].String);
 	    valid++;
 	}
     AcpiOsFree(devinfo);
 
     return_VALUE (valid);
 }
 
 static int
 acpi_device_id_probe(device_t bus, device_t dev, char **ids, char **match) 
 {
     ACPI_HANDLE h;
     ACPI_OBJECT_TYPE t;
     int rv;
     int i;
 
     h = acpi_get_handle(dev);
     if (ids == NULL || h == NULL)
 	return (ENXIO);
     t = acpi_get_type(dev);
     if (t != ACPI_TYPE_DEVICE && t != ACPI_TYPE_PROCESSOR)
 	return (ENXIO);
 
     /* Try to match one of the array of IDs with a HID or CID. */
     for (i = 0; ids[i] != NULL; i++) {
 	rv = acpi_MatchHid(h, ids[i]);
 	if (rv == ACPI_MATCHHID_NOMATCH)
 	    continue;
 
 	if (match != NULL) {
 	    *match = ids[i];
 	}
 	return ((rv == ACPI_MATCHHID_HID)?
 		    BUS_PROBE_DEFAULT : BUS_PROBE_LOW_PRIORITY);
     }
     return (ENXIO);
 }
 
 static ACPI_STATUS
 acpi_device_eval_obj(device_t bus, device_t dev, ACPI_STRING pathname,
     ACPI_OBJECT_LIST *parameters, ACPI_BUFFER *ret)
 {
     ACPI_HANDLE h;
 
     if (dev == NULL)
 	h = ACPI_ROOT_OBJECT;
     else if ((h = acpi_get_handle(dev)) == NULL)
 	return (AE_BAD_PARAMETER);
     return (AcpiEvaluateObject(h, pathname, parameters, ret));
 }
 
 int
 acpi_device_pwr_for_sleep(device_t bus, device_t dev, int *dstate)
 {
     struct acpi_softc *sc;
     ACPI_HANDLE handle;
     ACPI_STATUS status;
     char sxd[8];
 
     handle = acpi_get_handle(dev);
 
     /*
      * XXX If we find these devices, don't try to power them down.
      * The serial and IRDA ports on my T23 hang the system when
      * set to D3 and it appears that such legacy devices may
      * need special handling in their drivers.
      */
     if (dstate == NULL || handle == NULL ||
 	acpi_MatchHid(handle, "PNP0500") ||
 	acpi_MatchHid(handle, "PNP0501") ||
 	acpi_MatchHid(handle, "PNP0502") ||
 	acpi_MatchHid(handle, "PNP0510") ||
 	acpi_MatchHid(handle, "PNP0511"))
 	return (ENXIO);
 
     /*
      * Override next state with the value from _SxD, if present.
      * Note illegal _S0D is evaluated because some systems expect this.
      */
     sc = device_get_softc(bus);
     snprintf(sxd, sizeof(sxd), "_S%dD", sc->acpi_sstate);
     status = acpi_GetInteger(handle, sxd, dstate);
     if (ACPI_FAILURE(status) && status != AE_NOT_FOUND) {
 	    device_printf(dev, "failed to get %s on %s: %s\n", sxd,
 		acpi_name(handle), AcpiFormatException(status));
 	    return (ENXIO);
     }
 
     return (0);
 }
 
 /* Callback arg for our implementation of walking the namespace. */
 struct acpi_device_scan_ctx {
     acpi_scan_cb_t	user_fn;
     void		*arg;
     ACPI_HANDLE		parent;
 };
 
 static ACPI_STATUS
 acpi_device_scan_cb(ACPI_HANDLE h, UINT32 level, void *arg, void **retval)
 {
     struct acpi_device_scan_ctx *ctx;
     device_t dev, old_dev;
     ACPI_STATUS status;
     ACPI_OBJECT_TYPE type;
 
     /*
      * Skip this device if we think we'll have trouble with it or it is
      * the parent where the scan began.
      */
     ctx = (struct acpi_device_scan_ctx *)arg;
     if (acpi_avoid(h) || h == ctx->parent)
 	return (AE_OK);
 
     /* If this is not a valid device type (e.g., a method), skip it. */
     if (ACPI_FAILURE(AcpiGetType(h, &type)))
 	return (AE_OK);
     if (type != ACPI_TYPE_DEVICE && type != ACPI_TYPE_PROCESSOR &&
 	type != ACPI_TYPE_THERMAL && type != ACPI_TYPE_POWER)
 	return (AE_OK);
 
     /*
      * Call the user function with the current device.  If it is unchanged
      * afterwards, return.  Otherwise, we update the handle to the new dev.
      */
     old_dev = acpi_get_device(h);
     dev = old_dev;
     status = ctx->user_fn(h, &dev, level, ctx->arg);
     if (ACPI_FAILURE(status) || old_dev == dev)
 	return (status);
 
     /* Remove the old child and its connection to the handle. */
     if (old_dev != NULL)
 	device_delete_child(device_get_parent(old_dev), old_dev);
 
     /* Recreate the handle association if the user created a device. */
     if (dev != NULL)
 	AcpiAttachData(h, acpi_fake_objhandler, dev);
 
     return (AE_OK);
 }
 
 static ACPI_STATUS
 acpi_device_scan_children(device_t bus, device_t dev, int max_depth,
     acpi_scan_cb_t user_fn, void *arg)
 {
     ACPI_HANDLE h;
     struct acpi_device_scan_ctx ctx;
 
     if (acpi_disabled("children"))
 	return (AE_OK);
 
     if (dev == NULL)
 	h = ACPI_ROOT_OBJECT;
     else if ((h = acpi_get_handle(dev)) == NULL)
 	return (AE_BAD_PARAMETER);
     ctx.user_fn = user_fn;
     ctx.arg = arg;
     ctx.parent = h;
     return (AcpiWalkNamespace(ACPI_TYPE_ANY, h, max_depth,
 	acpi_device_scan_cb, NULL, &ctx, NULL));
 }
 
 /*
  * Even though ACPI devices are not PCI, we use the PCI approach for setting
  * device power states since it's close enough to ACPI.
  */
 int
 acpi_set_powerstate(device_t child, int state)
 {
     ACPI_HANDLE h;
     ACPI_STATUS status;
 
     h = acpi_get_handle(child);
     if (state < ACPI_STATE_D0 || state > ACPI_D_STATES_MAX)
 	return (EINVAL);
     if (h == NULL)
 	return (0);
 
     /* Ignore errors if the power methods aren't present. */
     status = acpi_pwr_switch_consumer(h, state);
     if (ACPI_SUCCESS(status)) {
 	if (bootverbose)
 	    device_printf(child, "set ACPI power state D%d on %s\n",
 		state, acpi_name(h));
     } else if (status != AE_NOT_FOUND)
 	device_printf(child,
 	    "failed to set ACPI power state D%d on %s: %s\n", state,
 	    acpi_name(h), AcpiFormatException(status));
 
     return (0);
 }
 
 static int
 acpi_isa_pnp_probe(device_t bus, device_t child, struct isa_pnp_id *ids)
 {
     int			result, cid_count, i;
     uint32_t		lid, cids[8];
 
     ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
 
     /*
      * ISA-style drivers attached to ACPI may persist and
      * probe manually if we return ENOENT.  We never want
      * that to happen, so don't ever return it.
      */
     result = ENXIO;
 
     /* Scan the supplied IDs for a match */
     lid = acpi_isa_get_logicalid(child);
     cid_count = acpi_isa_get_compatid(child, cids, 8);
     while (ids && ids->ip_id) {
 	if (lid == ids->ip_id) {
 	    result = 0;
 	    goto out;
 	}
 	for (i = 0; i < cid_count; i++) {
 	    if (cids[i] == ids->ip_id) {
 		result = 0;
 		goto out;
 	    }
 	}
 	ids++;
     }
 
  out:
     if (result == 0 && ids->ip_desc)
 	device_set_desc(child, ids->ip_desc);
 
     return_VALUE (result);
 }
 
 /*
  * Look for a MCFG table.  If it is present, use the settings for
  * domain (segment) 0 to setup PCI config space access via the memory
  * map.
  *
  * On non-x86 architectures (arm64 for now), this will be done from the
  * PCI host bridge driver.
  */
 static void
 acpi_enable_pcie(void)
 {
 #if defined(__i386__) || defined(__amd64__)
 	ACPI_TABLE_HEADER *hdr;
 	ACPI_MCFG_ALLOCATION *alloc, *end;
 	ACPI_STATUS status;
 
 	status = AcpiGetTable(ACPI_SIG_MCFG, 1, &hdr);
 	if (ACPI_FAILURE(status))
 		return;
 
 	end = (ACPI_MCFG_ALLOCATION *)((char *)hdr + hdr->Length);
 	alloc = (ACPI_MCFG_ALLOCATION *)((ACPI_TABLE_MCFG *)hdr + 1);
 	while (alloc < end) {
 		if (alloc->PciSegment == 0) {
 			pcie_cfgregopen(alloc->Address, alloc->StartBusNumber,
 			    alloc->EndBusNumber);
 			return;
 		}
 		alloc++;
 	}
 #endif
 }
 
 static void
 acpi_platform_osc(device_t dev)
 {
 	ACPI_HANDLE sb_handle;
 	ACPI_STATUS status;
 	uint32_t cap_set[2];
 
 	/* 0811B06E-4A27-44F9-8D60-3CBBC22E7B48 */
 	static uint8_t acpi_platform_uuid[ACPI_UUID_LENGTH] = {
 		0x6e, 0xb0, 0x11, 0x08, 0x27, 0x4a, 0xf9, 0x44,
 		0x8d, 0x60, 0x3c, 0xbb, 0xc2, 0x2e, 0x7b, 0x48
 	};
 
 	if (ACPI_FAILURE(AcpiGetHandle(ACPI_ROOT_OBJECT, "\\_SB_", &sb_handle)))
 		return;
 
 	cap_set[1] = 0x10;	/* APEI Support */
 	status = acpi_EvaluateOSC(sb_handle, acpi_platform_uuid, 1,
 	    nitems(cap_set), cap_set, cap_set, false);
 	if (ACPI_FAILURE(status)) {
 		if (status == AE_NOT_FOUND)
 			return;
 		device_printf(dev, "_OSC failed: %s\n",
 		    AcpiFormatException(status));
 		return;
 	}
 }
 
 /*
  * Scan all of the ACPI namespace and attach child devices.
  *
  * We should only expect to find devices in the \_PR, \_TZ, \_SI, and
  * \_SB scopes, and \_PR and \_TZ became obsolete in the ACPI 2.0 spec.
  * However, in violation of the spec, some systems place their PCI link
  * devices in \, so we have to walk the whole namespace.  We check the
  * type of namespace nodes, so this should be ok.
  */
 static void
 acpi_probe_children(device_t bus)
 {
 
     ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
 
     /*
      * Scan the namespace and insert placeholders for all the devices that
      * we find.  We also probe/attach any early devices.
      *
      * Note that we use AcpiWalkNamespace rather than AcpiGetDevices because
      * we want to create nodes for all devices, not just those that are
      * currently present. (This assumes that we don't want to create/remove
      * devices as they appear, which might be smarter.)
      */
     ACPI_DEBUG_PRINT((ACPI_DB_OBJECTS, "namespace scan\n"));
     AcpiWalkNamespace(ACPI_TYPE_ANY, ACPI_ROOT_OBJECT, 100, acpi_probe_child,
 	NULL, bus, NULL);
 
     /* Pre-allocate resources for our rman from any sysresource devices. */
     acpi_sysres_alloc(bus);
 
     /* Reserve resources already allocated to children. */
     acpi_reserve_resources(bus);
 
     /* Create any static children by calling device identify methods. */
     ACPI_DEBUG_PRINT((ACPI_DB_OBJECTS, "device identify routines\n"));
     bus_generic_probe(bus);
 
     /* Probe/attach all children, created statically and from the namespace. */
     ACPI_DEBUG_PRINT((ACPI_DB_OBJECTS, "acpi bus_generic_attach\n"));
     bus_generic_attach(bus);
 
     /* Attach wake sysctls. */
     acpi_wake_sysctl_walk(bus);
 
     ACPI_DEBUG_PRINT((ACPI_DB_OBJECTS, "done attaching children\n"));
     return_VOID;
 }
 
 /*
  * Determine the probe order for a given device.
  */
 static void
 acpi_probe_order(ACPI_HANDLE handle, int *order)
 {
 	ACPI_OBJECT_TYPE type;
 
 	/*
 	 * 0. CPUs
 	 * 1. I/O port and memory system resource holders
 	 * 2. Clocks and timers (to handle early accesses)
 	 * 3. Embedded controllers (to handle early accesses)
 	 * 4. PCI Link Devices
 	 */
 	AcpiGetType(handle, &type);
 	if (type == ACPI_TYPE_PROCESSOR)
 		*order = 0;
 	else if (acpi_MatchHid(handle, "PNP0C01") ||
 	    acpi_MatchHid(handle, "PNP0C02"))
 		*order = 1;
 	else if (acpi_MatchHid(handle, "PNP0100") ||
 	    acpi_MatchHid(handle, "PNP0103") ||
 	    acpi_MatchHid(handle, "PNP0B00"))
 		*order = 2;
 	else if (acpi_MatchHid(handle, "PNP0C09"))
 		*order = 3;
 	else if (acpi_MatchHid(handle, "PNP0C0F"))
 		*order = 4;
 }
 
 /*
  * Evaluate a child device and determine whether we might attach a device to
  * it.
  */
 static ACPI_STATUS
 acpi_probe_child(ACPI_HANDLE handle, UINT32 level, void *context, void **status)
 {
     ACPI_DEVICE_INFO *devinfo;
     struct acpi_device	*ad;
     struct acpi_prw_data prw;
     ACPI_OBJECT_TYPE type;
     ACPI_HANDLE h;
     device_t bus, child;
     char *handle_str;
     int order;
 
     ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
 
     if (acpi_disabled("children"))
 	return_ACPI_STATUS (AE_OK);
 
     /* Skip this device if we think we'll have trouble with it. */
     if (acpi_avoid(handle))
 	return_ACPI_STATUS (AE_OK);
 
     bus = (device_t)context;
     if (ACPI_SUCCESS(AcpiGetType(handle, &type))) {
 	handle_str = acpi_name(handle);
 	switch (type) {
 	case ACPI_TYPE_DEVICE:
 	    /*
 	     * Since we scan from \, be sure to skip system scope objects.
 	     * \_SB_ and \_TZ_ are defined in ACPICA as devices to work around
 	     * BIOS bugs.  For example, \_SB_ is to allow \_SB_._INI to be run
 	     * during the initialization and \_TZ_ is to support Notify() on it.
 	     */
 	    if (strcmp(handle_str, "\\_SB_") == 0 ||
 		strcmp(handle_str, "\\_TZ_") == 0)
 		break;
 	    if (acpi_parse_prw(handle, &prw) == 0)
 		AcpiSetupGpeForWake(handle, prw.gpe_handle, prw.gpe_bit);
 
 	    /*
 	     * Ignore devices that do not have a _HID or _CID.  They should
 	     * be discovered by other buses (e.g. the PCI bus driver).
 	     */
 	    if (!acpi_has_hid(handle))
 		break;
 	    /* FALLTHROUGH */
 	case ACPI_TYPE_PROCESSOR:
 	case ACPI_TYPE_THERMAL:
 	case ACPI_TYPE_POWER:
 	    /* 
 	     * Create a placeholder device for this node.  Sort the
 	     * placeholder so that the probe/attach passes will run
 	     * breadth-first.  Orders less than ACPI_DEV_BASE_ORDER
 	     * are reserved for special objects (i.e., system
 	     * resources).
 	     */
 	    ACPI_DEBUG_PRINT((ACPI_DB_OBJECTS, "scanning '%s'\n", handle_str));
 	    order = level * 10 + ACPI_DEV_BASE_ORDER;
 	    acpi_probe_order(handle, &order);
 	    child = BUS_ADD_CHILD(bus, order, NULL, -1);
 	    if (child == NULL)
 		break;
 
 	    /* Associate the handle with the device_t and vice versa. */
 	    acpi_set_handle(child, handle);
 	    AcpiAttachData(handle, acpi_fake_objhandler, child);
 
 	    /*
 	     * Check that the device is present.  If it's not present,
 	     * leave it disabled (so that we have a device_t attached to
 	     * the handle, but we don't probe it).
 	     *
 	     * XXX PCI link devices sometimes report "present" but not
 	     * "functional" (i.e. if disabled).  Go ahead and probe them
 	     * anyway since we may enable them later.
 	     */
 	    if (type == ACPI_TYPE_DEVICE && !acpi_DeviceIsPresent(child)) {
 		/* Never disable PCI link devices. */
 		if (acpi_MatchHid(handle, "PNP0C0F"))
 		    break;
 		/*
 		 * Docking stations should remain enabled since the system
 		 * may be undocked at boot.
 		 */
 		if (ACPI_SUCCESS(AcpiGetHandle(handle, "_DCK", &h)))
 		    break;
 
 		device_disable(child);
 		break;
 	    }
 
 	    /*
 	     * Get the device's resource settings and attach them.
 	     * Note that if the device has _PRS but no _CRS, we need
 	     * to decide when it's appropriate to try to configure the
 	     * device.  Ignore the return value here; it's OK for the
 	     * device not to have any resources.
 	     */
 	    acpi_parse_resources(child, handle, &acpi_res_parse_set, NULL);
 
 	    ad = device_get_ivars(child);
 	    ad->ad_cls_class = 0xffffff;
 	    if (ACPI_SUCCESS(AcpiGetObjectInfo(handle, &devinfo))) {
 		if ((devinfo->Valid & ACPI_VALID_CLS) != 0 &&
 		    devinfo->ClassCode.Length >= ACPI_PCICLS_STRING_SIZE) {
 		    ad->ad_cls_class = strtoul(devinfo->ClassCode.String,
 			NULL, 16);
 		}
 		AcpiOsFree(devinfo);
 	    }
 	    break;
 	}
     }
 
     return_ACPI_STATUS (AE_OK);
 }
 
 /*
  * AcpiAttachData() requires an object handler but never uses it.  This is a
  * placeholder object handler so we can store a device_t in an ACPI_HANDLE.
  */
 void
 acpi_fake_objhandler(ACPI_HANDLE h, void *data)
 {
 }
 
 static void
 acpi_shutdown_final(void *arg, int howto)
 {
     struct acpi_softc *sc = (struct acpi_softc *)arg;
     register_t intr;
     ACPI_STATUS status;
 
     /*
      * XXX Shutdown code should only run on the BSP (cpuid 0).
      * Some chipsets do not power off the system correctly if called from
      * an AP.
      */
     if ((howto & RB_POWEROFF) != 0) {
 	status = AcpiEnterSleepStatePrep(ACPI_STATE_S5);
 	if (ACPI_FAILURE(status)) {
 	    device_printf(sc->acpi_dev, "AcpiEnterSleepStatePrep failed - %s\n",
 		AcpiFormatException(status));
 	    return;
 	}
 	device_printf(sc->acpi_dev, "Powering system off\n");
 	intr = intr_disable();
 	status = AcpiEnterSleepState(ACPI_STATE_S5);
 	if (ACPI_FAILURE(status)) {
 	    intr_restore(intr);
 	    device_printf(sc->acpi_dev, "power-off failed - %s\n",
 		AcpiFormatException(status));
 	} else {
 	    DELAY(1000000);
 	    intr_restore(intr);
 	    device_printf(sc->acpi_dev, "power-off failed - timeout\n");
 	}
     } else if ((howto & RB_HALT) == 0 && sc->acpi_handle_reboot) {
 	/* Reboot using the reset register. */
 	status = AcpiReset();
 	if (ACPI_SUCCESS(status)) {
 	    DELAY(1000000);
 	    device_printf(sc->acpi_dev, "reset failed - timeout\n");
 	} else if (status != AE_NOT_EXIST)
 	    device_printf(sc->acpi_dev, "reset failed - %s\n",
 		AcpiFormatException(status));
     } else if (sc->acpi_do_disable && !KERNEL_PANICKED()) {
 	/*
 	 * Only disable ACPI if the user requested.  On some systems, writing
 	 * the disable value to SMI_CMD hangs the system.
 	 */
 	device_printf(sc->acpi_dev, "Shutting down\n");
 	AcpiTerminate();
     }
 }
 
 static void
 acpi_enable_fixed_events(struct acpi_softc *sc)
 {
     static int	first_time = 1;
 
     /* Enable and clear fixed events and install handlers. */
     if ((AcpiGbl_FADT.Flags & ACPI_FADT_POWER_BUTTON) == 0) {
 	AcpiClearEvent(ACPI_EVENT_POWER_BUTTON);
 	AcpiInstallFixedEventHandler(ACPI_EVENT_POWER_BUTTON,
 				     acpi_event_power_button_sleep, sc);
 	if (first_time)
 	    device_printf(sc->acpi_dev, "Power Button (fixed)\n");
     }
     if ((AcpiGbl_FADT.Flags & ACPI_FADT_SLEEP_BUTTON) == 0) {
 	AcpiClearEvent(ACPI_EVENT_SLEEP_BUTTON);
 	AcpiInstallFixedEventHandler(ACPI_EVENT_SLEEP_BUTTON,
 				     acpi_event_sleep_button_sleep, sc);
 	if (first_time)
 	    device_printf(sc->acpi_dev, "Sleep Button (fixed)\n");
     }
 
     first_time = 0;
 }
 
 /*
  * Returns true if the device is actually present and should
  * be attached to.  This requires the present, enabled, UI-visible 
  * and diagnostics-passed bits to be set.
  */
 BOOLEAN
 acpi_DeviceIsPresent(device_t dev)
 {
 	ACPI_HANDLE h;
 	UINT32 s;
 	ACPI_STATUS status;
 
 	h = acpi_get_handle(dev);
 	if (h == NULL)
 		return (FALSE);
 	/*
 	 * Certain Treadripper boards always returns 0 for FreeBSD because it
 	 * only returns non-zero for the OS string "Windows 2015". Otherwise it
 	 * will return zero. Force them to always be treated as present.
 	 * Beata versions were worse: they always returned 0.
 	 */
 	if (acpi_MatchHid(h, "AMDI0020") || acpi_MatchHid(h, "AMDI0010"))
 		return (TRUE);
 
 	status = acpi_GetInteger(h, "_STA", &s);
 
 	/*
 	 * If no _STA method or if it failed, then assume that
 	 * the device is present.
 	 */
 	if (ACPI_FAILURE(status))
 		return (TRUE);
 
 	return (ACPI_DEVICE_PRESENT(s) ? TRUE : FALSE);
 }
 
 /*
  * Returns true if the battery is actually present and inserted.
  */
 BOOLEAN
 acpi_BatteryIsPresent(device_t dev)
 {
 	ACPI_HANDLE h;
 	UINT32 s;
 	ACPI_STATUS status;
 
 	h = acpi_get_handle(dev);
 	if (h == NULL)
 		return (FALSE);
 	status = acpi_GetInteger(h, "_STA", &s);
 
 	/*
 	 * If no _STA method or if it failed, then assume that
 	 * the device is present.
 	 */
 	if (ACPI_FAILURE(status))
 		return (TRUE);
 
 	return (ACPI_BATTERY_PRESENT(s) ? TRUE : FALSE);
 }
 
 /*
  * Returns true if a device has at least one valid device ID.
  */
 BOOLEAN
 acpi_has_hid(ACPI_HANDLE h)
 {
     ACPI_DEVICE_INFO	*devinfo;
     BOOLEAN		ret;
 
     if (h == NULL ||
 	ACPI_FAILURE(AcpiGetObjectInfo(h, &devinfo)))
 	return (FALSE);
 
     ret = FALSE;
     if ((devinfo->Valid & ACPI_VALID_HID) != 0)
 	ret = TRUE;
     else if ((devinfo->Valid & ACPI_VALID_CID) != 0)
 	if (devinfo->CompatibleIdList.Count > 0)
 	    ret = TRUE;
 
     AcpiOsFree(devinfo);
     return (ret);
 }
 
 /*
  * Match a HID string against a handle
  * returns ACPI_MATCHHID_HID if _HID match
  *         ACPI_MATCHHID_CID if _CID match and not _HID match.
  *         ACPI_MATCHHID_NOMATCH=0 if no match.
  */
 int
 acpi_MatchHid(ACPI_HANDLE h, const char *hid) 
 {
     ACPI_DEVICE_INFO	*devinfo;
     BOOLEAN		ret;
     int			i;
 
     if (hid == NULL || h == NULL ||
 	ACPI_FAILURE(AcpiGetObjectInfo(h, &devinfo)))
 	return (ACPI_MATCHHID_NOMATCH);
 
     ret = ACPI_MATCHHID_NOMATCH;
     if ((devinfo->Valid & ACPI_VALID_HID) != 0 &&
 	strcmp(hid, devinfo->HardwareId.String) == 0)
 	    ret = ACPI_MATCHHID_HID;
     else if ((devinfo->Valid & ACPI_VALID_CID) != 0)
 	for (i = 0; i < devinfo->CompatibleIdList.Count; i++) {
 	    if (strcmp(hid, devinfo->CompatibleIdList.Ids[i].String) == 0) {
 		ret = ACPI_MATCHHID_CID;
 		break;
 	    }
 	}
 
     AcpiOsFree(devinfo);
     return (ret);
 }
 
 /*
  * Return the handle of a named object within our scope, ie. that of (parent)
  * or one if its parents.
  */
 ACPI_STATUS
 acpi_GetHandleInScope(ACPI_HANDLE parent, char *path, ACPI_HANDLE *result)
 {
     ACPI_HANDLE		r;
     ACPI_STATUS		status;
 
     /* Walk back up the tree to the root */
     for (;;) {
 	status = AcpiGetHandle(parent, path, &r);
 	if (ACPI_SUCCESS(status)) {
 	    *result = r;
 	    return (AE_OK);
 	}
 	/* XXX Return error here? */
 	if (status != AE_NOT_FOUND)
 	    return (AE_OK);
 	if (ACPI_FAILURE(AcpiGetParent(parent, &r)))
 	    return (AE_NOT_FOUND);
 	parent = r;
     }
 }
 
 /*
  * Allocate a buffer with a preset data size.
  */
 ACPI_BUFFER *
 acpi_AllocBuffer(int size)
 {
     ACPI_BUFFER	*buf;
 
     if ((buf = malloc(size + sizeof(*buf), M_ACPIDEV, M_NOWAIT)) == NULL)
 	return (NULL);
     buf->Length = size;
     buf->Pointer = (void *)(buf + 1);
     return (buf);
 }
 
 ACPI_STATUS
 acpi_SetInteger(ACPI_HANDLE handle, char *path, UINT32 number)
 {
     ACPI_OBJECT arg1;
     ACPI_OBJECT_LIST args;
 
     arg1.Type = ACPI_TYPE_INTEGER;
     arg1.Integer.Value = number;
     args.Count = 1;
     args.Pointer = &arg1;
 
     return (AcpiEvaluateObject(handle, path, &args, NULL));
 }
 
 /*
  * Evaluate a path that should return an integer.
  */
 ACPI_STATUS
 acpi_GetInteger(ACPI_HANDLE handle, char *path, UINT32 *number)
 {
     ACPI_STATUS	status;
     ACPI_BUFFER	buf;
     ACPI_OBJECT	param;
 
     if (handle == NULL)
 	handle = ACPI_ROOT_OBJECT;
 
     /*
      * Assume that what we've been pointed at is an Integer object, or
      * a method that will return an Integer.
      */
     buf.Pointer = &param;
     buf.Length = sizeof(param);
     status = AcpiEvaluateObject(handle, path, NULL, &buf);
     if (ACPI_SUCCESS(status)) {
 	if (param.Type == ACPI_TYPE_INTEGER)
 	    *number = param.Integer.Value;
 	else
 	    status = AE_TYPE;
     }
 
     /* 
      * In some applications, a method that's expected to return an Integer
      * may instead return a Buffer (probably to simplify some internal
      * arithmetic).  We'll try to fetch whatever it is, and if it's a Buffer,
      * convert it into an Integer as best we can.
      *
      * This is a hack.
      */
     if (status == AE_BUFFER_OVERFLOW) {
 	if ((buf.Pointer = AcpiOsAllocate(buf.Length)) == NULL) {
 	    status = AE_NO_MEMORY;
 	} else {
 	    status = AcpiEvaluateObject(handle, path, NULL, &buf);
 	    if (ACPI_SUCCESS(status))
 		status = acpi_ConvertBufferToInteger(&buf, number);
 	    AcpiOsFree(buf.Pointer);
 	}
     }
     return (status);
 }
 
 ACPI_STATUS
 acpi_ConvertBufferToInteger(ACPI_BUFFER *bufp, UINT32 *number)
 {
     ACPI_OBJECT	*p;
     UINT8	*val;
     int		i;
 
     p = (ACPI_OBJECT *)bufp->Pointer;
     if (p->Type == ACPI_TYPE_INTEGER) {
 	*number = p->Integer.Value;
 	return (AE_OK);
     }
     if (p->Type != ACPI_TYPE_BUFFER)
 	return (AE_TYPE);
     if (p->Buffer.Length > sizeof(int))
 	return (AE_BAD_DATA);
 
     *number = 0;
     val = p->Buffer.Pointer;
     for (i = 0; i < p->Buffer.Length; i++)
 	*number += val[i] << (i * 8);
     return (AE_OK);
 }
 
 /*
  * Iterate over the elements of an a package object, calling the supplied
  * function for each element.
  *
  * XXX possible enhancement might be to abort traversal on error.
  */
 ACPI_STATUS
 acpi_ForeachPackageObject(ACPI_OBJECT *pkg,
 	void (*func)(ACPI_OBJECT *comp, void *arg), void *arg)
 {
     ACPI_OBJECT	*comp;
     int		i;
 
     if (pkg == NULL || pkg->Type != ACPI_TYPE_PACKAGE)
 	return (AE_BAD_PARAMETER);
 
     /* Iterate over components */
     i = 0;
     comp = pkg->Package.Elements;
     for (; i < pkg->Package.Count; i++, comp++)
 	func(comp, arg);
 
     return (AE_OK);
 }
 
 /*
  * Find the (index)th resource object in a set.
  */
 ACPI_STATUS
 acpi_FindIndexedResource(ACPI_BUFFER *buf, int index, ACPI_RESOURCE **resp)
 {
     ACPI_RESOURCE	*rp;
     int			i;
 
     rp = (ACPI_RESOURCE *)buf->Pointer;
     i = index;
     while (i-- > 0) {
 	/* Range check */
 	if (rp > (ACPI_RESOURCE *)((u_int8_t *)buf->Pointer + buf->Length))
 	    return (AE_BAD_PARAMETER);
 
 	/* Check for terminator */
 	if (rp->Type == ACPI_RESOURCE_TYPE_END_TAG || rp->Length == 0)
 	    return (AE_NOT_FOUND);
 	rp = ACPI_NEXT_RESOURCE(rp);
     }
     if (resp != NULL)
 	*resp = rp;
 
     return (AE_OK);
 }
 
 /*
  * Append an ACPI_RESOURCE to an ACPI_BUFFER.
  *
  * Given a pointer to an ACPI_RESOURCE structure, expand the ACPI_BUFFER
  * provided to contain it.  If the ACPI_BUFFER is empty, allocate a sensible
  * backing block.  If the ACPI_RESOURCE is NULL, return an empty set of
  * resources.
  */
 #define ACPI_INITIAL_RESOURCE_BUFFER_SIZE	512
 
 ACPI_STATUS
 acpi_AppendBufferResource(ACPI_BUFFER *buf, ACPI_RESOURCE *res)
 {
     ACPI_RESOURCE	*rp;
     void		*newp;
 
     /* Initialise the buffer if necessary. */
     if (buf->Pointer == NULL) {
 	buf->Length = ACPI_INITIAL_RESOURCE_BUFFER_SIZE;
 	if ((buf->Pointer = AcpiOsAllocate(buf->Length)) == NULL)
 	    return (AE_NO_MEMORY);
 	rp = (ACPI_RESOURCE *)buf->Pointer;
 	rp->Type = ACPI_RESOURCE_TYPE_END_TAG;
 	rp->Length = ACPI_RS_SIZE_MIN;
     }
     if (res == NULL)
 	return (AE_OK);
 
     /*
      * Scan the current buffer looking for the terminator.
      * This will either find the terminator or hit the end
      * of the buffer and return an error.
      */
     rp = (ACPI_RESOURCE *)buf->Pointer;
     for (;;) {
 	/* Range check, don't go outside the buffer */
 	if (rp >= (ACPI_RESOURCE *)((u_int8_t *)buf->Pointer + buf->Length))
 	    return (AE_BAD_PARAMETER);
 	if (rp->Type == ACPI_RESOURCE_TYPE_END_TAG || rp->Length == 0)
 	    break;
 	rp = ACPI_NEXT_RESOURCE(rp);
     }
 
     /*
      * Check the size of the buffer and expand if required.
      *
      * Required size is:
      *	size of existing resources before terminator + 
      *	size of new resource and header +
      * 	size of terminator.
      *
      * Note that this loop should really only run once, unless
      * for some reason we are stuffing a *really* huge resource.
      */
     while ((((u_int8_t *)rp - (u_int8_t *)buf->Pointer) + 
 	    res->Length + ACPI_RS_SIZE_NO_DATA +
 	    ACPI_RS_SIZE_MIN) >= buf->Length) {
 	if ((newp = AcpiOsAllocate(buf->Length * 2)) == NULL)
 	    return (AE_NO_MEMORY);
 	bcopy(buf->Pointer, newp, buf->Length);
 	rp = (ACPI_RESOURCE *)((u_int8_t *)newp +
 			       ((u_int8_t *)rp - (u_int8_t *)buf->Pointer));
 	AcpiOsFree(buf->Pointer);
 	buf->Pointer = newp;
 	buf->Length += buf->Length;
     }
 
     /* Insert the new resource. */
     bcopy(res, rp, res->Length + ACPI_RS_SIZE_NO_DATA);
 
     /* And add the terminator. */
     rp = ACPI_NEXT_RESOURCE(rp);
     rp->Type = ACPI_RESOURCE_TYPE_END_TAG;
     rp->Length = ACPI_RS_SIZE_MIN;
 
     return (AE_OK);
 }
 
 UINT64
 acpi_DSMQuery(ACPI_HANDLE h, const uint8_t *uuid, int revision)
 {
     /*
      * ACPI spec 9.1.1 defines this.
      *
      * "Arg2: Function Index Represents a specific function whose meaning is
      * specific to the UUID and Revision ID. Function indices should start
      * with 1. Function number zero is a query function (see the special
      * return code defined below)."
      */
     ACPI_BUFFER buf;
     ACPI_OBJECT *obj;
     UINT64 ret = 0;
     int i;
 
     if (!ACPI_SUCCESS(acpi_EvaluateDSM(h, uuid, revision, 0, NULL, &buf))) {
 	ACPI_INFO(("Failed to enumerate DSM functions\n"));
 	return (0);
     }
 
     obj = (ACPI_OBJECT *)buf.Pointer;
     KASSERT(obj, ("Object not allowed to be NULL\n"));
 
     /*
      * From ACPI 6.2 spec 9.1.1:
      * If Function Index = 0, a Buffer containing a function index bitfield.
      * Otherwise, the return value and type depends on the UUID and revision
      * ID (see below).
      */
     switch (obj->Type) {
     case ACPI_TYPE_BUFFER:
 	for (i = 0; i < MIN(obj->Buffer.Length, sizeof(ret)); i++)
 	    ret |= (((uint64_t)obj->Buffer.Pointer[i]) << (i * 8));
 	break;
     case ACPI_TYPE_INTEGER:
 	ACPI_BIOS_WARNING((AE_INFO,
 	    "Possibly buggy BIOS with ACPI_TYPE_INTEGER for function enumeration\n"));
 	ret = obj->Integer.Value;
 	break;
     default:
 	ACPI_WARNING((AE_INFO, "Unexpected return type %u\n", obj->Type));
     };
 
     AcpiOsFree(obj);
     return ret;
 }
 
 /*
  * DSM may return multiple types depending on the function. It is therefore
  * unsafe to use the typed evaluation. It is highly recommended that the caller
  * check the type of the returned object.
  */
 ACPI_STATUS
 acpi_EvaluateDSM(ACPI_HANDLE handle, const uint8_t *uuid, int revision,
     UINT64 function, ACPI_OBJECT *package, ACPI_BUFFER *out_buf)
 {
 	return (acpi_EvaluateDSMTyped(handle, uuid, revision, function,
 	    package, out_buf, ACPI_TYPE_ANY));
 }
 
 ACPI_STATUS
 acpi_EvaluateDSMTyped(ACPI_HANDLE handle, const uint8_t *uuid, int revision,
     UINT64 function, ACPI_OBJECT *package, ACPI_BUFFER *out_buf,
     ACPI_OBJECT_TYPE type)
 {
     ACPI_OBJECT arg[4];
     ACPI_OBJECT_LIST arglist;
     ACPI_BUFFER buf;
     ACPI_STATUS status;
 
     if (out_buf == NULL)
 	return (AE_NO_MEMORY);
 
     arg[0].Type = ACPI_TYPE_BUFFER;
     arg[0].Buffer.Length = ACPI_UUID_LENGTH;
     arg[0].Buffer.Pointer = __DECONST(uint8_t *, uuid);
     arg[1].Type = ACPI_TYPE_INTEGER;
     arg[1].Integer.Value = revision;
     arg[2].Type = ACPI_TYPE_INTEGER;
     arg[2].Integer.Value = function;
     if (package) {
 	arg[3] = *package;
     } else {
 	arg[3].Type = ACPI_TYPE_PACKAGE;
 	arg[3].Package.Count = 0;
 	arg[3].Package.Elements = NULL;
     }
 
     arglist.Pointer = arg;
     arglist.Count = 4;
     buf.Pointer = NULL;
     buf.Length = ACPI_ALLOCATE_BUFFER;
     status = AcpiEvaluateObjectTyped(handle, "_DSM", &arglist, &buf, type);
     if (ACPI_FAILURE(status))
 	return (status);
 
     KASSERT(ACPI_SUCCESS(status), ("Unexpected status"));
 
     *out_buf = buf;
     return (status);
 }
 
 ACPI_STATUS
 acpi_EvaluateOSC(ACPI_HANDLE handle, uint8_t *uuid, int revision, int count,
     uint32_t *caps_in, uint32_t *caps_out, bool query)
 {
 	ACPI_OBJECT arg[4], *ret;
 	ACPI_OBJECT_LIST arglist;
 	ACPI_BUFFER buf;
 	ACPI_STATUS status;
 
 	arglist.Pointer = arg;
 	arglist.Count = 4;
 	arg[0].Type = ACPI_TYPE_BUFFER;
 	arg[0].Buffer.Length = ACPI_UUID_LENGTH;
 	arg[0].Buffer.Pointer = uuid;
 	arg[1].Type = ACPI_TYPE_INTEGER;
 	arg[1].Integer.Value = revision;
 	arg[2].Type = ACPI_TYPE_INTEGER;
 	arg[2].Integer.Value = count;
 	arg[3].Type = ACPI_TYPE_BUFFER;
 	arg[3].Buffer.Length = count * sizeof(*caps_in);
 	arg[3].Buffer.Pointer = (uint8_t *)caps_in;
 	caps_in[0] = query ? 1 : 0;
 	buf.Pointer = NULL;
 	buf.Length = ACPI_ALLOCATE_BUFFER;
 	status = AcpiEvaluateObjectTyped(handle, "_OSC", &arglist, &buf,
 	    ACPI_TYPE_BUFFER);
 	if (ACPI_FAILURE(status))
 		return (status);
 	if (caps_out != NULL) {
 		ret = buf.Pointer;
 		if (ret->Buffer.Length != count * sizeof(*caps_out)) {
 			AcpiOsFree(buf.Pointer);
 			return (AE_BUFFER_OVERFLOW);
 		}
 		bcopy(ret->Buffer.Pointer, caps_out, ret->Buffer.Length);
 	}
 	AcpiOsFree(buf.Pointer);
 	return (status);
 }
 
 /*
  * Set interrupt model.
  */
 ACPI_STATUS
 acpi_SetIntrModel(int model)
 {
 
     return (acpi_SetInteger(ACPI_ROOT_OBJECT, "_PIC", model));
 }
 
 /*
  * Walk subtables of a table and call a callback routine for each
  * subtable.  The caller should provide the first subtable and a
  * pointer to the end of the table.  This can be used to walk tables
  * such as MADT and SRAT that use subtable entries.
  */
 void
 acpi_walk_subtables(void *first, void *end, acpi_subtable_handler *handler,
     void *arg)
 {
     ACPI_SUBTABLE_HEADER *entry;
 
     for (entry = first; (void *)entry < end; ) {
 	/* Avoid an infinite loop if we hit a bogus entry. */
 	if (entry->Length < sizeof(ACPI_SUBTABLE_HEADER))
 	    return;
 
 	handler(entry, arg);
 	entry = ACPI_ADD_PTR(ACPI_SUBTABLE_HEADER, entry, entry->Length);
     }
 }
 
 /*
  * DEPRECATED.  This interface has serious deficiencies and will be
  * removed.
  *
  * Immediately enter the sleep state.  In the old model, acpiconf(8) ran
  * rc.suspend and rc.resume so we don't have to notify devd(8) to do this.
  */
 ACPI_STATUS
 acpi_SetSleepState(struct acpi_softc *sc, int state)
 {
     static int once;
 
     if (!once) {
 	device_printf(sc->acpi_dev,
 "warning: acpi_SetSleepState() deprecated, need to update your software\n");
 	once = 1;
     }
     return (acpi_EnterSleepState(sc, state));
 }
 
 #if defined(__amd64__) || defined(__i386__)
 static void
 acpi_sleep_force_task(void *context)
 {
     struct acpi_softc *sc = (struct acpi_softc *)context;
 
     if (ACPI_FAILURE(acpi_EnterSleepState(sc, sc->acpi_next_sstate)))
 	device_printf(sc->acpi_dev, "force sleep state S%d failed\n",
 	    sc->acpi_next_sstate);
 }
 
 static void
 acpi_sleep_force(void *arg)
 {
     struct acpi_softc *sc = (struct acpi_softc *)arg;
 
     device_printf(sc->acpi_dev,
 	"suspend request timed out, forcing sleep now\n");
     /*
      * XXX Suspending from callout causes freezes in DEVICE_SUSPEND().
      * Suspend from acpi_task thread instead.
      */
     if (ACPI_FAILURE(AcpiOsExecute(OSL_NOTIFY_HANDLER,
 	acpi_sleep_force_task, sc)))
 	device_printf(sc->acpi_dev, "AcpiOsExecute() for sleeping failed\n");
 }
 #endif
 
 /*
  * Request that the system enter the given suspend state.  All /dev/apm
  * devices and devd(8) will be notified.  Userland then has a chance to
  * save state and acknowledge the request.  The system sleeps once all
  * acks are in.
  */
 int
 acpi_ReqSleepState(struct acpi_softc *sc, int state)
 {
 #if defined(__amd64__) || defined(__i386__)
     struct apm_clone_data *clone;
     ACPI_STATUS status;
 
     if (state < ACPI_STATE_S1 || state > ACPI_S_STATES_MAX)
 	return (EINVAL);
     if (!acpi_sleep_states[state])
 	return (EOPNOTSUPP);
 
     /*
      * If a reboot/shutdown/suspend request is already in progress or
      * suspend is blocked due to an upcoming shutdown, just return.
      */
     if (rebooting || sc->acpi_next_sstate != 0 || suspend_blocked) {
 	return (0);
     }
 
     /* Wait until sleep is enabled. */
     while (sc->acpi_sleep_disabled) {
 	AcpiOsSleep(1000);
     }
 
     ACPI_LOCK(acpi);
 
     sc->acpi_next_sstate = state;
 
     /* S5 (soft-off) should be entered directly with no waiting. */
     if (state == ACPI_STATE_S5) {
     	ACPI_UNLOCK(acpi);
 	status = acpi_EnterSleepState(sc, state);
 	return (ACPI_SUCCESS(status) ? 0 : ENXIO);
     }
 
     /* Record the pending state and notify all apm devices. */
     STAILQ_FOREACH(clone, &sc->apm_cdevs, entries) {
 	clone->notify_status = APM_EV_NONE;
 	if ((clone->flags & ACPI_EVF_DEVD) == 0) {
 	    selwakeuppri(&clone->sel_read, PZERO);
 	    KNOTE_LOCKED(&clone->sel_read.si_note, 0);
 	}
     }
 
     /* If devd(8) is not running, immediately enter the sleep state. */
     if (!devctl_process_running()) {
 	ACPI_UNLOCK(acpi);
 	status = acpi_EnterSleepState(sc, state);
 	return (ACPI_SUCCESS(status) ? 0 : ENXIO);
     }
 
     /*
      * Set a timeout to fire if userland doesn't ack the suspend request
      * in time.  This way we still eventually go to sleep if we were
      * overheating or running low on battery, even if userland is hung.
      * We cancel this timeout once all userland acks are in or the
      * suspend request is aborted.
      */
     callout_reset(&sc->susp_force_to, 10 * hz, acpi_sleep_force, sc);
     ACPI_UNLOCK(acpi);
 
     /* Now notify devd(8) also. */
     acpi_UserNotify("Suspend", ACPI_ROOT_OBJECT, state);
 
     return (0);
 #else
     /* This platform does not support acpi suspend/resume. */
     return (EOPNOTSUPP);
 #endif
 }
 
 /*
  * Acknowledge (or reject) a pending sleep state.  The caller has
  * prepared for suspend and is now ready for it to proceed.  If the
  * error argument is non-zero, it indicates suspend should be cancelled
  * and gives an errno value describing why.  Once all votes are in,
  * we suspend the system.
  */
 int
 acpi_AckSleepState(struct apm_clone_data *clone, int error)
 {
 #if defined(__amd64__) || defined(__i386__)
     struct acpi_softc *sc;
     int ret, sleeping;
 
     /* If no pending sleep state, return an error. */
     ACPI_LOCK(acpi);
     sc = clone->acpi_sc;
     if (sc->acpi_next_sstate == 0) {
     	ACPI_UNLOCK(acpi);
 	return (ENXIO);
     }
 
     /* Caller wants to abort suspend process. */
     if (error) {
 	sc->acpi_next_sstate = 0;
 	callout_stop(&sc->susp_force_to);
 	device_printf(sc->acpi_dev,
 	    "listener on %s cancelled the pending suspend\n",
 	    devtoname(clone->cdev));
     	ACPI_UNLOCK(acpi);
 	return (0);
     }
 
     /*
      * Mark this device as acking the suspend request.  Then, walk through
      * all devices, seeing if they agree yet.  We only count devices that
      * are writable since read-only devices couldn't ack the request.
      */
     sleeping = TRUE;
     clone->notify_status = APM_EV_ACKED;
     STAILQ_FOREACH(clone, &sc->apm_cdevs, entries) {
 	if ((clone->flags & ACPI_EVF_WRITE) != 0 &&
 	    clone->notify_status != APM_EV_ACKED) {
 	    sleeping = FALSE;
 	    break;
 	}
     }
 
     /* If all devices have voted "yes", we will suspend now. */
     if (sleeping)
 	callout_stop(&sc->susp_force_to);
     ACPI_UNLOCK(acpi);
     ret = 0;
     if (sleeping) {
 	if (ACPI_FAILURE(acpi_EnterSleepState(sc, sc->acpi_next_sstate)))
 		ret = ENODEV;
     }
     return (ret);
 #else
     /* This platform does not support acpi suspend/resume. */
     return (EOPNOTSUPP);
 #endif
 }
 
 static void
 acpi_sleep_enable(void *arg)
 {
     struct acpi_softc	*sc = (struct acpi_softc *)arg;
 
     ACPI_LOCK_ASSERT(acpi);
 
     /* Reschedule if the system is not fully up and running. */
     if (!AcpiGbl_SystemAwakeAndRunning) {
 	callout_schedule(&acpi_sleep_timer, hz * ACPI_MINIMUM_AWAKETIME);
 	return;
     }
 
     sc->acpi_sleep_disabled = FALSE;
 }
 
 static ACPI_STATUS
 acpi_sleep_disable(struct acpi_softc *sc)
 {
     ACPI_STATUS		status;
 
     /* Fail if the system is not fully up and running. */
     if (!AcpiGbl_SystemAwakeAndRunning)
 	return (AE_ERROR);
 
     ACPI_LOCK(acpi);
     status = sc->acpi_sleep_disabled ? AE_ERROR : AE_OK;
     sc->acpi_sleep_disabled = TRUE;
     ACPI_UNLOCK(acpi);
 
     return (status);
 }
 
 enum acpi_sleep_state {
     ACPI_SS_NONE,
     ACPI_SS_GPE_SET,
     ACPI_SS_DEV_SUSPEND,
     ACPI_SS_SLP_PREP,
     ACPI_SS_SLEPT,
 };
 
 /*
  * Enter the desired system sleep state.
  *
  * Currently we support S1-S5 but S4 is only S4BIOS
  */
 static ACPI_STATUS
 acpi_EnterSleepState(struct acpi_softc *sc, int state)
 {
     register_t intr;
     ACPI_STATUS status;
     ACPI_EVENT_STATUS power_button_status;
     enum acpi_sleep_state slp_state;
     int sleep_result;
 
     ACPI_FUNCTION_TRACE_U32((char *)(uintptr_t)__func__, state);
 
     if (state < ACPI_STATE_S1 || state > ACPI_S_STATES_MAX)
 	return_ACPI_STATUS (AE_BAD_PARAMETER);
     if (!acpi_sleep_states[state]) {
 	device_printf(sc->acpi_dev, "Sleep state S%d not supported by BIOS\n",
 	    state);
 	return (AE_SUPPORT);
     }
 
     /* Re-entry once we're suspending is not allowed. */
     status = acpi_sleep_disable(sc);
     if (ACPI_FAILURE(status)) {
 	device_printf(sc->acpi_dev,
 	    "suspend request ignored (not ready yet)\n");
 	return (status);
     }
 
     if (state == ACPI_STATE_S5) {
 	/*
 	 * Shut down cleanly and power off.  This will call us back through the
 	 * shutdown handlers.
 	 */
 	shutdown_nice(RB_POWEROFF);
 	return_ACPI_STATUS (AE_OK);
     }
 
     EVENTHANDLER_INVOKE(power_suspend_early);
     stop_all_proc();
+    suspend_all_fs();
     EVENTHANDLER_INVOKE(power_suspend);
 
 #ifdef EARLY_AP_STARTUP
     MPASS(mp_ncpus == 1 || smp_started);
     thread_lock(curthread);
     sched_bind(curthread, 0);
     thread_unlock(curthread);
 #else
     if (smp_started) {
 	thread_lock(curthread);
 	sched_bind(curthread, 0);
 	thread_unlock(curthread);
     }
 #endif
 
     /*
      * Be sure to hold Giant across DEVICE_SUSPEND/RESUME since non-MPSAFE
      * drivers need this.
      */
     mtx_lock(&Giant);
 
     slp_state = ACPI_SS_NONE;
 
     sc->acpi_sstate = state;
 
     /* Enable any GPEs as appropriate and requested by the user. */
     acpi_wake_prep_walk(state);
     slp_state = ACPI_SS_GPE_SET;
 
     /*
      * Inform all devices that we are going to sleep.  If at least one
      * device fails, DEVICE_SUSPEND() automatically resumes the tree.
      *
      * XXX Note that a better two-pass approach with a 'veto' pass
      * followed by a "real thing" pass would be better, but the current
      * bus interface does not provide for this.
      */
     if (DEVICE_SUSPEND(root_bus) != 0) {
 	device_printf(sc->acpi_dev, "device_suspend failed\n");
 	goto backout;
     }
     slp_state = ACPI_SS_DEV_SUSPEND;
 
     status = AcpiEnterSleepStatePrep(state);
     if (ACPI_FAILURE(status)) {
 	device_printf(sc->acpi_dev, "AcpiEnterSleepStatePrep failed - %s\n",
 		      AcpiFormatException(status));
 	goto backout;
     }
     slp_state = ACPI_SS_SLP_PREP;
 
     if (sc->acpi_sleep_delay > 0)
 	DELAY(sc->acpi_sleep_delay * 1000000);
 
     suspendclock();
     intr = intr_disable();
     if (state != ACPI_STATE_S1) {
 	sleep_result = acpi_sleep_machdep(sc, state);
 	acpi_wakeup_machdep(sc, state, sleep_result, 0);
 
 	/*
 	 * XXX According to ACPI specification SCI_EN bit should be restored
 	 * by ACPI platform (BIOS, firmware) to its pre-sleep state.
 	 * Unfortunately some BIOSes fail to do that and that leads to
 	 * unexpected and serious consequences during wake up like a system
 	 * getting stuck in SMI handlers.
 	 * This hack is picked up from Linux, which claims that it follows
 	 * Windows behavior.
 	 */
 	if (sleep_result == 1 && state != ACPI_STATE_S4)
 	    AcpiWriteBitRegister(ACPI_BITREG_SCI_ENABLE, ACPI_ENABLE_EVENT);
 
 	if (sleep_result == 1 && state == ACPI_STATE_S3) {
 	    /*
 	     * Prevent mis-interpretation of the wakeup by power button
 	     * as a request for power off.
 	     * Ideally we should post an appropriate wakeup event,
 	     * perhaps using acpi_event_power_button_wake or alike.
 	     *
 	     * Clearing of power button status after wakeup is mandated
 	     * by ACPI specification in section "Fixed Power Button".
 	     *
 	     * XXX As of ACPICA 20121114 AcpiGetEventStatus provides
 	     * status as 0/1 corressponding to inactive/active despite
 	     * its type being ACPI_EVENT_STATUS.  In other words,
 	     * we should not test for ACPI_EVENT_FLAG_SET for time being.
 	     */
 	    if (ACPI_SUCCESS(AcpiGetEventStatus(ACPI_EVENT_POWER_BUTTON,
 		&power_button_status)) && power_button_status != 0) {
 		AcpiClearEvent(ACPI_EVENT_POWER_BUTTON);
 		device_printf(sc->acpi_dev,
 		    "cleared fixed power button status\n");
 	    }
 	}
 
 	intr_restore(intr);
 
 	/* call acpi_wakeup_machdep() again with interrupt enabled */
 	acpi_wakeup_machdep(sc, state, sleep_result, 1);
 
 	AcpiLeaveSleepStatePrep(state);
 
 	if (sleep_result == -1)
 		goto backout;
 
 	/* Re-enable ACPI hardware on wakeup from sleep state 4. */
 	if (state == ACPI_STATE_S4)
 	    AcpiEnable();
     } else {
 	status = AcpiEnterSleepState(state);
 	intr_restore(intr);
 	AcpiLeaveSleepStatePrep(state);
 	if (ACPI_FAILURE(status)) {
 	    device_printf(sc->acpi_dev, "AcpiEnterSleepState failed - %s\n",
 			  AcpiFormatException(status));
 	    goto backout;
 	}
     }
     slp_state = ACPI_SS_SLEPT;
 
     /*
      * Back out state according to how far along we got in the suspend
      * process.  This handles both the error and success cases.
      */
 backout:
     if (slp_state >= ACPI_SS_SLP_PREP)
 	resumeclock();
     if (slp_state >= ACPI_SS_GPE_SET) {
 	acpi_wake_prep_walk(state);
 	sc->acpi_sstate = ACPI_STATE_S0;
     }
     if (slp_state >= ACPI_SS_DEV_SUSPEND)
 	DEVICE_RESUME(root_bus);
     if (slp_state >= ACPI_SS_SLP_PREP)
 	AcpiLeaveSleepState(state);
     if (slp_state >= ACPI_SS_SLEPT) {
 #if defined(__i386__) || defined(__amd64__)
 	/* NB: we are still using ACPI timecounter at this point. */
 	resume_TSC();
 #endif
 	acpi_resync_clock(sc);
 	acpi_enable_fixed_events(sc);
     }
     sc->acpi_next_sstate = 0;
 
     mtx_unlock(&Giant);
 
 #ifdef EARLY_AP_STARTUP
     thread_lock(curthread);
     sched_unbind(curthread);
     thread_unlock(curthread);
 #else
     if (smp_started) {
 	thread_lock(curthread);
 	sched_unbind(curthread);
 	thread_unlock(curthread);
     }
 #endif
 
+    resume_all_fs();
     resume_all_proc();
 
     EVENTHANDLER_INVOKE(power_resume);
 
     /* Allow another sleep request after a while. */
     callout_schedule(&acpi_sleep_timer, hz * ACPI_MINIMUM_AWAKETIME);
 
     /* Run /etc/rc.resume after we are back. */
     if (devctl_process_running())
 	acpi_UserNotify("Resume", ACPI_ROOT_OBJECT, state);
 
     return_ACPI_STATUS (status);
 }
 
 static void
 acpi_resync_clock(struct acpi_softc *sc)
 {
 
     /*
      * Warm up timecounter again and reset system clock.
      */
     (void)timecounter->tc_get_timecount(timecounter);
     inittodr(time_second + sc->acpi_sleep_delay);
 }
 
 /* Enable or disable the device's wake GPE. */
 int
 acpi_wake_set_enable(device_t dev, int enable)
 {
     struct acpi_prw_data prw;
     ACPI_STATUS status;
     int flags;
 
     /* Make sure the device supports waking the system and get the GPE. */
     if (acpi_parse_prw(acpi_get_handle(dev), &prw) != 0)
 	return (ENXIO);
 
     flags = acpi_get_flags(dev);
     if (enable) {
 	status = AcpiSetGpeWakeMask(prw.gpe_handle, prw.gpe_bit,
 	    ACPI_GPE_ENABLE);
 	if (ACPI_FAILURE(status)) {
 	    device_printf(dev, "enable wake failed\n");
 	    return (ENXIO);
 	}
 	acpi_set_flags(dev, flags | ACPI_FLAG_WAKE_ENABLED);
     } else {
 	status = AcpiSetGpeWakeMask(prw.gpe_handle, prw.gpe_bit,
 	    ACPI_GPE_DISABLE);
 	if (ACPI_FAILURE(status)) {
 	    device_printf(dev, "disable wake failed\n");
 	    return (ENXIO);
 	}
 	acpi_set_flags(dev, flags & ~ACPI_FLAG_WAKE_ENABLED);
     }
 
     return (0);
 }
 
 static int
 acpi_wake_sleep_prep(ACPI_HANDLE handle, int sstate)
 {
     struct acpi_prw_data prw;
     device_t dev;
 
     /* Check that this is a wake-capable device and get its GPE. */
     if (acpi_parse_prw(handle, &prw) != 0)
 	return (ENXIO);
     dev = acpi_get_device(handle);
 
     /*
      * The destination sleep state must be less than (i.e., higher power)
      * or equal to the value specified by _PRW.  If this GPE cannot be
      * enabled for the next sleep state, then disable it.  If it can and
      * the user requested it be enabled, turn on any required power resources
      * and set _PSW.
      */
     if (sstate > prw.lowest_wake) {
 	AcpiSetGpeWakeMask(prw.gpe_handle, prw.gpe_bit, ACPI_GPE_DISABLE);
 	if (bootverbose)
 	    device_printf(dev, "wake_prep disabled wake for %s (S%d)\n",
 		acpi_name(handle), sstate);
     } else if (dev && (acpi_get_flags(dev) & ACPI_FLAG_WAKE_ENABLED) != 0) {
 	acpi_pwr_wake_enable(handle, 1);
 	acpi_SetInteger(handle, "_PSW", 1);
 	if (bootverbose)
 	    device_printf(dev, "wake_prep enabled for %s (S%d)\n",
 		acpi_name(handle), sstate);
     }
 
     return (0);
 }
 
 static int
 acpi_wake_run_prep(ACPI_HANDLE handle, int sstate)
 {
     struct acpi_prw_data prw;
     device_t dev;
 
     /*
      * Check that this is a wake-capable device and get its GPE.  Return
      * now if the user didn't enable this device for wake.
      */
     if (acpi_parse_prw(handle, &prw) != 0)
 	return (ENXIO);
     dev = acpi_get_device(handle);
     if (dev == NULL || (acpi_get_flags(dev) & ACPI_FLAG_WAKE_ENABLED) == 0)
 	return (0);
 
     /*
      * If this GPE couldn't be enabled for the previous sleep state, it was
      * disabled before going to sleep so re-enable it.  If it was enabled,
      * clear _PSW and turn off any power resources it used.
      */
     if (sstate > prw.lowest_wake) {
 	AcpiSetGpeWakeMask(prw.gpe_handle, prw.gpe_bit, ACPI_GPE_ENABLE);
 	if (bootverbose)
 	    device_printf(dev, "run_prep re-enabled %s\n", acpi_name(handle));
     } else {
 	acpi_SetInteger(handle, "_PSW", 0);
 	acpi_pwr_wake_enable(handle, 0);
 	if (bootverbose)
 	    device_printf(dev, "run_prep cleaned up for %s\n",
 		acpi_name(handle));
     }
 
     return (0);
 }
 
 static ACPI_STATUS
 acpi_wake_prep(ACPI_HANDLE handle, UINT32 level, void *context, void **status)
 {
     int sstate;
 
     /* If suspending, run the sleep prep function, otherwise wake. */
     sstate = *(int *)context;
     if (AcpiGbl_SystemAwakeAndRunning)
 	acpi_wake_sleep_prep(handle, sstate);
     else
 	acpi_wake_run_prep(handle, sstate);
     return (AE_OK);
 }
 
 /* Walk the tree rooted at acpi0 to prep devices for suspend/resume. */
 static int
 acpi_wake_prep_walk(int sstate)
 {
     ACPI_HANDLE sb_handle;
 
     if (ACPI_SUCCESS(AcpiGetHandle(ACPI_ROOT_OBJECT, "\\_SB_", &sb_handle)))
 	AcpiWalkNamespace(ACPI_TYPE_DEVICE, sb_handle, 100,
 	    acpi_wake_prep, NULL, &sstate, NULL);
     return (0);
 }
 
 /* Walk the tree rooted at acpi0 to attach per-device wake sysctls. */
 static int
 acpi_wake_sysctl_walk(device_t dev)
 {
     int error, i, numdevs;
     device_t *devlist;
     device_t child;
     ACPI_STATUS status;
 
     error = device_get_children(dev, &devlist, &numdevs);
     if (error != 0 || numdevs == 0) {
 	if (numdevs == 0)
 	    free(devlist, M_TEMP);
 	return (error);
     }
     for (i = 0; i < numdevs; i++) {
 	child = devlist[i];
 	acpi_wake_sysctl_walk(child);
 	if (!device_is_attached(child))
 	    continue;
 	status = AcpiEvaluateObject(acpi_get_handle(child), "_PRW", NULL, NULL);
 	if (ACPI_SUCCESS(status)) {
 	    SYSCTL_ADD_PROC(device_get_sysctl_ctx(child),
 		SYSCTL_CHILDREN(device_get_sysctl_tree(child)), OID_AUTO,
 		"wake", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, child, 0,
 		acpi_wake_set_sysctl, "I", "Device set to wake the system");
 	}
     }
     free(devlist, M_TEMP);
 
     return (0);
 }
 
 /* Enable or disable wake from userland. */
 static int
 acpi_wake_set_sysctl(SYSCTL_HANDLER_ARGS)
 {
     int enable, error;
     device_t dev;
 
     dev = (device_t)arg1;
     enable = (acpi_get_flags(dev) & ACPI_FLAG_WAKE_ENABLED) ? 1 : 0;
 
     error = sysctl_handle_int(oidp, &enable, 0, req);
     if (error != 0 || req->newptr == NULL)
 	return (error);
     if (enable != 0 && enable != 1)
 	return (EINVAL);
 
     return (acpi_wake_set_enable(dev, enable));
 }
 
 /* Parse a device's _PRW into a structure. */
 int
 acpi_parse_prw(ACPI_HANDLE h, struct acpi_prw_data *prw)
 {
     ACPI_STATUS			status;
     ACPI_BUFFER			prw_buffer;
     ACPI_OBJECT			*res, *res2;
     int				error, i, power_count;
 
     if (h == NULL || prw == NULL)
 	return (EINVAL);
 
     /*
      * The _PRW object (7.2.9) is only required for devices that have the
      * ability to wake the system from a sleeping state.
      */
     error = EINVAL;
     prw_buffer.Pointer = NULL;
     prw_buffer.Length = ACPI_ALLOCATE_BUFFER;
     status = AcpiEvaluateObject(h, "_PRW", NULL, &prw_buffer);
     if (ACPI_FAILURE(status))
 	return (ENOENT);
     res = (ACPI_OBJECT *)prw_buffer.Pointer;
     if (res == NULL)
 	return (ENOENT);
     if (!ACPI_PKG_VALID(res, 2))
 	goto out;
 
     /*
      * Element 1 of the _PRW object:
      * The lowest power system sleeping state that can be entered while still
      * providing wake functionality.  The sleeping state being entered must
      * be less than (i.e., higher power) or equal to this value.
      */
     if (acpi_PkgInt32(res, 1, &prw->lowest_wake) != 0)
 	goto out;
 
     /*
      * Element 0 of the _PRW object:
      */
     switch (res->Package.Elements[0].Type) {
     case ACPI_TYPE_INTEGER:
 	/*
 	 * If the data type of this package element is numeric, then this
 	 * _PRW package element is the bit index in the GPEx_EN, in the
 	 * GPE blocks described in the FADT, of the enable bit that is
 	 * enabled for the wake event.
 	 */
 	prw->gpe_handle = NULL;
 	prw->gpe_bit = res->Package.Elements[0].Integer.Value;
 	error = 0;
 	break;
     case ACPI_TYPE_PACKAGE:
 	/*
 	 * If the data type of this package element is a package, then this
 	 * _PRW package element is itself a package containing two
 	 * elements.  The first is an object reference to the GPE Block
 	 * device that contains the GPE that will be triggered by the wake
 	 * event.  The second element is numeric and it contains the bit
 	 * index in the GPEx_EN, in the GPE Block referenced by the
 	 * first element in the package, of the enable bit that is enabled for
 	 * the wake event.
 	 *
 	 * For example, if this field is a package then it is of the form:
 	 * Package() {\_SB.PCI0.ISA.GPE, 2}
 	 */
 	res2 = &res->Package.Elements[0];
 	if (!ACPI_PKG_VALID(res2, 2))
 	    goto out;
 	prw->gpe_handle = acpi_GetReference(NULL, &res2->Package.Elements[0]);
 	if (prw->gpe_handle == NULL)
 	    goto out;
 	if (acpi_PkgInt32(res2, 1, &prw->gpe_bit) != 0)
 	    goto out;
 	error = 0;
 	break;
     default:
 	goto out;
     }
 
     /* Elements 2 to N of the _PRW object are power resources. */
     power_count = res->Package.Count - 2;
     if (power_count > ACPI_PRW_MAX_POWERRES) {
 	printf("ACPI device %s has too many power resources\n", acpi_name(h));
 	power_count = 0;
     }
     prw->power_res_count = power_count;
     for (i = 0; i < power_count; i++)
 	prw->power_res[i] = res->Package.Elements[i];
 
 out:
     if (prw_buffer.Pointer != NULL)
 	AcpiOsFree(prw_buffer.Pointer);
     return (error);
 }
 
 /*
  * ACPI Event Handlers
  */
 
 /* System Event Handlers (registered by EVENTHANDLER_REGISTER) */
 
 static void
 acpi_system_eventhandler_sleep(void *arg, int state)
 {
     struct acpi_softc *sc = (struct acpi_softc *)arg;
     int ret;
 
     ACPI_FUNCTION_TRACE_U32((char *)(uintptr_t)__func__, state);
 
     /* Check if button action is disabled or unknown. */
     if (state == ACPI_STATE_UNKNOWN)
 	return;
 
     /* Request that the system prepare to enter the given suspend state. */
     ret = acpi_ReqSleepState(sc, state);
     if (ret != 0)
 	device_printf(sc->acpi_dev,
 	    "request to enter state S%d failed (err %d)\n", state, ret);
 
     return_VOID;
 }
 
 static void
 acpi_system_eventhandler_wakeup(void *arg, int state)
 {
 
     ACPI_FUNCTION_TRACE_U32((char *)(uintptr_t)__func__, state);
 
     /* Currently, nothing to do for wakeup. */
 
     return_VOID;
 }
 
 /* 
  * ACPICA Event Handlers (FixedEvent, also called from button notify handler)
  */
 static void
 acpi_invoke_sleep_eventhandler(void *context)
 {
 
     EVENTHANDLER_INVOKE(acpi_sleep_event, *(int *)context);
 }
 
 static void
 acpi_invoke_wake_eventhandler(void *context)
 {
 
     EVENTHANDLER_INVOKE(acpi_wakeup_event, *(int *)context);
 }
 
 UINT32
 acpi_event_power_button_sleep(void *context)
 {
     struct acpi_softc	*sc = (struct acpi_softc *)context;
 
     ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
 
     if (ACPI_FAILURE(AcpiOsExecute(OSL_NOTIFY_HANDLER,
 	acpi_invoke_sleep_eventhandler, &sc->acpi_power_button_sx)))
 	return_VALUE (ACPI_INTERRUPT_NOT_HANDLED);
     return_VALUE (ACPI_INTERRUPT_HANDLED);
 }
 
 UINT32
 acpi_event_power_button_wake(void *context)
 {
     struct acpi_softc	*sc = (struct acpi_softc *)context;
 
     ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
 
     if (ACPI_FAILURE(AcpiOsExecute(OSL_NOTIFY_HANDLER,
 	acpi_invoke_wake_eventhandler, &sc->acpi_power_button_sx)))
 	return_VALUE (ACPI_INTERRUPT_NOT_HANDLED);
     return_VALUE (ACPI_INTERRUPT_HANDLED);
 }
 
 UINT32
 acpi_event_sleep_button_sleep(void *context)
 {
     struct acpi_softc	*sc = (struct acpi_softc *)context;
 
     ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
 
     if (ACPI_FAILURE(AcpiOsExecute(OSL_NOTIFY_HANDLER,
 	acpi_invoke_sleep_eventhandler, &sc->acpi_sleep_button_sx)))
 	return_VALUE (ACPI_INTERRUPT_NOT_HANDLED);
     return_VALUE (ACPI_INTERRUPT_HANDLED);
 }
 
 UINT32
 acpi_event_sleep_button_wake(void *context)
 {
     struct acpi_softc	*sc = (struct acpi_softc *)context;
 
     ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
 
     if (ACPI_FAILURE(AcpiOsExecute(OSL_NOTIFY_HANDLER,
 	acpi_invoke_wake_eventhandler, &sc->acpi_sleep_button_sx)))
 	return_VALUE (ACPI_INTERRUPT_NOT_HANDLED);
     return_VALUE (ACPI_INTERRUPT_HANDLED);
 }
 
 /*
  * XXX This static buffer is suboptimal.  There is no locking so only
  * use this for single-threaded callers.
  */
 char *
 acpi_name(ACPI_HANDLE handle)
 {
     ACPI_BUFFER buf;
     static char data[256];
 
     buf.Length = sizeof(data);
     buf.Pointer = data;
 
     if (handle && ACPI_SUCCESS(AcpiGetName(handle, ACPI_FULL_PATHNAME, &buf)))
 	return (data);
     return ("(unknown)");
 }
 
 /*
  * Debugging/bug-avoidance.  Avoid trying to fetch info on various
  * parts of the namespace.
  */
 int
 acpi_avoid(ACPI_HANDLE handle)
 {
     char	*cp, *env, *np;
     int		len;
 
     np = acpi_name(handle);
     if (*np == '\\')
 	np++;
     if ((env = kern_getenv("debug.acpi.avoid")) == NULL)
 	return (0);
 
     /* Scan the avoid list checking for a match */
     cp = env;
     for (;;) {
 	while (*cp != 0 && isspace(*cp))
 	    cp++;
 	if (*cp == 0)
 	    break;
 	len = 0;
 	while (cp[len] != 0 && !isspace(cp[len]))
 	    len++;
 	if (!strncmp(cp, np, len)) {
 	    freeenv(env);
 	    return(1);
 	}
 	cp += len;
     }
     freeenv(env);
 
     return (0);
 }
 
 /*
  * Debugging/bug-avoidance.  Disable ACPI subsystem components.
  */
 int
 acpi_disabled(char *subsys)
 {
     char	*cp, *env;
     int		len;
 
     if ((env = kern_getenv("debug.acpi.disabled")) == NULL)
 	return (0);
     if (strcmp(env, "all") == 0) {
 	freeenv(env);
 	return (1);
     }
 
     /* Scan the disable list, checking for a match. */
     cp = env;
     for (;;) {
 	while (*cp != '\0' && isspace(*cp))
 	    cp++;
 	if (*cp == '\0')
 	    break;
 	len = 0;
 	while (cp[len] != '\0' && !isspace(cp[len]))
 	    len++;
 	if (strncmp(cp, subsys, len) == 0) {
 	    freeenv(env);
 	    return (1);
 	}
 	cp += len;
     }
     freeenv(env);
 
     return (0);
 }
 
 static void
 acpi_lookup(void *arg, const char *name, device_t *dev)
 {
     ACPI_HANDLE handle;
 
     if (*dev != NULL)
 	return;
 
     /*
      * Allow any handle name that is specified as an absolute path and
      * starts with '\'.  We could restrict this to \_SB and friends,
      * but see acpi_probe_children() for notes on why we scan the entire
      * namespace for devices.
      *
      * XXX: The pathname argument to AcpiGetHandle() should be fixed to
      * be const.
      */
     if (name[0] != '\\')
 	return;
     if (ACPI_FAILURE(AcpiGetHandle(ACPI_ROOT_OBJECT, __DECONST(char *, name),
 	&handle)))
 	return;
     *dev = acpi_get_device(handle);
 }
 
 /*
  * Control interface.
  *
  * We multiplex ioctls for all participating ACPI devices here.  Individual 
  * drivers wanting to be accessible via /dev/acpi should use the
  * register/deregister interface to make their handlers visible.
  */
 struct acpi_ioctl_hook
 {
     TAILQ_ENTRY(acpi_ioctl_hook) link;
     u_long			 cmd;
     acpi_ioctl_fn		 fn;
     void			 *arg;
 };
 
 static TAILQ_HEAD(,acpi_ioctl_hook)	acpi_ioctl_hooks;
 static int				acpi_ioctl_hooks_initted;
 
 int
 acpi_register_ioctl(u_long cmd, acpi_ioctl_fn fn, void *arg)
 {
     struct acpi_ioctl_hook	*hp;
 
     if ((hp = malloc(sizeof(*hp), M_ACPIDEV, M_NOWAIT)) == NULL)
 	return (ENOMEM);
     hp->cmd = cmd;
     hp->fn = fn;
     hp->arg = arg;
 
     ACPI_LOCK(acpi);
     if (acpi_ioctl_hooks_initted == 0) {
 	TAILQ_INIT(&acpi_ioctl_hooks);
 	acpi_ioctl_hooks_initted = 1;
     }
     TAILQ_INSERT_TAIL(&acpi_ioctl_hooks, hp, link);
     ACPI_UNLOCK(acpi);
 
     return (0);
 }
 
 void
 acpi_deregister_ioctl(u_long cmd, acpi_ioctl_fn fn)
 {
     struct acpi_ioctl_hook	*hp;
 
     ACPI_LOCK(acpi);
     TAILQ_FOREACH(hp, &acpi_ioctl_hooks, link)
 	if (hp->cmd == cmd && hp->fn == fn)
 	    break;
 
     if (hp != NULL) {
 	TAILQ_REMOVE(&acpi_ioctl_hooks, hp, link);
 	free(hp, M_ACPIDEV);
     }
     ACPI_UNLOCK(acpi);
 }
 
 static int
 acpiopen(struct cdev *dev, int flag, int fmt, struct thread *td)
 {
     return (0);
 }
 
 static int
 acpiclose(struct cdev *dev, int flag, int fmt, struct thread *td)
 {
     return (0);
 }
 
 static int
 acpiioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td)
 {
     struct acpi_softc		*sc;
     struct acpi_ioctl_hook	*hp;
     int				error, state;
 
     error = 0;
     hp = NULL;
     sc = dev->si_drv1;
 
     /*
      * Scan the list of registered ioctls, looking for handlers.
      */
     ACPI_LOCK(acpi);
     if (acpi_ioctl_hooks_initted)
 	TAILQ_FOREACH(hp, &acpi_ioctl_hooks, link) {
 	    if (hp->cmd == cmd)
 		break;
 	}
     ACPI_UNLOCK(acpi);
     if (hp)
 	return (hp->fn(cmd, addr, hp->arg));
 
     /*
      * Core ioctls are not permitted for non-writable user.
      * Currently, other ioctls just fetch information.
      * Not changing system behavior.
      */
     if ((flag & FWRITE) == 0)
 	return (EPERM);
 
     /* Core system ioctls. */
     switch (cmd) {
     case ACPIIO_REQSLPSTATE:
 	state = *(int *)addr;
 	if (state != ACPI_STATE_S5)
 	    return (acpi_ReqSleepState(sc, state));
 	device_printf(sc->acpi_dev, "power off via acpi ioctl not supported\n");
 	error = EOPNOTSUPP;
 	break;
     case ACPIIO_ACKSLPSTATE:
 	error = *(int *)addr;
 	error = acpi_AckSleepState(sc->acpi_clone, error);
 	break;
     case ACPIIO_SETSLPSTATE:	/* DEPRECATED */
 	state = *(int *)addr;
 	if (state < ACPI_STATE_S0 || state > ACPI_S_STATES_MAX)
 	    return (EINVAL);
 	if (!acpi_sleep_states[state])
 	    return (EOPNOTSUPP);
 	if (ACPI_FAILURE(acpi_SetSleepState(sc, state)))
 	    error = ENXIO;
 	break;
     default:
 	error = ENXIO;
 	break;
     }
 
     return (error);
 }
 
 static int
 acpi_sname2sstate(const char *sname)
 {
     int sstate;
 
     if (toupper(sname[0]) == 'S') {
 	sstate = sname[1] - '0';
 	if (sstate >= ACPI_STATE_S0 && sstate <= ACPI_STATE_S5 &&
 	    sname[2] == '\0')
 	    return (sstate);
     } else if (strcasecmp(sname, "NONE") == 0)
 	return (ACPI_STATE_UNKNOWN);
     return (-1);
 }
 
 static const char *
 acpi_sstate2sname(int sstate)
 {
     static const char *snames[] = { "S0", "S1", "S2", "S3", "S4", "S5" };
 
     if (sstate >= ACPI_STATE_S0 && sstate <= ACPI_STATE_S5)
 	return (snames[sstate]);
     else if (sstate == ACPI_STATE_UNKNOWN)
 	return ("NONE");
     return (NULL);
 }
 
 static int
 acpi_supported_sleep_state_sysctl(SYSCTL_HANDLER_ARGS)
 {
     int error;
     struct sbuf sb;
     UINT8 state;
 
     sbuf_new(&sb, NULL, 32, SBUF_AUTOEXTEND);
     for (state = ACPI_STATE_S1; state < ACPI_S_STATE_COUNT; state++)
 	if (acpi_sleep_states[state])
 	    sbuf_printf(&sb, "%s ", acpi_sstate2sname(state));
     sbuf_trim(&sb);
     sbuf_finish(&sb);
     error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
     sbuf_delete(&sb);
     return (error);
 }
 
 static int
 acpi_sleep_state_sysctl(SYSCTL_HANDLER_ARGS)
 {
     char sleep_state[10];
     int error, new_state, old_state;
 
     old_state = *(int *)oidp->oid_arg1;
     strlcpy(sleep_state, acpi_sstate2sname(old_state), sizeof(sleep_state));
     error = sysctl_handle_string(oidp, sleep_state, sizeof(sleep_state), req);
     if (error == 0 && req->newptr != NULL) {
 	new_state = acpi_sname2sstate(sleep_state);
 	if (new_state < ACPI_STATE_S1)
 	    return (EINVAL);
 	if (new_state < ACPI_S_STATE_COUNT && !acpi_sleep_states[new_state])
 	    return (EOPNOTSUPP);
 	if (new_state != old_state)
 	    *(int *)oidp->oid_arg1 = new_state;
     }
     return (error);
 }
 
 /* Inform devctl(4) when we receive a Notify. */
 void
 acpi_UserNotify(const char *subsystem, ACPI_HANDLE h, uint8_t notify)
 {
     char		notify_buf[16];
     ACPI_BUFFER		handle_buf;
     ACPI_STATUS		status;
 
     if (subsystem == NULL)
 	return;
 
     handle_buf.Pointer = NULL;
     handle_buf.Length = ACPI_ALLOCATE_BUFFER;
     status = AcpiNsHandleToPathname(h, &handle_buf, FALSE);
     if (ACPI_FAILURE(status))
 	return;
     snprintf(notify_buf, sizeof(notify_buf), "notify=0x%02x", notify);
     devctl_notify("ACPI", subsystem, handle_buf.Pointer, notify_buf);
     AcpiOsFree(handle_buf.Pointer);
 }
 
 #ifdef ACPI_DEBUG
 /*
  * Support for parsing debug options from the kernel environment.
  *
  * Bits may be set in the AcpiDbgLayer and AcpiDbgLevel debug registers
  * by specifying the names of the bits in the debug.acpi.layer and
  * debug.acpi.level environment variables.  Bits may be unset by 
  * prefixing the bit name with !.
  */
 struct debugtag
 {
     char	*name;
     UINT32	value;
 };
 
 static struct debugtag	dbg_layer[] = {
     {"ACPI_UTILITIES",		ACPI_UTILITIES},
     {"ACPI_HARDWARE",		ACPI_HARDWARE},
     {"ACPI_EVENTS",		ACPI_EVENTS},
     {"ACPI_TABLES",		ACPI_TABLES},
     {"ACPI_NAMESPACE",		ACPI_NAMESPACE},
     {"ACPI_PARSER",		ACPI_PARSER},
     {"ACPI_DISPATCHER",		ACPI_DISPATCHER},
     {"ACPI_EXECUTER",		ACPI_EXECUTER},
     {"ACPI_RESOURCES",		ACPI_RESOURCES},
     {"ACPI_CA_DEBUGGER",	ACPI_CA_DEBUGGER},
     {"ACPI_OS_SERVICES",	ACPI_OS_SERVICES},
     {"ACPI_CA_DISASSEMBLER",	ACPI_CA_DISASSEMBLER},
     {"ACPI_ALL_COMPONENTS",	ACPI_ALL_COMPONENTS},
 
     {"ACPI_AC_ADAPTER",		ACPI_AC_ADAPTER},
     {"ACPI_BATTERY",		ACPI_BATTERY},
     {"ACPI_BUS",		ACPI_BUS},
     {"ACPI_BUTTON",		ACPI_BUTTON},
     {"ACPI_EC", 		ACPI_EC},
     {"ACPI_FAN",		ACPI_FAN},
     {"ACPI_POWERRES",		ACPI_POWERRES},
     {"ACPI_PROCESSOR",		ACPI_PROCESSOR},
     {"ACPI_THERMAL",		ACPI_THERMAL},
     {"ACPI_TIMER",		ACPI_TIMER},
     {"ACPI_ALL_DRIVERS",	ACPI_ALL_DRIVERS},
     {NULL, 0}
 };
 
 static struct debugtag dbg_level[] = {
     {"ACPI_LV_INIT",		ACPI_LV_INIT},
     {"ACPI_LV_DEBUG_OBJECT",	ACPI_LV_DEBUG_OBJECT},
     {"ACPI_LV_INFO",		ACPI_LV_INFO},
     {"ACPI_LV_REPAIR",		ACPI_LV_REPAIR},
     {"ACPI_LV_ALL_EXCEPTIONS",	ACPI_LV_ALL_EXCEPTIONS},
 
     /* Trace verbosity level 1 [Standard Trace Level] */
     {"ACPI_LV_INIT_NAMES",	ACPI_LV_INIT_NAMES},
     {"ACPI_LV_PARSE",		ACPI_LV_PARSE},
     {"ACPI_LV_LOAD",		ACPI_LV_LOAD},
     {"ACPI_LV_DISPATCH",	ACPI_LV_DISPATCH},
     {"ACPI_LV_EXEC",		ACPI_LV_EXEC},
     {"ACPI_LV_NAMES",		ACPI_LV_NAMES},
     {"ACPI_LV_OPREGION",	ACPI_LV_OPREGION},
     {"ACPI_LV_BFIELD",		ACPI_LV_BFIELD},
     {"ACPI_LV_TABLES",		ACPI_LV_TABLES},
     {"ACPI_LV_VALUES",		ACPI_LV_VALUES},
     {"ACPI_LV_OBJECTS",		ACPI_LV_OBJECTS},
     {"ACPI_LV_RESOURCES",	ACPI_LV_RESOURCES},
     {"ACPI_LV_USER_REQUESTS",	ACPI_LV_USER_REQUESTS},
     {"ACPI_LV_PACKAGE",		ACPI_LV_PACKAGE},
     {"ACPI_LV_VERBOSITY1",	ACPI_LV_VERBOSITY1},
 
     /* Trace verbosity level 2 [Function tracing and memory allocation] */
     {"ACPI_LV_ALLOCATIONS",	ACPI_LV_ALLOCATIONS},
     {"ACPI_LV_FUNCTIONS",	ACPI_LV_FUNCTIONS},
     {"ACPI_LV_OPTIMIZATIONS",	ACPI_LV_OPTIMIZATIONS},
     {"ACPI_LV_VERBOSITY2",	ACPI_LV_VERBOSITY2},
     {"ACPI_LV_ALL",		ACPI_LV_ALL},
 
     /* Trace verbosity level 3 [Threading, I/O, and Interrupts] */
     {"ACPI_LV_MUTEX",		ACPI_LV_MUTEX},
     {"ACPI_LV_THREADS",		ACPI_LV_THREADS},
     {"ACPI_LV_IO",		ACPI_LV_IO},
     {"ACPI_LV_INTERRUPTS",	ACPI_LV_INTERRUPTS},
     {"ACPI_LV_VERBOSITY3",	ACPI_LV_VERBOSITY3},
 
     /* Exceptionally verbose output -- also used in the global "DebugLevel"  */
     {"ACPI_LV_AML_DISASSEMBLE",	ACPI_LV_AML_DISASSEMBLE},
     {"ACPI_LV_VERBOSE_INFO",	ACPI_LV_VERBOSE_INFO},
     {"ACPI_LV_FULL_TABLES",	ACPI_LV_FULL_TABLES},
     {"ACPI_LV_EVENTS",		ACPI_LV_EVENTS},
     {"ACPI_LV_VERBOSE",		ACPI_LV_VERBOSE},
     {NULL, 0}
 };    
 
 static void
 acpi_parse_debug(char *cp, struct debugtag *tag, UINT32 *flag)
 {
     char	*ep;
     int		i, l;
     int		set;
 
     while (*cp) {
 	if (isspace(*cp)) {
 	    cp++;
 	    continue;
 	}
 	ep = cp;
 	while (*ep && !isspace(*ep))
 	    ep++;
 	if (*cp == '!') {
 	    set = 0;
 	    cp++;
 	    if (cp == ep)
 		continue;
 	} else {
 	    set = 1;
 	}
 	l = ep - cp;
 	for (i = 0; tag[i].name != NULL; i++) {
 	    if (!strncmp(cp, tag[i].name, l)) {
 		if (set)
 		    *flag |= tag[i].value;
 		else
 		    *flag &= ~tag[i].value;
 	    }
 	}
 	cp = ep;
     }
 }
 
 static void
 acpi_set_debugging(void *junk)
 {
     char	*layer, *level;
 
     if (cold) {
 	AcpiDbgLayer = 0;
 	AcpiDbgLevel = 0;
     }
 
     layer = kern_getenv("debug.acpi.layer");
     level = kern_getenv("debug.acpi.level");
     if (layer == NULL && level == NULL)
 	return;
 
     printf("ACPI set debug");
     if (layer != NULL) {
 	if (strcmp("NONE", layer) != 0)
 	    printf(" layer '%s'", layer);
 	acpi_parse_debug(layer, &dbg_layer[0], &AcpiDbgLayer);
 	freeenv(layer);
     }
     if (level != NULL) {
 	if (strcmp("NONE", level) != 0)
 	    printf(" level '%s'", level);
 	acpi_parse_debug(level, &dbg_level[0], &AcpiDbgLevel);
 	freeenv(level);
     }
     printf("\n");
 }
 
 SYSINIT(acpi_debugging, SI_SUB_TUNABLES, SI_ORDER_ANY, acpi_set_debugging,
 	NULL);
 
 static int
 acpi_debug_sysctl(SYSCTL_HANDLER_ARGS)
 {
     int		 error, *dbg;
     struct	 debugtag *tag;
     struct	 sbuf sb;
     char	 temp[128];
 
     if (sbuf_new(&sb, NULL, 128, SBUF_AUTOEXTEND) == NULL)
 	return (ENOMEM);
     if (strcmp(oidp->oid_arg1, "debug.acpi.layer") == 0) {
 	tag = &dbg_layer[0];
 	dbg = &AcpiDbgLayer;
     } else {
 	tag = &dbg_level[0];
 	dbg = &AcpiDbgLevel;
     }
 
     /* Get old values if this is a get request. */
     ACPI_SERIAL_BEGIN(acpi);
     if (*dbg == 0) {
 	sbuf_cpy(&sb, "NONE");
     } else if (req->newptr == NULL) {
 	for (; tag->name != NULL; tag++) {
 	    if ((*dbg & tag->value) == tag->value)
 		sbuf_printf(&sb, "%s ", tag->name);
 	}
     }
     sbuf_trim(&sb);
     sbuf_finish(&sb);
     strlcpy(temp, sbuf_data(&sb), sizeof(temp));
     sbuf_delete(&sb);
 
     error = sysctl_handle_string(oidp, temp, sizeof(temp), req);
 
     /* Check for error or no change */
     if (error == 0 && req->newptr != NULL) {
 	*dbg = 0;
 	kern_setenv((char *)oidp->oid_arg1, temp);
 	acpi_set_debugging(NULL);
     }
     ACPI_SERIAL_END(acpi);
 
     return (error);
 }
 
 SYSCTL_PROC(_debug_acpi, OID_AUTO, layer,
     CTLFLAG_RW | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, "debug.acpi.layer", 0,
     acpi_debug_sysctl, "A",
     "");
 SYSCTL_PROC(_debug_acpi, OID_AUTO, level,
     CTLFLAG_RW | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, "debug.acpi.level", 0,
     acpi_debug_sysctl, "A",
     "");
 #endif /* ACPI_DEBUG */
 
 static int
 acpi_debug_objects_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	int	error;
 	int	old;
 
 	old = acpi_debug_objects;
 	error = sysctl_handle_int(oidp, &acpi_debug_objects, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (old == acpi_debug_objects || (old && acpi_debug_objects))
 		return (0);
 
 	ACPI_SERIAL_BEGIN(acpi);
 	AcpiGbl_EnableAmlDebugObject = acpi_debug_objects ? TRUE : FALSE;
 	ACPI_SERIAL_END(acpi);
 
 	return (0);
 }
 
 static int
 acpi_parse_interfaces(char *str, struct acpi_interface *iface)
 {
 	char *p;
 	size_t len;
 	int i, j;
 
 	p = str;
 	while (isspace(*p) || *p == ',')
 		p++;
 	len = strlen(p);
 	if (len == 0)
 		return (0);
 	p = strdup(p, M_TEMP);
 	for (i = 0; i < len; i++)
 		if (p[i] == ',')
 			p[i] = '\0';
 	i = j = 0;
 	while (i < len)
 		if (isspace(p[i]) || p[i] == '\0')
 			i++;
 		else {
 			i += strlen(p + i) + 1;
 			j++;
 		}
 	if (j == 0) {
 		free(p, M_TEMP);
 		return (0);
 	}
 	iface->data = malloc(sizeof(*iface->data) * j, M_TEMP, M_WAITOK);
 	iface->num = j;
 	i = j = 0;
 	while (i < len)
 		if (isspace(p[i]) || p[i] == '\0')
 			i++;
 		else {
 			iface->data[j] = p + i;
 			i += strlen(p + i) + 1;
 			j++;
 		}
 
 	return (j);
 }
 
 static void
 acpi_free_interfaces(struct acpi_interface *iface)
 {
 
 	free(iface->data[0], M_TEMP);
 	free(iface->data, M_TEMP);
 }
 
 static void
 acpi_reset_interfaces(device_t dev)
 {
 	struct acpi_interface list;
 	ACPI_STATUS status;
 	int i;
 
 	if (acpi_parse_interfaces(acpi_install_interface, &list) > 0) {
 		for (i = 0; i < list.num; i++) {
 			status = AcpiInstallInterface(list.data[i]);
 			if (ACPI_FAILURE(status))
 				device_printf(dev,
 				    "failed to install _OSI(\"%s\"): %s\n",
 				    list.data[i], AcpiFormatException(status));
 			else if (bootverbose)
 				device_printf(dev, "installed _OSI(\"%s\")\n",
 				    list.data[i]);
 		}
 		acpi_free_interfaces(&list);
 	}
 	if (acpi_parse_interfaces(acpi_remove_interface, &list) > 0) {
 		for (i = 0; i < list.num; i++) {
 			status = AcpiRemoveInterface(list.data[i]);
 			if (ACPI_FAILURE(status))
 				device_printf(dev,
 				    "failed to remove _OSI(\"%s\"): %s\n",
 				    list.data[i], AcpiFormatException(status));
 			else if (bootverbose)
 				device_printf(dev, "removed _OSI(\"%s\")\n",
 				    list.data[i]);
 		}
 		acpi_free_interfaces(&list);
 	}
 }
 
 static int
 acpi_pm_func(u_long cmd, void *arg, ...)
 {
 	int	state, acpi_state;
 	int	error;
 	struct	acpi_softc *sc;
 	va_list	ap;
 
 	error = 0;
 	switch (cmd) {
 	case POWER_CMD_SUSPEND:
 		sc = (struct acpi_softc *)arg;
 		if (sc == NULL) {
 			error = EINVAL;
 			goto out;
 		}
 
 		va_start(ap, arg);
 		state = va_arg(ap, int);
 		va_end(ap);
 
 		switch (state) {
 		case POWER_SLEEP_STATE_STANDBY:
 			acpi_state = sc->acpi_standby_sx;
 			break;
 		case POWER_SLEEP_STATE_SUSPEND:
 			acpi_state = sc->acpi_suspend_sx;
 			break;
 		case POWER_SLEEP_STATE_HIBERNATE:
 			acpi_state = ACPI_STATE_S4;
 			break;
 		default:
 			error = EINVAL;
 			goto out;
 		}
 
 		if (ACPI_FAILURE(acpi_EnterSleepState(sc, acpi_state)))
 			error = ENXIO;
 		break;
 	default:
 		error = EINVAL;
 		goto out;
 	}
 
 out:
 	return (error);
 }
 
 static void
 acpi_pm_register(void *arg)
 {
     if (!cold || resource_disabled("acpi", 0))
 	return;
 
     power_pm_register(POWER_PM_TYPE_ACPI, acpi_pm_func, NULL);
 }
 
 SYSINIT(power, SI_SUB_KLD, SI_ORDER_ANY, acpi_pm_register, NULL);
diff --git a/sys/dev/xen/control/control.c b/sys/dev/xen/control/control.c
index 8ad398bf986b..e985d56cda5a 100644
--- a/sys/dev/xen/control/control.c
+++ b/sys/dev/xen/control/control.c
@@ -1,478 +1,481 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD AND BSD-4-Clause
  *
  * Copyright (c) 2010 Justin T. Gibbs, Spectra Logic Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions, and the following disclaimer,
  *    without modification.
  * 2. Redistributions in binary form must reproduce at minimum a disclaimer
  *    substantially similar to the "NO WARRANTY" disclaimer below
  *    ("Disclaimer") and any redistribution must be conditioned upon
  *    including a substantially similar Disclaimer requirement for further
  *    binary redistribution.
  *
  * NO WARRANTY
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGES.
  */
 
 /*-
  * PV suspend/resume support:
  *
  * Copyright (c) 2004 Christian Limpach.
  * Copyright (c) 2004-2006,2008 Kip Macy
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed by Christian Limpach.
  * 4. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*-
  * HVM suspend/resume support:
  *
  * Copyright (c) 2008 Citrix Systems, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /**
  * \file control.c
  *
  * \brief Device driver to repond to control domain events that impact
  *        this VM.
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 
 #include <sys/bio.h>
 #include <sys/bus.h>
 #include <sys/conf.h>
 #include <sys/disk.h>
 #include <sys/fcntl.h>
 #include <sys/filedesc.h>
 #include <sys/kdb.h>
 #include <sys/module.h>
+#include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/reboot.h>
 #include <sys/rman.h>
 #include <sys/sched.h>
 #include <sys/taskqueue.h>
 #include <sys/types.h>
 #include <sys/vnode.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/eventhandler.h>
 #include <sys/timetc.h>
 
 #include <geom/geom.h>
 
 #include <machine/_inttypes.h>
 #include <machine/intr_machdep.h>
 
 #include <x86/apicvar.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 
 #include <xen/xen-os.h>
 #include <xen/blkif.h>
 #include <xen/evtchn.h>
 #include <xen/gnttab.h>
 #include <xen/xen_intr.h>
 
 #include <xen/hvm.h>
 
 #include <xen/interface/event_channel.h>
 #include <xen/interface/grant_table.h>
 
 #include <xen/xenbus/xenbusvar.h>
 
 bool xen_suspend_cancelled;
 /*--------------------------- Forward Declarations --------------------------*/
 /** Function signature for shutdown event handlers. */
 typedef	void (xctrl_shutdown_handler_t)(void);
 
 static xctrl_shutdown_handler_t xctrl_poweroff;
 static xctrl_shutdown_handler_t xctrl_reboot;
 static xctrl_shutdown_handler_t xctrl_suspend;
 static xctrl_shutdown_handler_t xctrl_crash;
 
 /*-------------------------- Private Data Structures -------------------------*/
 /** Element type for lookup table of event name to handler. */
 struct xctrl_shutdown_reason {
 	const char		 *name;
 	xctrl_shutdown_handler_t *handler;
 };
 
 /** Lookup table for shutdown event name to handler. */
 static const struct xctrl_shutdown_reason xctrl_shutdown_reasons[] = {
 	{ "poweroff", xctrl_poweroff },
 	{ "reboot",   xctrl_reboot   },
 	{ "suspend",  xctrl_suspend  },
 	{ "crash",    xctrl_crash    },
 	{ "halt",     xctrl_poweroff },
 };
 
 struct xctrl_softc {
 	struct xs_watch    xctrl_watch;	
 };
 
 /*------------------------------ Event Handlers ------------------------------*/
 static void
 xctrl_poweroff()
 {
 	shutdown_nice(RB_POWEROFF|RB_HALT);
 }
 
 static void
 xctrl_reboot()
 {
 	shutdown_nice(0);
 }
 
 static void
 xctrl_suspend()
 {
 #ifdef SMP
 	cpuset_t cpu_suspend_map;
 #endif
 
 	EVENTHANDLER_INVOKE(power_suspend_early);
 	xs_lock();
 	stop_all_proc();
 	xs_unlock();
+	suspend_all_fs();
 	EVENTHANDLER_INVOKE(power_suspend);
 
 #ifdef EARLY_AP_STARTUP
 	MPASS(mp_ncpus == 1 || smp_started);
 	thread_lock(curthread);
 	sched_bind(curthread, 0);
 	thread_unlock(curthread);
 #else
 	if (smp_started) {
 		thread_lock(curthread);
 		sched_bind(curthread, 0);
 		thread_unlock(curthread);
 	}
 #endif
 	KASSERT((PCPU_GET(cpuid) == 0), ("Not running on CPU#0"));
 
 	/*
 	 * Be sure to hold Giant across DEVICE_SUSPEND/RESUME since non-MPSAFE
 	 * drivers need this.
 	 */
 	mtx_lock(&Giant);
 	if (DEVICE_SUSPEND(root_bus) != 0) {
 		mtx_unlock(&Giant);
 		printf("%s: device_suspend failed\n", __func__);
 		return;
 	}
 
 #ifdef SMP
 #ifdef EARLY_AP_STARTUP
 	/*
 	 * Suspend other CPUs. This prevents IPIs while we
 	 * are resuming, and will allow us to reset per-cpu
 	 * vcpu_info on resume.
 	 */
 	cpu_suspend_map = all_cpus;
 	CPU_CLR(PCPU_GET(cpuid), &cpu_suspend_map);
 	if (!CPU_EMPTY(&cpu_suspend_map))
 		suspend_cpus(cpu_suspend_map);
 #else
 	CPU_ZERO(&cpu_suspend_map);	/* silence gcc */
 	if (smp_started) {
 		/*
 		 * Suspend other CPUs. This prevents IPIs while we
 		 * are resuming, and will allow us to reset per-cpu
 		 * vcpu_info on resume.
 		 */
 		cpu_suspend_map = all_cpus;
 		CPU_CLR(PCPU_GET(cpuid), &cpu_suspend_map);
 		if (!CPU_EMPTY(&cpu_suspend_map))
 			suspend_cpus(cpu_suspend_map);
 	}
 #endif
 #endif
 
 	/*
 	 * Prevent any races with evtchn_interrupt() handler.
 	 */
 	disable_intr();
 	intr_suspend();
 	xen_hvm_suspend();
 
 	xen_suspend_cancelled = !!HYPERVISOR_suspend(0);
 
 	if (!xen_suspend_cancelled) {
 		xen_hvm_resume(false);
 	}
 	intr_resume(xen_suspend_cancelled != 0);
 	enable_intr();
 
 	/*
 	 * Reset grant table info.
 	 */
 	if (!xen_suspend_cancelled) {
 		gnttab_resume(NULL);
 	}
 
 #ifdef SMP
 	if (!CPU_EMPTY(&cpu_suspend_map)) {
 		/*
 		 * Now that event channels have been initialized,
 		 * resume CPUs.
 		 */
 		resume_cpus(cpu_suspend_map);
 		/* Send an IPI_BITMAP in case there are pending bitmap IPIs. */
 		lapic_ipi_vectored(IPI_BITMAP_VECTOR, APIC_IPI_DEST_ALL);
 	}
 #endif
 
 	/*
 	 * FreeBSD really needs to add DEVICE_SUSPEND_CANCEL or
 	 * similar.
 	 */
 	DEVICE_RESUME(root_bus);
 	mtx_unlock(&Giant);
 
 	/*
 	 * Warm up timecounter again and reset system clock.
 	 */
 	timecounter->tc_get_timecount(timecounter);
 	inittodr(time_second);
 
 #ifdef EARLY_AP_STARTUP
 	thread_lock(curthread);
 	sched_unbind(curthread);
 	thread_unlock(curthread);
 #else
 	if (smp_started) {
 		thread_lock(curthread);
 		sched_unbind(curthread);
 		thread_unlock(curthread);
 	}
 #endif
 
+	resume_all_fs();
 	resume_all_proc();
 
 	EVENTHANDLER_INVOKE(power_resume);
 
 	if (bootverbose)
 		printf("System resumed after suspension\n");
 
 }
 
 static void
 xctrl_crash()
 {
 	panic("Xen directed crash");
 }
 
 static void
 xen_pv_shutdown_final(void *arg, int howto)
 {
 	/*
 	 * Inform the hypervisor that shutdown is complete.
 	 * This is not necessary in HVM domains since Xen
 	 * emulates ACPI in that mode and FreeBSD's ACPI
 	 * support will request this transition.
 	 */
 	if (howto & (RB_HALT | RB_POWEROFF))
 		HYPERVISOR_shutdown(SHUTDOWN_poweroff);
 	else
 		HYPERVISOR_shutdown(SHUTDOWN_reboot);
 }
 
 /*------------------------------ Event Reception -----------------------------*/
 static void
 xctrl_on_watch_event(struct xs_watch *watch, const char **vec, unsigned int len)
 {
 	const struct xctrl_shutdown_reason *reason;
 	const struct xctrl_shutdown_reason *last_reason;
 	char *result;
 	int   error;
 	int   result_len;
 
 	error = xs_read(XST_NIL, "control", "shutdown",
 			&result_len, (void **)&result);
 	if (error != 0 || result_len == 0)
 		return;
 
 	/* Acknowledge the request by writing back an empty string. */
 	error = xs_write(XST_NIL, "control", "shutdown", "");
 	if (error != 0)
 		printf("unable to ack shutdown request, proceeding anyway\n");
 
 	reason = xctrl_shutdown_reasons;
 	last_reason = reason + nitems(xctrl_shutdown_reasons);
 	while (reason < last_reason) {
 		if (!strcmp(result, reason->name)) {
 			reason->handler();
 			break;
 		}
 		reason++;
 	}
 
 	free(result, M_XENSTORE);
 }
 
 /*------------------ Private Device Attachment Functions  --------------------*/
 /**
  * \brief Identify instances of this device type in the system.
  *
  * \param driver  The driver performing this identify action.
  * \param parent  The NewBus parent device for any devices this method adds.
  */
 static void
 xctrl_identify(driver_t *driver __unused, device_t parent)
 {
 	/*
 	 * A single device instance for our driver is always present
 	 * in a system operating under Xen.
 	 */
 	BUS_ADD_CHILD(parent, 0, driver->name, 0);
 }
 
 /**
  * \brief Probe for the existence of the Xen Control device
  *
  * \param dev  NewBus device_t for this Xen control instance.
  *
  * \return  Always returns 0 indicating success.
  */
 static int 
 xctrl_probe(device_t dev)
 {
 	device_set_desc(dev, "Xen Control Device");
 
 	return (BUS_PROBE_NOWILDCARD);
 }
 
 /**
  * \brief Attach the Xen control device.
  *
  * \param dev  NewBus device_t for this Xen control instance.
  *
  * \return  On success, 0. Otherwise an errno value indicating the
  *          type of failure.
  */
 static int
 xctrl_attach(device_t dev)
 {
 	struct xctrl_softc *xctrl;
 
 	xctrl = device_get_softc(dev);
 
 	/* Activate watch */
 	xctrl->xctrl_watch.node = "control/shutdown";
 	xctrl->xctrl_watch.callback = xctrl_on_watch_event;
 	xctrl->xctrl_watch.callback_data = (uintptr_t)xctrl;
 	xs_register_watch(&xctrl->xctrl_watch);
 
 	if (xen_pv_domain())
 		EVENTHANDLER_REGISTER(shutdown_final, xen_pv_shutdown_final, NULL,
 		                      SHUTDOWN_PRI_LAST);
 
 	return (0);
 }
 
 /**
  * \brief Detach the Xen control device.
  *
  * \param dev  NewBus device_t for this Xen control device instance.
  *
  * \return  On success, 0. Otherwise an errno value indicating the
  *          type of failure.
  */
 static int
 xctrl_detach(device_t dev)
 {
 	struct xctrl_softc *xctrl;
 
 	xctrl = device_get_softc(dev);
 
 	/* Release watch */
 	xs_unregister_watch(&xctrl->xctrl_watch);
 
 	return (0);
 }
 
 /*-------------------- Private Device Attachment Data  -----------------------*/
 static device_method_t xctrl_methods[] = { 
 	/* Device interface */ 
 	DEVMETHOD(device_identify,	xctrl_identify),
 	DEVMETHOD(device_probe,         xctrl_probe), 
 	DEVMETHOD(device_attach,        xctrl_attach), 
 	DEVMETHOD(device_detach,        xctrl_detach), 
 
 	DEVMETHOD_END
 }; 
 
 DEFINE_CLASS_0(xctrl, xctrl_driver, xctrl_methods, sizeof(struct xctrl_softc));
 devclass_t xctrl_devclass; 
 
 DRIVER_MODULE(xctrl, xenstore, xctrl_driver, xctrl_devclass, NULL, NULL);
diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c
index 08190d432a93..7ab3b6274491 100644
--- a/sys/kern/vfs_mount.c
+++ b/sys/kern/vfs_mount.c
@@ -1,2509 +1,2573 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1999-2004 Poul-Henning Kamp
  * Copyright (c) 1999 Michael Smith
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/smp.h>
 #include <sys/devctl.h>
 #include <sys/eventhandler.h>
 #include <sys/fcntl.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/libkern.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/filedesc.h>
 #include <sys/reboot.h>
 #include <sys/sbuf.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysproto.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 #include <sys/vnode.h>
 #include <vm/uma.h>
 
 #include <geom/geom.h>
 
 #include <machine/stdarg.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #define	VFS_MOUNTARG_SIZE_MAX	(1024 * 64)
 
 static int	vfs_domount(struct thread *td, const char *fstype, char *fspath,
 		    uint64_t fsflags, struct vfsoptlist **optlist);
 static void	free_mntarg(struct mntarg *ma);
 
 static int	usermount = 0;
 SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0,
     "Unprivileged users may mount and unmount file systems");
 
 static bool	default_autoro = false;
 SYSCTL_BOOL(_vfs, OID_AUTO, default_autoro, CTLFLAG_RW, &default_autoro, 0,
     "Retry failed r/w mount as r/o if no explicit ro/rw option is specified");
 
 MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure");
 MALLOC_DEFINE(M_STATFS, "statfs", "statfs structure");
 static uma_zone_t mount_zone;
 
 /* List of mounted filesystems. */
 struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist);
 
 /* For any iteration/modification of mountlist */
 struct mtx_padalign __exclusive_cache_line mountlist_mtx;
 MTX_SYSINIT(mountlist, &mountlist_mtx, "mountlist", MTX_DEF);
 
 EVENTHANDLER_LIST_DEFINE(vfs_mounted);
 EVENTHANDLER_LIST_DEFINE(vfs_unmounted);
 
 static void mount_devctl_event(const char *type, struct mount *mp, bool donew);
 
 /*
  * Global opts, taken by all filesystems
  */
 static const char *global_opts[] = {
 	"errmsg",
 	"fstype",
 	"fspath",
 	"ro",
 	"rw",
 	"nosuid",
 	"noexec",
 	NULL
 };
 
 static int
 mount_init(void *mem, int size, int flags)
 {
 	struct mount *mp;
 
 	mp = (struct mount *)mem;
 	mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF);
 	mtx_init(&mp->mnt_listmtx, "struct mount vlist mtx", NULL, MTX_DEF);
 	lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0);
 	mp->mnt_thread_in_ops_pcpu = uma_zalloc_pcpu(pcpu_zone_4,
 	    M_WAITOK | M_ZERO);
 	mp->mnt_ref_pcpu = uma_zalloc_pcpu(pcpu_zone_4,
 	    M_WAITOK | M_ZERO);
 	mp->mnt_lockref_pcpu = uma_zalloc_pcpu(pcpu_zone_4,
 	    M_WAITOK | M_ZERO);
 	mp->mnt_writeopcount_pcpu = uma_zalloc_pcpu(pcpu_zone_4,
 	    M_WAITOK | M_ZERO);
 	mp->mnt_ref = 0;
 	mp->mnt_vfs_ops = 1;
 	mp->mnt_rootvnode = NULL;
 	return (0);
 }
 
 static void
 mount_fini(void *mem, int size)
 {
 	struct mount *mp;
 
 	mp = (struct mount *)mem;
 	uma_zfree_pcpu(pcpu_zone_4, mp->mnt_writeopcount_pcpu);
 	uma_zfree_pcpu(pcpu_zone_4, mp->mnt_lockref_pcpu);
 	uma_zfree_pcpu(pcpu_zone_4, mp->mnt_ref_pcpu);
 	uma_zfree_pcpu(pcpu_zone_4, mp->mnt_thread_in_ops_pcpu);
 	lockdestroy(&mp->mnt_explock);
 	mtx_destroy(&mp->mnt_listmtx);
 	mtx_destroy(&mp->mnt_mtx);
 }
 
 static void
 vfs_mount_init(void *dummy __unused)
 {
 
 	mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount), NULL,
 	    NULL, mount_init, mount_fini, UMA_ALIGN_CACHE, UMA_ZONE_NOFREE);
 }
 SYSINIT(vfs_mount, SI_SUB_VFS, SI_ORDER_ANY, vfs_mount_init, NULL);
 
 /*
  * ---------------------------------------------------------------------
  * Functions for building and sanitizing the mount options
  */
 
 /* Remove one mount option. */
 static void
 vfs_freeopt(struct vfsoptlist *opts, struct vfsopt *opt)
 {
 
 	TAILQ_REMOVE(opts, opt, link);
 	free(opt->name, M_MOUNT);
 	if (opt->value != NULL)
 		free(opt->value, M_MOUNT);
 	free(opt, M_MOUNT);
 }
 
 /* Release all resources related to the mount options. */
 void
 vfs_freeopts(struct vfsoptlist *opts)
 {
 	struct vfsopt *opt;
 
 	while (!TAILQ_EMPTY(opts)) {
 		opt = TAILQ_FIRST(opts);
 		vfs_freeopt(opts, opt);
 	}
 	free(opts, M_MOUNT);
 }
 
 void
 vfs_deleteopt(struct vfsoptlist *opts, const char *name)
 {
 	struct vfsopt *opt, *temp;
 
 	if (opts == NULL)
 		return;
 	TAILQ_FOREACH_SAFE(opt, opts, link, temp)  {
 		if (strcmp(opt->name, name) == 0)
 			vfs_freeopt(opts, opt);
 	}
 }
 
 static int
 vfs_isopt_ro(const char *opt)
 {
 
 	if (strcmp(opt, "ro") == 0 || strcmp(opt, "rdonly") == 0 ||
 	    strcmp(opt, "norw") == 0)
 		return (1);
 	return (0);
 }
 
 static int
 vfs_isopt_rw(const char *opt)
 {
 
 	if (strcmp(opt, "rw") == 0 || strcmp(opt, "noro") == 0)
 		return (1);
 	return (0);
 }
 
 /*
  * Check if options are equal (with or without the "no" prefix).
  */
 static int
 vfs_equalopts(const char *opt1, const char *opt2)
 {
 	char *p;
 
 	/* "opt" vs. "opt" or "noopt" vs. "noopt" */
 	if (strcmp(opt1, opt2) == 0)
 		return (1);
 	/* "noopt" vs. "opt" */
 	if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0)
 		return (1);
 	/* "opt" vs. "noopt" */
 	if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0)
 		return (1);
 	while ((p = strchr(opt1, '.')) != NULL &&
 	    !strncmp(opt1, opt2, ++p - opt1)) {
 		opt2 += p - opt1;
 		opt1 = p;
 		/* "foo.noopt" vs. "foo.opt" */
 		if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0)
 			return (1);
 		/* "foo.opt" vs. "foo.noopt" */
 		if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0)
 			return (1);
 	}
 	/* "ro" / "rdonly" / "norw" / "rw" / "noro" */
 	if ((vfs_isopt_ro(opt1) || vfs_isopt_rw(opt1)) &&
 	    (vfs_isopt_ro(opt2) || vfs_isopt_rw(opt2)))
 		return (1);
 	return (0);
 }
 
 /*
  * If a mount option is specified several times,
  * (with or without the "no" prefix) only keep
  * the last occurrence of it.
  */
 static void
 vfs_sanitizeopts(struct vfsoptlist *opts)
 {
 	struct vfsopt *opt, *opt2, *tmp;
 
 	TAILQ_FOREACH_REVERSE(opt, opts, vfsoptlist, link) {
 		opt2 = TAILQ_PREV(opt, vfsoptlist, link);
 		while (opt2 != NULL) {
 			if (vfs_equalopts(opt->name, opt2->name)) {
 				tmp = TAILQ_PREV(opt2, vfsoptlist, link);
 				vfs_freeopt(opts, opt2);
 				opt2 = tmp;
 			} else {
 				opt2 = TAILQ_PREV(opt2, vfsoptlist, link);
 			}
 		}
 	}
 }
 
 /*
  * Build a linked list of mount options from a struct uio.
  */
 int
 vfs_buildopts(struct uio *auio, struct vfsoptlist **options)
 {
 	struct vfsoptlist *opts;
 	struct vfsopt *opt;
 	size_t memused, namelen, optlen;
 	unsigned int i, iovcnt;
 	int error;
 
 	opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
 	TAILQ_INIT(opts);
 	memused = 0;
 	iovcnt = auio->uio_iovcnt;
 	for (i = 0; i < iovcnt; i += 2) {
 		namelen = auio->uio_iov[i].iov_len;
 		optlen = auio->uio_iov[i + 1].iov_len;
 		memused += sizeof(struct vfsopt) + optlen + namelen;
 		/*
 		 * Avoid consuming too much memory, and attempts to overflow
 		 * memused.
 		 */
 		if (memused > VFS_MOUNTARG_SIZE_MAX ||
 		    optlen > VFS_MOUNTARG_SIZE_MAX ||
 		    namelen > VFS_MOUNTARG_SIZE_MAX) {
 			error = EINVAL;
 			goto bad;
 		}
 
 		opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
 		opt->name = malloc(namelen, M_MOUNT, M_WAITOK);
 		opt->value = NULL;
 		opt->len = 0;
 		opt->pos = i / 2;
 		opt->seen = 0;
 
 		/*
 		 * Do this early, so jumps to "bad" will free the current
 		 * option.
 		 */
 		TAILQ_INSERT_TAIL(opts, opt, link);
 
 		if (auio->uio_segflg == UIO_SYSSPACE) {
 			bcopy(auio->uio_iov[i].iov_base, opt->name, namelen);
 		} else {
 			error = copyin(auio->uio_iov[i].iov_base, opt->name,
 			    namelen);
 			if (error)
 				goto bad;
 		}
 		/* Ensure names are null-terminated strings. */
 		if (namelen == 0 || opt->name[namelen - 1] != '\0') {
 			error = EINVAL;
 			goto bad;
 		}
 		if (optlen != 0) {
 			opt->len = optlen;
 			opt->value = malloc(optlen, M_MOUNT, M_WAITOK);
 			if (auio->uio_segflg == UIO_SYSSPACE) {
 				bcopy(auio->uio_iov[i + 1].iov_base, opt->value,
 				    optlen);
 			} else {
 				error = copyin(auio->uio_iov[i + 1].iov_base,
 				    opt->value, optlen);
 				if (error)
 					goto bad;
 			}
 		}
 	}
 	vfs_sanitizeopts(opts);
 	*options = opts;
 	return (0);
 bad:
 	vfs_freeopts(opts);
 	return (error);
 }
 
 /*
  * Merge the old mount options with the new ones passed
  * in the MNT_UPDATE case.
  *
  * XXX: This function will keep a "nofoo" option in the new
  * options.  E.g, if the option's canonical name is "foo",
  * "nofoo" ends up in the mount point's active options.
  */
 static void
 vfs_mergeopts(struct vfsoptlist *toopts, struct vfsoptlist *oldopts)
 {
 	struct vfsopt *opt, *new;
 
 	TAILQ_FOREACH(opt, oldopts, link) {
 		new = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
 		new->name = strdup(opt->name, M_MOUNT);
 		if (opt->len != 0) {
 			new->value = malloc(opt->len, M_MOUNT, M_WAITOK);
 			bcopy(opt->value, new->value, opt->len);
 		} else
 			new->value = NULL;
 		new->len = opt->len;
 		new->seen = opt->seen;
 		TAILQ_INSERT_HEAD(toopts, new, link);
 	}
 	vfs_sanitizeopts(toopts);
 }
 
 /*
  * Mount a filesystem.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct nmount_args {
 	struct iovec *iovp;
 	unsigned int iovcnt;
 	int flags;
 };
 #endif
 int
 sys_nmount(struct thread *td, struct nmount_args *uap)
 {
 	struct uio *auio;
 	int error;
 	u_int iovcnt;
 	uint64_t flags;
 
 	/*
 	 * Mount flags are now 64-bits. On 32-bit archtectures only
 	 * 32-bits are passed in, but from here on everything handles
 	 * 64-bit flags correctly.
 	 */
 	flags = uap->flags;
 
 	AUDIT_ARG_FFLAGS(flags);
 	CTR4(KTR_VFS, "%s: iovp %p with iovcnt %d and flags %d", __func__,
 	    uap->iovp, uap->iovcnt, flags);
 
 	/*
 	 * Filter out MNT_ROOTFS.  We do not want clients of nmount() in
 	 * userspace to set this flag, but we must filter it out if we want
 	 * MNT_UPDATE on the root file system to work.
 	 * MNT_ROOTFS should only be set by the kernel when mounting its
 	 * root file system.
 	 */
 	flags &= ~MNT_ROOTFS;
 
 	iovcnt = uap->iovcnt;
 	/*
 	 * Check that we have an even number of iovec's
 	 * and that we have at least two options.
 	 */
 	if ((iovcnt & 1) || (iovcnt < 4)) {
 		CTR2(KTR_VFS, "%s: failed for invalid iovcnt %d", __func__,
 		    uap->iovcnt);
 		return (EINVAL);
 	}
 
 	error = copyinuio(uap->iovp, iovcnt, &auio);
 	if (error) {
 		CTR2(KTR_VFS, "%s: failed for invalid uio op with %d errno",
 		    __func__, error);
 		return (error);
 	}
 	error = vfs_donmount(td, flags, auio);
 
 	free(auio, M_IOV);
 	return (error);
 }
 
 /*
  * ---------------------------------------------------------------------
  * Various utility functions
  */
 
 void
 vfs_ref(struct mount *mp)
 {
 
 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 	if (vfs_op_thread_enter(mp)) {
 		vfs_mp_count_add_pcpu(mp, ref, 1);
 		vfs_op_thread_exit(mp);
 		return;
 	}
 
 	MNT_ILOCK(mp);
 	MNT_REF(mp);
 	MNT_IUNLOCK(mp);
 }
 
 void
 vfs_rel(struct mount *mp)
 {
 
 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 	if (vfs_op_thread_enter(mp)) {
 		vfs_mp_count_sub_pcpu(mp, ref, 1);
 		vfs_op_thread_exit(mp);
 		return;
 	}
 
 	MNT_ILOCK(mp);
 	MNT_REL(mp);
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * Allocate and initialize the mount point struct.
  */
 struct mount *
 vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath,
     struct ucred *cred)
 {
 	struct mount *mp;
 
 	mp = uma_zalloc(mount_zone, M_WAITOK);
 	bzero(&mp->mnt_startzero,
 	    __rangeof(struct mount, mnt_startzero, mnt_endzero));
 	TAILQ_INIT(&mp->mnt_nvnodelist);
 	mp->mnt_nvnodelistsize = 0;
 	TAILQ_INIT(&mp->mnt_lazyvnodelist);
 	mp->mnt_lazyvnodelistsize = 0;
 	if (mp->mnt_ref != 0 || mp->mnt_lockref != 0 ||
 	    mp->mnt_writeopcount != 0)
 		panic("%s: non-zero counters on new mp %p\n", __func__, mp);
 	if (mp->mnt_vfs_ops != 1)
 		panic("%s: vfs_ops should be 1 but %d found\n", __func__,
 		    mp->mnt_vfs_ops);
 	(void) vfs_busy(mp, MBF_NOWAIT);
 	atomic_add_acq_int(&vfsp->vfc_refcount, 1);
 	mp->mnt_op = vfsp->vfc_vfsops;
 	mp->mnt_vfc = vfsp;
 	mp->mnt_stat.f_type = vfsp->vfc_typenum;
 	mp->mnt_gen++;
 	strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
 	mp->mnt_vnodecovered = vp;
 	mp->mnt_cred = crdup(cred);
 	mp->mnt_stat.f_owner = cred->cr_uid;
 	strlcpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN);
 	mp->mnt_iosize_max = DFLTPHYS;
 #ifdef MAC
 	mac_mount_init(mp);
 	mac_mount_create(cred, mp);
 #endif
 	arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0);
 	TAILQ_INIT(&mp->mnt_uppers);
 	return (mp);
 }
 
 /*
  * Destroy the mount struct previously allocated by vfs_mount_alloc().
  */
 void
 vfs_mount_destroy(struct mount *mp)
 {
 
 	if (mp->mnt_vfs_ops == 0)
 		panic("%s: entered with zero vfs_ops\n", __func__);
 
 	vfs_assert_mount_counters(mp);
 
 	MNT_ILOCK(mp);
 	mp->mnt_kern_flag |= MNTK_REFEXPIRE;
 	if (mp->mnt_kern_flag & MNTK_MWAIT) {
 		mp->mnt_kern_flag &= ~MNTK_MWAIT;
 		wakeup(mp);
 	}
 	while (mp->mnt_ref)
 		msleep(mp, MNT_MTX(mp), PVFS, "mntref", 0);
 	KASSERT(mp->mnt_ref == 0,
 	    ("%s: invalid refcount in the drain path @ %s:%d", __func__,
 	    __FILE__, __LINE__));
 	if (mp->mnt_writeopcount != 0)
 		panic("vfs_mount_destroy: nonzero writeopcount");
 	if (mp->mnt_secondary_writes != 0)
 		panic("vfs_mount_destroy: nonzero secondary_writes");
 	atomic_subtract_rel_int(&mp->mnt_vfc->vfc_refcount, 1);
 	if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) {
 		struct vnode *vp;
 
 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes)
 			vn_printf(vp, "dangling vnode ");
 		panic("unmount: dangling vnode");
 	}
 	KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers"));
 	if (mp->mnt_nvnodelistsize != 0)
 		panic("vfs_mount_destroy: nonzero nvnodelistsize");
 	if (mp->mnt_lazyvnodelistsize != 0)
 		panic("vfs_mount_destroy: nonzero lazyvnodelistsize");
 	if (mp->mnt_lockref != 0)
 		panic("vfs_mount_destroy: nonzero lock refcount");
 	MNT_IUNLOCK(mp);
 
 	if (mp->mnt_vfs_ops != 1)
 		panic("%s: vfs_ops should be 1 but %d found\n", __func__,
 		    mp->mnt_vfs_ops);
 
 	if (mp->mnt_rootvnode != NULL)
 		panic("%s: mount point still has a root vnode %p\n", __func__,
 		    mp->mnt_rootvnode);
 
 	if (mp->mnt_vnodecovered != NULL)
 		vrele(mp->mnt_vnodecovered);
 #ifdef MAC
 	mac_mount_destroy(mp);
 #endif
 	if (mp->mnt_opt != NULL)
 		vfs_freeopts(mp->mnt_opt);
 	crfree(mp->mnt_cred);
 	uma_zfree(mount_zone, mp);
 }
 
 static bool
 vfs_should_downgrade_to_ro_mount(uint64_t fsflags, int error)
 {
 	/* This is an upgrade of an exisiting mount. */
 	if ((fsflags & MNT_UPDATE) != 0)
 		return (false);
 	/* This is already an R/O mount. */
 	if ((fsflags & MNT_RDONLY) != 0)
 		return (false);
 
 	switch (error) {
 	case ENODEV:	/* generic, geom, ... */
 	case EACCES:	/* cam/scsi, ... */
 	case EROFS:	/* md, mmcsd, ... */
 		/*
 		 * These errors can be returned by the storage layer to signal
 		 * that the media is read-only.  No harm in the R/O mount
 		 * attempt if the error was returned for some other reason.
 		 */
 		return (true);
 	default:
 		return (false);
 	}
 }
 
 int
 vfs_donmount(struct thread *td, uint64_t fsflags, struct uio *fsoptions)
 {
 	struct vfsoptlist *optlist;
 	struct vfsopt *opt, *tmp_opt;
 	char *fstype, *fspath, *errmsg;
 	int error, fstypelen, fspathlen, errmsg_len, errmsg_pos;
 	bool autoro;
 
 	errmsg = fspath = NULL;
 	errmsg_len = fspathlen = 0;
 	errmsg_pos = -1;
 	autoro = default_autoro;
 
 	error = vfs_buildopts(fsoptions, &optlist);
 	if (error)
 		return (error);
 
 	if (vfs_getopt(optlist, "errmsg", (void **)&errmsg, &errmsg_len) == 0)
 		errmsg_pos = vfs_getopt_pos(optlist, "errmsg");
 
 	/*
 	 * We need these two options before the others,
 	 * and they are mandatory for any filesystem.
 	 * Ensure they are NUL terminated as well.
 	 */
 	fstypelen = 0;
 	error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen);
 	if (error || fstypelen <= 0 || fstype[fstypelen - 1] != '\0') {
 		error = EINVAL;
 		if (errmsg != NULL)
 			strncpy(errmsg, "Invalid fstype", errmsg_len);
 		goto bail;
 	}
 	fspathlen = 0;
 	error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen);
 	if (error || fspathlen <= 0 || fspath[fspathlen - 1] != '\0') {
 		error = EINVAL;
 		if (errmsg != NULL)
 			strncpy(errmsg, "Invalid fspath", errmsg_len);
 		goto bail;
 	}
 
 	/*
 	 * We need to see if we have the "update" option
 	 * before we call vfs_domount(), since vfs_domount() has special
 	 * logic based on MNT_UPDATE.  This is very important
 	 * when we want to update the root filesystem.
 	 */
 	TAILQ_FOREACH_SAFE(opt, optlist, link, tmp_opt) {
 		int do_freeopt = 0;
 
 		if (strcmp(opt->name, "update") == 0) {
 			fsflags |= MNT_UPDATE;
 			do_freeopt = 1;
 		}
 		else if (strcmp(opt->name, "async") == 0)
 			fsflags |= MNT_ASYNC;
 		else if (strcmp(opt->name, "force") == 0) {
 			fsflags |= MNT_FORCE;
 			do_freeopt = 1;
 		}
 		else if (strcmp(opt->name, "reload") == 0) {
 			fsflags |= MNT_RELOAD;
 			do_freeopt = 1;
 		}
 		else if (strcmp(opt->name, "multilabel") == 0)
 			fsflags |= MNT_MULTILABEL;
 		else if (strcmp(opt->name, "noasync") == 0)
 			fsflags &= ~MNT_ASYNC;
 		else if (strcmp(opt->name, "noatime") == 0)
 			fsflags |= MNT_NOATIME;
 		else if (strcmp(opt->name, "atime") == 0) {
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("nonoatime", M_MOUNT);
 		}
 		else if (strcmp(opt->name, "noclusterr") == 0)
 			fsflags |= MNT_NOCLUSTERR;
 		else if (strcmp(opt->name, "clusterr") == 0) {
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("nonoclusterr", M_MOUNT);
 		}
 		else if (strcmp(opt->name, "noclusterw") == 0)
 			fsflags |= MNT_NOCLUSTERW;
 		else if (strcmp(opt->name, "clusterw") == 0) {
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("nonoclusterw", M_MOUNT);
 		}
 		else if (strcmp(opt->name, "noexec") == 0)
 			fsflags |= MNT_NOEXEC;
 		else if (strcmp(opt->name, "exec") == 0) {
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("nonoexec", M_MOUNT);
 		}
 		else if (strcmp(opt->name, "nosuid") == 0)
 			fsflags |= MNT_NOSUID;
 		else if (strcmp(opt->name, "suid") == 0) {
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("nonosuid", M_MOUNT);
 		}
 		else if (strcmp(opt->name, "nosymfollow") == 0)
 			fsflags |= MNT_NOSYMFOLLOW;
 		else if (strcmp(opt->name, "symfollow") == 0) {
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("nonosymfollow", M_MOUNT);
 		}
 		else if (strcmp(opt->name, "noro") == 0) {
 			fsflags &= ~MNT_RDONLY;
 			autoro = false;
 		}
 		else if (strcmp(opt->name, "rw") == 0) {
 			fsflags &= ~MNT_RDONLY;
 			autoro = false;
 		}
 		else if (strcmp(opt->name, "ro") == 0) {
 			fsflags |= MNT_RDONLY;
 			autoro = false;
 		}
 		else if (strcmp(opt->name, "rdonly") == 0) {
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("ro", M_MOUNT);
 			fsflags |= MNT_RDONLY;
 			autoro = false;
 		}
 		else if (strcmp(opt->name, "autoro") == 0) {
 			do_freeopt = 1;
 			autoro = true;
 		}
 		else if (strcmp(opt->name, "suiddir") == 0)
 			fsflags |= MNT_SUIDDIR;
 		else if (strcmp(opt->name, "sync") == 0)
 			fsflags |= MNT_SYNCHRONOUS;
 		else if (strcmp(opt->name, "union") == 0)
 			fsflags |= MNT_UNION;
 		else if (strcmp(opt->name, "automounted") == 0) {
 			fsflags |= MNT_AUTOMOUNTED;
 			do_freeopt = 1;
 		} else if (strcmp(opt->name, "nocover") == 0) {
 			fsflags |= MNT_NOCOVER;
 			do_freeopt = 1;
 		} else if (strcmp(opt->name, "cover") == 0) {
 			fsflags &= ~MNT_NOCOVER;
 			do_freeopt = 1;
 		} else if (strcmp(opt->name, "emptydir") == 0) {
 			fsflags |= MNT_EMPTYDIR;
 			do_freeopt = 1;
 		} else if (strcmp(opt->name, "noemptydir") == 0) {
 			fsflags &= ~MNT_EMPTYDIR;
 			do_freeopt = 1;
 		}
 		if (do_freeopt)
 			vfs_freeopt(optlist, opt);
 	}
 
 	/*
 	 * Be ultra-paranoid about making sure the type and fspath
 	 * variables will fit in our mp buffers, including the
 	 * terminating NUL.
 	 */
 	if (fstypelen > MFSNAMELEN || fspathlen > MNAMELEN) {
 		error = ENAMETOOLONG;
 		goto bail;
 	}
 
 	error = vfs_domount(td, fstype, fspath, fsflags, &optlist);
 
 	/*
 	 * See if we can mount in the read-only mode if the error code suggests
 	 * that it could be possible and the mount options allow for that.
 	 * Never try it if "[no]{ro|rw}" has been explicitly requested and not
 	 * overridden by "autoro".
 	 */
 	if (autoro && vfs_should_downgrade_to_ro_mount(fsflags, error)) {
 		printf("%s: R/W mount failed, possibly R/O media,"
 		    " trying R/O mount\n", __func__);
 		fsflags |= MNT_RDONLY;
 		error = vfs_domount(td, fstype, fspath, fsflags, &optlist);
 	}
 bail:
 	/* copyout the errmsg */
 	if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt)
 	    && errmsg_len > 0 && errmsg != NULL) {
 		if (fsoptions->uio_segflg == UIO_SYSSPACE) {
 			bcopy(errmsg,
 			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
 			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
 		} else {
 			copyout(errmsg,
 			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
 			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
 		}
 	}
 
 	if (optlist != NULL)
 		vfs_freeopts(optlist);
 	return (error);
 }
 
 /*
  * Old mount API.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct mount_args {
 	char	*type;
 	char	*path;
 	int	flags;
 	caddr_t	data;
 };
 #endif
 /* ARGSUSED */
 int
 sys_mount(struct thread *td, struct mount_args *uap)
 {
 	char *fstype;
 	struct vfsconf *vfsp = NULL;
 	struct mntarg *ma = NULL;
 	uint64_t flags;
 	int error;
 
 	/*
 	 * Mount flags are now 64-bits. On 32-bit architectures only
 	 * 32-bits are passed in, but from here on everything handles
 	 * 64-bit flags correctly.
 	 */
 	flags = uap->flags;
 
 	AUDIT_ARG_FFLAGS(flags);
 
 	/*
 	 * Filter out MNT_ROOTFS.  We do not want clients of mount() in
 	 * userspace to set this flag, but we must filter it out if we want
 	 * MNT_UPDATE on the root file system to work.
 	 * MNT_ROOTFS should only be set by the kernel when mounting its
 	 * root file system.
 	 */
 	flags &= ~MNT_ROOTFS;
 
 	fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK);
 	error = copyinstr(uap->type, fstype, MFSNAMELEN, NULL);
 	if (error) {
 		free(fstype, M_TEMP);
 		return (error);
 	}
 
 	AUDIT_ARG_TEXT(fstype);
 	vfsp = vfs_byname_kld(fstype, td, &error);
 	free(fstype, M_TEMP);
 	if (vfsp == NULL)
 		return (ENOENT);
 	if (((vfsp->vfc_flags & VFCF_SBDRY) != 0 &&
 	    vfsp->vfc_vfsops_sd->vfs_cmount == NULL) ||
 	    ((vfsp->vfc_flags & VFCF_SBDRY) == 0 &&
 	    vfsp->vfc_vfsops->vfs_cmount == NULL))
 		return (EOPNOTSUPP);
 
 	ma = mount_argsu(ma, "fstype", uap->type, MFSNAMELEN);
 	ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN);
 	ma = mount_argb(ma, flags & MNT_RDONLY, "noro");
 	ma = mount_argb(ma, !(flags & MNT_NOSUID), "nosuid");
 	ma = mount_argb(ma, !(flags & MNT_NOEXEC), "noexec");
 
 	if ((vfsp->vfc_flags & VFCF_SBDRY) != 0)
 		return (vfsp->vfc_vfsops_sd->vfs_cmount(ma, uap->data, flags));
 	return (vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, flags));
 }
 
 /*
  * vfs_domount_first(): first file system mount (not update)
  */
 static int
 vfs_domount_first(
 	struct thread *td,		/* Calling thread. */
 	struct vfsconf *vfsp,		/* File system type. */
 	char *fspath,			/* Mount path. */
 	struct vnode *vp,		/* Vnode to be covered. */
 	uint64_t fsflags,		/* Flags common to all filesystems. */
 	struct vfsoptlist **optlist	/* Options local to the filesystem. */
 	)
 {
 	struct vattr va;
 	struct mount *mp;
 	struct vnode *newdp, *rootvp;
 	int error, error1;
 
 	ASSERT_VOP_ELOCKED(vp, __func__);
 	KASSERT((fsflags & MNT_UPDATE) == 0, ("MNT_UPDATE shouldn't be here"));
 
 	if ((fsflags & MNT_EMPTYDIR) != 0) {
 		error = vfs_emptydir(vp);
 		if (error != 0) {
 			vput(vp);
 			return (error);
 		}
 	}
 
 	/*
 	 * If the jail of the calling thread lacks permission for this type of
 	 * file system, deny immediately.
 	 */
 	if (jailed(td->td_ucred) && !prison_allow(td->td_ucred,
 	    vfsp->vfc_prison_flag)) {
 		vput(vp);
 		return (EPERM);
 	}
 
 	/*
 	 * If the user is not root, ensure that they own the directory
 	 * onto which we are attempting to mount.
 	 */
 	error = VOP_GETATTR(vp, &va, td->td_ucred);
 	if (error == 0 && va.va_uid != td->td_ucred->cr_uid)
 		error = priv_check_cred(td->td_ucred, PRIV_VFS_ADMIN);
 	if (error == 0)
 		error = vinvalbuf(vp, V_SAVE, 0, 0);
 	if (error == 0 && vp->v_type != VDIR)
 		error = ENOTDIR;
 	if (error == 0) {
 		VI_LOCK(vp);
 		if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL)
 			vp->v_iflag |= VI_MOUNT;
 		else
 			error = EBUSY;
 		VI_UNLOCK(vp);
 	}
 	if (error != 0) {
 		vput(vp);
 		return (error);
 	}
 	vn_seqc_write_begin(vp);
 	VOP_UNLOCK(vp);
 
 	/* Allocate and initialize the filesystem. */
 	mp = vfs_mount_alloc(vp, vfsp, fspath, td->td_ucred);
 	/* XXXMAC: pass to vfs_mount_alloc? */
 	mp->mnt_optnew = *optlist;
 	/* Set the mount level flags. */
 	mp->mnt_flag = (fsflags & (MNT_UPDATEMASK | MNT_ROOTFS | MNT_RDONLY));
 
 	/*
 	 * Mount the filesystem.
 	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
 	 * get.  No freeing of cn_pnbuf.
 	 */
 	error1 = 0;
 	if ((error = VFS_MOUNT(mp)) != 0 ||
 	    (error1 = VFS_STATFS(mp, &mp->mnt_stat)) != 0 ||
 	    (error1 = VFS_ROOT(mp, LK_EXCLUSIVE, &newdp)) != 0) {
 		rootvp = NULL;
 		if (error1 != 0) {
 			error = error1;
 			rootvp = vfs_cache_root_clear(mp);
 			if (rootvp != NULL) {
 				vhold(rootvp);
 				vrele(rootvp);
 			}
 			if ((error1 = VFS_UNMOUNT(mp, 0)) != 0)
 				printf("VFS_UNMOUNT returned %d\n", error1);
 		}
 		vfs_unbusy(mp);
 		mp->mnt_vnodecovered = NULL;
 		vfs_mount_destroy(mp);
 		VI_LOCK(vp);
 		vp->v_iflag &= ~VI_MOUNT;
 		VI_UNLOCK(vp);
 		if (rootvp != NULL) {
 			vn_seqc_write_end(rootvp);
 			vdrop(rootvp);
 		}
 		vn_seqc_write_end(vp);
 		vrele(vp);
 		return (error);
 	}
 	vn_seqc_write_begin(newdp);
 	VOP_UNLOCK(newdp);
 
 	if (mp->mnt_opt != NULL)
 		vfs_freeopts(mp->mnt_opt);
 	mp->mnt_opt = mp->mnt_optnew;
 	*optlist = NULL;
 
 	/*
 	 * Prevent external consumers of mount options from reading mnt_optnew.
 	 */
 	mp->mnt_optnew = NULL;
 
 	MNT_ILOCK(mp);
 	if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
 	    (mp->mnt_kern_flag & MNTK_NOASYNC) == 0)
 		mp->mnt_kern_flag |= MNTK_ASYNC;
 	else
 		mp->mnt_kern_flag &= ~MNTK_ASYNC;
 	MNT_IUNLOCK(mp);
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	cache_purge(vp);
 	VI_LOCK(vp);
 	vp->v_iflag &= ~VI_MOUNT;
 	VI_UNLOCK(vp);
 	vp->v_mountedhere = mp;
 	/* Place the new filesystem at the end of the mount list. */
 	mtx_lock(&mountlist_mtx);
 	TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
 	mtx_unlock(&mountlist_mtx);
 	vfs_event_signal(NULL, VQ_MOUNT, 0);
 	vn_lock(newdp, LK_EXCLUSIVE | LK_RETRY);
 	VOP_UNLOCK(vp);
 	EVENTHANDLER_DIRECT_INVOKE(vfs_mounted, mp, newdp, td);
 	VOP_UNLOCK(newdp);
 	mount_devctl_event("MOUNT", mp, false);
 	mountcheckdirs(vp, newdp);
 	vn_seqc_write_end(vp);
 	vn_seqc_write_end(newdp);
 	vrele(newdp);
 	if ((mp->mnt_flag & MNT_RDONLY) == 0)
 		vfs_allocate_syncvnode(mp);
 	vfs_op_exit(mp);
 	vfs_unbusy(mp);
 	return (0);
 }
 
 /*
  * vfs_domount_update(): update of mounted file system
  */
 static int
 vfs_domount_update(
 	struct thread *td,		/* Calling thread. */
 	struct vnode *vp,		/* Mount point vnode. */
 	uint64_t fsflags,		/* Flags common to all filesystems. */
 	struct vfsoptlist **optlist	/* Options local to the filesystem. */
 	)
 {
 	struct export_args export;
 	struct o2export_args o2export;
 	struct vnode *rootvp;
 	void *bufp;
 	struct mount *mp;
 	int error, export_error, i, len;
 	uint64_t flag;
 	gid_t *grps;
 
 	ASSERT_VOP_ELOCKED(vp, __func__);
 	KASSERT((fsflags & MNT_UPDATE) != 0, ("MNT_UPDATE should be here"));
 	mp = vp->v_mount;
 
 	if ((vp->v_vflag & VV_ROOT) == 0) {
 		if (vfs_copyopt(*optlist, "export", &export, sizeof(export))
 		    == 0)
 			error = EXDEV;
 		else
 			error = EINVAL;
 		vput(vp);
 		return (error);
 	}
 
 	/*
 	 * We only allow the filesystem to be reloaded if it
 	 * is currently mounted read-only.
 	 */
 	flag = mp->mnt_flag;
 	if ((fsflags & MNT_RELOAD) != 0 && (flag & MNT_RDONLY) == 0) {
 		vput(vp);
 		return (EOPNOTSUPP);	/* Needs translation */
 	}
 	/*
 	 * Only privileged root, or (if MNT_USER is set) the user that
 	 * did the original mount is permitted to update it.
 	 */
 	error = vfs_suser(mp, td);
 	if (error != 0) {
 		vput(vp);
 		return (error);
 	}
 	if (vfs_busy(mp, MBF_NOWAIT)) {
 		vput(vp);
 		return (EBUSY);
 	}
 	VI_LOCK(vp);
 	if ((vp->v_iflag & VI_MOUNT) != 0 || vp->v_mountedhere != NULL) {
 		VI_UNLOCK(vp);
 		vfs_unbusy(mp);
 		vput(vp);
 		return (EBUSY);
 	}
 	vp->v_iflag |= VI_MOUNT;
 	VI_UNLOCK(vp);
 	VOP_UNLOCK(vp);
 
 	vfs_op_enter(mp);
 	vn_seqc_write_begin(vp);
 
 	rootvp = NULL;
 	MNT_ILOCK(mp);
 	if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
 		MNT_IUNLOCK(mp);
 		error = EBUSY;
 		goto end;
 	}
 	mp->mnt_flag &= ~MNT_UPDATEMASK;
 	mp->mnt_flag |= fsflags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE |
 	    MNT_SNAPSHOT | MNT_ROOTFS | MNT_UPDATEMASK | MNT_RDONLY);
 	if ((mp->mnt_flag & MNT_ASYNC) == 0)
 		mp->mnt_kern_flag &= ~MNTK_ASYNC;
 	rootvp = vfs_cache_root_clear(mp);
 	MNT_IUNLOCK(mp);
 	mp->mnt_optnew = *optlist;
 	vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt);
 
 	/*
 	 * Mount the filesystem.
 	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
 	 * get.  No freeing of cn_pnbuf.
 	 */
 	error = VFS_MOUNT(mp);
 
 	export_error = 0;
 	/* Process the export option. */
 	if (error == 0 && vfs_getopt(mp->mnt_optnew, "export", &bufp,
 	    &len) == 0) {
 		/* Assume that there is only 1 ABI for each length. */
 		switch (len) {
 		case (sizeof(struct oexport_args)):
 			bzero(&o2export, sizeof(o2export));
 			/* FALLTHROUGH */
 		case (sizeof(o2export)):
 			bcopy(bufp, &o2export, len);
 			export.ex_flags = (uint64_t)o2export.ex_flags;
 			export.ex_root = o2export.ex_root;
 			export.ex_uid = o2export.ex_anon.cr_uid;
 			export.ex_groups = NULL;
 			export.ex_ngroups = o2export.ex_anon.cr_ngroups;
 			if (export.ex_ngroups > 0) {
 				if (export.ex_ngroups <= XU_NGROUPS) {
 					export.ex_groups = malloc(
 					    export.ex_ngroups * sizeof(gid_t),
 					    M_TEMP, M_WAITOK);
 					for (i = 0; i < export.ex_ngroups; i++)
 						export.ex_groups[i] =
 						  o2export.ex_anon.cr_groups[i];
 				} else
 					export_error = EINVAL;
 			} else if (export.ex_ngroups < 0)
 				export_error = EINVAL;
 			export.ex_addr = o2export.ex_addr;
 			export.ex_addrlen = o2export.ex_addrlen;
 			export.ex_mask = o2export.ex_mask;
 			export.ex_masklen = o2export.ex_masklen;
 			export.ex_indexfile = o2export.ex_indexfile;
 			export.ex_numsecflavors = o2export.ex_numsecflavors;
 			if (export.ex_numsecflavors < MAXSECFLAVORS) {
 				for (i = 0; i < export.ex_numsecflavors; i++)
 					export.ex_secflavors[i] =
 					    o2export.ex_secflavors[i];
 			} else
 				export_error = EINVAL;
 			if (export_error == 0)
 				export_error = vfs_export(mp, &export);
 			free(export.ex_groups, M_TEMP);
 			break;
 		case (sizeof(export)):
 			bcopy(bufp, &export, len);
 			grps = NULL;
 			if (export.ex_ngroups > 0) {
 				if (export.ex_ngroups <= NGROUPS_MAX) {
 					grps = malloc(export.ex_ngroups *
 					    sizeof(gid_t), M_TEMP, M_WAITOK);
 					export_error = copyin(export.ex_groups,
 					    grps, export.ex_ngroups *
 					    sizeof(gid_t));
 					if (export_error == 0)
 						export.ex_groups = grps;
 				} else
 					export_error = EINVAL;
 			} else if (export.ex_ngroups == 0)
 				export.ex_groups = NULL;
 			else
 				export_error = EINVAL;
 			if (export_error == 0)
 				export_error = vfs_export(mp, &export);
 			free(grps, M_TEMP);
 			break;
 		default:
 			export_error = EINVAL;
 			break;
 		}
 	}
 
 	MNT_ILOCK(mp);
 	if (error == 0) {
 		mp->mnt_flag &=	~(MNT_UPDATE | MNT_RELOAD | MNT_FORCE |
 		    MNT_SNAPSHOT);
 	} else {
 		/*
 		 * If we fail, restore old mount flags. MNT_QUOTA is special,
 		 * because it is not part of MNT_UPDATEMASK, but it could have
 		 * changed in the meantime if quotactl(2) was called.
 		 * All in all we want current value of MNT_QUOTA, not the old
 		 * one.
 		 */
 		mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA);
 	}
 	if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
 	    (mp->mnt_kern_flag & MNTK_NOASYNC) == 0)
 		mp->mnt_kern_flag |= MNTK_ASYNC;
 	else
 		mp->mnt_kern_flag &= ~MNTK_ASYNC;
 	MNT_IUNLOCK(mp);
 
 	if (error != 0)
 		goto end;
 
 	mount_devctl_event("REMOUNT", mp, true);
 	if (mp->mnt_opt != NULL)
 		vfs_freeopts(mp->mnt_opt);
 	mp->mnt_opt = mp->mnt_optnew;
 	*optlist = NULL;
 	(void)VFS_STATFS(mp, &mp->mnt_stat);
 	/*
 	 * Prevent external consumers of mount options from reading
 	 * mnt_optnew.
 	 */
 	mp->mnt_optnew = NULL;
 
 	if ((mp->mnt_flag & MNT_RDONLY) == 0)
 		vfs_allocate_syncvnode(mp);
 	else
 		vfs_deallocate_syncvnode(mp);
 end:
 	vfs_op_exit(mp);
 	if (rootvp != NULL) {
 		vn_seqc_write_end(rootvp);
 		vrele(rootvp);
 	}
 	vn_seqc_write_end(vp);
 	vfs_unbusy(mp);
 	VI_LOCK(vp);
 	vp->v_iflag &= ~VI_MOUNT;
 	VI_UNLOCK(vp);
 	vrele(vp);
 	return (error != 0 ? error : export_error);
 }
 
 /*
  * vfs_domount(): actually attempt a filesystem mount.
  */
 static int
 vfs_domount(
 	struct thread *td,		/* Calling thread. */
 	const char *fstype,		/* Filesystem type. */
 	char *fspath,			/* Mount path. */
 	uint64_t fsflags,		/* Flags common to all filesystems. */
 	struct vfsoptlist **optlist	/* Options local to the filesystem. */
 	)
 {
 	struct vfsconf *vfsp;
 	struct nameidata nd;
 	struct vnode *vp;
 	char *pathbuf;
 	int error;
 
 	/*
 	 * Be ultra-paranoid about making sure the type and fspath
 	 * variables will fit in our mp buffers, including the
 	 * terminating NUL.
 	 */
 	if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN)
 		return (ENAMETOOLONG);
 
 	if (jailed(td->td_ucred) || usermount == 0) {
 		if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0)
 			return (error);
 	}
 
 	/*
 	 * Do not allow NFS export or MNT_SUIDDIR by unprivileged users.
 	 */
 	if (fsflags & MNT_EXPORTED) {
 		error = priv_check(td, PRIV_VFS_MOUNT_EXPORTED);
 		if (error)
 			return (error);
 	}
 	if (fsflags & MNT_SUIDDIR) {
 		error = priv_check(td, PRIV_VFS_MOUNT_SUIDDIR);
 		if (error)
 			return (error);
 	}
 	/*
 	 * Silently enforce MNT_NOSUID and MNT_USER for unprivileged users.
 	 */
 	if ((fsflags & (MNT_NOSUID | MNT_USER)) != (MNT_NOSUID | MNT_USER)) {
 		if (priv_check(td, PRIV_VFS_MOUNT_NONUSER) != 0)
 			fsflags |= MNT_NOSUID | MNT_USER;
 	}
 
 	/* Load KLDs before we lock the covered vnode to avoid reversals. */
 	vfsp = NULL;
 	if ((fsflags & MNT_UPDATE) == 0) {
 		/* Don't try to load KLDs if we're mounting the root. */
 		if (fsflags & MNT_ROOTFS)
 			vfsp = vfs_byname(fstype);
 		else
 			vfsp = vfs_byname_kld(fstype, td, &error);
 		if (vfsp == NULL)
 			return (ENODEV);
 	}
 
 	/*
 	 * Get vnode to be covered or mount point's vnode in case of MNT_UPDATE.
 	 */
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
 	    UIO_SYSSPACE, fspath, td);
 	error = namei(&nd);
 	if (error != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 	if ((fsflags & MNT_UPDATE) == 0) {
 		if ((vp->v_vflag & VV_ROOT) != 0 &&
 		    (fsflags & MNT_NOCOVER) != 0) {
 			vput(vp);
 			return (EBUSY);
 		}
 		pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK);
 		strcpy(pathbuf, fspath);
 		error = vn_path_to_global_path(td, vp, pathbuf, MNAMELEN);
 		if (error == 0) {
 			error = vfs_domount_first(td, vfsp, pathbuf, vp,
 			    fsflags, optlist);
 		}
 		free(pathbuf, M_TEMP);
 	} else
 		error = vfs_domount_update(td, vp, fsflags, optlist);
 
 	return (error);
 }
 
 /*
  * Unmount a filesystem.
  *
  * Note: unmount takes a path to the vnode mounted on as argument, not
  * special file (as before).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct unmount_args {
 	char	*path;
 	int	flags;
 };
 #endif
 /* ARGSUSED */
 int
 sys_unmount(struct thread *td, struct unmount_args *uap)
 {
 
 	return (kern_unmount(td, uap->path, uap->flags));
 }
 
 int
 kern_unmount(struct thread *td, const char *path, int flags)
 {
 	struct nameidata nd;
 	struct mount *mp;
 	char *pathbuf;
 	int error, id0, id1;
 
 	AUDIT_ARG_VALUE(flags);
 	if (jailed(td->td_ucred) || usermount == 0) {
 		error = priv_check(td, PRIV_VFS_UNMOUNT);
 		if (error)
 			return (error);
 	}
 
 	pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK);
 	error = copyinstr(path, pathbuf, MNAMELEN, NULL);
 	if (error) {
 		free(pathbuf, M_TEMP);
 		return (error);
 	}
 	if (flags & MNT_BYFSID) {
 		AUDIT_ARG_TEXT(pathbuf);
 		/* Decode the filesystem ID. */
 		if (sscanf(pathbuf, "FSID:%d:%d", &id0, &id1) != 2) {
 			free(pathbuf, M_TEMP);
 			return (EINVAL);
 		}
 
 		mtx_lock(&mountlist_mtx);
 		TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
 			if (mp->mnt_stat.f_fsid.val[0] == id0 &&
 			    mp->mnt_stat.f_fsid.val[1] == id1) {
 				vfs_ref(mp);
 				break;
 			}
 		}
 		mtx_unlock(&mountlist_mtx);
 	} else {
 		/*
 		 * Try to find global path for path argument.
 		 */
 		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
 		    UIO_SYSSPACE, pathbuf, td);
 		if (namei(&nd) == 0) {
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 			error = vn_path_to_global_path(td, nd.ni_vp, pathbuf,
 			    MNAMELEN);
 			if (error == 0)
 				vput(nd.ni_vp);
 		}
 		mtx_lock(&mountlist_mtx);
 		TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
 			if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0) {
 				vfs_ref(mp);
 				break;
 			}
 		}
 		mtx_unlock(&mountlist_mtx);
 	}
 	free(pathbuf, M_TEMP);
 	if (mp == NULL) {
 		/*
 		 * Previously we returned ENOENT for a nonexistent path and
 		 * EINVAL for a non-mountpoint.  We cannot tell these apart
 		 * now, so in the !MNT_BYFSID case return the more likely
 		 * EINVAL for compatibility.
 		 */
 		return ((flags & MNT_BYFSID) ? ENOENT : EINVAL);
 	}
 
 	/*
 	 * Don't allow unmounting the root filesystem.
 	 */
 	if (mp->mnt_flag & MNT_ROOTFS) {
 		vfs_rel(mp);
 		return (EINVAL);
 	}
 	error = dounmount(mp, flags, td);
 	return (error);
 }
 
 /*
  * Return error if any of the vnodes, ignoring the root vnode
  * and the syncer vnode, have non-zero usecount.
  *
  * This function is purely advisory - it can return false positives
  * and negatives.
  */
 static int
 vfs_check_usecounts(struct mount *mp)
 {
 	struct vnode *vp, *mvp;
 
 	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
 		if ((vp->v_vflag & VV_ROOT) == 0 && vp->v_type != VNON &&
 		    vp->v_usecount != 0) {
 			VI_UNLOCK(vp);
 			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 			return (EBUSY);
 		}
 		VI_UNLOCK(vp);
 	}
 
 	return (0);
 }
 
 static void
 dounmount_cleanup(struct mount *mp, struct vnode *coveredvp, int mntkflags)
 {
 
 	mtx_assert(MNT_MTX(mp), MA_OWNED);
 	mp->mnt_kern_flag &= ~mntkflags;
 	if ((mp->mnt_kern_flag & MNTK_MWAIT) != 0) {
 		mp->mnt_kern_flag &= ~MNTK_MWAIT;
 		wakeup(mp);
 	}
 	vfs_op_exit_locked(mp);
 	MNT_IUNLOCK(mp);
 	if (coveredvp != NULL) {
 		VOP_UNLOCK(coveredvp);
 		vdrop(coveredvp);
 	}
 	vn_finished_write(mp);
 }
 
 /*
  * There are various reference counters associated with the mount point.
  * Normally it is permitted to modify them without taking the mnt ilock,
  * but this behavior can be temporarily disabled if stable value is needed
  * or callers are expected to block (e.g. to not allow new users during
  * forced unmount).
  */
 void
 vfs_op_enter(struct mount *mp)
 {
 	int cpu;
 
 	MNT_ILOCK(mp);
 	mp->mnt_vfs_ops++;
 	if (mp->mnt_vfs_ops > 1) {
 		MNT_IUNLOCK(mp);
 		return;
 	}
 	vfs_op_barrier_wait(mp);
 	CPU_FOREACH(cpu) {
 		mp->mnt_ref +=
 		    zpcpu_replace_cpu(mp->mnt_ref_pcpu, 0, cpu);
 		mp->mnt_lockref +=
 		    zpcpu_replace_cpu(mp->mnt_lockref_pcpu, 0, cpu);
 		mp->mnt_writeopcount +=
 		    zpcpu_replace_cpu(mp->mnt_writeopcount_pcpu, 0, cpu);
 	}
 	if (mp->mnt_ref <= 0 || mp->mnt_lockref < 0 || mp->mnt_writeopcount < 0)
 		panic("%s: invalid count(s) on mp %p: ref %d lockref %d writeopcount %d\n",
 		    __func__, mp, mp->mnt_ref, mp->mnt_lockref, mp->mnt_writeopcount);
 	MNT_IUNLOCK(mp);
 	vfs_assert_mount_counters(mp);
 }
 
 void
 vfs_op_exit_locked(struct mount *mp)
 {
 
 	mtx_assert(MNT_MTX(mp), MA_OWNED);
 
 	if (mp->mnt_vfs_ops <= 0)
 		panic("%s: invalid vfs_ops count %d for mp %p\n",
 		    __func__, mp->mnt_vfs_ops, mp);
 	mp->mnt_vfs_ops--;
 }
 
 void
 vfs_op_exit(struct mount *mp)
 {
 
 	MNT_ILOCK(mp);
 	vfs_op_exit_locked(mp);
 	MNT_IUNLOCK(mp);
 }
 
 struct vfs_op_barrier_ipi {
 	struct mount *mp;
 	struct smp_rendezvous_cpus_retry_arg srcra;
 };
 
 static void
 vfs_op_action_func(void *arg)
 {
 	struct vfs_op_barrier_ipi *vfsopipi;
 	struct mount *mp;
 
 	vfsopipi = __containerof(arg, struct vfs_op_barrier_ipi, srcra);
 	mp = vfsopipi->mp;
 
 	if (!vfs_op_thread_entered(mp))
 		smp_rendezvous_cpus_done(arg);
 }
 
 static void
 vfs_op_wait_func(void *arg, int cpu)
 {
 	struct vfs_op_barrier_ipi *vfsopipi;
 	struct mount *mp;
 	int *in_op;
 
 	vfsopipi = __containerof(arg, struct vfs_op_barrier_ipi, srcra);
 	mp = vfsopipi->mp;
 
 	in_op = zpcpu_get_cpu(mp->mnt_thread_in_ops_pcpu, cpu);
 	while (atomic_load_int(in_op))
 		cpu_spinwait();
 }
 
 void
 vfs_op_barrier_wait(struct mount *mp)
 {
 	struct vfs_op_barrier_ipi vfsopipi;
 
 	vfsopipi.mp = mp;
 
 	smp_rendezvous_cpus_retry(all_cpus,
 	    smp_no_rendezvous_barrier,
 	    vfs_op_action_func,
 	    smp_no_rendezvous_barrier,
 	    vfs_op_wait_func,
 	    &vfsopipi.srcra);
 }
 
 #ifdef DIAGNOSTIC
 void
 vfs_assert_mount_counters(struct mount *mp)
 {
 	int cpu;
 
 	if (mp->mnt_vfs_ops == 0)
 		return;
 
 	CPU_FOREACH(cpu) {
 		if (*zpcpu_get_cpu(mp->mnt_ref_pcpu, cpu) != 0 ||
 		    *zpcpu_get_cpu(mp->mnt_lockref_pcpu, cpu) != 0 ||
 		    *zpcpu_get_cpu(mp->mnt_writeopcount_pcpu, cpu) != 0)
 			vfs_dump_mount_counters(mp);
 	}
 }
 
 void
 vfs_dump_mount_counters(struct mount *mp)
 {
 	int cpu, *count;
 	int ref, lockref, writeopcount;
 
 	printf("%s: mp %p vfs_ops %d\n", __func__, mp, mp->mnt_vfs_ops);
 
 	printf("        ref : ");
 	ref = mp->mnt_ref;
 	CPU_FOREACH(cpu) {
 		count = zpcpu_get_cpu(mp->mnt_ref_pcpu, cpu);
 		printf("%d ", *count);
 		ref += *count;
 	}
 	printf("\n");
 	printf("    lockref : ");
 	lockref = mp->mnt_lockref;
 	CPU_FOREACH(cpu) {
 		count = zpcpu_get_cpu(mp->mnt_lockref_pcpu, cpu);
 		printf("%d ", *count);
 		lockref += *count;
 	}
 	printf("\n");
 	printf("writeopcount: ");
 	writeopcount = mp->mnt_writeopcount;
 	CPU_FOREACH(cpu) {
 		count = zpcpu_get_cpu(mp->mnt_writeopcount_pcpu, cpu);
 		printf("%d ", *count);
 		writeopcount += *count;
 	}
 	printf("\n");
 
 	printf("counter       struct total\n");
 	printf("ref             %-5d  %-5d\n", mp->mnt_ref, ref);
 	printf("lockref         %-5d  %-5d\n", mp->mnt_lockref, lockref);
 	printf("writeopcount    %-5d  %-5d\n", mp->mnt_writeopcount, writeopcount);
 
 	panic("invalid counts on struct mount");
 }
 #endif
 
 int
 vfs_mount_fetch_counter(struct mount *mp, enum mount_counter which)
 {
 	int *base, *pcpu;
 	int cpu, sum;
 
 	switch (which) {
 	case MNT_COUNT_REF:
 		base = &mp->mnt_ref;
 		pcpu = mp->mnt_ref_pcpu;
 		break;
 	case MNT_COUNT_LOCKREF:
 		base = &mp->mnt_lockref;
 		pcpu = mp->mnt_lockref_pcpu;
 		break;
 	case MNT_COUNT_WRITEOPCOUNT:
 		base = &mp->mnt_writeopcount;
 		pcpu = mp->mnt_writeopcount_pcpu;
 		break;
 	}
 
 	sum = *base;
 	CPU_FOREACH(cpu) {
 		sum += *zpcpu_get_cpu(pcpu, cpu);
 	}
 	return (sum);
 }
 
 /*
  * Do the actual filesystem unmount.
  */
 int
 dounmount(struct mount *mp, int flags, struct thread *td)
 {
 	struct vnode *coveredvp, *rootvp;
 	int error;
 	uint64_t async_flag;
 	int mnt_gen_r;
 
 	if ((coveredvp = mp->mnt_vnodecovered) != NULL) {
 		mnt_gen_r = mp->mnt_gen;
 		VI_LOCK(coveredvp);
 		vholdl(coveredvp);
 		vn_lock(coveredvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY);
 		/*
 		 * Check for mp being unmounted while waiting for the
 		 * covered vnode lock.
 		 */
 		if (coveredvp->v_mountedhere != mp ||
 		    coveredvp->v_mountedhere->mnt_gen != mnt_gen_r) {
 			VOP_UNLOCK(coveredvp);
 			vdrop(coveredvp);
 			vfs_rel(mp);
 			return (EBUSY);
 		}
 	}
 
 	/*
 	 * Only privileged root, or (if MNT_USER is set) the user that did the
 	 * original mount is permitted to unmount this filesystem.
 	 */
 	error = vfs_suser(mp, td);
 	if (error != 0) {
 		if (coveredvp != NULL) {
 			VOP_UNLOCK(coveredvp);
 			vdrop(coveredvp);
 		}
 		vfs_rel(mp);
 		return (error);
 	}
 
 	vfs_op_enter(mp);
 
 	vn_start_write(NULL, &mp, V_WAIT | V_MNTREF);
 	MNT_ILOCK(mp);
 	if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 ||
 	    (mp->mnt_flag & MNT_UPDATE) != 0 ||
 	    !TAILQ_EMPTY(&mp->mnt_uppers)) {
 		dounmount_cleanup(mp, coveredvp, 0);
 		return (EBUSY);
 	}
 	mp->mnt_kern_flag |= MNTK_UNMOUNT;
 	rootvp = vfs_cache_root_clear(mp);
 	if (coveredvp != NULL)
 		vn_seqc_write_begin(coveredvp);
 	if (flags & MNT_NONBUSY) {
 		MNT_IUNLOCK(mp);
 		error = vfs_check_usecounts(mp);
 		MNT_ILOCK(mp);
 		if (error != 0) {
 			vn_seqc_write_end(coveredvp);
 			dounmount_cleanup(mp, coveredvp, MNTK_UNMOUNT);
 			if (rootvp != NULL) {
 				vn_seqc_write_end(rootvp);
 				vrele(rootvp);
 			}
 			return (error);
 		}
 	}
 	/* Allow filesystems to detect that a forced unmount is in progress. */
 	if (flags & MNT_FORCE) {
 		mp->mnt_kern_flag |= MNTK_UNMOUNTF;
 		MNT_IUNLOCK(mp);
 		/*
 		 * Must be done after setting MNTK_UNMOUNTF and before
 		 * waiting for mnt_lockref to become 0.
 		 */
 		VFS_PURGE(mp);
 		MNT_ILOCK(mp);
 	}
 	error = 0;
 	if (mp->mnt_lockref) {
 		mp->mnt_kern_flag |= MNTK_DRAINING;
 		error = msleep(&mp->mnt_lockref, MNT_MTX(mp), PVFS,
 		    "mount drain", 0);
 	}
 	MNT_IUNLOCK(mp);
 	KASSERT(mp->mnt_lockref == 0,
 	    ("%s: invalid lock refcount in the drain path @ %s:%d",
 	    __func__, __FILE__, __LINE__));
 	KASSERT(error == 0,
 	    ("%s: invalid return value for msleep in the drain path @ %s:%d",
 	    __func__, __FILE__, __LINE__));
 
 	/*
 	 * We want to keep the vnode around so that we can vn_seqc_write_end
 	 * after we are done with unmount. Downgrade our reference to a mere
 	 * hold count so that we don't interefere with anything.
 	 */
 	if (rootvp != NULL) {
 		vhold(rootvp);
 		vrele(rootvp);
 	}
 
 	if (mp->mnt_flag & MNT_EXPUBLIC)
 		vfs_setpublicfs(NULL, NULL, NULL);
 
 	vfs_periodic(mp, MNT_WAIT);
 	MNT_ILOCK(mp);
 	async_flag = mp->mnt_flag & MNT_ASYNC;
 	mp->mnt_flag &= ~MNT_ASYNC;
 	mp->mnt_kern_flag &= ~MNTK_ASYNC;
 	MNT_IUNLOCK(mp);
 	vfs_deallocate_syncvnode(mp);
 	error = VFS_UNMOUNT(mp, flags);
 	vn_finished_write(mp);
 	/*
 	 * If we failed to flush the dirty blocks for this mount point,
 	 * undo all the cdir/rdir and rootvnode changes we made above.
 	 * Unless we failed to do so because the device is reporting that
 	 * it doesn't exist anymore.
 	 */
 	if (error && error != ENXIO) {
 		MNT_ILOCK(mp);
 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
 			MNT_IUNLOCK(mp);
 			vfs_allocate_syncvnode(mp);
 			MNT_ILOCK(mp);
 		}
 		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
 		mp->mnt_flag |= async_flag;
 		if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
 		    (mp->mnt_kern_flag & MNTK_NOASYNC) == 0)
 			mp->mnt_kern_flag |= MNTK_ASYNC;
 		if (mp->mnt_kern_flag & MNTK_MWAIT) {
 			mp->mnt_kern_flag &= ~MNTK_MWAIT;
 			wakeup(mp);
 		}
 		vfs_op_exit_locked(mp);
 		MNT_IUNLOCK(mp);
 		if (coveredvp) {
 			vn_seqc_write_end(coveredvp);
 			VOP_UNLOCK(coveredvp);
 			vdrop(coveredvp);
 		}
 		if (rootvp != NULL) {
 			vn_seqc_write_end(rootvp);
 			vdrop(rootvp);
 		}
 		return (error);
 	}
 	mtx_lock(&mountlist_mtx);
 	TAILQ_REMOVE(&mountlist, mp, mnt_list);
 	mtx_unlock(&mountlist_mtx);
 	EVENTHANDLER_DIRECT_INVOKE(vfs_unmounted, mp, td);
 	if (coveredvp != NULL) {
 		coveredvp->v_mountedhere = NULL;
 		vn_seqc_write_end(coveredvp);
 		VOP_UNLOCK(coveredvp);
 		vdrop(coveredvp);
 	}
 	mount_devctl_event("UNMOUNT", mp, false);
 	if (rootvp != NULL) {
 		vn_seqc_write_end(rootvp);
 		vdrop(rootvp);
 	}
 	vfs_event_signal(NULL, VQ_UNMOUNT, 0);
 	if (rootvnode != NULL && mp == rootvnode->v_mount) {
 		vrele(rootvnode);
 		rootvnode = NULL;
 	}
 	if (mp == rootdevmp)
 		rootdevmp = NULL;
 	vfs_mount_destroy(mp);
 	return (0);
 }
 
 /*
  * Report errors during filesystem mounting.
  */
 void
 vfs_mount_error(struct mount *mp, const char *fmt, ...)
 {
 	struct vfsoptlist *moptlist = mp->mnt_optnew;
 	va_list ap;
 	int error, len;
 	char *errmsg;
 
 	error = vfs_getopt(moptlist, "errmsg", (void **)&errmsg, &len);
 	if (error || errmsg == NULL || len <= 0)
 		return;
 
 	va_start(ap, fmt);
 	vsnprintf(errmsg, (size_t)len, fmt, ap);
 	va_end(ap);
 }
 
 void
 vfs_opterror(struct vfsoptlist *opts, const char *fmt, ...)
 {
 	va_list ap;
 	int error, len;
 	char *errmsg;
 
 	error = vfs_getopt(opts, "errmsg", (void **)&errmsg, &len);
 	if (error || errmsg == NULL || len <= 0)
 		return;
 
 	va_start(ap, fmt);
 	vsnprintf(errmsg, (size_t)len, fmt, ap);
 	va_end(ap);
 }
 
 /*
  * ---------------------------------------------------------------------
  * Functions for querying mount options/arguments from filesystems.
  */
 
 /*
  * Check that no unknown options are given
  */
 int
 vfs_filteropt(struct vfsoptlist *opts, const char **legal)
 {
 	struct vfsopt *opt;
 	char errmsg[255];
 	const char **t, *p, *q;
 	int ret = 0;
 
 	TAILQ_FOREACH(opt, opts, link) {
 		p = opt->name;
 		q = NULL;
 		if (p[0] == 'n' && p[1] == 'o')
 			q = p + 2;
 		for(t = global_opts; *t != NULL; t++) {
 			if (strcmp(*t, p) == 0)
 				break;
 			if (q != NULL) {
 				if (strcmp(*t, q) == 0)
 					break;
 			}
 		}
 		if (*t != NULL)
 			continue;
 		for(t = legal; *t != NULL; t++) {
 			if (strcmp(*t, p) == 0)
 				break;
 			if (q != NULL) {
 				if (strcmp(*t, q) == 0)
 					break;
 			}
 		}
 		if (*t != NULL)
 			continue;
 		snprintf(errmsg, sizeof(errmsg),
 		    "mount option <%s> is unknown", p);
 		ret = EINVAL;
 	}
 	if (ret != 0) {
 		TAILQ_FOREACH(opt, opts, link) {
 			if (strcmp(opt->name, "errmsg") == 0) {
 				strncpy((char *)opt->value, errmsg, opt->len);
 				break;
 			}
 		}
 		if (opt == NULL)
 			printf("%s\n", errmsg);
 	}
 	return (ret);
 }
 
 /*
  * Get a mount option by its name.
  *
  * Return 0 if the option was found, ENOENT otherwise.
  * If len is non-NULL it will be filled with the length
  * of the option. If buf is non-NULL, it will be filled
  * with the address of the option.
  */
 int
 vfs_getopt(struct vfsoptlist *opts, const char *name, void **buf, int *len)
 {
 	struct vfsopt *opt;
 
 	KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) == 0) {
 			opt->seen = 1;
 			if (len != NULL)
 				*len = opt->len;
 			if (buf != NULL)
 				*buf = opt->value;
 			return (0);
 		}
 	}
 	return (ENOENT);
 }
 
 int
 vfs_getopt_pos(struct vfsoptlist *opts, const char *name)
 {
 	struct vfsopt *opt;
 
 	if (opts == NULL)
 		return (-1);
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) == 0) {
 			opt->seen = 1;
 			return (opt->pos);
 		}
 	}
 	return (-1);
 }
 
 int
 vfs_getopt_size(struct vfsoptlist *opts, const char *name, off_t *value)
 {
 	char *opt_value, *vtp;
 	quad_t iv;
 	int error, opt_len;
 
 	error = vfs_getopt(opts, name, (void **)&opt_value, &opt_len);
 	if (error != 0)
 		return (error);
 	if (opt_len == 0 || opt_value == NULL)
 		return (EINVAL);
 	if (opt_value[0] == '\0' || opt_value[opt_len - 1] != '\0')
 		return (EINVAL);
 	iv = strtoq(opt_value, &vtp, 0);
 	if (vtp == opt_value || (vtp[0] != '\0' && vtp[1] != '\0'))
 		return (EINVAL);
 	if (iv < 0)
 		return (EINVAL);
 	switch (vtp[0]) {
 	case 't': case 'T':
 		iv *= 1024;
 		/* FALLTHROUGH */
 	case 'g': case 'G':
 		iv *= 1024;
 		/* FALLTHROUGH */
 	case 'm': case 'M':
 		iv *= 1024;
 		/* FALLTHROUGH */
 	case 'k': case 'K':
 		iv *= 1024;
 	case '\0':
 		break;
 	default:
 		return (EINVAL);
 	}
 	*value = iv;
 
 	return (0);
 }
 
 char *
 vfs_getopts(struct vfsoptlist *opts, const char *name, int *error)
 {
 	struct vfsopt *opt;
 
 	*error = 0;
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) != 0)
 			continue;
 		opt->seen = 1;
 		if (opt->len == 0 ||
 		    ((char *)opt->value)[opt->len - 1] != '\0') {
 			*error = EINVAL;
 			return (NULL);
 		}
 		return (opt->value);
 	}
 	*error = ENOENT;
 	return (NULL);
 }
 
 int
 vfs_flagopt(struct vfsoptlist *opts, const char *name, uint64_t *w,
 	uint64_t val)
 {
 	struct vfsopt *opt;
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) == 0) {
 			opt->seen = 1;
 			if (w != NULL)
 				*w |= val;
 			return (1);
 		}
 	}
 	if (w != NULL)
 		*w &= ~val;
 	return (0);
 }
 
 int
 vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...)
 {
 	va_list ap;
 	struct vfsopt *opt;
 	int ret;
 
 	KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) != 0)
 			continue;
 		opt->seen = 1;
 		if (opt->len == 0 || opt->value == NULL)
 			return (0);
 		if (((char *)opt->value)[opt->len - 1] != '\0')
 			return (0);
 		va_start(ap, fmt);
 		ret = vsscanf(opt->value, fmt, ap);
 		va_end(ap);
 		return (ret);
 	}
 	return (0);
 }
 
 int
 vfs_setopt(struct vfsoptlist *opts, const char *name, void *value, int len)
 {
 	struct vfsopt *opt;
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) != 0)
 			continue;
 		opt->seen = 1;
 		if (opt->value == NULL)
 			opt->len = len;
 		else {
 			if (opt->len != len)
 				return (EINVAL);
 			bcopy(value, opt->value, len);
 		}
 		return (0);
 	}
 	return (ENOENT);
 }
 
 int
 vfs_setopt_part(struct vfsoptlist *opts, const char *name, void *value, int len)
 {
 	struct vfsopt *opt;
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) != 0)
 			continue;
 		opt->seen = 1;
 		if (opt->value == NULL)
 			opt->len = len;
 		else {
 			if (opt->len < len)
 				return (EINVAL);
 			opt->len = len;
 			bcopy(value, opt->value, len);
 		}
 		return (0);
 	}
 	return (ENOENT);
 }
 
 int
 vfs_setopts(struct vfsoptlist *opts, const char *name, const char *value)
 {
 	struct vfsopt *opt;
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) != 0)
 			continue;
 		opt->seen = 1;
 		if (opt->value == NULL)
 			opt->len = strlen(value) + 1;
 		else if (strlcpy(opt->value, value, opt->len) >= opt->len)
 			return (EINVAL);
 		return (0);
 	}
 	return (ENOENT);
 }
 
 /*
  * Find and copy a mount option.
  *
  * The size of the buffer has to be specified
  * in len, if it is not the same length as the
  * mount option, EINVAL is returned.
  * Returns ENOENT if the option is not found.
  */
 int
 vfs_copyopt(struct vfsoptlist *opts, const char *name, void *dest, int len)
 {
 	struct vfsopt *opt;
 
 	KASSERT(opts != NULL, ("vfs_copyopt: caller passed 'opts' as NULL"));
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) == 0) {
 			opt->seen = 1;
 			if (len != opt->len)
 				return (EINVAL);
 			bcopy(opt->value, dest, opt->len);
 			return (0);
 		}
 	}
 	return (ENOENT);
 }
 
 int
 __vfs_statfs(struct mount *mp, struct statfs *sbp)
 {
 
 	/*
 	 * Filesystems only fill in part of the structure for updates, we
 	 * have to read the entirety first to get all content.
 	 */
 	if (sbp != &mp->mnt_stat)
 		memcpy(sbp, &mp->mnt_stat, sizeof(*sbp));
 
 	/*
 	 * Set these in case the underlying filesystem fails to do so.
 	 */
 	sbp->f_version = STATFS_VERSION;
 	sbp->f_namemax = NAME_MAX;
 	sbp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 
 	return (mp->mnt_op->vfs_statfs(mp, sbp));
 }
 
 void
 vfs_mountedfrom(struct mount *mp, const char *from)
 {
 
 	bzero(mp->mnt_stat.f_mntfromname, sizeof mp->mnt_stat.f_mntfromname);
 	strlcpy(mp->mnt_stat.f_mntfromname, from,
 	    sizeof mp->mnt_stat.f_mntfromname);
 }
 
 /*
  * ---------------------------------------------------------------------
  * This is the api for building mount args and mounting filesystems from
  * inside the kernel.
  *
  * The API works by accumulation of individual args.  First error is
  * latched.
  *
  * XXX: should be documented in new manpage kernel_mount(9)
  */
 
 /* A memory allocation which must be freed when we are done */
 struct mntaarg {
 	SLIST_ENTRY(mntaarg)	next;
 };
 
 /* The header for the mount arguments */
 struct mntarg {
 	struct iovec *v;
 	int len;
 	int error;
 	SLIST_HEAD(, mntaarg)	list;
 };
 
 /*
  * Add a boolean argument.
  *
  * flag is the boolean value.
  * name must start with "no".
  */
 struct mntarg *
 mount_argb(struct mntarg *ma, int flag, const char *name)
 {
 
 	KASSERT(name[0] == 'n' && name[1] == 'o',
 	    ("mount_argb(...,%s): name must start with 'no'", name));
 
 	return (mount_arg(ma, name + (flag ? 2 : 0), NULL, 0));
 }
 
 /*
  * Add an argument printf style
  */
 struct mntarg *
 mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...)
 {
 	va_list ap;
 	struct mntaarg *maa;
 	struct sbuf *sb;
 	int len;
 
 	if (ma == NULL) {
 		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
 		SLIST_INIT(&ma->list);
 	}
 	if (ma->error)
 		return (ma);
 
 	ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
 	    M_MOUNT, M_WAITOK);
 	ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
 	ma->v[ma->len].iov_len = strlen(name) + 1;
 	ma->len++;
 
 	sb = sbuf_new_auto();
 	va_start(ap, fmt);
 	sbuf_vprintf(sb, fmt, ap);
 	va_end(ap);
 	sbuf_finish(sb);
 	len = sbuf_len(sb) + 1;
 	maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
 	SLIST_INSERT_HEAD(&ma->list, maa, next);
 	bcopy(sbuf_data(sb), maa + 1, len);
 	sbuf_delete(sb);
 
 	ma->v[ma->len].iov_base = maa + 1;
 	ma->v[ma->len].iov_len = len;
 	ma->len++;
 
 	return (ma);
 }
 
 /*
  * Add an argument which is a userland string.
  */
 struct mntarg *
 mount_argsu(struct mntarg *ma, const char *name, const void *val, int len)
 {
 	struct mntaarg *maa;
 	char *tbuf;
 
 	if (val == NULL)
 		return (ma);
 	if (ma == NULL) {
 		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
 		SLIST_INIT(&ma->list);
 	}
 	if (ma->error)
 		return (ma);
 	maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
 	SLIST_INSERT_HEAD(&ma->list, maa, next);
 	tbuf = (void *)(maa + 1);
 	ma->error = copyinstr(val, tbuf, len, NULL);
 	return (mount_arg(ma, name, tbuf, -1));
 }
 
 /*
  * Plain argument.
  *
  * If length is -1, treat value as a C string.
  */
 struct mntarg *
 mount_arg(struct mntarg *ma, const char *name, const void *val, int len)
 {
 
 	if (ma == NULL) {
 		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
 		SLIST_INIT(&ma->list);
 	}
 	if (ma->error)
 		return (ma);
 
 	ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
 	    M_MOUNT, M_WAITOK);
 	ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
 	ma->v[ma->len].iov_len = strlen(name) + 1;
 	ma->len++;
 
 	ma->v[ma->len].iov_base = (void *)(uintptr_t)val;
 	if (len < 0)
 		ma->v[ma->len].iov_len = strlen(val) + 1;
 	else
 		ma->v[ma->len].iov_len = len;
 	ma->len++;
 	return (ma);
 }
 
 /*
  * Free a mntarg structure
  */
 static void
 free_mntarg(struct mntarg *ma)
 {
 	struct mntaarg *maa;
 
 	while (!SLIST_EMPTY(&ma->list)) {
 		maa = SLIST_FIRST(&ma->list);
 		SLIST_REMOVE_HEAD(&ma->list, next);
 		free(maa, M_MOUNT);
 	}
 	free(ma->v, M_MOUNT);
 	free(ma, M_MOUNT);
 }
 
 /*
  * Mount a filesystem
  */
 int
 kernel_mount(struct mntarg *ma, uint64_t flags)
 {
 	struct uio auio;
 	int error;
 
 	KASSERT(ma != NULL, ("kernel_mount NULL ma"));
 	KASSERT(ma->v != NULL, ("kernel_mount NULL ma->v"));
 	KASSERT(!(ma->len & 1), ("kernel_mount odd ma->len (%d)", ma->len));
 
 	auio.uio_iov = ma->v;
 	auio.uio_iovcnt = ma->len;
 	auio.uio_segflg = UIO_SYSSPACE;
 
 	error = ma->error;
 	if (!error)
 		error = vfs_donmount(curthread, flags, &auio);
 	free_mntarg(ma);
 	return (error);
 }
 
 /*
  * A printflike function to mount a filesystem.
  */
 int
 kernel_vmount(int flags, ...)
 {
 	struct mntarg *ma = NULL;
 	va_list ap;
 	const char *cp;
 	const void *vp;
 	int error;
 
 	va_start(ap, flags);
 	for (;;) {
 		cp = va_arg(ap, const char *);
 		if (cp == NULL)
 			break;
 		vp = va_arg(ap, const void *);
 		ma = mount_arg(ma, cp, vp, (vp != NULL ? -1 : 0));
 	}
 	va_end(ap);
 
 	error = kernel_mount(ma, flags);
 	return (error);
 }
 
 /* Map from mount options to printable formats. */
 static struct mntoptnames optnames[] = {
 	MNTOPT_NAMES
 };
 
 static void
 mount_devctl_event_mntopt(struct sbuf *sb, const char *what, struct vfsoptlist *opts)
 {
 	struct vfsopt *opt;
 
 	if (opts == NULL || TAILQ_EMPTY(opts))
 		return;
 	sbuf_printf(sb, " %s=\"", what);
 	TAILQ_FOREACH(opt, opts, link) {
 		if (opt->name[0] == '\0' || (opt->len > 0 && *(char *)opt->value == '\0'))
 			continue;
 		devctl_safe_quote_sb(sb, opt->name);
 		if (opt->len > 0) {
 			sbuf_putc(sb, '=');
 			devctl_safe_quote_sb(sb, opt->value);
 		}
 		sbuf_putc(sb, ';');
 	}
 	sbuf_putc(sb, '"');
 }
 
 #define DEVCTL_LEN 1024
 static void
 mount_devctl_event(const char *type, struct mount *mp, bool donew)
 {
 	const uint8_t *cp;
 	struct mntoptnames *fp;
 	struct sbuf sb;
 	struct statfs *sfp = &mp->mnt_stat;
 	char *buf;
 
 	buf = malloc(DEVCTL_LEN, M_MOUNT, M_NOWAIT);
 	if (buf == NULL)
 		return;
 	sbuf_new(&sb, buf, DEVCTL_LEN, SBUF_FIXEDLEN);
 	sbuf_cpy(&sb, "mount-point=\"");
 	devctl_safe_quote_sb(&sb, sfp->f_mntonname);
 	sbuf_cat(&sb, "\" mount-dev=\"");
 	devctl_safe_quote_sb(&sb, sfp->f_mntfromname);
 	sbuf_cat(&sb, "\" mount-type=\"");
 	devctl_safe_quote_sb(&sb, sfp->f_fstypename);
 	sbuf_cat(&sb, "\" fsid=0x");
 	cp = (const uint8_t *)&sfp->f_fsid.val[0];
 	for (int i = 0; i < sizeof(sfp->f_fsid); i++)
 		sbuf_printf(&sb, "%02x", cp[i]);
 	sbuf_printf(&sb, " owner=%u flags=\"", sfp->f_owner);
 	for (fp = optnames; fp->o_opt != 0; fp++) {
 		if ((mp->mnt_flag & fp->o_opt) != 0) {
 			sbuf_cat(&sb, fp->o_name);
 			sbuf_putc(&sb, ';');
 		}
 	}
 	sbuf_putc(&sb, '"');
 	mount_devctl_event_mntopt(&sb, "opt", mp->mnt_opt);
 	if (donew)
 		mount_devctl_event_mntopt(&sb, "optnew", mp->mnt_optnew);
 	sbuf_finish(&sb);
 
 	if (sbuf_error(&sb) == 0)
 		devctl_notify("VFS", "FS", type, sbuf_data(&sb));
 	sbuf_delete(&sb);
 	free(buf, M_MOUNT);
 }
+
+/*
+ * Suspend write operations on all local writeable filesystems.  Does
+ * full sync of them in the process.
+ *
+ * Iterate over the mount points in reverse order, suspending most
+ * recently mounted filesystems first.  It handles a case where a
+ * filesystem mounted from a md(4) vnode-backed device should be
+ * suspended before the filesystem that owns the vnode.
+ */
+void
+suspend_all_fs(void)
+{
+	struct mount *mp;
+	int error;
+
+	mtx_lock(&mountlist_mtx);
+	TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
+		error = vfs_busy(mp, MBF_MNTLSTLOCK | MBF_NOWAIT);
+		if (error != 0)
+			continue;
+		if ((mp->mnt_flag & (MNT_RDONLY | MNT_LOCAL)) != MNT_LOCAL ||
+		    (mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
+			mtx_lock(&mountlist_mtx);
+			vfs_unbusy(mp);
+			continue;
+		}
+		error = vfs_write_suspend(mp, 0);
+		if (error == 0) {
+			MNT_ILOCK(mp);
+			MPASS((mp->mnt_kern_flag & MNTK_SUSPEND_ALL) == 0);
+			mp->mnt_kern_flag |= MNTK_SUSPEND_ALL;
+			MNT_IUNLOCK(mp);
+			mtx_lock(&mountlist_mtx);
+		} else {
+			printf("suspend of %s failed, error %d\n",
+			    mp->mnt_stat.f_mntonname, error);
+			mtx_lock(&mountlist_mtx);
+			vfs_unbusy(mp);
+		}
+	}
+	mtx_unlock(&mountlist_mtx);
+}
+
+void
+resume_all_fs(void)
+{
+	struct mount *mp;
+
+	mtx_lock(&mountlist_mtx);
+	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
+		if ((mp->mnt_kern_flag & MNTK_SUSPEND_ALL) == 0)
+			continue;
+		mtx_unlock(&mountlist_mtx);
+		MNT_ILOCK(mp);
+		MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) != 0);
+		mp->mnt_kern_flag &= ~MNTK_SUSPEND_ALL;
+		MNT_IUNLOCK(mp);
+		vfs_write_resume(mp, 0);
+		mtx_lock(&mountlist_mtx);
+		vfs_unbusy(mp);
+	}
+	mtx_unlock(&mountlist_mtx);
+}
diff --git a/sys/sys/mount.h b/sys/sys/mount.h
index 06646f80d08b..ee8d916c2432 100644
--- a/sys/sys/mount.h
+++ b/sys/sys/mount.h
@@ -1,1141 +1,1145 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)mount.h	8.21 (Berkeley) 5/20/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_MOUNT_H_
 #define _SYS_MOUNT_H_
 
 #include <sys/ucred.h>
 #include <sys/queue.h>
 #ifdef _KERNEL
 #include <sys/lock.h>
 #include <sys/lockmgr.h>
 #include <sys/tslog.h>
 #include <sys/_mutex.h>
 #include <sys/_sx.h>
 #endif
 
 /*
  * NOTE: When changing statfs structure, mount structure, MNT_* flags or
  * MNTK_* flags also update DDB show mount command in vfs_subr.c.
  */
 
 typedef struct fsid { int32_t val[2]; } fsid_t;	/* filesystem id type */
 
 #define fsidcmp(a, b) memcmp((a), (b), sizeof(fsid_t))
 
 /*
  * File identifier.
  * These are unique per filesystem on a single machine.
  *
  * Note that the offset of fid_data is 4 bytes, so care must be taken to avoid
  * undefined behavior accessing unaligned fields within an embedded struct.
  */
 #define	MAXFIDSZ	16
 
 struct fid {
 	u_short		fid_len;		/* length of data in bytes */
 	u_short		fid_data0;		/* force longword alignment */
 	char		fid_data[MAXFIDSZ];	/* data (variable length) */
 };
 
 /*
  * filesystem statistics
  */
 #define	MFSNAMELEN	16		/* length of type name including null */
 #define	MNAMELEN	1024		/* size of on/from name bufs */
 #define	STATFS_VERSION	0x20140518	/* current version number */
 struct statfs {
 	uint32_t f_version;		/* structure version number */
 	uint32_t f_type;		/* type of filesystem */
 	uint64_t f_flags;		/* copy of mount exported flags */
 	uint64_t f_bsize;		/* filesystem fragment size */
 	uint64_t f_iosize;		/* optimal transfer block size */
 	uint64_t f_blocks;		/* total data blocks in filesystem */
 	uint64_t f_bfree;		/* free blocks in filesystem */
 	int64_t	 f_bavail;		/* free blocks avail to non-superuser */
 	uint64_t f_files;		/* total file nodes in filesystem */
 	int64_t	 f_ffree;		/* free nodes avail to non-superuser */
 	uint64_t f_syncwrites;		/* count of sync writes since mount */
 	uint64_t f_asyncwrites;		/* count of async writes since mount */
 	uint64_t f_syncreads;		/* count of sync reads since mount */
 	uint64_t f_asyncreads;		/* count of async reads since mount */
 	uint64_t f_spare[10];		/* unused spare */
 	uint32_t f_namemax;		/* maximum filename length */
 	uid_t	  f_owner;		/* user that mounted the filesystem */
 	fsid_t	  f_fsid;		/* filesystem id */
 	char	  f_charspare[80];	    /* spare string space */
 	char	  f_fstypename[MFSNAMELEN]; /* filesystem type name */
 	char	  f_mntfromname[MNAMELEN];  /* mounted filesystem */
 	char	  f_mntonname[MNAMELEN];    /* directory on which mounted */
 };
 
 #if defined(_WANT_FREEBSD11_STATFS) || defined(_KERNEL)
 #define	FREEBSD11_STATFS_VERSION	0x20030518 /* current version number */
 struct freebsd11_statfs {
 	uint32_t f_version;		/* structure version number */
 	uint32_t f_type;		/* type of filesystem */
 	uint64_t f_flags;		/* copy of mount exported flags */
 	uint64_t f_bsize;		/* filesystem fragment size */
 	uint64_t f_iosize;		/* optimal transfer block size */
 	uint64_t f_blocks;		/* total data blocks in filesystem */
 	uint64_t f_bfree;		/* free blocks in filesystem */
 	int64_t	 f_bavail;		/* free blocks avail to non-superuser */
 	uint64_t f_files;		/* total file nodes in filesystem */
 	int64_t	 f_ffree;		/* free nodes avail to non-superuser */
 	uint64_t f_syncwrites;		/* count of sync writes since mount */
 	uint64_t f_asyncwrites;		/* count of async writes since mount */
 	uint64_t f_syncreads;		/* count of sync reads since mount */
 	uint64_t f_asyncreads;		/* count of async reads since mount */
 	uint64_t f_spare[10];		/* unused spare */
 	uint32_t f_namemax;		/* maximum filename length */
 	uid_t	  f_owner;		/* user that mounted the filesystem */
 	fsid_t	  f_fsid;		/* filesystem id */
 	char	  f_charspare[80];	/* spare string space */
 	char	  f_fstypename[16];	/* filesystem type name */
 	char	  f_mntfromname[88];	/* mounted filesystem */
 	char	  f_mntonname[88];	/* directory on which mounted */
 };
 #endif /* _WANT_FREEBSD11_STATFS || _KERNEL */
 
 #ifdef _KERNEL
 #define	OMFSNAMELEN	16	/* length of fs type name, including null */
 #define	OMNAMELEN	(88 - 2 * sizeof(long))	/* size of on/from name bufs */
 
 /* XXX getfsstat.2 is out of date with write and read counter changes here. */
 /* XXX statfs.2 is out of date with read counter changes here. */
 struct ostatfs {
 	long	f_spare2;		/* placeholder */
 	long	f_bsize;		/* fundamental filesystem block size */
 	long	f_iosize;		/* optimal transfer block size */
 	long	f_blocks;		/* total data blocks in filesystem */
 	long	f_bfree;		/* free blocks in fs */
 	long	f_bavail;		/* free blocks avail to non-superuser */
 	long	f_files;		/* total file nodes in filesystem */
 	long	f_ffree;		/* free file nodes in fs */
 	fsid_t	f_fsid;			/* filesystem id */
 	uid_t	f_owner;		/* user that mounted the filesystem */
 	int	f_type;			/* type of filesystem */
 	int	f_flags;		/* copy of mount exported flags */
 	long	f_syncwrites;		/* count of sync writes since mount */
 	long	f_asyncwrites;		/* count of async writes since mount */
 	char	f_fstypename[OMFSNAMELEN]; /* fs type name */
 	char	f_mntonname[OMNAMELEN];	/* directory on which mounted */
 	long	f_syncreads;		/* count of sync reads since mount */
 	long	f_asyncreads;		/* count of async reads since mount */
 	short	f_spares1;		/* unused spare */
 	char	f_mntfromname[OMNAMELEN];/* mounted filesystem */
 	short	f_spares2;		/* unused spare */
 	/*
 	 * XXX on machines where longs are aligned to 8-byte boundaries, there
 	 * is an unnamed int32_t here.  This spare was after the apparent end
 	 * of the struct until we bit off the read counters from f_mntonname.
 	 */
 	long	f_spare[2];		/* unused spare */
 };
 
 TAILQ_HEAD(vnodelst, vnode);
 
 /* Mount options list */
 TAILQ_HEAD(vfsoptlist, vfsopt);
 struct vfsopt {
 	TAILQ_ENTRY(vfsopt) link;
 	char	*name;
 	void	*value;
 	int	len;
 	int	pos;
 	int	seen;
 };
 
 /*
  * Structure per mounted filesystem.  Each mounted filesystem has an
  * array of operations and an instance record.  The filesystems are
  * put on a doubly linked list.
  *
  * Lock reference:
  * 	l - mnt_listmtx
  *	m - mountlist_mtx
  *	i - interlock
  *	v - vnode freelist mutex
  *
  * Unmarked fields are considered stable as long as a ref is held.
  *
  */
 struct mount {
 	struct mtx	mnt_mtx;		/* mount structure interlock */
 	int		mnt_gen;		/* struct mount generation */
 #define	mnt_startzero	mnt_list
 	TAILQ_ENTRY(mount) mnt_list;		/* (m) mount list */
 	struct vfsops	*mnt_op;		/* operations on fs */
 	struct vfsconf	*mnt_vfc;		/* configuration info */
 	struct vnode	*mnt_vnodecovered;	/* vnode we mounted on */
 	struct vnode	*mnt_syncer;		/* syncer vnode */
 	int		mnt_ref;		/* (i) Reference count */
 	struct vnodelst	mnt_nvnodelist;		/* (i) list of vnodes */
 	int		mnt_nvnodelistsize;	/* (i) # of vnodes */
 	int		mnt_writeopcount;	/* (i) write syscalls pending */
 	int		mnt_kern_flag;		/* (i) kernel only flags */
 	uint64_t	mnt_flag;		/* (i) flags shared with user */
 	struct vfsoptlist *mnt_opt;		/* current mount options */
 	struct vfsoptlist *mnt_optnew;		/* new options passed to fs */
 	int		mnt_maxsymlinklen;	/* max size of short symlink */
 	struct statfs	mnt_stat;		/* cache of filesystem stats */
 	struct ucred	*mnt_cred;		/* credentials of mounter */
 	void *		mnt_data;		/* private data */
 	time_t		mnt_time;		/* last time written*/
 	int		mnt_iosize_max;		/* max size for clusters, etc */
 	struct netexport *mnt_export;		/* export list */
 	struct label	*mnt_label;		/* MAC label for the fs */
 	u_int		mnt_hashseed;		/* Random seed for vfs_hash */
 	int		mnt_lockref;		/* (i) Lock reference count */
 	int		mnt_secondary_writes;   /* (i) # of secondary writes */
 	int		mnt_secondary_accwrites;/* (i) secondary wr. starts */
 	struct thread	*mnt_susp_owner;	/* (i) thread owning suspension */
 #define	mnt_endzero	mnt_gjprovider
 	char		*mnt_gjprovider;	/* gjournal provider name */
 	struct mtx	mnt_listmtx;
 	struct vnodelst	mnt_lazyvnodelist;	/* (l) list of lazy vnodes */
 	int		mnt_lazyvnodelistsize;	/* (l) # of lazy vnodes */
 	struct lock	mnt_explock;		/* vfs_export walkers lock */
 	TAILQ_ENTRY(mount) mnt_upper_link;	/* (m) we in the all uppers */
 	TAILQ_HEAD(, mount) mnt_uppers;		/* (m) upper mounts over us*/
 	int __aligned(CACHE_LINE_SIZE)	mnt_vfs_ops;/* (i) pending vfs ops */
 	int		*mnt_thread_in_ops_pcpu;
 	int		*mnt_ref_pcpu;
 	int		*mnt_lockref_pcpu;
 	int		*mnt_writeopcount_pcpu;
 	struct vnode	*mnt_rootvnode;
 };
 
 /*
  * Definitions for MNT_VNODE_FOREACH_ALL.
  */
 struct vnode *__mnt_vnode_next_all(struct vnode **mvp, struct mount *mp);
 struct vnode *__mnt_vnode_first_all(struct vnode **mvp, struct mount *mp);
 void          __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp);
 
 #define MNT_VNODE_FOREACH_ALL(vp, mp, mvp)				\
 	for (vp = __mnt_vnode_first_all(&(mvp), (mp));			\
 		(vp) != NULL; vp = __mnt_vnode_next_all(&(mvp), (mp)))
 
 #define MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp)				\
 	do {								\
 		MNT_ILOCK(mp);						\
 		__mnt_vnode_markerfree_all(&(mvp), (mp));		\
 		/* MNT_IUNLOCK(mp); -- done in above function */	\
 		mtx_assert(MNT_MTX(mp), MA_NOTOWNED);			\
 	} while (0)
 
 /*
  * Definitions for MNT_VNODE_FOREACH_LAZY.
  */
 typedef int mnt_lazy_cb_t(struct vnode *, void *);
 struct vnode *__mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp,
     mnt_lazy_cb_t *cb, void *cbarg);
 struct vnode *__mnt_vnode_first_lazy(struct vnode **mvp, struct mount *mp,
     mnt_lazy_cb_t *cb, void *cbarg);
 void          __mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp);
 
 #define MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, cb, cbarg)			\
 	for (vp = __mnt_vnode_first_lazy(&(mvp), (mp), (cb), (cbarg));	\
 		(vp) != NULL; 						\
 		vp = __mnt_vnode_next_lazy(&(mvp), (mp), (cb), (cbarg)))
 
 #define MNT_VNODE_FOREACH_LAZY_ABORT(mp, mvp)				\
 	__mnt_vnode_markerfree_lazy(&(mvp), (mp))
 
 #define	MNT_ILOCK(mp)	mtx_lock(&(mp)->mnt_mtx)
 #define	MNT_ITRYLOCK(mp) mtx_trylock(&(mp)->mnt_mtx)
 #define	MNT_IUNLOCK(mp)	mtx_unlock(&(mp)->mnt_mtx)
 #define	MNT_MTX(mp)	(&(mp)->mnt_mtx)
 
 #define	MNT_REF(mp)	do {						\
 	mtx_assert(MNT_MTX(mp), MA_OWNED);				\
 	mp->mnt_ref++;							\
 } while (0)
 #define	MNT_REL(mp)	do {						\
 	mtx_assert(MNT_MTX(mp), MA_OWNED);				\
 	(mp)->mnt_ref--;						\
 	if ((mp)->mnt_vfs_ops && (mp)->mnt_ref < 0)		\
 		vfs_dump_mount_counters(mp);				\
 	if ((mp)->mnt_ref == 0 && (mp)->mnt_vfs_ops)		\
 		wakeup((mp));						\
 } while (0)
 
 #endif /* _KERNEL */
 
 #if defined(_WANT_MNTOPTNAMES) || defined(_KERNEL)
 struct mntoptnames {
 	uint64_t o_opt;
 	const char *o_name;
 };
 #define MNTOPT_NAMES							\
 	{ MNT_ASYNC,		"asynchronous" },			\
 	{ MNT_EXPORTED,		"NFS exported" },			\
 	{ MNT_LOCAL,		"local" },				\
 	{ MNT_NOATIME,		"noatime" },				\
 	{ MNT_NOEXEC,		"noexec" },				\
 	{ MNT_NOSUID,		"nosuid" },				\
 	{ MNT_NOSYMFOLLOW,	"nosymfollow" },			\
 	{ MNT_QUOTA,		"with quotas" },			\
 	{ MNT_RDONLY,		"read-only" },				\
 	{ MNT_SYNCHRONOUS,	"synchronous" },			\
 	{ MNT_UNION,		"union" },				\
 	{ MNT_NOCLUSTERR,	"noclusterr" },				\
 	{ MNT_NOCLUSTERW,	"noclusterw" },				\
 	{ MNT_SUIDDIR,		"suiddir" },				\
 	{ MNT_SOFTDEP,		"soft-updates" },			\
 	{ MNT_SUJ,		"journaled soft-updates" },		\
 	{ MNT_MULTILABEL,	"multilabel" },				\
 	{ MNT_ACLS,		"acls" },				\
 	{ MNT_NFS4ACLS,		"nfsv4acls" },				\
 	{ MNT_GJOURNAL,		"gjournal" },				\
 	{ MNT_AUTOMOUNTED,	"automounted" },			\
 	{ MNT_VERIFIED,		"verified" },				\
 	{ MNT_UNTRUSTED,	"untrusted" },				\
 	{ MNT_NOCOVER,		"nocover" },				\
 	{ MNT_EMPTYDIR,		"emptydir" },				\
 	{ MNT_UPDATE,		"update" },				\
 	{ MNT_DELEXPORT,	"delexport" },				\
 	{ MNT_RELOAD,		"reload" },				\
 	{ MNT_FORCE,		"force" },				\
 	{ MNT_SNAPSHOT,		"snapshot" },				\
 	{ 0, NULL }
 #endif
 
 /*
  * User specifiable flags, stored in mnt_flag.
  */
 #define	MNT_RDONLY	0x0000000000000001ULL /* read only filesystem */
 #define	MNT_SYNCHRONOUS	0x0000000000000002ULL /* fs written synchronously */
 #define	MNT_NOEXEC	0x0000000000000004ULL /* can't exec from filesystem */
 #define	MNT_NOSUID	0x0000000000000008ULL /* don't honor setuid fs bits */
 #define	MNT_NFS4ACLS	0x0000000000000010ULL /* enable NFS version 4 ACLs */
 #define	MNT_UNION	0x0000000000000020ULL /* union with underlying fs */
 #define	MNT_ASYNC	0x0000000000000040ULL /* fs written asynchronously */
 #define	MNT_SUIDDIR	0x0000000000100000ULL /* special SUID dir handling */
 #define	MNT_SOFTDEP	0x0000000000200000ULL /* using soft updates */
 #define	MNT_NOSYMFOLLOW	0x0000000000400000ULL /* do not follow symlinks */
 #define	MNT_GJOURNAL	0x0000000002000000ULL /* GEOM journal support enabled */
 #define	MNT_MULTILABEL	0x0000000004000000ULL /* MAC support for objects */
 #define	MNT_ACLS	0x0000000008000000ULL /* ACL support enabled */
 #define	MNT_NOATIME	0x0000000010000000ULL /* dont update file access time */
 #define	MNT_NOCLUSTERR	0x0000000040000000ULL /* disable cluster read */
 #define	MNT_NOCLUSTERW	0x0000000080000000ULL /* disable cluster write */
 #define	MNT_SUJ		0x0000000100000000ULL /* using journaled soft updates */
 #define	MNT_AUTOMOUNTED	0x0000000200000000ULL /* mounted by automountd(8) */
 #define	MNT_UNTRUSTED	0x0000000800000000ULL /* filesys metadata untrusted */
 
 /*
  * NFS export related mount flags.
  */
 #define	MNT_EXRDONLY	0x0000000000000080ULL	/* exported read only */
 #define	MNT_EXPORTED	0x0000000000000100ULL	/* filesystem is exported */
 #define	MNT_DEFEXPORTED	0x0000000000000200ULL	/* exported to the world */
 #define	MNT_EXPORTANON	0x0000000000000400ULL	/* anon uid mapping for all */
 #define	MNT_EXKERB	0x0000000000000800ULL	/* exported with Kerberos */
 #define	MNT_EXPUBLIC	0x0000000020000000ULL	/* public export (WebNFS) */
 #define	MNT_EXTLS	0x0000004000000000ULL /* require TLS */
 #define	MNT_EXTLSCERT	0x0000008000000000ULL /* require TLS with client cert */
 #define	MNT_EXTLSCERTUSER 0x0000010000000000ULL /* require TLS with user cert */
 
 /*
  * Flags set by internal operations,
  * but visible to the user.
  * XXX some of these are not quite right.. (I've never seen the root flag set)
  */
 #define	MNT_LOCAL	0x0000000000001000ULL /* filesystem is stored locally */
 #define	MNT_QUOTA	0x0000000000002000ULL /* quotas are enabled on fs */
 #define	MNT_ROOTFS	0x0000000000004000ULL /* identifies the root fs */
 #define	MNT_USER	0x0000000000008000ULL /* mounted by a user */
 #define	MNT_IGNORE	0x0000000000800000ULL /* do not show entry in df */
 #define	MNT_VERIFIED	0x0000000400000000ULL /* filesystem is verified */
 
 /*
  * Mask of flags that are visible to statfs().
  * XXX I think that this could now become (~(MNT_CMDFLAGS))
  * but the 'mount' program may need changing to handle this.
  */
 #define	MNT_VISFLAGMASK	(MNT_RDONLY	| MNT_SYNCHRONOUS | MNT_NOEXEC	| \
 			MNT_NOSUID	| MNT_UNION	| MNT_SUJ	| \
 			MNT_ASYNC	| MNT_EXRDONLY	| MNT_EXPORTED	| \
 			MNT_DEFEXPORTED	| MNT_EXPORTANON| MNT_EXKERB	| \
 			MNT_LOCAL	| MNT_USER	| MNT_QUOTA	| \
 			MNT_ROOTFS	| MNT_NOATIME	| MNT_NOCLUSTERR| \
 			MNT_NOCLUSTERW	| MNT_SUIDDIR	| MNT_SOFTDEP	| \
 			MNT_IGNORE	| MNT_EXPUBLIC	| MNT_NOSYMFOLLOW | \
 			MNT_GJOURNAL	| MNT_MULTILABEL | MNT_ACLS	| \
 			MNT_NFS4ACLS	| MNT_AUTOMOUNTED | MNT_VERIFIED | \
 			MNT_UNTRUSTED)
 
 /* Mask of flags that can be updated. */
 #define	MNT_UPDATEMASK (MNT_NOSUID	| MNT_NOEXEC	| \
 			MNT_SYNCHRONOUS	| MNT_UNION	| MNT_ASYNC	| \
 			MNT_NOATIME | \
 			MNT_NOSYMFOLLOW	| MNT_IGNORE	| \
 			MNT_NOCLUSTERR	| MNT_NOCLUSTERW | MNT_SUIDDIR	| \
 			MNT_ACLS	| MNT_USER	| MNT_NFS4ACLS	| \
 			MNT_AUTOMOUNTED | MNT_UNTRUSTED)
 
 /*
  * External filesystem command modifier flags.
  * Unmount can use the MNT_FORCE flag.
  * XXX: These are not STATES and really should be somewhere else.
  * XXX: MNT_BYFSID and MNT_NONBUSY collide with MNT_ACLS and MNT_MULTILABEL,
  *      but because MNT_ACLS and MNT_MULTILABEL are only used for mount(2),
  *      and MNT_BYFSID and MNT_NONBUSY are only used for unmount(2),
  *      it's harmless.
  */
 #define	MNT_UPDATE	0x0000000000010000ULL /* not real mount, just update */
 #define	MNT_DELEXPORT	0x0000000000020000ULL /* delete export host lists */
 #define	MNT_RELOAD	0x0000000000040000ULL /* reload filesystem data */
 #define	MNT_FORCE	0x0000000000080000ULL /* force unmount or readonly */
 #define	MNT_SNAPSHOT	0x0000000001000000ULL /* snapshot the filesystem */
 #define	MNT_NONBUSY	0x0000000004000000ULL /* check vnode use counts. */
 #define	MNT_BYFSID	0x0000000008000000ULL /* specify filesystem by ID. */
 #define	MNT_NOCOVER	0x0000001000000000ULL /* Do not cover a mount point */
 #define	MNT_EMPTYDIR	0x0000002000000000ULL /* Only mount on empty dir */
 #define MNT_CMDFLAGS   (MNT_UPDATE	| MNT_DELEXPORT	| MNT_RELOAD	| \
 			MNT_FORCE	| MNT_SNAPSHOT	| MNT_NONBUSY	| \
 			MNT_BYFSID	| MNT_NOCOVER	| MNT_EMPTYDIR)
 /*
  * Internal filesystem control flags stored in mnt_kern_flag.
  *
  * MNTK_UNMOUNT locks the mount entry so that name lookup cannot
  * proceed past the mount point.  This keeps the subtree stable during
  * mounts and unmounts.  When non-forced unmount flushes all vnodes
  * from the mp queue, the MNTK_UNMOUNT flag prevents insmntque() from
  * queueing new vnodes.
  *
  * MNTK_UNMOUNTF permits filesystems to detect a forced unmount while
  * dounmount() is still waiting to lock the mountpoint. This allows
  * the filesystem to cancel operations that might otherwise deadlock
  * with the unmount attempt (used by NFS).
  */
 #define MNTK_UNMOUNTF	0x00000001	/* forced unmount in progress */
 #define MNTK_ASYNC	0x00000002	/* filtered async flag */
 #define MNTK_SOFTDEP	0x00000004	/* async disabled by softdep */
 #define MNTK_NOMSYNC	0x00000008	/* don't do msync */
 #define	MNTK_DRAINING	0x00000010	/* lock draining is happening */
 #define	MNTK_REFEXPIRE	0x00000020	/* refcount expiring is happening */
 #define MNTK_EXTENDED_SHARED	0x00000040 /* Allow shared locking for more ops */
 #define	MNTK_SHARED_WRITES	0x00000080 /* Allow shared locking for writes */
 #define	MNTK_NO_IOPF	0x00000100	/* Disallow page faults during reads
 					   and writes. Filesystem shall properly
 					   handle i/o state on EFAULT. */
 #define	MNTK_VGONE_UPPER	0x00000200
 #define	MNTK_VGONE_WAITER	0x00000400
 #define	MNTK_LOOKUP_EXCL_DOTDOT	0x00000800
 #define	MNTK_MARKER		0x00001000
 #define	MNTK_UNMAPPED_BUFS	0x00002000
 #define	MNTK_USES_BCACHE	0x00004000 /* FS uses the buffer cache. */
 #define	MNTK_TEXT_REFS		0x00008000 /* Keep use ref for text */
 #define	MNTK_VMSETSIZE_BUG	0x00010000
 #define	MNTK_UNIONFS	0x00020000	/* A hack for F_ISUNIONSTACK */
 #define	MNTK_FPLOOKUP	0x00040000	/* fast path lookup is supported */
+#define	MNTK_SUSPEND_ALL	0x00080000 /* Suspended by all-fs suspension */
 #define MNTK_NOASYNC	0x00800000	/* disable async */
 #define MNTK_UNMOUNT	0x01000000	/* unmount in progress */
 #define	MNTK_MWAIT	0x02000000	/* waiting for unmount to finish */
 #define	MNTK_SUSPEND	0x08000000	/* request write suspension */
 #define	MNTK_SUSPEND2	0x04000000	/* block secondary writes */
 #define	MNTK_SUSPENDED	0x10000000	/* write operations are suspended */
 #define	MNTK_NULL_NOCACHE	0x20000000 /* auto disable cache for nullfs
 					      mounts over this fs */
 #define MNTK_LOOKUP_SHARED	0x40000000 /* FS supports shared lock lookups */
 #define	MNTK_NOKNOTE	0x80000000	/* Don't send KNOTEs from VOP hooks */
 
 #ifdef _KERNEL
 static inline int
 MNT_SHARED_WRITES(struct mount *mp)
 {
 
 	return (mp != NULL && (mp->mnt_kern_flag & MNTK_SHARED_WRITES) != 0);
 }
 
 static inline int
 MNT_EXTENDED_SHARED(struct mount *mp)
 {
 
 	return (mp != NULL && (mp->mnt_kern_flag & MNTK_EXTENDED_SHARED) != 0);
 }
 #endif
 
 /*
  * Sysctl CTL_VFS definitions.
  *
  * Second level identifier specifies which filesystem. Second level
  * identifier VFS_VFSCONF returns information about all filesystems.
  * Second level identifier VFS_GENERIC is non-terminal.
  */
 #define	VFS_VFSCONF		0	/* get configured filesystems */
 #define	VFS_GENERIC		0	/* generic filesystem information */
 /*
  * Third level identifiers for VFS_GENERIC are given below; third
  * level identifiers for specific filesystems are given in their
  * mount specific header files.
  */
 #define VFS_MAXTYPENUM	1	/* int: highest defined filesystem type */
 #define VFS_CONF	2	/* struct: vfsconf for filesystem given
 				   as next argument */
 
 /*
  * Flags for various system call interfaces.
  *
  * waitfor flags to vfs_sync() and getfsstat()
  */
 #define MNT_WAIT	1	/* synchronously wait for I/O to complete */
 #define MNT_NOWAIT	2	/* start all I/O, but do not wait for it */
 #define MNT_LAZY	3	/* push data not written by filesystem syncer */
 #define MNT_SUSPEND	4	/* Suspend file system after sync */
 
 /*
  * Generic file handle
  */
 struct fhandle {
 	fsid_t	fh_fsid;	/* Filesystem id of mount point */
 	struct	fid fh_fid;	/* Filesys specific id */
 };
 typedef struct fhandle	fhandle_t;
 
 /*
  * Old export arguments without security flavor list
  */
 struct oexport_args {
 	int	ex_flags;		/* export related flags */
 	uid_t	ex_root;		/* mapping for root uid */
 	struct	xucred ex_anon;		/* mapping for anonymous user */
 	struct	sockaddr *ex_addr;	/* net address to which exported */
 	u_char	ex_addrlen;		/* and the net address length */
 	struct	sockaddr *ex_mask;	/* mask of valid bits in saddr */
 	u_char	ex_masklen;		/* and the smask length */
 	char	*ex_indexfile;		/* index file for WebNFS URLs */
 };
 
 /*
  * Not quite so old export arguments with 32bit ex_flags and xucred ex_anon.
  */
 #define	MAXSECFLAVORS	5
 struct o2export_args {
 	int	ex_flags;		/* export related flags */
 	uid_t	ex_root;		/* mapping for root uid */
 	struct	xucred ex_anon;		/* mapping for anonymous user */
 	struct	sockaddr *ex_addr;	/* net address to which exported */
 	u_char	ex_addrlen;		/* and the net address length */
 	struct	sockaddr *ex_mask;	/* mask of valid bits in saddr */
 	u_char	ex_masklen;		/* and the smask length */
 	char	*ex_indexfile;		/* index file for WebNFS URLs */
 	int	ex_numsecflavors;	/* security flavor count */
 	int	ex_secflavors[MAXSECFLAVORS]; /* list of security flavors */
 };
 
 /*
  * Export arguments for local filesystem mount calls.
  */
 struct export_args {
 	uint64_t ex_flags;		/* export related flags */
 	uid_t	ex_root;		/* mapping for root uid */
 	uid_t	ex_uid;			/* mapping for anonymous user */
 	int	ex_ngroups;
 	gid_t	*ex_groups;
 	struct	sockaddr *ex_addr;	/* net address to which exported */
 	u_char	ex_addrlen;		/* and the net address length */
 	struct	sockaddr *ex_mask;	/* mask of valid bits in saddr */
 	u_char	ex_masklen;		/* and the smask length */
 	char	*ex_indexfile;		/* index file for WebNFS URLs */
 	int	ex_numsecflavors;	/* security flavor count */
 	int	ex_secflavors[MAXSECFLAVORS]; /* list of security flavors */
 };
 
 /*
  * Structure holding information for a publicly exported filesystem
  * (WebNFS). Currently the specs allow just for one such filesystem.
  */
 struct nfs_public {
 	int		np_valid;	/* Do we hold valid information */
 	fhandle_t	np_handle;	/* Filehandle for pub fs (internal) */
 	struct mount	*np_mount;	/* Mountpoint of exported fs */
 	char		*np_index;	/* Index file */
 };
 
 /*
  * Filesystem configuration information. One of these exists for each
  * type of filesystem supported by the kernel. These are searched at
  * mount time to identify the requested filesystem.
  *
  * XXX: Never change the first two arguments!
  */
 struct vfsconf {
 	u_int	vfc_version;		/* ABI version number */
 	char	vfc_name[MFSNAMELEN];	/* filesystem type name */
 	struct	vfsops *vfc_vfsops;	/* filesystem operations vector */
 	struct	vfsops *vfc_vfsops_sd;	/* ... signal-deferred */
 	int	vfc_typenum;		/* historic filesystem type number */
 	int	vfc_refcount;		/* number mounted of this type */
 	int	vfc_flags;		/* permanent flags */
 	int	vfc_prison_flag;	/* prison allow.mount.* flag */
 	struct	vfsoptdecl *vfc_opts;	/* mount options */
 	TAILQ_ENTRY(vfsconf) vfc_list;	/* list of vfscons */
 };
 
 /* Userland version of the struct vfsconf. */
 struct xvfsconf {
 	struct	vfsops *vfc_vfsops;	/* filesystem operations vector */
 	char	vfc_name[MFSNAMELEN];	/* filesystem type name */
 	int	vfc_typenum;		/* historic filesystem type number */
 	int	vfc_refcount;		/* number mounted of this type */
 	int	vfc_flags;		/* permanent flags */
 	struct	vfsconf *vfc_next;	/* next in list */
 };
 
 #ifndef BURN_BRIDGES
 struct ovfsconf {
 	void	*vfc_vfsops;
 	char	vfc_name[32];
 	int	vfc_index;
 	int	vfc_refcount;
 	int	vfc_flags;
 };
 #endif
 
 /*
  * NB: these flags refer to IMPLEMENTATION properties, not properties of
  * any actual mounts; i.e., it does not make sense to change the flags.
  */
 #define	VFCF_STATIC	0x00010000	/* statically compiled into kernel */
 #define	VFCF_NETWORK	0x00020000	/* may get data over the network */
 #define	VFCF_READONLY	0x00040000	/* writes are not implemented */
 #define	VFCF_SYNTHETIC	0x00080000	/* data does not represent real files */
 #define	VFCF_LOOPBACK	0x00100000	/* aliases some other mounted FS */
 #define	VFCF_UNICODE	0x00200000	/* stores file names as Unicode */
 #define	VFCF_JAIL	0x00400000	/* can be mounted from within a jail */
 #define	VFCF_DELEGADMIN	0x00800000	/* supports delegated administration */
 #define	VFCF_SBDRY	0x01000000	/* Stop at Boundary: defer stop requests
 					   to kernel->user (AST) transition */
 
 typedef uint32_t fsctlop_t;
 
 struct vfsidctl {
 	int		vc_vers;	/* should be VFSIDCTL_VERS1 (below) */
 	fsid_t		vc_fsid;	/* fsid to operate on */
 	char		vc_fstypename[MFSNAMELEN];
 					/* type of fs 'nfs' or '*' */
 	fsctlop_t	vc_op;		/* operation VFS_CTL_* (below) */
 	void		*vc_ptr;	/* pointer to data structure */
 	size_t		vc_len;		/* sizeof said structure */
 	u_int32_t	vc_spare[12];	/* spare (must be zero) */
 };
 
 /* vfsidctl API version. */
 #define VFS_CTL_VERS1	0x01
 
 /*
  * New style VFS sysctls, do not reuse/conflict with the namespace for
  * private sysctls.
  * All "global" sysctl ops have the 33rd bit set:
  * 0x...1....
  * Private sysctl ops should have the 33rd bit unset.
  */
 #define VFS_CTL_QUERY	0x00010001	/* anything wrong? (vfsquery) */
 #define VFS_CTL_TIMEO	0x00010002	/* set timeout for vfs notification */
 #define VFS_CTL_NOLOCKS	0x00010003	/* disable file locking */
 
 struct vfsquery {
 	u_int32_t	vq_flags;
 	u_int32_t	vq_spare[31];
 };
 
 /* vfsquery flags */
 #define VQ_NOTRESP	0x0001	/* server down */
 #define VQ_NEEDAUTH	0x0002	/* server bad auth */
 #define VQ_LOWDISK	0x0004	/* we're low on space */
 #define VQ_MOUNT	0x0008	/* new filesystem arrived */
 #define VQ_UNMOUNT	0x0010	/* filesystem has left */
 #define VQ_DEAD		0x0020	/* filesystem is dead, needs force unmount */
 #define VQ_ASSIST	0x0040	/* filesystem needs assistance from external
 				   program */
 #define VQ_NOTRESPLOCK	0x0080	/* server lockd down */
 #define VQ_FLAG0100	0x0100	/* placeholder */
 #define VQ_FLAG0200	0x0200	/* placeholder */
 #define VQ_FLAG0400	0x0400	/* placeholder */
 #define VQ_FLAG0800	0x0800	/* placeholder */
 #define VQ_FLAG1000	0x1000	/* placeholder */
 #define VQ_FLAG2000	0x2000	/* placeholder */
 #define VQ_FLAG4000	0x4000	/* placeholder */
 #define VQ_FLAG8000	0x8000	/* placeholder */
 
 #ifdef _KERNEL
 /* Point a sysctl request at a vfsidctl's data. */
 #define VCTLTOREQ(vc, req)						\
 	do {								\
 		(req)->newptr = (vc)->vc_ptr;				\
 		(req)->newlen = (vc)->vc_len;				\
 		(req)->newidx = 0;					\
 	} while (0)
 #endif
 
 struct iovec;
 struct uio;
 
 #ifdef _KERNEL
 
 /*
  * vfs_busy specific flags and mask.
  */
 #define	MBF_NOWAIT	0x01
 #define	MBF_MNTLSTLOCK	0x02
 #define	MBF_MASK	(MBF_NOWAIT | MBF_MNTLSTLOCK)
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_MOUNT);
 MALLOC_DECLARE(M_STATFS);
 #endif
 extern int maxvfsconf;		/* highest defined filesystem type */
 
 TAILQ_HEAD(vfsconfhead, vfsconf);
 extern struct vfsconfhead vfsconf;
 
 /*
  * Operations supported on mounted filesystem.
  */
 struct mount_args;
 struct nameidata;
 struct sysctl_req;
 struct mntarg;
 
 /*
  * N.B., vfs_cmount is the ancient vfsop invoked by the old mount(2) syscall.
  * The new way is vfs_mount.
  *
  * vfs_cmount implementations typically translate arguments from their
  * respective old per-FS structures into the key-value list supported by
  * nmount(2), then use kernel_mount(9) to mimic nmount(2) from kernelspace.
  *
  * Filesystems with mounters that use nmount(2) do not need to and should not
  * implement vfs_cmount.  Hopefully a future cleanup can remove vfs_cmount and
  * mount(2) entirely.
  */
 typedef int vfs_cmount_t(struct mntarg *ma, void *data, uint64_t flags);
 typedef int vfs_unmount_t(struct mount *mp, int mntflags);
 typedef int vfs_root_t(struct mount *mp, int flags, struct vnode **vpp);
 typedef	int vfs_quotactl_t(struct mount *mp, int cmds, uid_t uid, void *arg);
 typedef	int vfs_statfs_t(struct mount *mp, struct statfs *sbp);
 typedef	int vfs_sync_t(struct mount *mp, int waitfor);
 typedef	int vfs_vget_t(struct mount *mp, ino_t ino, int flags,
 		    struct vnode **vpp);
 typedef	int vfs_fhtovp_t(struct mount *mp, struct fid *fhp,
 		    int flags, struct vnode **vpp);
 typedef	int vfs_checkexp_t(struct mount *mp, struct sockaddr *nam,
 		    uint64_t *extflagsp, struct ucred **credanonp,
 		    int *numsecflavors, int *secflavors);
 typedef	int vfs_init_t(struct vfsconf *);
 typedef	int vfs_uninit_t(struct vfsconf *);
 typedef	int vfs_extattrctl_t(struct mount *mp, int cmd,
 		    struct vnode *filename_vp, int attrnamespace,
 		    const char *attrname);
 typedef	int vfs_mount_t(struct mount *mp);
 typedef int vfs_sysctl_t(struct mount *mp, fsctlop_t op,
 		    struct sysctl_req *req);
 typedef void vfs_susp_clean_t(struct mount *mp);
 typedef void vfs_notify_lowervp_t(struct mount *mp, struct vnode *lowervp);
 typedef void vfs_purge_t(struct mount *mp);
 
 struct vfsops {
 	vfs_mount_t		*vfs_mount;
 	vfs_cmount_t		*vfs_cmount;
 	vfs_unmount_t		*vfs_unmount;
 	vfs_root_t		*vfs_root;
 	vfs_root_t		*vfs_cachedroot;
 	vfs_quotactl_t		*vfs_quotactl;
 	vfs_statfs_t		*vfs_statfs;
 	vfs_sync_t		*vfs_sync;
 	vfs_vget_t		*vfs_vget;
 	vfs_fhtovp_t		*vfs_fhtovp;
 	vfs_checkexp_t		*vfs_checkexp;
 	vfs_init_t		*vfs_init;
 	vfs_uninit_t		*vfs_uninit;
 	vfs_extattrctl_t	*vfs_extattrctl;
 	vfs_sysctl_t		*vfs_sysctl;
 	vfs_susp_clean_t	*vfs_susp_clean;
 	vfs_notify_lowervp_t	*vfs_reclaim_lowervp;
 	vfs_notify_lowervp_t	*vfs_unlink_lowervp;
 	vfs_purge_t		*vfs_purge;
 	vfs_mount_t		*vfs_spare[6];	/* spares for ABI compat */
 };
 
 vfs_statfs_t	__vfs_statfs;
 
 #define	VFS_MOUNT(MP) ({						\
 	int _rc;							\
 									\
 	TSRAW(curthread, TS_ENTER, "VFS_MOUNT", (MP)->mnt_vfc->vfc_name);\
 	_rc = (*(MP)->mnt_op->vfs_mount)(MP);				\
 	TSRAW(curthread, TS_EXIT, "VFS_MOUNT", (MP)->mnt_vfc->vfc_name);\
 	_rc; })
 
 #define	VFS_UNMOUNT(MP, FORCE) ({					\
 	int _rc;							\
 									\
 	_rc = (*(MP)->mnt_op->vfs_unmount)(MP, FORCE);			\
 	_rc; })
 
 #define	VFS_ROOT(MP, FLAGS, VPP) ({					\
 	int _rc;							\
 									\
 	_rc = (*(MP)->mnt_op->vfs_root)(MP, FLAGS, VPP);		\
 	_rc; })
 
 #define	VFS_CACHEDROOT(MP, FLAGS, VPP) ({				\
 	int _rc;							\
 									\
 	_rc = (*(MP)->mnt_op->vfs_cachedroot)(MP, FLAGS, VPP);		\
 	_rc; })
 
 #define	VFS_QUOTACTL(MP, C, U, A) ({					\
 	int _rc;							\
 									\
 	_rc = (*(MP)->mnt_op->vfs_quotactl)(MP, C, U, A);		\
 	_rc; })
 
 #define	VFS_STATFS(MP, SBP) ({						\
 	int _rc;							\
 									\
 	_rc = __vfs_statfs((MP), (SBP));				\
 	_rc; })
 
 #define	VFS_SYNC(MP, WAIT) ({						\
 	int _rc;							\
 									\
 	_rc = (*(MP)->mnt_op->vfs_sync)(MP, WAIT);			\
 	_rc; })
 
 #define	VFS_VGET(MP, INO, FLAGS, VPP) ({				\
 	int _rc;							\
 									\
 	_rc = (*(MP)->mnt_op->vfs_vget)(MP, INO, FLAGS, VPP);		\
 	_rc; })
 
 #define	VFS_FHTOVP(MP, FIDP, FLAGS, VPP) ({				\
 	int _rc;							\
 									\
 	_rc = (*(MP)->mnt_op->vfs_fhtovp)(MP, FIDP, FLAGS, VPP);	\
 	_rc; })
 
 #define	VFS_CHECKEXP(MP, NAM, EXFLG, CRED, NUMSEC, SEC) ({		\
 	int _rc;							\
 									\
 	_rc = (*(MP)->mnt_op->vfs_checkexp)(MP, NAM, EXFLG, CRED, NUMSEC,\
 	    SEC);							\
 	_rc; })
 
 #define	VFS_EXTATTRCTL(MP, C, FN, NS, N) ({				\
 	int _rc;							\
 									\
 	_rc = (*(MP)->mnt_op->vfs_extattrctl)(MP, C, FN, NS, N);	\
 	_rc; })
 
 #define	VFS_SYSCTL(MP, OP, REQ) ({					\
 	int _rc;							\
 									\
 	_rc = (*(MP)->mnt_op->vfs_sysctl)(MP, OP, REQ);			\
 	_rc; })
 
 #define	VFS_SUSP_CLEAN(MP) do {						\
 	if (*(MP)->mnt_op->vfs_susp_clean != NULL) {			\
 		(*(MP)->mnt_op->vfs_susp_clean)(MP);			\
 	}								\
 } while (0)
 
 #define	VFS_RECLAIM_LOWERVP(MP, VP) do {				\
 	if (*(MP)->mnt_op->vfs_reclaim_lowervp != NULL) {		\
 		(*(MP)->mnt_op->vfs_reclaim_lowervp)((MP), (VP));	\
 	}								\
 } while (0)
 
 #define	VFS_UNLINK_LOWERVP(MP, VP) do {					\
 	if (*(MP)->mnt_op->vfs_unlink_lowervp != NULL) {		\
 		(*(MP)->mnt_op->vfs_unlink_lowervp)((MP), (VP));	\
 	}								\
 } while (0)
 
 #define	VFS_PURGE(MP) do {						\
 	if (*(MP)->mnt_op->vfs_purge != NULL) {				\
 		(*(MP)->mnt_op->vfs_purge)(MP);				\
 	}								\
 } while (0)
 
 #define VFS_KNOTE_LOCKED(vp, hint) do					\
 {									\
 	if (((vp)->v_vflag & VV_NOKNOTE) == 0)				\
 		VN_KNOTE((vp), (hint), KNF_LISTLOCKED);			\
 } while (0)
 
 #define VFS_KNOTE_UNLOCKED(vp, hint) do					\
 {									\
 	if (((vp)->v_vflag & VV_NOKNOTE) == 0)				\
 		VN_KNOTE((vp), (hint), 0);				\
 } while (0)
 
 #define	VFS_NOTIFY_UPPER_RECLAIM	1
 #define	VFS_NOTIFY_UPPER_UNLINK		2
 
 #include <sys/module.h>
 
 /*
  * Version numbers.
  */
 #define VFS_VERSION_00	0x19660120
 #define VFS_VERSION_01	0x20121030
 #define VFS_VERSION_02	0x20180504
 #define VFS_VERSION	VFS_VERSION_02
 
 #define VFS_SET(vfsops, fsname, flags) \
 	static struct vfsconf fsname ## _vfsconf = {		\
 		.vfc_version = VFS_VERSION,			\
 		.vfc_name = #fsname,				\
 		.vfc_vfsops = &vfsops,				\
 		.vfc_typenum = -1,				\
 		.vfc_flags = flags,				\
 	};							\
 	static moduledata_t fsname ## _mod = {			\
 		#fsname,					\
 		vfs_modevent,					\
 		& fsname ## _vfsconf				\
 	};							\
 	DECLARE_MODULE(fsname, fsname ## _mod, SI_SUB_VFS, SI_ORDER_MIDDLE)
 
 /*
  * exported vnode operations
  */
 
 int	dounmount(struct mount *, int, struct thread *);
 
 int	kernel_mount(struct mntarg *ma, uint64_t flags);
 int	kernel_vmount(int flags, ...);
 struct mntarg *mount_arg(struct mntarg *ma, const char *name, const void *val, int len);
 struct mntarg *mount_argb(struct mntarg *ma, int flag, const char *name);
 struct mntarg *mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...);
 struct mntarg *mount_argsu(struct mntarg *ma, const char *name, const void *val, int len);
 void	statfs_scale_blocks(struct statfs *sf, long max_size);
 struct vfsconf *vfs_byname(const char *);
 struct vfsconf *vfs_byname_kld(const char *, struct thread *td, int *);
 void	vfs_mount_destroy(struct mount *);
 void	vfs_event_signal(fsid_t *, u_int32_t, intptr_t);
 void	vfs_freeopts(struct vfsoptlist *opts);
 void	vfs_deleteopt(struct vfsoptlist *opts, const char *name);
 int	vfs_buildopts(struct uio *auio, struct vfsoptlist **options);
 int	vfs_flagopt(struct vfsoptlist *opts, const char *name, uint64_t *w,
 	    uint64_t val);
 int	vfs_getopt(struct vfsoptlist *, const char *, void **, int *);
 int	vfs_getopt_pos(struct vfsoptlist *opts, const char *name);
 int	vfs_getopt_size(struct vfsoptlist *opts, const char *name,
 	    off_t *value);
 char	*vfs_getopts(struct vfsoptlist *, const char *, int *error);
 int	vfs_copyopt(struct vfsoptlist *, const char *, void *, int);
 int	vfs_filteropt(struct vfsoptlist *, const char **legal);
 void	vfs_opterror(struct vfsoptlist *opts, const char *fmt, ...);
 int	vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...);
 int	vfs_setopt(struct vfsoptlist *opts, const char *name, void *value,
 	    int len);
 int	vfs_setopt_part(struct vfsoptlist *opts, const char *name, void *value,
 	    int len);
 int	vfs_setopts(struct vfsoptlist *opts, const char *name,
 	    const char *value);
 int	vfs_setpublicfs			    /* set publicly exported fs */
 	    (struct mount *, struct netexport *, struct export_args *);
 void	vfs_periodic(struct mount *, int);
 int	vfs_busy(struct mount *, int);
 int	vfs_export			 /* process mount export info */
 	    (struct mount *, struct export_args *);
 void	vfs_allocate_syncvnode(struct mount *);
 void	vfs_deallocate_syncvnode(struct mount *);
 int	vfs_donmount(struct thread *td, uint64_t fsflags,
 	    struct uio *fsoptions);
 void	vfs_getnewfsid(struct mount *);
 struct cdev *vfs_getrootfsid(struct mount *);
 struct	mount *vfs_getvfs(fsid_t *);      /* return vfs given fsid */
 struct	mount *vfs_busyfs(fsid_t *);
 int	vfs_modevent(module_t, int, void *);
 void	vfs_mount_error(struct mount *, const char *, ...);
 void	vfs_mountroot(void);			/* mount our root filesystem */
 void	vfs_mountedfrom(struct mount *, const char *from);
 void	vfs_notify_upper(struct vnode *, int);
 void	vfs_ref(struct mount *);
 void	vfs_rel(struct mount *);
 struct mount *vfs_mount_alloc(struct vnode *, struct vfsconf *, const char *,
 	    struct ucred *);
 int	vfs_suser(struct mount *, struct thread *);
 void	vfs_unbusy(struct mount *);
 void	vfs_unmountall(void);
 extern	TAILQ_HEAD(mntlist, mount) mountlist;	/* mounted filesystem list */
 extern	struct mtx_padalign mountlist_mtx;
 extern	struct nfs_public nfs_pub;
 extern	struct sx vfsconf_sx;
 #define	vfsconf_lock()		sx_xlock(&vfsconf_sx)
 #define	vfsconf_unlock()	sx_xunlock(&vfsconf_sx)
 #define	vfsconf_slock()		sx_slock(&vfsconf_sx)
 #define	vfsconf_sunlock()	sx_sunlock(&vfsconf_sx)
 struct vnode *mntfs_allocvp(struct mount *, struct vnode *);
 void   mntfs_freevp(struct vnode *);
 
 /*
  * Declarations for these vfs default operations are located in
  * kern/vfs_default.c.  They will be automatically used to replace
  * null entries in VFS ops tables when registering a new filesystem
  * type in the global table.
  */
 vfs_root_t		vfs_stdroot;
 vfs_quotactl_t		vfs_stdquotactl;
 vfs_statfs_t		vfs_stdstatfs;
 vfs_sync_t		vfs_stdsync;
 vfs_sync_t		vfs_stdnosync;
 vfs_vget_t		vfs_stdvget;
 vfs_fhtovp_t		vfs_stdfhtovp;
 vfs_checkexp_t		vfs_stdcheckexp;
 vfs_init_t		vfs_stdinit;
 vfs_uninit_t		vfs_stduninit;
 vfs_extattrctl_t	vfs_stdextattrctl;
 vfs_sysctl_t		vfs_stdsysctl;
 
 void	syncer_suspend(void);
 void	syncer_resume(void);
 
 struct vnode *vfs_cache_root_clear(struct mount *);
 void	vfs_cache_root_set(struct mount *, struct vnode *);
 
 void	vfs_op_barrier_wait(struct mount *);
 void	vfs_op_enter(struct mount *);
 void	vfs_op_exit_locked(struct mount *);
 void	vfs_op_exit(struct mount *);
 
 #ifdef DIAGNOSTIC
 void	vfs_assert_mount_counters(struct mount *);
 void	vfs_dump_mount_counters(struct mount *);
 #else
 #define vfs_assert_mount_counters(mp) do { } while (0)
 #define vfs_dump_mount_counters(mp) do { } while (0)
 #endif
 
 enum mount_counter { MNT_COUNT_REF, MNT_COUNT_LOCKREF, MNT_COUNT_WRITEOPCOUNT };
 int	vfs_mount_fetch_counter(struct mount *, enum mount_counter);
 
+void suspend_all_fs(void);
+void resume_all_fs(void);
+
 /*
  * Code transitioning mnt_vfs_ops to > 0 issues IPIs until it observes
  * all CPUs not executing code enclosed by mnt_thread_in_ops_pcpu.
  *
  * This provides an invariant that by the time the last CPU is observed not
  * executing, everyone else entering will see the counter > 0 and exit.
  *
  * Note there is no barrier between vfs_ops and the rest of the code in the
  * section. It is not necessary as the writer has to wait for everyone to drain
  * before making any changes or only make changes safe while the section is
  * executed.
  */
 #define vfs_op_thread_entered(mp) ({				\
 	MPASS(curthread->td_critnest > 0);			\
 	*zpcpu_get(mp->mnt_thread_in_ops_pcpu) == 1;		\
 })
 
 #define vfs_op_thread_enter_crit(mp) ({				\
 	bool _retval_crit = true;				\
 	MPASS(curthread->td_critnest > 0);			\
 	MPASS(!vfs_op_thread_entered(mp));			\
 	zpcpu_set_protected(mp->mnt_thread_in_ops_pcpu, 1);	\
 	__compiler_membar();					\
 	if (__predict_false(mp->mnt_vfs_ops > 0)) {		\
 		vfs_op_thread_exit_crit(mp);			\
 		_retval_crit = false;				\
 	}							\
 	_retval_crit;						\
 })
 
 #define vfs_op_thread_enter(mp) ({				\
 	bool _retval;						\
 	critical_enter();					\
 	_retval = vfs_op_thread_enter_crit(mp);			\
 	if (__predict_false(!_retval))				\
 		critical_exit();				\
 	_retval;						\
 })
 
 #define vfs_op_thread_exit_crit(mp) do {			\
 	MPASS(vfs_op_thread_entered(mp));			\
 	__compiler_membar();					\
 	zpcpu_set_protected(mp->mnt_thread_in_ops_pcpu, 0);	\
 } while (0)
 
 #define vfs_op_thread_exit(mp) do {				\
 	vfs_op_thread_exit_crit(mp);				\
 	critical_exit();					\
 } while (0)
 
 #define vfs_mp_count_add_pcpu(mp, count, val) do {		\
 	MPASS(vfs_op_thread_entered(mp));			\
 	zpcpu_add_protected(mp->mnt_##count##_pcpu, val);	\
 } while (0)
 
 #define vfs_mp_count_sub_pcpu(mp, count, val) do {		\
 	MPASS(vfs_op_thread_entered(mp));			\
 	zpcpu_sub_protected(mp->mnt_##count##_pcpu, val);	\
 } while (0)
 
 #else /* !_KERNEL */
 
 #include <sys/cdefs.h>
 
 struct stat;
 
 __BEGIN_DECLS
 int	fhlink(struct fhandle *, const char *);
 int	fhlinkat(struct fhandle *, int, const char *);
 int	fhopen(const struct fhandle *, int);
 int	fhreadlink(struct fhandle *, char *, size_t);
 int	fhstat(const struct fhandle *, struct stat *);
 int	fhstatfs(const struct fhandle *, struct statfs *);
 int	fstatfs(int, struct statfs *);
 int	getfh(const char *, fhandle_t *);
 int	getfhat(int, char *, struct fhandle *, int);
 int	getfsstat(struct statfs *, long, int);
 int	getmntinfo(struct statfs **, int);
 int	lgetfh(const char *, fhandle_t *);
 int	mount(const char *, const char *, int, void *);
 int	nmount(struct iovec *, unsigned int, int);
 int	statfs(const char *, struct statfs *);
 int	unmount(const char *, int);
 
 /* C library stuff */
 int	getvfsbyname(const char *, struct xvfsconf *);
 __END_DECLS
 
 #endif /* _KERNEL */
 
 #endif /* !_SYS_MOUNT_H_ */