Index: stable/11/sys/dev/acpica/acpi.c =================================================================== --- stable/11/sys/dev/acpica/acpi.c (revision 359651) +++ stable/11/sys/dev/acpica/acpi.c (revision 359652) @@ -1,4161 +1,4161 @@ /*- * Copyright (c) 2000 Takanori Watanabe * Copyright (c) 2000 Mitsuru IWASAKI * Copyright (c) 2000, 2001 Michael Smith * Copyright (c) 2000 BSDi * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_acpi.h" #include "opt_device_numa.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__i386__) || defined(__amd64__) #include #include #endif #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_ACPIDEV, "acpidev", "ACPI devices"); /* Hooks for the ACPI CA debugging infrastructure */ #define _COMPONENT ACPI_BUS ACPI_MODULE_NAME("ACPI") static d_open_t acpiopen; static d_close_t acpiclose; static d_ioctl_t acpiioctl; static struct cdevsw acpi_cdevsw = { .d_version = D_VERSION, .d_open = acpiopen, .d_close = acpiclose, .d_ioctl = acpiioctl, .d_name = "acpi", }; struct acpi_interface { ACPI_STRING *data; int num; }; /* Global mutex for locking access to the ACPI subsystem. */ struct mtx acpi_mutex; struct callout acpi_sleep_timer; /* Bitmap of device quirks. */ int acpi_quirks; /* Supported sleep states. */ static BOOLEAN acpi_sleep_states[ACPI_S_STATE_COUNT]; static void acpi_lookup(void *arg, const char *name, device_t *dev); static int acpi_modevent(struct module *mod, int event, void *junk); static int acpi_probe(device_t dev); static int acpi_attach(device_t dev); static int acpi_suspend(device_t dev); static int acpi_resume(device_t dev); static int acpi_shutdown(device_t dev); static device_t acpi_add_child(device_t bus, u_int order, const char *name, int unit); static int acpi_print_child(device_t bus, device_t child); static void acpi_probe_nomatch(device_t bus, device_t child); static void acpi_driver_added(device_t dev, driver_t *driver); static int acpi_read_ivar(device_t dev, device_t child, int index, uintptr_t *result); static int acpi_write_ivar(device_t dev, device_t child, int index, uintptr_t value); static struct resource_list *acpi_get_rlist(device_t dev, device_t child); static void acpi_reserve_resources(device_t dev); static int acpi_sysres_alloc(device_t dev); static int acpi_set_resource(device_t dev, device_t child, int type, int rid, rman_res_t start, rman_res_t count); static struct resource *acpi_alloc_resource(device_t bus, device_t child, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags); static int acpi_adjust_resource(device_t bus, device_t child, int type, struct resource *r, rman_res_t start, rman_res_t end); static int acpi_release_resource(device_t bus, device_t child, int type, int rid, struct resource *r); static void acpi_delete_resource(device_t bus, device_t child, int type, int rid); static uint32_t acpi_isa_get_logicalid(device_t dev); static int acpi_isa_get_compatid(device_t dev, uint32_t *cids, int count); static char *acpi_device_id_probe(device_t bus, device_t dev, char **ids); static ACPI_STATUS acpi_device_eval_obj(device_t bus, device_t dev, ACPI_STRING pathname, ACPI_OBJECT_LIST *parameters, ACPI_BUFFER *ret); static ACPI_STATUS acpi_device_scan_cb(ACPI_HANDLE h, UINT32 level, void *context, void **retval); static ACPI_STATUS acpi_device_scan_children(device_t bus, device_t dev, int max_depth, acpi_scan_cb_t user_fn, void *arg); static int acpi_set_powerstate(device_t child, int state); static int acpi_isa_pnp_probe(device_t bus, device_t child, struct isa_pnp_id *ids); static void acpi_probe_children(device_t bus); static void acpi_probe_order(ACPI_HANDLE handle, int *order); static ACPI_STATUS acpi_probe_child(ACPI_HANDLE handle, UINT32 level, void *context, void **status); static void acpi_sleep_enable(void *arg); static ACPI_STATUS acpi_sleep_disable(struct acpi_softc *sc); static ACPI_STATUS acpi_EnterSleepState(struct acpi_softc *sc, int state); static void acpi_shutdown_final(void *arg, int howto); static void acpi_enable_fixed_events(struct acpi_softc *sc); static BOOLEAN acpi_has_hid(ACPI_HANDLE handle); static void acpi_resync_clock(struct acpi_softc *sc); static int acpi_wake_sleep_prep(ACPI_HANDLE handle, int sstate); static int acpi_wake_run_prep(ACPI_HANDLE handle, int sstate); static int acpi_wake_prep_walk(int sstate); static int acpi_wake_sysctl_walk(device_t dev); static int acpi_wake_set_sysctl(SYSCTL_HANDLER_ARGS); static void acpi_system_eventhandler_sleep(void *arg, int state); static void acpi_system_eventhandler_wakeup(void *arg, int state); static int acpi_sname2sstate(const char *sname); static const char *acpi_sstate2sname(int sstate); static int acpi_supported_sleep_state_sysctl(SYSCTL_HANDLER_ARGS); static int acpi_sleep_state_sysctl(SYSCTL_HANDLER_ARGS); static int acpi_debug_objects_sysctl(SYSCTL_HANDLER_ARGS); static int acpi_pm_func(u_long cmd, void *arg, ...); static int acpi_child_location_str_method(device_t acdev, device_t child, char *buf, size_t buflen); static int acpi_child_pnpinfo_str_method(device_t acdev, device_t child, char *buf, size_t buflen); #if defined(__i386__) || defined(__amd64__) static void acpi_enable_pcie(void); #endif static void acpi_hint_device_unit(device_t acdev, device_t child, const char *name, int *unitp); static void acpi_reset_interfaces(device_t dev); static device_method_t acpi_methods[] = { /* Device interface */ DEVMETHOD(device_probe, acpi_probe), DEVMETHOD(device_attach, acpi_attach), DEVMETHOD(device_shutdown, acpi_shutdown), DEVMETHOD(device_detach, bus_generic_detach), DEVMETHOD(device_suspend, acpi_suspend), DEVMETHOD(device_resume, acpi_resume), /* Bus interface */ DEVMETHOD(bus_add_child, acpi_add_child), DEVMETHOD(bus_print_child, acpi_print_child), DEVMETHOD(bus_probe_nomatch, acpi_probe_nomatch), DEVMETHOD(bus_driver_added, acpi_driver_added), DEVMETHOD(bus_read_ivar, acpi_read_ivar), DEVMETHOD(bus_write_ivar, acpi_write_ivar), DEVMETHOD(bus_get_resource_list, acpi_get_rlist), DEVMETHOD(bus_set_resource, acpi_set_resource), DEVMETHOD(bus_get_resource, bus_generic_rl_get_resource), DEVMETHOD(bus_alloc_resource, acpi_alloc_resource), DEVMETHOD(bus_adjust_resource, acpi_adjust_resource), DEVMETHOD(bus_release_resource, acpi_release_resource), DEVMETHOD(bus_delete_resource, acpi_delete_resource), DEVMETHOD(bus_child_pnpinfo_str, acpi_child_pnpinfo_str_method), DEVMETHOD(bus_child_location_str, acpi_child_location_str_method), DEVMETHOD(bus_activate_resource, bus_generic_activate_resource), DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource), DEVMETHOD(bus_setup_intr, bus_generic_setup_intr), DEVMETHOD(bus_teardown_intr, bus_generic_teardown_intr), DEVMETHOD(bus_hint_device_unit, acpi_hint_device_unit), DEVMETHOD(bus_get_cpus, acpi_get_cpus), DEVMETHOD(bus_get_domain, acpi_get_domain), /* ACPI bus */ DEVMETHOD(acpi_id_probe, acpi_device_id_probe), DEVMETHOD(acpi_evaluate_object, acpi_device_eval_obj), DEVMETHOD(acpi_pwr_for_sleep, acpi_device_pwr_for_sleep), DEVMETHOD(acpi_scan_children, acpi_device_scan_children), /* ISA emulation */ DEVMETHOD(isa_pnp_probe, acpi_isa_pnp_probe), DEVMETHOD_END }; static driver_t acpi_driver = { "acpi", acpi_methods, sizeof(struct acpi_softc), }; static devclass_t acpi_devclass; DRIVER_MODULE(acpi, nexus, acpi_driver, acpi_devclass, acpi_modevent, 0); MODULE_VERSION(acpi, 1); ACPI_SERIAL_DECL(acpi, "ACPI root bus"); /* Local pools for managing system resources for ACPI child devices. */ static struct rman acpi_rman_io, acpi_rman_mem; #define ACPI_MINIMUM_AWAKETIME 5 /* Holds the description of the acpi0 device. */ static char acpi_desc[ACPI_OEM_ID_SIZE + ACPI_OEM_TABLE_ID_SIZE + 2]; SYSCTL_NODE(_debug, OID_AUTO, acpi, CTLFLAG_RD, NULL, "ACPI debugging"); static char acpi_ca_version[12]; SYSCTL_STRING(_debug_acpi, OID_AUTO, acpi_ca_version, CTLFLAG_RD, acpi_ca_version, 0, "Version of Intel ACPI-CA"); /* * Allow overriding _OSI methods. */ static char acpi_install_interface[256]; TUNABLE_STR("hw.acpi.install_interface", acpi_install_interface, sizeof(acpi_install_interface)); static char acpi_remove_interface[256]; TUNABLE_STR("hw.acpi.remove_interface", acpi_remove_interface, sizeof(acpi_remove_interface)); /* Allow users to dump Debug objects without ACPI debugger. */ static int acpi_debug_objects; TUNABLE_INT("debug.acpi.enable_debug_objects", &acpi_debug_objects); SYSCTL_PROC(_debug_acpi, OID_AUTO, enable_debug_objects, CTLFLAG_RW | CTLTYPE_INT, NULL, 0, acpi_debug_objects_sysctl, "I", "Enable Debug objects"); /* Allow the interpreter to ignore common mistakes in BIOS. */ static int acpi_interpreter_slack = 1; TUNABLE_INT("debug.acpi.interpreter_slack", &acpi_interpreter_slack); SYSCTL_INT(_debug_acpi, OID_AUTO, interpreter_slack, CTLFLAG_RDTUN, &acpi_interpreter_slack, 1, "Turn on interpreter slack mode."); /* Ignore register widths set by FADT and use default widths instead. */ static int acpi_ignore_reg_width = 1; TUNABLE_INT("debug.acpi.default_register_width", &acpi_ignore_reg_width); SYSCTL_INT(_debug_acpi, OID_AUTO, default_register_width, CTLFLAG_RDTUN, &acpi_ignore_reg_width, 1, "Ignore register widths set by FADT"); #ifdef __amd64__ /* Reset system clock while resuming. XXX Remove once tested. */ static int acpi_reset_clock = 1; TUNABLE_INT("debug.acpi.reset_clock", &acpi_reset_clock); SYSCTL_INT(_debug_acpi, OID_AUTO, reset_clock, CTLFLAG_RW, &acpi_reset_clock, 1, "Reset system clock while resuming."); #endif /* Allow users to override quirks. */ TUNABLE_INT("debug.acpi.quirks", &acpi_quirks); int acpi_susp_bounce; SYSCTL_INT(_debug_acpi, OID_AUTO, suspend_bounce, CTLFLAG_RW, &acpi_susp_bounce, 0, "Don't actually suspend, just test devices."); /* * ACPI can only be loaded as a module by the loader; activating it after * system bootstrap time is not useful, and can be fatal to the system. * It also cannot be unloaded, since the entire system bus hierarchy hangs * off it. */ static int acpi_modevent(struct module *mod, int event, void *junk) { switch (event) { case MOD_LOAD: if (!cold) { printf("The ACPI driver cannot be loaded after boot.\n"); return (EPERM); } break; case MOD_UNLOAD: if (!cold && power_pm_get_type() == POWER_PM_TYPE_ACPI) return (EBUSY); break; default: break; } return (0); } /* * Perform early initialization. */ ACPI_STATUS acpi_Startup(void) { static int started = 0; ACPI_STATUS status; int val; ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); /* Only run the startup code once. The MADT driver also calls this. */ if (started) return_VALUE (AE_OK); started = 1; /* * Initialize the ACPICA subsystem. */ if (ACPI_FAILURE(status = AcpiInitializeSubsystem())) { printf("ACPI: Could not initialize Subsystem: %s\n", AcpiFormatException(status)); return_VALUE (status); } /* * Pre-allocate space for RSDT/XSDT and DSDT tables and allow resizing * if more tables exist. */ if (ACPI_FAILURE(status = AcpiInitializeTables(NULL, 2, TRUE))) { printf("ACPI: Table initialisation failed: %s\n", AcpiFormatException(status)); return_VALUE (status); } /* Set up any quirks we have for this system. */ if (acpi_quirks == ACPI_Q_OK) acpi_table_quirks(&acpi_quirks); /* If the user manually set the disabled hint to 0, force-enable ACPI. */ if (resource_int_value("acpi", 0, "disabled", &val) == 0 && val == 0) acpi_quirks &= ~ACPI_Q_BROKEN; if (acpi_quirks & ACPI_Q_BROKEN) { printf("ACPI disabled by blacklist. Contact your BIOS vendor.\n"); status = AE_SUPPORT; } return_VALUE (status); } /* * Detect ACPI and perform early initialisation. */ int acpi_identify(void) { ACPI_TABLE_RSDP *rsdp; ACPI_TABLE_HEADER *rsdt; ACPI_PHYSICAL_ADDRESS paddr; struct sbuf sb; ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); if (!cold) return (ENXIO); /* Check that we haven't been disabled with a hint. */ if (resource_disabled("acpi", 0)) return (ENXIO); /* Check for other PM systems. */ if (power_pm_get_type() != POWER_PM_TYPE_NONE && power_pm_get_type() != POWER_PM_TYPE_ACPI) { printf("ACPI identify failed, other PM system enabled.\n"); return (ENXIO); } /* Initialize root tables. */ if (ACPI_FAILURE(acpi_Startup())) { printf("ACPI: Try disabling either ACPI or apic support.\n"); return (ENXIO); } if ((paddr = AcpiOsGetRootPointer()) == 0 || (rsdp = AcpiOsMapMemory(paddr, sizeof(ACPI_TABLE_RSDP))) == NULL) return (ENXIO); if (rsdp->Revision > 1 && rsdp->XsdtPhysicalAddress != 0) paddr = (ACPI_PHYSICAL_ADDRESS)rsdp->XsdtPhysicalAddress; else paddr = (ACPI_PHYSICAL_ADDRESS)rsdp->RsdtPhysicalAddress; AcpiOsUnmapMemory(rsdp, sizeof(ACPI_TABLE_RSDP)); if ((rsdt = AcpiOsMapMemory(paddr, sizeof(ACPI_TABLE_HEADER))) == NULL) return (ENXIO); sbuf_new(&sb, acpi_desc, sizeof(acpi_desc), SBUF_FIXEDLEN); sbuf_bcat(&sb, rsdt->OemId, ACPI_OEM_ID_SIZE); sbuf_trim(&sb); sbuf_putc(&sb, ' '); sbuf_bcat(&sb, rsdt->OemTableId, ACPI_OEM_TABLE_ID_SIZE); sbuf_trim(&sb); sbuf_finish(&sb); sbuf_delete(&sb); AcpiOsUnmapMemory(rsdt, sizeof(ACPI_TABLE_HEADER)); snprintf(acpi_ca_version, sizeof(acpi_ca_version), "%x", ACPI_CA_VERSION); return (0); } /* * Fetch some descriptive data from ACPI to put in our attach message. */ static int acpi_probe(device_t dev) { ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); device_set_desc(dev, acpi_desc); return_VALUE (BUS_PROBE_NOWILDCARD); } static int acpi_attach(device_t dev) { struct acpi_softc *sc; ACPI_STATUS status; int error, state; UINT32 flags; UINT8 TypeA, TypeB; char *env; ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); sc = device_get_softc(dev); sc->acpi_dev = dev; callout_init(&sc->susp_force_to, 1); error = ENXIO; /* Initialize resource manager. */ acpi_rman_io.rm_type = RMAN_ARRAY; acpi_rman_io.rm_start = 0; acpi_rman_io.rm_end = 0xffff; acpi_rman_io.rm_descr = "ACPI I/O ports"; if (rman_init(&acpi_rman_io) != 0) panic("acpi rman_init IO ports failed"); acpi_rman_mem.rm_type = RMAN_ARRAY; acpi_rman_mem.rm_descr = "ACPI I/O memory addresses"; if (rman_init(&acpi_rman_mem) != 0) panic("acpi rman_init memory failed"); /* Initialise the ACPI mutex */ mtx_init(&acpi_mutex, "ACPI global lock", NULL, MTX_DEF); /* * Set the globals from our tunables. This is needed because ACPI-CA * uses UINT8 for some values and we have no tunable_byte. */ AcpiGbl_EnableInterpreterSlack = acpi_interpreter_slack ? TRUE : FALSE; AcpiGbl_EnableAmlDebugObject = acpi_debug_objects ? TRUE : FALSE; AcpiGbl_UseDefaultRegisterWidths = acpi_ignore_reg_width ? TRUE : FALSE; #ifndef ACPI_DEBUG /* * Disable all debugging layers and levels. */ AcpiDbgLayer = 0; AcpiDbgLevel = 0; #endif /* Override OS interfaces if the user requested. */ acpi_reset_interfaces(dev); /* Load ACPI name space. */ status = AcpiLoadTables(); if (ACPI_FAILURE(status)) { device_printf(dev, "Could not load Namespace: %s\n", AcpiFormatException(status)); goto out; } #if defined(__i386__) || defined(__amd64__) /* Handle MCFG table if present. */ acpi_enable_pcie(); #endif /* * Note that some systems (specifically, those with namespace evaluation * issues that require the avoidance of parts of the namespace) must * avoid running _INI and _STA on everything, as well as dodging the final * object init pass. * * For these devices, we set ACPI_NO_DEVICE_INIT and ACPI_NO_OBJECT_INIT). * * XXX We should arrange for the object init pass after we have attached * all our child devices, but on many systems it works here. */ flags = 0; if (testenv("debug.acpi.avoid")) flags = ACPI_NO_DEVICE_INIT | ACPI_NO_OBJECT_INIT; /* Bring the hardware and basic handlers online. */ if (ACPI_FAILURE(status = AcpiEnableSubsystem(flags))) { device_printf(dev, "Could not enable ACPI: %s\n", AcpiFormatException(status)); goto out; } /* * Call the ECDT probe function to provide EC functionality before * the namespace has been evaluated. * * XXX This happens before the sysresource devices have been probed and * attached so its resources come from nexus0. In practice, this isn't * a problem but should be addressed eventually. */ acpi_ec_ecdt_probe(dev); /* Bring device objects and regions online. */ if (ACPI_FAILURE(status = AcpiInitializeObjects(flags))) { device_printf(dev, "Could not initialize ACPI objects: %s\n", AcpiFormatException(status)); goto out; } /* * Setup our sysctl tree. * * XXX: This doesn't check to make sure that none of these fail. */ sysctl_ctx_init(&sc->acpi_sysctl_ctx); sc->acpi_sysctl_tree = SYSCTL_ADD_NODE(&sc->acpi_sysctl_ctx, SYSCTL_STATIC_CHILDREN(_hw), OID_AUTO, device_get_name(dev), CTLFLAG_RD, 0, ""); SYSCTL_ADD_PROC(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree), OID_AUTO, "supported_sleep_state", CTLTYPE_STRING | CTLFLAG_RD, 0, 0, acpi_supported_sleep_state_sysctl, "A", "List supported ACPI sleep states."); SYSCTL_ADD_PROC(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree), OID_AUTO, "power_button_state", CTLTYPE_STRING | CTLFLAG_RW, &sc->acpi_power_button_sx, 0, acpi_sleep_state_sysctl, "A", "Power button ACPI sleep state."); SYSCTL_ADD_PROC(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree), OID_AUTO, "sleep_button_state", CTLTYPE_STRING | CTLFLAG_RW, &sc->acpi_sleep_button_sx, 0, acpi_sleep_state_sysctl, "A", "Sleep button ACPI sleep state."); SYSCTL_ADD_PROC(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree), OID_AUTO, "lid_switch_state", CTLTYPE_STRING | CTLFLAG_RW, &sc->acpi_lid_switch_sx, 0, acpi_sleep_state_sysctl, "A", "Lid ACPI sleep state. Set to S3 if you want to suspend your laptop when close the Lid."); SYSCTL_ADD_PROC(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree), OID_AUTO, "standby_state", CTLTYPE_STRING | CTLFLAG_RW, &sc->acpi_standby_sx, 0, acpi_sleep_state_sysctl, "A", ""); SYSCTL_ADD_PROC(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree), OID_AUTO, "suspend_state", CTLTYPE_STRING | CTLFLAG_RW, &sc->acpi_suspend_sx, 0, acpi_sleep_state_sysctl, "A", ""); SYSCTL_ADD_INT(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree), OID_AUTO, "sleep_delay", CTLFLAG_RW, &sc->acpi_sleep_delay, 0, "sleep delay in seconds"); SYSCTL_ADD_INT(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree), OID_AUTO, "s4bios", CTLFLAG_RW, &sc->acpi_s4bios, 0, "S4BIOS mode"); SYSCTL_ADD_INT(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree), OID_AUTO, "verbose", CTLFLAG_RW, &sc->acpi_verbose, 0, "verbose mode"); SYSCTL_ADD_INT(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree), OID_AUTO, "disable_on_reboot", CTLFLAG_RW, &sc->acpi_do_disable, 0, "Disable ACPI when rebooting/halting system"); SYSCTL_ADD_INT(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree), OID_AUTO, "handle_reboot", CTLFLAG_RW, &sc->acpi_handle_reboot, 0, "Use ACPI Reset Register to reboot"); /* * Default to 1 second before sleeping to give some machines time to * stabilize. */ sc->acpi_sleep_delay = 1; if (bootverbose) sc->acpi_verbose = 1; if ((env = kern_getenv("hw.acpi.verbose")) != NULL) { if (strcmp(env, "0") != 0) sc->acpi_verbose = 1; freeenv(env); } /* Only enable reboot by default if the FADT says it is available. */ if (AcpiGbl_FADT.Flags & ACPI_FADT_RESET_REGISTER) sc->acpi_handle_reboot = 1; #if !ACPI_REDUCED_HARDWARE /* Only enable S4BIOS by default if the FACS says it is available. */ if (AcpiGbl_FACS != NULL && AcpiGbl_FACS->Flags & ACPI_FACS_S4_BIOS_PRESENT) sc->acpi_s4bios = 1; #endif /* Probe all supported sleep states. */ acpi_sleep_states[ACPI_STATE_S0] = TRUE; for (state = ACPI_STATE_S1; state < ACPI_S_STATE_COUNT; state++) if (ACPI_SUCCESS(AcpiEvaluateObject(ACPI_ROOT_OBJECT, __DECONST(char *, AcpiGbl_SleepStateNames[state]), NULL, NULL)) && ACPI_SUCCESS(AcpiGetSleepTypeData(state, &TypeA, &TypeB))) acpi_sleep_states[state] = TRUE; /* * Dispatch the default sleep state to devices. The lid switch is set * to UNKNOWN by default to avoid surprising users. */ sc->acpi_power_button_sx = acpi_sleep_states[ACPI_STATE_S5] ? ACPI_STATE_S5 : ACPI_STATE_UNKNOWN; sc->acpi_lid_switch_sx = ACPI_STATE_UNKNOWN; sc->acpi_standby_sx = acpi_sleep_states[ACPI_STATE_S1] ? ACPI_STATE_S1 : ACPI_STATE_UNKNOWN; sc->acpi_suspend_sx = acpi_sleep_states[ACPI_STATE_S3] ? ACPI_STATE_S3 : ACPI_STATE_UNKNOWN; /* Pick the first valid sleep state for the sleep button default. */ sc->acpi_sleep_button_sx = ACPI_STATE_UNKNOWN; for (state = ACPI_STATE_S1; state <= ACPI_STATE_S4; state++) if (acpi_sleep_states[state]) { sc->acpi_sleep_button_sx = state; break; } acpi_enable_fixed_events(sc); /* * Scan the namespace and attach/initialise children. */ /* Register our shutdown handler. */ EVENTHANDLER_REGISTER(shutdown_final, acpi_shutdown_final, sc, SHUTDOWN_PRI_LAST); /* * Register our acpi event handlers. * XXX should be configurable eg. via userland policy manager. */ EVENTHANDLER_REGISTER(acpi_sleep_event, acpi_system_eventhandler_sleep, sc, ACPI_EVENT_PRI_LAST); EVENTHANDLER_REGISTER(acpi_wakeup_event, acpi_system_eventhandler_wakeup, sc, ACPI_EVENT_PRI_LAST); /* Flag our initial states. */ sc->acpi_enabled = TRUE; sc->acpi_sstate = ACPI_STATE_S0; sc->acpi_sleep_disabled = TRUE; /* Create the control device */ sc->acpi_dev_t = make_dev(&acpi_cdevsw, 0, UID_ROOT, GID_WHEEL, 0644, "acpi"); sc->acpi_dev_t->si_drv1 = sc; if ((error = acpi_machdep_init(dev))) goto out; /* Register ACPI again to pass the correct argument of pm_func. */ power_pm_register(POWER_PM_TYPE_ACPI, acpi_pm_func, sc); if (!acpi_disabled("bus")) { EVENTHANDLER_REGISTER(dev_lookup, acpi_lookup, NULL, 1000); acpi_probe_children(dev); } /* Update all GPEs and enable runtime GPEs. */ status = AcpiUpdateAllGpes(); if (ACPI_FAILURE(status)) device_printf(dev, "Could not update all GPEs: %s\n", AcpiFormatException(status)); /* Allow sleep request after a while. */ callout_init_mtx(&acpi_sleep_timer, &acpi_mutex, 0); callout_reset(&acpi_sleep_timer, hz * ACPI_MINIMUM_AWAKETIME, acpi_sleep_enable, sc); error = 0; out: return_VALUE (error); } static void acpi_set_power_children(device_t dev, int state) { device_t child; device_t *devlist; int dstate, i, numdevs; if (device_get_children(dev, &devlist, &numdevs) != 0) return; /* * Retrieve and set D-state for the sleep state if _SxD is present. * Skip children who aren't attached since they are handled separately. */ for (i = 0; i < numdevs; i++) { child = devlist[i]; dstate = state; if (device_is_attached(child) && acpi_device_pwr_for_sleep(dev, child, &dstate) == 0) acpi_set_powerstate(child, dstate); } free(devlist, M_TEMP); } static int acpi_suspend(device_t dev) { int error; GIANT_REQUIRED; error = bus_generic_suspend(dev); if (error == 0) acpi_set_power_children(dev, ACPI_STATE_D3); return (error); } static int acpi_resume(device_t dev) { GIANT_REQUIRED; acpi_set_power_children(dev, ACPI_STATE_D0); return (bus_generic_resume(dev)); } static int acpi_shutdown(device_t dev) { GIANT_REQUIRED; /* Allow children to shutdown first. */ bus_generic_shutdown(dev); /* * Enable any GPEs that are able to power-on the system (i.e., RTC). * Also, disable any that are not valid for this state (most). */ acpi_wake_prep_walk(ACPI_STATE_S5); return (0); } /* * Handle a new device being added */ static device_t acpi_add_child(device_t bus, u_int order, const char *name, int unit) { struct acpi_device *ad; device_t child; if ((ad = malloc(sizeof(*ad), M_ACPIDEV, M_NOWAIT | M_ZERO)) == NULL) return (NULL); resource_list_init(&ad->ad_rl); child = device_add_child_ordered(bus, order, name, unit); if (child != NULL) device_set_ivars(child, ad); else free(ad, M_ACPIDEV); return (child); } static int acpi_print_child(device_t bus, device_t child) { struct acpi_device *adev = device_get_ivars(child); struct resource_list *rl = &adev->ad_rl; int retval = 0; retval += bus_print_child_header(bus, child); retval += resource_list_print_type(rl, "port", SYS_RES_IOPORT, "%#jx"); retval += resource_list_print_type(rl, "iomem", SYS_RES_MEMORY, "%#jx"); retval += resource_list_print_type(rl, "irq", SYS_RES_IRQ, "%jd"); retval += resource_list_print_type(rl, "drq", SYS_RES_DRQ, "%jd"); if (device_get_flags(child)) retval += printf(" flags %#x", device_get_flags(child)); retval += bus_print_child_domain(bus, child); retval += bus_print_child_footer(bus, child); return (retval); } /* * If this device is an ACPI child but no one claimed it, attempt * to power it off. We'll power it back up when a driver is added. * * XXX Disabled for now since many necessary devices (like fdc and * ATA) don't claim the devices we created for them but still expect * them to be powered up. */ static void acpi_probe_nomatch(device_t bus, device_t child) { #ifdef ACPI_ENABLE_POWERDOWN_NODRIVER acpi_set_powerstate(child, ACPI_STATE_D3); #endif } /* * If a new driver has a chance to probe a child, first power it up. * * XXX Disabled for now (see acpi_probe_nomatch for details). */ static void acpi_driver_added(device_t dev, driver_t *driver) { device_t child, *devlist; int i, numdevs; DEVICE_IDENTIFY(driver, dev); if (device_get_children(dev, &devlist, &numdevs)) return; for (i = 0; i < numdevs; i++) { child = devlist[i]; if (device_get_state(child) == DS_NOTPRESENT) { #ifdef ACPI_ENABLE_POWERDOWN_NODRIVER acpi_set_powerstate(child, ACPI_STATE_D0); if (device_probe_and_attach(child) != 0) acpi_set_powerstate(child, ACPI_STATE_D3); #else device_probe_and_attach(child); #endif } } free(devlist, M_TEMP); } /* Location hint for devctl(8) */ static int acpi_child_location_str_method(device_t cbdev, device_t child, char *buf, size_t buflen) { struct acpi_device *dinfo = device_get_ivars(child); char buf2[32]; int pxm; if (dinfo->ad_handle) { snprintf(buf, buflen, "handle=%s", acpi_name(dinfo->ad_handle)); if (ACPI_SUCCESS(acpi_GetInteger(dinfo->ad_handle, "_PXM", &pxm))) { snprintf(buf2, 32, " _PXM=%d", pxm); strlcat(buf, buf2, buflen); } } else { snprintf(buf, buflen, "unknown"); } return (0); } /* PnP information for devctl(8) */ static int acpi_child_pnpinfo_str_method(device_t cbdev, device_t child, char *buf, size_t buflen) { struct acpi_device *dinfo = device_get_ivars(child); ACPI_DEVICE_INFO *adinfo; if (ACPI_FAILURE(AcpiGetObjectInfo(dinfo->ad_handle, &adinfo))) { snprintf(buf, buflen, "unknown"); return (0); } snprintf(buf, buflen, "_HID=%s _UID=%lu", (adinfo->Valid & ACPI_VALID_HID) ? adinfo->HardwareId.String : "none", (adinfo->Valid & ACPI_VALID_UID) ? strtoul(adinfo->UniqueId.String, NULL, 10) : 0UL); AcpiOsFree(adinfo); return (0); } /* * Handle per-device ivars */ static int acpi_read_ivar(device_t dev, device_t child, int index, uintptr_t *result) { struct acpi_device *ad; if ((ad = device_get_ivars(child)) == NULL) { device_printf(child, "device has no ivars\n"); return (ENOENT); } /* ACPI and ISA compatibility ivars */ switch(index) { case ACPI_IVAR_HANDLE: *(ACPI_HANDLE *)result = ad->ad_handle; break; case ACPI_IVAR_PRIVATE: *(void **)result = ad->ad_private; break; case ACPI_IVAR_FLAGS: *(int *)result = ad->ad_flags; break; case ISA_IVAR_VENDORID: case ISA_IVAR_SERIAL: case ISA_IVAR_COMPATID: *(int *)result = -1; break; case ISA_IVAR_LOGICALID: *(int *)result = acpi_isa_get_logicalid(child); break; default: return (ENOENT); } return (0); } static int acpi_write_ivar(device_t dev, device_t child, int index, uintptr_t value) { struct acpi_device *ad; if ((ad = device_get_ivars(child)) == NULL) { device_printf(child, "device has no ivars\n"); return (ENOENT); } switch(index) { case ACPI_IVAR_HANDLE: ad->ad_handle = (ACPI_HANDLE)value; break; case ACPI_IVAR_PRIVATE: ad->ad_private = (void *)value; break; case ACPI_IVAR_FLAGS: ad->ad_flags = (int)value; break; default: panic("bad ivar write request (%d)", index); return (ENOENT); } return (0); } /* * Handle child resource allocation/removal */ static struct resource_list * acpi_get_rlist(device_t dev, device_t child) { struct acpi_device *ad; ad = device_get_ivars(child); return (&ad->ad_rl); } static int acpi_match_resource_hint(device_t dev, int type, long value) { struct acpi_device *ad = device_get_ivars(dev); struct resource_list *rl = &ad->ad_rl; struct resource_list_entry *rle; STAILQ_FOREACH(rle, rl, link) { if (rle->type != type) continue; if (rle->start <= value && rle->end >= value) return (1); } return (0); } /* * Wire device unit numbers based on resource matches in hints. */ static void acpi_hint_device_unit(device_t acdev, device_t child, const char *name, int *unitp) { const char *s; long value; int line, matches, unit; /* * Iterate over all the hints for the devices with the specified * name to see if one's resources are a subset of this device. */ line = 0; for (;;) { if (resource_find_dev(&line, name, &unit, "at", NULL) != 0) break; /* Must have an "at" for acpi or isa. */ resource_string_value(name, unit, "at", &s); if (!(strcmp(s, "acpi0") == 0 || strcmp(s, "acpi") == 0 || strcmp(s, "isa0") == 0 || strcmp(s, "isa") == 0)) continue; /* * Check for matching resources. We must have at least one match. * Since I/O and memory resources cannot be shared, if we get a * match on either of those, ignore any mismatches in IRQs or DRQs. * * XXX: We may want to revisit this to be more lenient and wire * as long as it gets one match. */ matches = 0; if (resource_long_value(name, unit, "port", &value) == 0) { /* * Floppy drive controllers are notorious for having a * wide variety of resources not all of which include the * first port that is specified by the hint (typically * 0x3f0) (see the comment above fdc_isa_alloc_resources() * in fdc_isa.c). However, they do all seem to include * port + 2 (e.g. 0x3f2) so for a floppy device, look for * 'value + 2' in the port resources instead of the hint * value. */ if (strcmp(name, "fdc") == 0) value += 2; if (acpi_match_resource_hint(child, SYS_RES_IOPORT, value)) matches++; else continue; } if (resource_long_value(name, unit, "maddr", &value) == 0) { if (acpi_match_resource_hint(child, SYS_RES_MEMORY, value)) matches++; else continue; } if (matches > 0) goto matched; if (resource_long_value(name, unit, "irq", &value) == 0) { if (acpi_match_resource_hint(child, SYS_RES_IRQ, value)) matches++; else continue; } if (resource_long_value(name, unit, "drq", &value) == 0) { if (acpi_match_resource_hint(child, SYS_RES_DRQ, value)) matches++; else continue; } matched: if (matches > 0) { /* We have a winner! */ *unitp = unit; break; } } } /* * Fetch the NUMA domain for a device by mapping the value returned by * _PXM to a NUMA domain. If the device does not have a _PXM method, * -2 is returned. If any other error occurs, -1 is returned. */ static int acpi_parse_pxm(device_t dev) { #ifdef DEVICE_NUMA ACPI_HANDLE handle; ACPI_STATUS status; int pxm; handle = acpi_get_handle(dev); if (handle == NULL) return (-2); status = acpi_GetInteger(handle, "_PXM", &pxm); if (ACPI_SUCCESS(status)) return (acpi_map_pxm_to_vm_domainid(pxm)); if (status == AE_NOT_FOUND) return (-2); #endif return (-1); } int acpi_get_cpus(device_t dev, device_t child, enum cpu_sets op, size_t setsize, cpuset_t *cpuset) { int d, error; d = acpi_parse_pxm(child); if (d < 0) return (bus_generic_get_cpus(dev, child, op, setsize, cpuset)); switch (op) { case LOCAL_CPUS: if (setsize != sizeof(cpuset_t)) return (EINVAL); *cpuset = cpuset_domain[d]; return (0); case INTR_CPUS: error = bus_generic_get_cpus(dev, child, op, setsize, cpuset); if (error != 0) return (error); if (setsize != sizeof(cpuset_t)) return (EINVAL); CPU_AND(cpuset, &cpuset_domain[d]); return (0); default: return (bus_generic_get_cpus(dev, child, op, setsize, cpuset)); } } /* * Fetch the NUMA domain for the given device 'dev'. * * If a device has a _PXM method, map that to a NUMA domain. * Otherwise, pass the request up to the parent. * If there's no matching domain or the domain cannot be * determined, return ENOENT. */ int acpi_get_domain(device_t dev, device_t child, int *domain) { int d; d = acpi_parse_pxm(child); if (d >= 0) { *domain = d; return (0); } if (d == -1) return (ENOENT); /* No _PXM node; go up a level */ return (bus_generic_get_domain(dev, child, domain)); } /* * Pre-allocate/manage all memory and IO resources. Since rman can't handle * duplicates, we merge any in the sysresource attach routine. */ static int acpi_sysres_alloc(device_t dev) { struct resource *res; struct resource_list *rl; struct resource_list_entry *rle; struct rman *rm; char *sysres_ids[] = { "PNP0C01", "PNP0C02", NULL }; device_t *children; int child_count, i; /* * Probe/attach any sysresource devices. This would be unnecessary if we * had multi-pass probe/attach. */ if (device_get_children(dev, &children, &child_count) != 0) return (ENXIO); for (i = 0; i < child_count; i++) { if (ACPI_ID_PROBE(dev, children[i], sysres_ids) != NULL) device_probe_and_attach(children[i]); } free(children, M_TEMP); rl = BUS_GET_RESOURCE_LIST(device_get_parent(dev), dev); STAILQ_FOREACH(rle, rl, link) { if (rle->res != NULL) { device_printf(dev, "duplicate resource for %jx\n", rle->start); continue; } /* Only memory and IO resources are valid here. */ switch (rle->type) { case SYS_RES_IOPORT: rm = &acpi_rman_io; break; case SYS_RES_MEMORY: rm = &acpi_rman_mem; break; default: continue; } /* Pre-allocate resource and add to our rman pool. */ res = BUS_ALLOC_RESOURCE(device_get_parent(dev), dev, rle->type, &rle->rid, rle->start, rle->start + rle->count - 1, rle->count, 0); if (res != NULL) { rman_manage_region(rm, rman_get_start(res), rman_get_end(res)); rle->res = res; } else if (bootverbose) device_printf(dev, "reservation of %jx, %jx (%d) failed\n", rle->start, rle->count, rle->type); } return (0); } static char *pcilink_ids[] = { "PNP0C0F", NULL }; static char *sysres_ids[] = { "PNP0C01", "PNP0C02", NULL }; /* * Reserve declared resources for devices found during attach once system * resources have been allocated. */ static void acpi_reserve_resources(device_t dev) { struct resource_list_entry *rle; struct resource_list *rl; struct acpi_device *ad; struct acpi_softc *sc; device_t *children; int child_count, i; sc = device_get_softc(dev); if (device_get_children(dev, &children, &child_count) != 0) return; for (i = 0; i < child_count; i++) { ad = device_get_ivars(children[i]); rl = &ad->ad_rl; /* Don't reserve system resources. */ if (ACPI_ID_PROBE(dev, children[i], sysres_ids) != NULL) continue; STAILQ_FOREACH(rle, rl, link) { /* * Don't reserve IRQ resources. There are many sticky things * to get right otherwise (e.g. IRQs for psm, atkbd, and HPET * when using legacy routing). */ if (rle->type == SYS_RES_IRQ) continue; /* * Don't reserve the resource if it is already allocated. * The acpi_ec(4) driver can allocate its resources early * if ECDT is present. */ if (rle->res != NULL) continue; /* * Try to reserve the resource from our parent. If this * fails because the resource is a system resource, just * let it be. The resource range is already reserved so * that other devices will not use it. If the driver * needs to allocate the resource, then * acpi_alloc_resource() will sub-alloc from the system * resource. */ resource_list_reserve(rl, dev, children[i], rle->type, &rle->rid, rle->start, rle->end, rle->count, 0); } } free(children, M_TEMP); sc->acpi_resources_reserved = 1; } static int acpi_set_resource(device_t dev, device_t child, int type, int rid, rman_res_t start, rman_res_t count) { struct acpi_softc *sc = device_get_softc(dev); struct acpi_device *ad = device_get_ivars(child); struct resource_list *rl = &ad->ad_rl; ACPI_DEVICE_INFO *devinfo; rman_res_t end; /* Ignore IRQ resources for PCI link devices. */ if (type == SYS_RES_IRQ && ACPI_ID_PROBE(dev, child, pcilink_ids) != NULL) return (0); /* * Ignore most resources for PCI root bridges. Some BIOSes * incorrectly enumerate the memory ranges they decode as plain * memory resources instead of as ResourceProducer ranges. Other * BIOSes incorrectly list system resource entries for I/O ranges * under the PCI bridge. Do allow the one known-correct case on * x86 of a PCI bridge claiming the I/O ports used for PCI config * access. */ if (type == SYS_RES_MEMORY || type == SYS_RES_IOPORT) { if (ACPI_SUCCESS(AcpiGetObjectInfo(ad->ad_handle, &devinfo))) { if ((devinfo->Flags & ACPI_PCI_ROOT_BRIDGE) != 0) { #if defined(__i386__) || defined(__amd64__) if (!(type == SYS_RES_IOPORT && start == CONF1_ADDR_PORT)) #endif { AcpiOsFree(devinfo); return (0); } } AcpiOsFree(devinfo); } } /* If the resource is already allocated, fail. */ if (resource_list_busy(rl, type, rid)) return (EBUSY); /* If the resource is already reserved, release it. */ if (resource_list_reserved(rl, type, rid)) resource_list_unreserve(rl, dev, child, type, rid); /* Add the resource. */ end = (start + count - 1); resource_list_add(rl, type, rid, start, end, count); /* Don't reserve resources until the system resources are allocated. */ if (!sc->acpi_resources_reserved) return (0); /* Don't reserve system resources. */ if (ACPI_ID_PROBE(dev, child, sysres_ids) != NULL) return (0); /* * Don't reserve IRQ resources. There are many sticky things to * get right otherwise (e.g. IRQs for psm, atkbd, and HPET when * using legacy routing). */ if (type == SYS_RES_IRQ) return (0); /* * Don't reserve resources for CPU devices. Some of these * resources need to be allocated as shareable, but reservations * are always non-shareable. */ if (device_get_devclass(child) == devclass_find("cpu")) return (0); /* * Reserve the resource. * * XXX: Ignores failure for now. Failure here is probably a * BIOS/firmware bug? */ resource_list_reserve(rl, dev, child, type, &rid, start, end, count, 0); return (0); } static struct resource * acpi_alloc_resource(device_t bus, device_t child, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) { ACPI_RESOURCE ares; struct acpi_device *ad; struct resource_list_entry *rle; struct resource_list *rl; struct resource *res; int isdefault = RMAN_IS_DEFAULT_RANGE(start, end); /* * First attempt at allocating the resource. For direct children, * use resource_list_alloc() to handle reserved resources. For * other devices, pass the request up to our parent. */ if (bus == device_get_parent(child)) { ad = device_get_ivars(child); rl = &ad->ad_rl; /* * Simulate the behavior of the ISA bus for direct children * devices. That is, if a non-default range is specified for * a resource that doesn't exist, use bus_set_resource() to * add the resource before allocating it. Note that these * resources will not be reserved. */ if (!isdefault && resource_list_find(rl, type, *rid) == NULL) resource_list_add(rl, type, *rid, start, end, count); res = resource_list_alloc(rl, bus, child, type, rid, start, end, count, flags); if (res != NULL && type == SYS_RES_IRQ) { /* * Since bus_config_intr() takes immediate effect, we cannot * configure the interrupt associated with a device when we * parse the resources but have to defer it until a driver * actually allocates the interrupt via bus_alloc_resource(). * * XXX: Should we handle the lookup failing? */ if (ACPI_SUCCESS(acpi_lookup_irq_resource(child, *rid, res, &ares))) acpi_config_intr(child, &ares); } /* * If this is an allocation of the "default" range for a given * RID, fetch the exact bounds for this resource from the * resource list entry to try to allocate the range from the * system resource regions. */ if (res == NULL && isdefault) { rle = resource_list_find(rl, type, *rid); if (rle != NULL) { start = rle->start; end = rle->end; count = rle->count; } } } else res = BUS_ALLOC_RESOURCE(device_get_parent(bus), child, type, rid, start, end, count, flags); /* * If the first attempt failed and this is an allocation of a * specific range, try to satisfy the request via a suballocation * from our system resource regions. */ if (res == NULL && start + count - 1 == end) res = acpi_alloc_sysres(child, type, rid, start, end, count, flags); return (res); } /* * Attempt to allocate a specific resource range from the system * resource ranges. Note that we only handle memory and I/O port * system resources. */ struct resource * acpi_alloc_sysres(device_t child, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) { struct rman *rm; struct resource *res; switch (type) { case SYS_RES_IOPORT: rm = &acpi_rman_io; break; case SYS_RES_MEMORY: rm = &acpi_rman_mem; break; default: return (NULL); } KASSERT(start + count - 1 == end, ("wildcard resource range")); res = rman_reserve_resource(rm, start, end, count, flags & ~RF_ACTIVE, child); if (res == NULL) return (NULL); rman_set_rid(res, *rid); /* If requested, activate the resource using the parent's method. */ if (flags & RF_ACTIVE) if (bus_activate_resource(child, type, *rid, res) != 0) { rman_release_resource(res); return (NULL); } return (res); } static int acpi_is_resource_managed(int type, struct resource *r) { /* We only handle memory and IO resources through rman. */ switch (type) { case SYS_RES_IOPORT: return (rman_is_region_manager(r, &acpi_rman_io)); case SYS_RES_MEMORY: return (rman_is_region_manager(r, &acpi_rman_mem)); } return (0); } static int acpi_adjust_resource(device_t bus, device_t child, int type, struct resource *r, rman_res_t start, rman_res_t end) { if (acpi_is_resource_managed(type, r)) return (rman_adjust_resource(r, start, end)); return (bus_generic_adjust_resource(bus, child, type, r, start, end)); } static int acpi_release_resource(device_t bus, device_t child, int type, int rid, struct resource *r) { int ret; /* * If this resource belongs to one of our internal managers, * deactivate it and release it to the local pool. */ if (acpi_is_resource_managed(type, r)) { if (rman_get_flags(r) & RF_ACTIVE) { ret = bus_deactivate_resource(child, type, rid, r); if (ret != 0) return (ret); } return (rman_release_resource(r)); } return (bus_generic_rl_release_resource(bus, child, type, rid, r)); } static void acpi_delete_resource(device_t bus, device_t child, int type, int rid) { struct resource_list *rl; rl = acpi_get_rlist(bus, child); if (resource_list_busy(rl, type, rid)) { device_printf(bus, "delete_resource: Resource still owned by child" " (type=%d, rid=%d)\n", type, rid); return; } resource_list_unreserve(rl, bus, child, type, rid); resource_list_delete(rl, type, rid); } /* Allocate an IO port or memory resource, given its GAS. */ int acpi_bus_alloc_gas(device_t dev, int *type, int *rid, ACPI_GENERIC_ADDRESS *gas, struct resource **res, u_int flags) { int error, res_type; error = ENOMEM; if (type == NULL || rid == NULL || gas == NULL || res == NULL) return (EINVAL); /* We only support memory and IO spaces. */ switch (gas->SpaceId) { case ACPI_ADR_SPACE_SYSTEM_MEMORY: res_type = SYS_RES_MEMORY; break; case ACPI_ADR_SPACE_SYSTEM_IO: res_type = SYS_RES_IOPORT; break; default: return (EOPNOTSUPP); } /* * If the register width is less than 8, assume the BIOS author means * it is a bit field and just allocate a byte. */ if (gas->BitWidth && gas->BitWidth < 8) gas->BitWidth = 8; /* Validate the address after we're sure we support the space. */ if (gas->Address == 0 || gas->BitWidth == 0) return (EINVAL); bus_set_resource(dev, res_type, *rid, gas->Address, gas->BitWidth / 8); *res = bus_alloc_resource_any(dev, res_type, rid, RF_ACTIVE | flags); if (*res != NULL) { *type = res_type; error = 0; } else bus_delete_resource(dev, res_type, *rid); return (error); } /* Probe _HID and _CID for compatible ISA PNP ids. */ static uint32_t acpi_isa_get_logicalid(device_t dev) { ACPI_DEVICE_INFO *devinfo; ACPI_HANDLE h; uint32_t pnpid; ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); /* Fetch and validate the HID. */ if ((h = acpi_get_handle(dev)) == NULL || ACPI_FAILURE(AcpiGetObjectInfo(h, &devinfo))) return_VALUE (0); pnpid = (devinfo->Valid & ACPI_VALID_HID) != 0 && devinfo->HardwareId.Length >= ACPI_EISAID_STRING_SIZE ? PNP_EISAID(devinfo->HardwareId.String) : 0; AcpiOsFree(devinfo); return_VALUE (pnpid); } static int acpi_isa_get_compatid(device_t dev, uint32_t *cids, int count) { ACPI_DEVICE_INFO *devinfo; ACPI_PNP_DEVICE_ID *ids; ACPI_HANDLE h; uint32_t *pnpid; int i, valid; ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); pnpid = cids; /* Fetch and validate the CID */ if ((h = acpi_get_handle(dev)) == NULL || ACPI_FAILURE(AcpiGetObjectInfo(h, &devinfo))) return_VALUE (0); if ((devinfo->Valid & ACPI_VALID_CID) == 0) { AcpiOsFree(devinfo); return_VALUE (0); } if (devinfo->CompatibleIdList.Count < count) count = devinfo->CompatibleIdList.Count; ids = devinfo->CompatibleIdList.Ids; for (i = 0, valid = 0; i < count; i++) if (ids[i].Length >= ACPI_EISAID_STRING_SIZE && strncmp(ids[i].String, "PNP", 3) == 0) { *pnpid++ = PNP_EISAID(ids[i].String); valid++; } AcpiOsFree(devinfo); return_VALUE (valid); } static char * acpi_device_id_probe(device_t bus, device_t dev, char **ids) { ACPI_HANDLE h; ACPI_OBJECT_TYPE t; int i; h = acpi_get_handle(dev); if (ids == NULL || h == NULL) return (NULL); t = acpi_get_type(dev); if (t != ACPI_TYPE_DEVICE && t != ACPI_TYPE_PROCESSOR) return (NULL); /* Try to match one of the array of IDs with a HID or CID. */ for (i = 0; ids[i] != NULL; i++) { if (acpi_MatchHid(h, ids[i])) return (ids[i]); } return (NULL); } static ACPI_STATUS acpi_device_eval_obj(device_t bus, device_t dev, ACPI_STRING pathname, ACPI_OBJECT_LIST *parameters, ACPI_BUFFER *ret) { ACPI_HANDLE h; if (dev == NULL) h = ACPI_ROOT_OBJECT; else if ((h = acpi_get_handle(dev)) == NULL) return (AE_BAD_PARAMETER); return (AcpiEvaluateObject(h, pathname, parameters, ret)); } int acpi_device_pwr_for_sleep(device_t bus, device_t dev, int *dstate) { struct acpi_softc *sc; ACPI_HANDLE handle; ACPI_STATUS status; char sxd[8]; handle = acpi_get_handle(dev); /* * XXX If we find these devices, don't try to power them down. * The serial and IRDA ports on my T23 hang the system when * set to D3 and it appears that such legacy devices may * need special handling in their drivers. */ if (dstate == NULL || handle == NULL || acpi_MatchHid(handle, "PNP0500") || acpi_MatchHid(handle, "PNP0501") || acpi_MatchHid(handle, "PNP0502") || acpi_MatchHid(handle, "PNP0510") || acpi_MatchHid(handle, "PNP0511")) return (ENXIO); /* * Override next state with the value from _SxD, if present. * Note illegal _S0D is evaluated because some systems expect this. */ sc = device_get_softc(bus); snprintf(sxd, sizeof(sxd), "_S%dD", sc->acpi_sstate); status = acpi_GetInteger(handle, sxd, dstate); if (ACPI_FAILURE(status) && status != AE_NOT_FOUND) { device_printf(dev, "failed to get %s on %s: %s\n", sxd, acpi_name(handle), AcpiFormatException(status)); return (ENXIO); } return (0); } /* Callback arg for our implementation of walking the namespace. */ struct acpi_device_scan_ctx { acpi_scan_cb_t user_fn; void *arg; ACPI_HANDLE parent; }; static ACPI_STATUS acpi_device_scan_cb(ACPI_HANDLE h, UINT32 level, void *arg, void **retval) { struct acpi_device_scan_ctx *ctx; device_t dev, old_dev; ACPI_STATUS status; ACPI_OBJECT_TYPE type; /* * Skip this device if we think we'll have trouble with it or it is * the parent where the scan began. */ ctx = (struct acpi_device_scan_ctx *)arg; if (acpi_avoid(h) || h == ctx->parent) return (AE_OK); /* If this is not a valid device type (e.g., a method), skip it. */ if (ACPI_FAILURE(AcpiGetType(h, &type))) return (AE_OK); if (type != ACPI_TYPE_DEVICE && type != ACPI_TYPE_PROCESSOR && type != ACPI_TYPE_THERMAL && type != ACPI_TYPE_POWER) return (AE_OK); /* * Call the user function with the current device. If it is unchanged * afterwards, return. Otherwise, we update the handle to the new dev. */ old_dev = acpi_get_device(h); dev = old_dev; status = ctx->user_fn(h, &dev, level, ctx->arg); if (ACPI_FAILURE(status) || old_dev == dev) return (status); /* Remove the old child and its connection to the handle. */ if (old_dev != NULL) { device_delete_child(device_get_parent(old_dev), old_dev); AcpiDetachData(h, acpi_fake_objhandler); } /* Recreate the handle association if the user created a device. */ if (dev != NULL) AcpiAttachData(h, acpi_fake_objhandler, dev); return (AE_OK); } static ACPI_STATUS acpi_device_scan_children(device_t bus, device_t dev, int max_depth, acpi_scan_cb_t user_fn, void *arg) { ACPI_HANDLE h; struct acpi_device_scan_ctx ctx; if (acpi_disabled("children")) return (AE_OK); if (dev == NULL) h = ACPI_ROOT_OBJECT; else if ((h = acpi_get_handle(dev)) == NULL) return (AE_BAD_PARAMETER); ctx.user_fn = user_fn; ctx.arg = arg; ctx.parent = h; return (AcpiWalkNamespace(ACPI_TYPE_ANY, h, max_depth, acpi_device_scan_cb, NULL, &ctx, NULL)); } /* * Even though ACPI devices are not PCI, we use the PCI approach for setting * device power states since it's close enough to ACPI. */ static int acpi_set_powerstate(device_t child, int state) { ACPI_HANDLE h; ACPI_STATUS status; h = acpi_get_handle(child); if (state < ACPI_STATE_D0 || state > ACPI_D_STATES_MAX) return (EINVAL); if (h == NULL) return (0); /* Ignore errors if the power methods aren't present. */ status = acpi_pwr_switch_consumer(h, state); if (ACPI_SUCCESS(status)) { if (bootverbose) device_printf(child, "set ACPI power state D%d on %s\n", state, acpi_name(h)); } else if (status != AE_NOT_FOUND) device_printf(child, "failed to set ACPI power state D%d on %s: %s\n", state, acpi_name(h), AcpiFormatException(status)); return (0); } static int acpi_isa_pnp_probe(device_t bus, device_t child, struct isa_pnp_id *ids) { int result, cid_count, i; uint32_t lid, cids[8]; ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); /* * ISA-style drivers attached to ACPI may persist and * probe manually if we return ENOENT. We never want * that to happen, so don't ever return it. */ result = ENXIO; /* Scan the supplied IDs for a match */ lid = acpi_isa_get_logicalid(child); cid_count = acpi_isa_get_compatid(child, cids, 8); while (ids && ids->ip_id) { if (lid == ids->ip_id) { result = 0; goto out; } for (i = 0; i < cid_count; i++) { if (cids[i] == ids->ip_id) { result = 0; goto out; } } ids++; } out: if (result == 0 && ids->ip_desc) device_set_desc(child, ids->ip_desc); return_VALUE (result); } #if defined(__i386__) || defined(__amd64__) /* * Look for a MCFG table. If it is present, use the settings for * domain (segment) 0 to setup PCI config space access via the memory * map. */ static void acpi_enable_pcie(void) { ACPI_TABLE_HEADER *hdr; ACPI_MCFG_ALLOCATION *alloc, *end; ACPI_STATUS status; status = AcpiGetTable(ACPI_SIG_MCFG, 1, &hdr); if (ACPI_FAILURE(status)) return; end = (ACPI_MCFG_ALLOCATION *)((char *)hdr + hdr->Length); alloc = (ACPI_MCFG_ALLOCATION *)((ACPI_TABLE_MCFG *)hdr + 1); while (alloc < end) { if (alloc->PciSegment == 0) { pcie_cfgregopen(alloc->Address, alloc->StartBusNumber, alloc->EndBusNumber); return; } alloc++; } } #endif /* * Scan all of the ACPI namespace and attach child devices. * * We should only expect to find devices in the \_PR, \_TZ, \_SI, and * \_SB scopes, and \_PR and \_TZ became obsolete in the ACPI 2.0 spec. * However, in violation of the spec, some systems place their PCI link * devices in \, so we have to walk the whole namespace. We check the * type of namespace nodes, so this should be ok. */ static void acpi_probe_children(device_t bus) { ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); /* * Scan the namespace and insert placeholders for all the devices that * we find. We also probe/attach any early devices. * * Note that we use AcpiWalkNamespace rather than AcpiGetDevices because * we want to create nodes for all devices, not just those that are * currently present. (This assumes that we don't want to create/remove * devices as they appear, which might be smarter.) */ ACPI_DEBUG_PRINT((ACPI_DB_OBJECTS, "namespace scan\n")); AcpiWalkNamespace(ACPI_TYPE_ANY, ACPI_ROOT_OBJECT, 100, acpi_probe_child, NULL, bus, NULL); /* Pre-allocate resources for our rman from any sysresource devices. */ acpi_sysres_alloc(bus); /* Reserve resources already allocated to children. */ acpi_reserve_resources(bus); /* Create any static children by calling device identify methods. */ ACPI_DEBUG_PRINT((ACPI_DB_OBJECTS, "device identify routines\n")); bus_generic_probe(bus); /* Probe/attach all children, created statically and from the namespace. */ ACPI_DEBUG_PRINT((ACPI_DB_OBJECTS, "acpi bus_generic_attach\n")); bus_generic_attach(bus); /* Attach wake sysctls. */ acpi_wake_sysctl_walk(bus); ACPI_DEBUG_PRINT((ACPI_DB_OBJECTS, "done attaching children\n")); return_VOID; } /* * Determine the probe order for a given device. */ static void acpi_probe_order(ACPI_HANDLE handle, int *order) { ACPI_OBJECT_TYPE type; /* * 0. CPUs * 1. I/O port and memory system resource holders * 2. Clocks and timers (to handle early accesses) * 3. Embedded controllers (to handle early accesses) * 4. PCI Link Devices */ AcpiGetType(handle, &type); if (type == ACPI_TYPE_PROCESSOR) *order = 0; else if (acpi_MatchHid(handle, "PNP0C01") || acpi_MatchHid(handle, "PNP0C02")) *order = 1; else if (acpi_MatchHid(handle, "PNP0100") || acpi_MatchHid(handle, "PNP0103") || acpi_MatchHid(handle, "PNP0B00")) *order = 2; else if (acpi_MatchHid(handle, "PNP0C09")) *order = 3; else if (acpi_MatchHid(handle, "PNP0C0F")) *order = 4; } /* * Evaluate a child device and determine whether we might attach a device to * it. */ static ACPI_STATUS acpi_probe_child(ACPI_HANDLE handle, UINT32 level, void *context, void **status) { struct acpi_prw_data prw; ACPI_OBJECT_TYPE type; ACPI_HANDLE h; device_t bus, child; char *handle_str; int order; ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); if (acpi_disabled("children")) return_ACPI_STATUS (AE_OK); /* Skip this device if we think we'll have trouble with it. */ if (acpi_avoid(handle)) return_ACPI_STATUS (AE_OK); bus = (device_t)context; if (ACPI_SUCCESS(AcpiGetType(handle, &type))) { handle_str = acpi_name(handle); switch (type) { case ACPI_TYPE_DEVICE: /* * Since we scan from \, be sure to skip system scope objects. * \_SB_ and \_TZ_ are defined in ACPICA as devices to work around * BIOS bugs. For example, \_SB_ is to allow \_SB_._INI to be run * during the initialization and \_TZ_ is to support Notify() on it. */ if (strcmp(handle_str, "\\_SB_") == 0 || strcmp(handle_str, "\\_TZ_") == 0) break; if (acpi_parse_prw(handle, &prw) == 0) AcpiSetupGpeForWake(handle, prw.gpe_handle, prw.gpe_bit); /* * Ignore devices that do not have a _HID or _CID. They should * be discovered by other buses (e.g. the PCI bus driver). */ if (!acpi_has_hid(handle)) break; /* FALLTHROUGH */ case ACPI_TYPE_PROCESSOR: case ACPI_TYPE_THERMAL: case ACPI_TYPE_POWER: /* * Create a placeholder device for this node. Sort the * placeholder so that the probe/attach passes will run * breadth-first. Orders less than ACPI_DEV_BASE_ORDER * are reserved for special objects (i.e., system * resources). */ ACPI_DEBUG_PRINT((ACPI_DB_OBJECTS, "scanning '%s'\n", handle_str)); order = level * 10 + ACPI_DEV_BASE_ORDER; acpi_probe_order(handle, &order); child = BUS_ADD_CHILD(bus, order, NULL, -1); if (child == NULL) break; /* Associate the handle with the device_t and vice versa. */ acpi_set_handle(child, handle); AcpiAttachData(handle, acpi_fake_objhandler, child); /* * Check that the device is present. If it's not present, * leave it disabled (so that we have a device_t attached to * the handle, but we don't probe it). * * XXX PCI link devices sometimes report "present" but not * "functional" (i.e. if disabled). Go ahead and probe them * anyway since we may enable them later. */ if (type == ACPI_TYPE_DEVICE && !acpi_DeviceIsPresent(child)) { /* Never disable PCI link devices. */ if (acpi_MatchHid(handle, "PNP0C0F")) break; /* * Docking stations should remain enabled since the system * may be undocked at boot. */ if (ACPI_SUCCESS(AcpiGetHandle(handle, "_DCK", &h))) break; device_disable(child); break; } /* * Get the device's resource settings and attach them. * Note that if the device has _PRS but no _CRS, we need * to decide when it's appropriate to try to configure the * device. Ignore the return value here; it's OK for the * device not to have any resources. */ acpi_parse_resources(child, handle, &acpi_res_parse_set, NULL); break; } } return_ACPI_STATUS (AE_OK); } /* * AcpiAttachData() requires an object handler but never uses it. This is a * placeholder object handler so we can store a device_t in an ACPI_HANDLE. */ void acpi_fake_objhandler(ACPI_HANDLE h, void *data) { } static void acpi_shutdown_final(void *arg, int howto) { struct acpi_softc *sc = (struct acpi_softc *)arg; register_t intr; ACPI_STATUS status; /* * XXX Shutdown code should only run on the BSP (cpuid 0). * Some chipsets do not power off the system correctly if called from * an AP. */ if ((howto & RB_POWEROFF) != 0) { status = AcpiEnterSleepStatePrep(ACPI_STATE_S5); if (ACPI_FAILURE(status)) { device_printf(sc->acpi_dev, "AcpiEnterSleepStatePrep failed - %s\n", AcpiFormatException(status)); return; } device_printf(sc->acpi_dev, "Powering system off\n"); intr = intr_disable(); status = AcpiEnterSleepState(ACPI_STATE_S5); if (ACPI_FAILURE(status)) { intr_restore(intr); device_printf(sc->acpi_dev, "power-off failed - %s\n", AcpiFormatException(status)); } else { DELAY(1000000); intr_restore(intr); device_printf(sc->acpi_dev, "power-off failed - timeout\n"); } } else if ((howto & RB_HALT) == 0 && sc->acpi_handle_reboot) { /* Reboot using the reset register. */ status = AcpiReset(); if (ACPI_SUCCESS(status)) { DELAY(1000000); device_printf(sc->acpi_dev, "reset failed - timeout\n"); } else if (status != AE_NOT_EXIST) device_printf(sc->acpi_dev, "reset failed - %s\n", AcpiFormatException(status)); } else if (sc->acpi_do_disable && panicstr == NULL) { /* * Only disable ACPI if the user requested. On some systems, writing * the disable value to SMI_CMD hangs the system. */ device_printf(sc->acpi_dev, "Shutting down\n"); AcpiTerminate(); } } static void acpi_enable_fixed_events(struct acpi_softc *sc) { static int first_time = 1; /* Enable and clear fixed events and install handlers. */ if ((AcpiGbl_FADT.Flags & ACPI_FADT_POWER_BUTTON) == 0) { AcpiClearEvent(ACPI_EVENT_POWER_BUTTON); AcpiInstallFixedEventHandler(ACPI_EVENT_POWER_BUTTON, acpi_event_power_button_sleep, sc); if (first_time) device_printf(sc->acpi_dev, "Power Button (fixed)\n"); } if ((AcpiGbl_FADT.Flags & ACPI_FADT_SLEEP_BUTTON) == 0) { AcpiClearEvent(ACPI_EVENT_SLEEP_BUTTON); AcpiInstallFixedEventHandler(ACPI_EVENT_SLEEP_BUTTON, acpi_event_sleep_button_sleep, sc); if (first_time) device_printf(sc->acpi_dev, "Sleep Button (fixed)\n"); } first_time = 0; } /* * Returns true if the device is actually present and should * be attached to. This requires the present, enabled, UI-visible * and diagnostics-passed bits to be set. */ BOOLEAN acpi_DeviceIsPresent(device_t dev) { ACPI_DEVICE_INFO *devinfo; ACPI_HANDLE h; BOOLEAN present; if ((h = acpi_get_handle(dev)) == NULL || ACPI_FAILURE(AcpiGetObjectInfo(h, &devinfo))) return (FALSE); /* Onboard serial ports on certain AMD motherboards have an invalid _STA * method that always returns 0. Force them to always be treated as present. * * This may solely be a quirk of a preproduction BIOS. */ if (acpi_MatchHid(h, "AMDI0020") || acpi_MatchHid(h, "AMDI0010")) return (TRUE); /* If no _STA method, must be present */ present = (devinfo->Valid & ACPI_VALID_STA) == 0 || ACPI_DEVICE_PRESENT(devinfo->CurrentStatus) ? TRUE : FALSE; AcpiOsFree(devinfo); return (present); } /* * Returns true if the battery is actually present and inserted. */ BOOLEAN acpi_BatteryIsPresent(device_t dev) { ACPI_DEVICE_INFO *devinfo; ACPI_HANDLE h; BOOLEAN present; if ((h = acpi_get_handle(dev)) == NULL || ACPI_FAILURE(AcpiGetObjectInfo(h, &devinfo))) return (FALSE); /* If no _STA method, must be present */ present = (devinfo->Valid & ACPI_VALID_STA) == 0 || ACPI_BATTERY_PRESENT(devinfo->CurrentStatus) ? TRUE : FALSE; AcpiOsFree(devinfo); return (present); } /* * Returns true if a device has at least one valid device ID. */ static BOOLEAN acpi_has_hid(ACPI_HANDLE h) { ACPI_DEVICE_INFO *devinfo; BOOLEAN ret; if (h == NULL || ACPI_FAILURE(AcpiGetObjectInfo(h, &devinfo))) return (FALSE); ret = FALSE; if ((devinfo->Valid & ACPI_VALID_HID) != 0) ret = TRUE; else if ((devinfo->Valid & ACPI_VALID_CID) != 0) if (devinfo->CompatibleIdList.Count > 0) ret = TRUE; AcpiOsFree(devinfo); return (ret); } /* * Match a HID string against a handle */ BOOLEAN acpi_MatchHid(ACPI_HANDLE h, const char *hid) { ACPI_DEVICE_INFO *devinfo; BOOLEAN ret; int i; if (hid == NULL || h == NULL || ACPI_FAILURE(AcpiGetObjectInfo(h, &devinfo))) return (FALSE); ret = FALSE; if ((devinfo->Valid & ACPI_VALID_HID) != 0 && strcmp(hid, devinfo->HardwareId.String) == 0) ret = TRUE; else if ((devinfo->Valid & ACPI_VALID_CID) != 0) for (i = 0; i < devinfo->CompatibleIdList.Count; i++) { if (strcmp(hid, devinfo->CompatibleIdList.Ids[i].String) == 0) { ret = TRUE; break; } } AcpiOsFree(devinfo); return (ret); } /* * Return the handle of a named object within our scope, ie. that of (parent) * or one if its parents. */ ACPI_STATUS acpi_GetHandleInScope(ACPI_HANDLE parent, char *path, ACPI_HANDLE *result) { ACPI_HANDLE r; ACPI_STATUS status; /* Walk back up the tree to the root */ for (;;) { status = AcpiGetHandle(parent, path, &r); if (ACPI_SUCCESS(status)) { *result = r; return (AE_OK); } /* XXX Return error here? */ if (status != AE_NOT_FOUND) return (AE_OK); if (ACPI_FAILURE(AcpiGetParent(parent, &r))) return (AE_NOT_FOUND); parent = r; } } /* * Allocate a buffer with a preset data size. */ ACPI_BUFFER * acpi_AllocBuffer(int size) { ACPI_BUFFER *buf; if ((buf = malloc(size + sizeof(*buf), M_ACPIDEV, M_NOWAIT)) == NULL) return (NULL); buf->Length = size; buf->Pointer = (void *)(buf + 1); return (buf); } ACPI_STATUS acpi_SetInteger(ACPI_HANDLE handle, char *path, UINT32 number) { ACPI_OBJECT arg1; ACPI_OBJECT_LIST args; arg1.Type = ACPI_TYPE_INTEGER; arg1.Integer.Value = number; args.Count = 1; args.Pointer = &arg1; return (AcpiEvaluateObject(handle, path, &args, NULL)); } /* * Evaluate a path that should return an integer. */ ACPI_STATUS acpi_GetInteger(ACPI_HANDLE handle, char *path, UINT32 *number) { ACPI_STATUS status; ACPI_BUFFER buf; ACPI_OBJECT param; if (handle == NULL) handle = ACPI_ROOT_OBJECT; /* * Assume that what we've been pointed at is an Integer object, or * a method that will return an Integer. */ buf.Pointer = ¶m; buf.Length = sizeof(param); status = AcpiEvaluateObject(handle, path, NULL, &buf); if (ACPI_SUCCESS(status)) { if (param.Type == ACPI_TYPE_INTEGER) *number = param.Integer.Value; else status = AE_TYPE; } /* * In some applications, a method that's expected to return an Integer * may instead return a Buffer (probably to simplify some internal * arithmetic). We'll try to fetch whatever it is, and if it's a Buffer, * convert it into an Integer as best we can. * * This is a hack. */ if (status == AE_BUFFER_OVERFLOW) { if ((buf.Pointer = AcpiOsAllocate(buf.Length)) == NULL) { status = AE_NO_MEMORY; } else { status = AcpiEvaluateObject(handle, path, NULL, &buf); if (ACPI_SUCCESS(status)) status = acpi_ConvertBufferToInteger(&buf, number); AcpiOsFree(buf.Pointer); } } return (status); } ACPI_STATUS acpi_ConvertBufferToInteger(ACPI_BUFFER *bufp, UINT32 *number) { ACPI_OBJECT *p; UINT8 *val; int i; p = (ACPI_OBJECT *)bufp->Pointer; if (p->Type == ACPI_TYPE_INTEGER) { *number = p->Integer.Value; return (AE_OK); } if (p->Type != ACPI_TYPE_BUFFER) return (AE_TYPE); if (p->Buffer.Length > sizeof(int)) return (AE_BAD_DATA); *number = 0; val = p->Buffer.Pointer; for (i = 0; i < p->Buffer.Length; i++) *number += val[i] << (i * 8); return (AE_OK); } /* * Iterate over the elements of an a package object, calling the supplied * function for each element. * * XXX possible enhancement might be to abort traversal on error. */ ACPI_STATUS acpi_ForeachPackageObject(ACPI_OBJECT *pkg, void (*func)(ACPI_OBJECT *comp, void *arg), void *arg) { ACPI_OBJECT *comp; int i; if (pkg == NULL || pkg->Type != ACPI_TYPE_PACKAGE) return (AE_BAD_PARAMETER); /* Iterate over components */ i = 0; comp = pkg->Package.Elements; for (; i < pkg->Package.Count; i++, comp++) func(comp, arg); return (AE_OK); } /* * Find the (index)th resource object in a set. */ ACPI_STATUS acpi_FindIndexedResource(ACPI_BUFFER *buf, int index, ACPI_RESOURCE **resp) { ACPI_RESOURCE *rp; int i; rp = (ACPI_RESOURCE *)buf->Pointer; i = index; while (i-- > 0) { /* Range check */ if (rp > (ACPI_RESOURCE *)((u_int8_t *)buf->Pointer + buf->Length)) return (AE_BAD_PARAMETER); /* Check for terminator */ if (rp->Type == ACPI_RESOURCE_TYPE_END_TAG || rp->Length == 0) return (AE_NOT_FOUND); rp = ACPI_NEXT_RESOURCE(rp); } if (resp != NULL) *resp = rp; return (AE_OK); } /* * Append an ACPI_RESOURCE to an ACPI_BUFFER. * * Given a pointer to an ACPI_RESOURCE structure, expand the ACPI_BUFFER * provided to contain it. If the ACPI_BUFFER is empty, allocate a sensible * backing block. If the ACPI_RESOURCE is NULL, return an empty set of * resources. */ #define ACPI_INITIAL_RESOURCE_BUFFER_SIZE 512 ACPI_STATUS acpi_AppendBufferResource(ACPI_BUFFER *buf, ACPI_RESOURCE *res) { ACPI_RESOURCE *rp; void *newp; /* Initialise the buffer if necessary. */ if (buf->Pointer == NULL) { buf->Length = ACPI_INITIAL_RESOURCE_BUFFER_SIZE; if ((buf->Pointer = AcpiOsAllocate(buf->Length)) == NULL) return (AE_NO_MEMORY); rp = (ACPI_RESOURCE *)buf->Pointer; rp->Type = ACPI_RESOURCE_TYPE_END_TAG; rp->Length = ACPI_RS_SIZE_MIN; } if (res == NULL) return (AE_OK); /* * Scan the current buffer looking for the terminator. * This will either find the terminator or hit the end * of the buffer and return an error. */ rp = (ACPI_RESOURCE *)buf->Pointer; for (;;) { /* Range check, don't go outside the buffer */ if (rp >= (ACPI_RESOURCE *)((u_int8_t *)buf->Pointer + buf->Length)) return (AE_BAD_PARAMETER); if (rp->Type == ACPI_RESOURCE_TYPE_END_TAG || rp->Length == 0) break; rp = ACPI_NEXT_RESOURCE(rp); } /* * Check the size of the buffer and expand if required. * * Required size is: * size of existing resources before terminator + * size of new resource and header + * size of terminator. * * Note that this loop should really only run once, unless * for some reason we are stuffing a *really* huge resource. */ while ((((u_int8_t *)rp - (u_int8_t *)buf->Pointer) + res->Length + ACPI_RS_SIZE_NO_DATA + ACPI_RS_SIZE_MIN) >= buf->Length) { if ((newp = AcpiOsAllocate(buf->Length * 2)) == NULL) return (AE_NO_MEMORY); bcopy(buf->Pointer, newp, buf->Length); rp = (ACPI_RESOURCE *)((u_int8_t *)newp + ((u_int8_t *)rp - (u_int8_t *)buf->Pointer)); AcpiOsFree(buf->Pointer); buf->Pointer = newp; buf->Length += buf->Length; } /* Insert the new resource. */ bcopy(res, rp, res->Length + ACPI_RS_SIZE_NO_DATA); /* And add the terminator. */ rp = ACPI_NEXT_RESOURCE(rp); rp->Type = ACPI_RESOURCE_TYPE_END_TAG; rp->Length = ACPI_RS_SIZE_MIN; return (AE_OK); } ACPI_STATUS acpi_EvaluateOSC(ACPI_HANDLE handle, uint8_t *uuid, int revision, int count, uint32_t *caps_in, uint32_t *caps_out, bool query) { ACPI_OBJECT arg[4], *ret; ACPI_OBJECT_LIST arglist; ACPI_BUFFER buf; ACPI_STATUS status; arglist.Pointer = arg; arglist.Count = 4; arg[0].Type = ACPI_TYPE_BUFFER; arg[0].Buffer.Length = ACPI_UUID_LENGTH; arg[0].Buffer.Pointer = uuid; arg[1].Type = ACPI_TYPE_INTEGER; arg[1].Integer.Value = revision; arg[2].Type = ACPI_TYPE_INTEGER; arg[2].Integer.Value = count; arg[3].Type = ACPI_TYPE_BUFFER; arg[3].Buffer.Length = count * sizeof(*caps_in); arg[3].Buffer.Pointer = (uint8_t *)caps_in; caps_in[0] = query ? 1 : 0; buf.Pointer = NULL; buf.Length = ACPI_ALLOCATE_BUFFER; status = AcpiEvaluateObjectTyped(handle, "_OSC", &arglist, &buf, ACPI_TYPE_BUFFER); if (ACPI_FAILURE(status)) return (status); if (caps_out != NULL) { ret = buf.Pointer; if (ret->Buffer.Length != count * sizeof(*caps_out)) { AcpiOsFree(buf.Pointer); return (AE_BUFFER_OVERFLOW); } bcopy(ret->Buffer.Pointer, caps_out, ret->Buffer.Length); } AcpiOsFree(buf.Pointer); return (status); } /* * Set interrupt model. */ ACPI_STATUS acpi_SetIntrModel(int model) { return (acpi_SetInteger(ACPI_ROOT_OBJECT, "_PIC", model)); } /* * Walk subtables of a table and call a callback routine for each * subtable. The caller should provide the first subtable and a * pointer to the end of the table. This can be used to walk tables * such as MADT and SRAT that use subtable entries. */ void acpi_walk_subtables(void *first, void *end, acpi_subtable_handler *handler, void *arg) { ACPI_SUBTABLE_HEADER *entry; for (entry = first; (void *)entry < end; ) { /* Avoid an infinite loop if we hit a bogus entry. */ if (entry->Length < sizeof(ACPI_SUBTABLE_HEADER)) return; handler(entry, arg); entry = ACPI_ADD_PTR(ACPI_SUBTABLE_HEADER, entry, entry->Length); } } /* * DEPRECATED. This interface has serious deficiencies and will be * removed. * * Immediately enter the sleep state. In the old model, acpiconf(8) ran * rc.suspend and rc.resume so we don't have to notify devd(8) to do this. */ ACPI_STATUS acpi_SetSleepState(struct acpi_softc *sc, int state) { static int once; if (!once) { device_printf(sc->acpi_dev, "warning: acpi_SetSleepState() deprecated, need to update your software\n"); once = 1; } return (acpi_EnterSleepState(sc, state)); } #if defined(__amd64__) || defined(__i386__) static void acpi_sleep_force_task(void *context) { struct acpi_softc *sc = (struct acpi_softc *)context; if (ACPI_FAILURE(acpi_EnterSleepState(sc, sc->acpi_next_sstate))) device_printf(sc->acpi_dev, "force sleep state S%d failed\n", sc->acpi_next_sstate); } static void acpi_sleep_force(void *arg) { struct acpi_softc *sc = (struct acpi_softc *)arg; device_printf(sc->acpi_dev, "suspend request timed out, forcing sleep now\n"); /* * XXX Suspending from callout causes freezes in DEVICE_SUSPEND(). * Suspend from acpi_task thread instead. */ if (ACPI_FAILURE(AcpiOsExecute(OSL_NOTIFY_HANDLER, acpi_sleep_force_task, sc))) device_printf(sc->acpi_dev, "AcpiOsExecute() for sleeping failed\n"); } #endif /* * Request that the system enter the given suspend state. All /dev/apm * devices and devd(8) will be notified. Userland then has a chance to * save state and acknowledge the request. The system sleeps once all * acks are in. */ int acpi_ReqSleepState(struct acpi_softc *sc, int state) { #if defined(__amd64__) || defined(__i386__) struct apm_clone_data *clone; ACPI_STATUS status; if (state < ACPI_STATE_S1 || state > ACPI_S_STATES_MAX) return (EINVAL); if (!acpi_sleep_states[state]) return (EOPNOTSUPP); /* * If a reboot/shutdown/suspend request is already in progress or * suspend is blocked due to an upcoming shutdown, just return. */ if (rebooting || sc->acpi_next_sstate != 0 || suspend_blocked) { return (0); } /* Wait until sleep is enabled. */ while (sc->acpi_sleep_disabled) { AcpiOsSleep(1000); } ACPI_LOCK(acpi); sc->acpi_next_sstate = state; /* S5 (soft-off) should be entered directly with no waiting. */ if (state == ACPI_STATE_S5) { ACPI_UNLOCK(acpi); status = acpi_EnterSleepState(sc, state); return (ACPI_SUCCESS(status) ? 0 : ENXIO); } /* Record the pending state and notify all apm devices. */ STAILQ_FOREACH(clone, &sc->apm_cdevs, entries) { clone->notify_status = APM_EV_NONE; if ((clone->flags & ACPI_EVF_DEVD) == 0) { selwakeuppri(&clone->sel_read, PZERO); KNOTE_LOCKED(&clone->sel_read.si_note, 0); } } /* If devd(8) is not running, immediately enter the sleep state. */ if (!devctl_process_running()) { ACPI_UNLOCK(acpi); status = acpi_EnterSleepState(sc, state); return (ACPI_SUCCESS(status) ? 0 : ENXIO); } /* * Set a timeout to fire if userland doesn't ack the suspend request * in time. This way we still eventually go to sleep if we were * overheating or running low on battery, even if userland is hung. * We cancel this timeout once all userland acks are in or the * suspend request is aborted. */ callout_reset(&sc->susp_force_to, 10 * hz, acpi_sleep_force, sc); ACPI_UNLOCK(acpi); /* Now notify devd(8) also. */ acpi_UserNotify("Suspend", ACPI_ROOT_OBJECT, state); return (0); #else /* This platform does not support acpi suspend/resume. */ return (EOPNOTSUPP); #endif } /* * Acknowledge (or reject) a pending sleep state. The caller has * prepared for suspend and is now ready for it to proceed. If the * error argument is non-zero, it indicates suspend should be cancelled * and gives an errno value describing why. Once all votes are in, * we suspend the system. */ int acpi_AckSleepState(struct apm_clone_data *clone, int error) { #if defined(__amd64__) || defined(__i386__) struct acpi_softc *sc; int ret, sleeping; /* If no pending sleep state, return an error. */ ACPI_LOCK(acpi); sc = clone->acpi_sc; if (sc->acpi_next_sstate == 0) { ACPI_UNLOCK(acpi); return (ENXIO); } /* Caller wants to abort suspend process. */ if (error) { sc->acpi_next_sstate = 0; callout_stop(&sc->susp_force_to); device_printf(sc->acpi_dev, "listener on %s cancelled the pending suspend\n", devtoname(clone->cdev)); ACPI_UNLOCK(acpi); return (0); } /* * Mark this device as acking the suspend request. Then, walk through * all devices, seeing if they agree yet. We only count devices that * are writable since read-only devices couldn't ack the request. */ sleeping = TRUE; clone->notify_status = APM_EV_ACKED; STAILQ_FOREACH(clone, &sc->apm_cdevs, entries) { if ((clone->flags & ACPI_EVF_WRITE) != 0 && clone->notify_status != APM_EV_ACKED) { sleeping = FALSE; break; } } /* If all devices have voted "yes", we will suspend now. */ if (sleeping) callout_stop(&sc->susp_force_to); ACPI_UNLOCK(acpi); ret = 0; if (sleeping) { if (ACPI_FAILURE(acpi_EnterSleepState(sc, sc->acpi_next_sstate))) ret = ENODEV; } return (ret); #else /* This platform does not support acpi suspend/resume. */ return (EOPNOTSUPP); #endif } static void acpi_sleep_enable(void *arg) { struct acpi_softc *sc = (struct acpi_softc *)arg; ACPI_LOCK_ASSERT(acpi); /* Reschedule if the system is not fully up and running. */ if (!AcpiGbl_SystemAwakeAndRunning) { callout_schedule(&acpi_sleep_timer, hz * ACPI_MINIMUM_AWAKETIME); return; } sc->acpi_sleep_disabled = FALSE; } static ACPI_STATUS acpi_sleep_disable(struct acpi_softc *sc) { ACPI_STATUS status; /* Fail if the system is not fully up and running. */ if (!AcpiGbl_SystemAwakeAndRunning) return (AE_ERROR); ACPI_LOCK(acpi); status = sc->acpi_sleep_disabled ? AE_ERROR : AE_OK; sc->acpi_sleep_disabled = TRUE; ACPI_UNLOCK(acpi); return (status); } enum acpi_sleep_state { ACPI_SS_NONE, ACPI_SS_GPE_SET, ACPI_SS_DEV_SUSPEND, ACPI_SS_SLP_PREP, ACPI_SS_SLEPT, }; /* * Enter the desired system sleep state. * * Currently we support S1-S5 but S4 is only S4BIOS */ static ACPI_STATUS acpi_EnterSleepState(struct acpi_softc *sc, int state) { register_t intr; ACPI_STATUS status; ACPI_EVENT_STATUS power_button_status; enum acpi_sleep_state slp_state; int sleep_result; ACPI_FUNCTION_TRACE_U32((char *)(uintptr_t)__func__, state); if (state < ACPI_STATE_S1 || state > ACPI_S_STATES_MAX) return_ACPI_STATUS (AE_BAD_PARAMETER); if (!acpi_sleep_states[state]) { device_printf(sc->acpi_dev, "Sleep state S%d not supported by BIOS\n", state); return (AE_SUPPORT); } /* Re-entry once we're suspending is not allowed. */ status = acpi_sleep_disable(sc); if (ACPI_FAILURE(status)) { device_printf(sc->acpi_dev, "suspend request ignored (not ready yet)\n"); return (status); } if (state == ACPI_STATE_S5) { /* * Shut down cleanly and power off. This will call us back through the * shutdown handlers. */ shutdown_nice(RB_POWEROFF); return_ACPI_STATUS (AE_OK); } EVENTHANDLER_INVOKE(power_suspend_early); stop_all_proc(); EVENTHANDLER_INVOKE(power_suspend); #ifdef EARLY_AP_STARTUP MPASS(mp_ncpus == 1 || smp_started); thread_lock(curthread); sched_bind(curthread, 0); thread_unlock(curthread); #else if (smp_started) { thread_lock(curthread); sched_bind(curthread, 0); thread_unlock(curthread); } #endif /* * Be sure to hold Giant across DEVICE_SUSPEND/RESUME since non-MPSAFE * drivers need this. */ mtx_lock(&Giant); slp_state = ACPI_SS_NONE; sc->acpi_sstate = state; /* Enable any GPEs as appropriate and requested by the user. */ acpi_wake_prep_walk(state); slp_state = ACPI_SS_GPE_SET; /* * Inform all devices that we are going to sleep. If at least one * device fails, DEVICE_SUSPEND() automatically resumes the tree. * * XXX Note that a better two-pass approach with a 'veto' pass * followed by a "real thing" pass would be better, but the current * bus interface does not provide for this. */ if (DEVICE_SUSPEND(root_bus) != 0) { device_printf(sc->acpi_dev, "device_suspend failed\n"); goto backout; } slp_state = ACPI_SS_DEV_SUSPEND; status = AcpiEnterSleepStatePrep(state); if (ACPI_FAILURE(status)) { device_printf(sc->acpi_dev, "AcpiEnterSleepStatePrep failed - %s\n", AcpiFormatException(status)); goto backout; } slp_state = ACPI_SS_SLP_PREP; if (sc->acpi_sleep_delay > 0) DELAY(sc->acpi_sleep_delay * 1000000); suspendclock(); intr = intr_disable(); if (state != ACPI_STATE_S1) { sleep_result = acpi_sleep_machdep(sc, state); acpi_wakeup_machdep(sc, state, sleep_result, 0); /* * XXX According to ACPI specification SCI_EN bit should be restored * by ACPI platform (BIOS, firmware) to its pre-sleep state. * Unfortunately some BIOSes fail to do that and that leads to * unexpected and serious consequences during wake up like a system * getting stuck in SMI handlers. * This hack is picked up from Linux, which claims that it follows * Windows behavior. */ if (sleep_result == 1 && state != ACPI_STATE_S4) AcpiWriteBitRegister(ACPI_BITREG_SCI_ENABLE, ACPI_ENABLE_EVENT); if (sleep_result == 1 && state == ACPI_STATE_S3) { /* * Prevent mis-interpretation of the wakeup by power button * as a request for power off. * Ideally we should post an appropriate wakeup event, * perhaps using acpi_event_power_button_wake or alike. * * Clearing of power button status after wakeup is mandated * by ACPI specification in section "Fixed Power Button". * * XXX As of ACPICA 20121114 AcpiGetEventStatus provides * status as 0/1 corressponding to inactive/active despite * its type being ACPI_EVENT_STATUS. In other words, * we should not test for ACPI_EVENT_FLAG_SET for time being. */ if (ACPI_SUCCESS(AcpiGetEventStatus(ACPI_EVENT_POWER_BUTTON, &power_button_status)) && power_button_status != 0) { AcpiClearEvent(ACPI_EVENT_POWER_BUTTON); device_printf(sc->acpi_dev, "cleared fixed power button status\n"); } } intr_restore(intr); /* call acpi_wakeup_machdep() again with interrupt enabled */ acpi_wakeup_machdep(sc, state, sleep_result, 1); AcpiLeaveSleepStatePrep(state); if (sleep_result == -1) goto backout; /* Re-enable ACPI hardware on wakeup from sleep state 4. */ if (state == ACPI_STATE_S4) AcpiEnable(); } else { status = AcpiEnterSleepState(state); intr_restore(intr); AcpiLeaveSleepStatePrep(state); if (ACPI_FAILURE(status)) { device_printf(sc->acpi_dev, "AcpiEnterSleepState failed - %s\n", AcpiFormatException(status)); goto backout; } } slp_state = ACPI_SS_SLEPT; /* * Back out state according to how far along we got in the suspend * process. This handles both the error and success cases. */ backout: if (slp_state >= ACPI_SS_SLP_PREP) resumeclock(); if (slp_state >= ACPI_SS_GPE_SET) { acpi_wake_prep_walk(state); sc->acpi_sstate = ACPI_STATE_S0; } if (slp_state >= ACPI_SS_DEV_SUSPEND) DEVICE_RESUME(root_bus); if (slp_state >= ACPI_SS_SLP_PREP) AcpiLeaveSleepState(state); if (slp_state >= ACPI_SS_SLEPT) { #if defined(__i386__) || defined(__amd64__) /* NB: we are still using ACPI timecounter at this point. */ resume_TSC(); #endif acpi_resync_clock(sc); acpi_enable_fixed_events(sc); } sc->acpi_next_sstate = 0; mtx_unlock(&Giant); #ifdef EARLY_AP_STARTUP thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); #else if (smp_started) { thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); } #endif resume_all_proc(); EVENTHANDLER_INVOKE(power_resume); /* Allow another sleep request after a while. */ callout_schedule(&acpi_sleep_timer, hz * ACPI_MINIMUM_AWAKETIME); /* Run /etc/rc.resume after we are back. */ if (devctl_process_running()) acpi_UserNotify("Resume", ACPI_ROOT_OBJECT, state); return_ACPI_STATUS (status); } static void acpi_resync_clock(struct acpi_softc *sc) { #ifdef __amd64__ if (!acpi_reset_clock) return; /* * Warm up timecounter again and reset system clock. */ (void)timecounter->tc_get_timecount(timecounter); (void)timecounter->tc_get_timecount(timecounter); inittodr(time_second + sc->acpi_sleep_delay); #endif } /* Enable or disable the device's wake GPE. */ int acpi_wake_set_enable(device_t dev, int enable) { struct acpi_prw_data prw; ACPI_STATUS status; int flags; /* Make sure the device supports waking the system and get the GPE. */ if (acpi_parse_prw(acpi_get_handle(dev), &prw) != 0) return (ENXIO); flags = acpi_get_flags(dev); if (enable) { status = AcpiSetGpeWakeMask(prw.gpe_handle, prw.gpe_bit, ACPI_GPE_ENABLE); if (ACPI_FAILURE(status)) { device_printf(dev, "enable wake failed\n"); return (ENXIO); } acpi_set_flags(dev, flags | ACPI_FLAG_WAKE_ENABLED); } else { status = AcpiSetGpeWakeMask(prw.gpe_handle, prw.gpe_bit, ACPI_GPE_DISABLE); if (ACPI_FAILURE(status)) { device_printf(dev, "disable wake failed\n"); return (ENXIO); } acpi_set_flags(dev, flags & ~ACPI_FLAG_WAKE_ENABLED); } return (0); } static int acpi_wake_sleep_prep(ACPI_HANDLE handle, int sstate) { struct acpi_prw_data prw; device_t dev; /* Check that this is a wake-capable device and get its GPE. */ if (acpi_parse_prw(handle, &prw) != 0) return (ENXIO); dev = acpi_get_device(handle); /* * The destination sleep state must be less than (i.e., higher power) * or equal to the value specified by _PRW. If this GPE cannot be * enabled for the next sleep state, then disable it. If it can and * the user requested it be enabled, turn on any required power resources * and set _PSW. */ if (sstate > prw.lowest_wake) { AcpiSetGpeWakeMask(prw.gpe_handle, prw.gpe_bit, ACPI_GPE_DISABLE); if (bootverbose) device_printf(dev, "wake_prep disabled wake for %s (S%d)\n", acpi_name(handle), sstate); } else if (dev && (acpi_get_flags(dev) & ACPI_FLAG_WAKE_ENABLED) != 0) { acpi_pwr_wake_enable(handle, 1); acpi_SetInteger(handle, "_PSW", 1); if (bootverbose) device_printf(dev, "wake_prep enabled for %s (S%d)\n", acpi_name(handle), sstate); } return (0); } static int acpi_wake_run_prep(ACPI_HANDLE handle, int sstate) { struct acpi_prw_data prw; device_t dev; /* * Check that this is a wake-capable device and get its GPE. Return * now if the user didn't enable this device for wake. */ if (acpi_parse_prw(handle, &prw) != 0) return (ENXIO); dev = acpi_get_device(handle); if (dev == NULL || (acpi_get_flags(dev) & ACPI_FLAG_WAKE_ENABLED) == 0) return (0); /* * If this GPE couldn't be enabled for the previous sleep state, it was * disabled before going to sleep so re-enable it. If it was enabled, * clear _PSW and turn off any power resources it used. */ if (sstate > prw.lowest_wake) { AcpiSetGpeWakeMask(prw.gpe_handle, prw.gpe_bit, ACPI_GPE_ENABLE); if (bootverbose) device_printf(dev, "run_prep re-enabled %s\n", acpi_name(handle)); } else { acpi_SetInteger(handle, "_PSW", 0); acpi_pwr_wake_enable(handle, 0); if (bootverbose) device_printf(dev, "run_prep cleaned up for %s\n", acpi_name(handle)); } return (0); } static ACPI_STATUS acpi_wake_prep(ACPI_HANDLE handle, UINT32 level, void *context, void **status) { int sstate; /* If suspending, run the sleep prep function, otherwise wake. */ sstate = *(int *)context; if (AcpiGbl_SystemAwakeAndRunning) acpi_wake_sleep_prep(handle, sstate); else acpi_wake_run_prep(handle, sstate); return (AE_OK); } /* Walk the tree rooted at acpi0 to prep devices for suspend/resume. */ static int acpi_wake_prep_walk(int sstate) { ACPI_HANDLE sb_handle; if (ACPI_SUCCESS(AcpiGetHandle(ACPI_ROOT_OBJECT, "\\_SB_", &sb_handle))) AcpiWalkNamespace(ACPI_TYPE_DEVICE, sb_handle, 100, acpi_wake_prep, NULL, &sstate, NULL); return (0); } /* Walk the tree rooted at acpi0 to attach per-device wake sysctls. */ static int acpi_wake_sysctl_walk(device_t dev) { int error, i, numdevs; device_t *devlist; device_t child; ACPI_STATUS status; error = device_get_children(dev, &devlist, &numdevs); if (error != 0 || numdevs == 0) { if (numdevs == 0) free(devlist, M_TEMP); return (error); } for (i = 0; i < numdevs; i++) { child = devlist[i]; acpi_wake_sysctl_walk(child); if (!device_is_attached(child)) continue; status = AcpiEvaluateObject(acpi_get_handle(child), "_PRW", NULL, NULL); if (ACPI_SUCCESS(status)) { SYSCTL_ADD_PROC(device_get_sysctl_ctx(child), SYSCTL_CHILDREN(device_get_sysctl_tree(child)), OID_AUTO, "wake", CTLTYPE_INT | CTLFLAG_RW, child, 0, acpi_wake_set_sysctl, "I", "Device set to wake the system"); } } free(devlist, M_TEMP); return (0); } /* Enable or disable wake from userland. */ static int acpi_wake_set_sysctl(SYSCTL_HANDLER_ARGS) { int enable, error; device_t dev; dev = (device_t)arg1; enable = (acpi_get_flags(dev) & ACPI_FLAG_WAKE_ENABLED) ? 1 : 0; error = sysctl_handle_int(oidp, &enable, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (enable != 0 && enable != 1) return (EINVAL); return (acpi_wake_set_enable(dev, enable)); } /* Parse a device's _PRW into a structure. */ int acpi_parse_prw(ACPI_HANDLE h, struct acpi_prw_data *prw) { ACPI_STATUS status; ACPI_BUFFER prw_buffer; ACPI_OBJECT *res, *res2; int error, i, power_count; if (h == NULL || prw == NULL) return (EINVAL); /* * The _PRW object (7.2.9) is only required for devices that have the * ability to wake the system from a sleeping state. */ error = EINVAL; prw_buffer.Pointer = NULL; prw_buffer.Length = ACPI_ALLOCATE_BUFFER; status = AcpiEvaluateObject(h, "_PRW", NULL, &prw_buffer); if (ACPI_FAILURE(status)) return (ENOENT); res = (ACPI_OBJECT *)prw_buffer.Pointer; if (res == NULL) return (ENOENT); if (!ACPI_PKG_VALID(res, 2)) goto out; /* * Element 1 of the _PRW object: * The lowest power system sleeping state that can be entered while still * providing wake functionality. The sleeping state being entered must * be less than (i.e., higher power) or equal to this value. */ if (acpi_PkgInt32(res, 1, &prw->lowest_wake) != 0) goto out; /* * Element 0 of the _PRW object: */ switch (res->Package.Elements[0].Type) { case ACPI_TYPE_INTEGER: /* * If the data type of this package element is numeric, then this * _PRW package element is the bit index in the GPEx_EN, in the * GPE blocks described in the FADT, of the enable bit that is * enabled for the wake event. */ prw->gpe_handle = NULL; prw->gpe_bit = res->Package.Elements[0].Integer.Value; error = 0; break; case ACPI_TYPE_PACKAGE: /* * If the data type of this package element is a package, then this * _PRW package element is itself a package containing two * elements. The first is an object reference to the GPE Block * device that contains the GPE that will be triggered by the wake * event. The second element is numeric and it contains the bit * index in the GPEx_EN, in the GPE Block referenced by the * first element in the package, of the enable bit that is enabled for * the wake event. * * For example, if this field is a package then it is of the form: * Package() {\_SB.PCI0.ISA.GPE, 2} */ res2 = &res->Package.Elements[0]; if (!ACPI_PKG_VALID(res2, 2)) goto out; prw->gpe_handle = acpi_GetReference(NULL, &res2->Package.Elements[0]); if (prw->gpe_handle == NULL) goto out; if (acpi_PkgInt32(res2, 1, &prw->gpe_bit) != 0) goto out; error = 0; break; default: goto out; } /* Elements 2 to N of the _PRW object are power resources. */ power_count = res->Package.Count - 2; if (power_count > ACPI_PRW_MAX_POWERRES) { printf("ACPI device %s has too many power resources\n", acpi_name(h)); power_count = 0; } prw->power_res_count = power_count; for (i = 0; i < power_count; i++) prw->power_res[i] = res->Package.Elements[i]; out: if (prw_buffer.Pointer != NULL) AcpiOsFree(prw_buffer.Pointer); return (error); } /* * ACPI Event Handlers */ /* System Event Handlers (registered by EVENTHANDLER_REGISTER) */ static void acpi_system_eventhandler_sleep(void *arg, int state) { struct acpi_softc *sc = (struct acpi_softc *)arg; int ret; ACPI_FUNCTION_TRACE_U32((char *)(uintptr_t)__func__, state); /* Check if button action is disabled or unknown. */ if (state == ACPI_STATE_UNKNOWN) return; /* Request that the system prepare to enter the given suspend state. */ ret = acpi_ReqSleepState(sc, state); if (ret != 0) device_printf(sc->acpi_dev, "request to enter state S%d failed (err %d)\n", state, ret); return_VOID; } static void acpi_system_eventhandler_wakeup(void *arg, int state) { ACPI_FUNCTION_TRACE_U32((char *)(uintptr_t)__func__, state); /* Currently, nothing to do for wakeup. */ return_VOID; } /* * ACPICA Event Handlers (FixedEvent, also called from button notify handler) */ static void acpi_invoke_sleep_eventhandler(void *context) { EVENTHANDLER_INVOKE(acpi_sleep_event, *(int *)context); } static void acpi_invoke_wake_eventhandler(void *context) { EVENTHANDLER_INVOKE(acpi_wakeup_event, *(int *)context); } UINT32 acpi_event_power_button_sleep(void *context) { struct acpi_softc *sc = (struct acpi_softc *)context; ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); if (ACPI_FAILURE(AcpiOsExecute(OSL_NOTIFY_HANDLER, acpi_invoke_sleep_eventhandler, &sc->acpi_power_button_sx))) return_VALUE (ACPI_INTERRUPT_NOT_HANDLED); return_VALUE (ACPI_INTERRUPT_HANDLED); } UINT32 acpi_event_power_button_wake(void *context) { struct acpi_softc *sc = (struct acpi_softc *)context; ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); if (ACPI_FAILURE(AcpiOsExecute(OSL_NOTIFY_HANDLER, acpi_invoke_wake_eventhandler, &sc->acpi_power_button_sx))) return_VALUE (ACPI_INTERRUPT_NOT_HANDLED); return_VALUE (ACPI_INTERRUPT_HANDLED); } UINT32 acpi_event_sleep_button_sleep(void *context) { struct acpi_softc *sc = (struct acpi_softc *)context; ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); if (ACPI_FAILURE(AcpiOsExecute(OSL_NOTIFY_HANDLER, acpi_invoke_sleep_eventhandler, &sc->acpi_sleep_button_sx))) return_VALUE (ACPI_INTERRUPT_NOT_HANDLED); return_VALUE (ACPI_INTERRUPT_HANDLED); } UINT32 acpi_event_sleep_button_wake(void *context) { struct acpi_softc *sc = (struct acpi_softc *)context; ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); if (ACPI_FAILURE(AcpiOsExecute(OSL_NOTIFY_HANDLER, acpi_invoke_wake_eventhandler, &sc->acpi_sleep_button_sx))) return_VALUE (ACPI_INTERRUPT_NOT_HANDLED); return_VALUE (ACPI_INTERRUPT_HANDLED); } /* * XXX This static buffer is suboptimal. There is no locking so only * use this for single-threaded callers. */ char * acpi_name(ACPI_HANDLE handle) { ACPI_BUFFER buf; static char data[256]; buf.Length = sizeof(data); buf.Pointer = data; if (handle && ACPI_SUCCESS(AcpiGetName(handle, ACPI_FULL_PATHNAME, &buf))) return (data); return ("(unknown)"); } /* * Debugging/bug-avoidance. Avoid trying to fetch info on various * parts of the namespace. */ int acpi_avoid(ACPI_HANDLE handle) { char *cp, *env, *np; int len; np = acpi_name(handle); if (*np == '\\') np++; if ((env = kern_getenv("debug.acpi.avoid")) == NULL) return (0); /* Scan the avoid list checking for a match */ cp = env; for (;;) { while (*cp != 0 && isspace(*cp)) cp++; if (*cp == 0) break; len = 0; while (cp[len] != 0 && !isspace(cp[len])) len++; if (!strncmp(cp, np, len)) { freeenv(env); return(1); } cp += len; } freeenv(env); return (0); } /* * Debugging/bug-avoidance. Disable ACPI subsystem components. */ int acpi_disabled(char *subsys) { char *cp, *env; int len; if ((env = kern_getenv("debug.acpi.disabled")) == NULL) return (0); if (strcmp(env, "all") == 0) { freeenv(env); return (1); } /* Scan the disable list, checking for a match. */ cp = env; for (;;) { while (*cp != '\0' && isspace(*cp)) cp++; if (*cp == '\0') break; len = 0; while (cp[len] != '\0' && !isspace(cp[len])) len++; if (strncmp(cp, subsys, len) == 0) { freeenv(env); return (1); } cp += len; } freeenv(env); return (0); } static void acpi_lookup(void *arg, const char *name, device_t *dev) { ACPI_HANDLE handle; if (*dev != NULL) return; /* * Allow any handle name that is specified as an absolute path and * starts with '\'. We could restrict this to \_SB and friends, * but see acpi_probe_children() for notes on why we scan the entire * namespace for devices. * * XXX: The pathname argument to AcpiGetHandle() should be fixed to * be const. */ if (name[0] != '\\') return; if (ACPI_FAILURE(AcpiGetHandle(ACPI_ROOT_OBJECT, __DECONST(char *, name), &handle))) return; *dev = acpi_get_device(handle); } /* * Control interface. * * We multiplex ioctls for all participating ACPI devices here. Individual * drivers wanting to be accessible via /dev/acpi should use the * register/deregister interface to make their handlers visible. */ struct acpi_ioctl_hook { TAILQ_ENTRY(acpi_ioctl_hook) link; u_long cmd; acpi_ioctl_fn fn; void *arg; }; static TAILQ_HEAD(,acpi_ioctl_hook) acpi_ioctl_hooks; static int acpi_ioctl_hooks_initted; int acpi_register_ioctl(u_long cmd, acpi_ioctl_fn fn, void *arg) { struct acpi_ioctl_hook *hp; if ((hp = malloc(sizeof(*hp), M_ACPIDEV, M_NOWAIT)) == NULL) return (ENOMEM); hp->cmd = cmd; hp->fn = fn; hp->arg = arg; ACPI_LOCK(acpi); if (acpi_ioctl_hooks_initted == 0) { TAILQ_INIT(&acpi_ioctl_hooks); acpi_ioctl_hooks_initted = 1; } TAILQ_INSERT_TAIL(&acpi_ioctl_hooks, hp, link); ACPI_UNLOCK(acpi); return (0); } void acpi_deregister_ioctl(u_long cmd, acpi_ioctl_fn fn) { struct acpi_ioctl_hook *hp; ACPI_LOCK(acpi); TAILQ_FOREACH(hp, &acpi_ioctl_hooks, link) if (hp->cmd == cmd && hp->fn == fn) break; if (hp != NULL) { TAILQ_REMOVE(&acpi_ioctl_hooks, hp, link); free(hp, M_ACPIDEV); } ACPI_UNLOCK(acpi); } static int acpiopen(struct cdev *dev, int flag, int fmt, struct thread *td) { return (0); } static int acpiclose(struct cdev *dev, int flag, int fmt, struct thread *td) { return (0); } static int acpiioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td) { struct acpi_softc *sc; struct acpi_ioctl_hook *hp; int error, state; error = 0; hp = NULL; sc = dev->si_drv1; /* * Scan the list of registered ioctls, looking for handlers. */ ACPI_LOCK(acpi); if (acpi_ioctl_hooks_initted) TAILQ_FOREACH(hp, &acpi_ioctl_hooks, link) { if (hp->cmd == cmd) break; } ACPI_UNLOCK(acpi); if (hp) return (hp->fn(cmd, addr, hp->arg)); /* * Core ioctls are not permitted for non-writable user. * Currently, other ioctls just fetch information. * Not changing system behavior. */ if ((flag & FWRITE) == 0) return (EPERM); /* Core system ioctls. */ switch (cmd) { case ACPIIO_REQSLPSTATE: state = *(int *)addr; if (state != ACPI_STATE_S5) return (acpi_ReqSleepState(sc, state)); device_printf(sc->acpi_dev, "power off via acpi ioctl not supported\n"); error = EOPNOTSUPP; break; case ACPIIO_ACKSLPSTATE: error = *(int *)addr; error = acpi_AckSleepState(sc->acpi_clone, error); break; case ACPIIO_SETSLPSTATE: /* DEPRECATED */ state = *(int *)addr; if (state < ACPI_STATE_S0 || state > ACPI_S_STATES_MAX) return (EINVAL); if (!acpi_sleep_states[state]) return (EOPNOTSUPP); if (ACPI_FAILURE(acpi_SetSleepState(sc, state))) error = ENXIO; break; default: error = ENXIO; break; } return (error); } static int acpi_sname2sstate(const char *sname) { int sstate; if (toupper(sname[0]) == 'S') { sstate = sname[1] - '0'; if (sstate >= ACPI_STATE_S0 && sstate <= ACPI_STATE_S5 && sname[2] == '\0') return (sstate); } else if (strcasecmp(sname, "NONE") == 0) return (ACPI_STATE_UNKNOWN); return (-1); } static const char * acpi_sstate2sname(int sstate) { static const char *snames[] = { "S0", "S1", "S2", "S3", "S4", "S5" }; if (sstate >= ACPI_STATE_S0 && sstate <= ACPI_STATE_S5) return (snames[sstate]); else if (sstate == ACPI_STATE_UNKNOWN) return ("NONE"); return (NULL); } static int acpi_supported_sleep_state_sysctl(SYSCTL_HANDLER_ARGS) { int error; struct sbuf sb; UINT8 state; sbuf_new(&sb, NULL, 32, SBUF_AUTOEXTEND); for (state = ACPI_STATE_S1; state < ACPI_S_STATE_COUNT; state++) if (acpi_sleep_states[state]) sbuf_printf(&sb, "%s ", acpi_sstate2sname(state)); sbuf_trim(&sb); sbuf_finish(&sb); error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); sbuf_delete(&sb); return (error); } static int acpi_sleep_state_sysctl(SYSCTL_HANDLER_ARGS) { char sleep_state[10]; int error, new_state, old_state; old_state = *(int *)oidp->oid_arg1; strlcpy(sleep_state, acpi_sstate2sname(old_state), sizeof(sleep_state)); error = sysctl_handle_string(oidp, sleep_state, sizeof(sleep_state), req); if (error == 0 && req->newptr != NULL) { new_state = acpi_sname2sstate(sleep_state); if (new_state < ACPI_STATE_S1) return (EINVAL); if (new_state < ACPI_S_STATE_COUNT && !acpi_sleep_states[new_state]) return (EOPNOTSUPP); if (new_state != old_state) *(int *)oidp->oid_arg1 = new_state; } return (error); } /* Inform devctl(4) when we receive a Notify. */ void acpi_UserNotify(const char *subsystem, ACPI_HANDLE h, uint8_t notify) { char notify_buf[16]; ACPI_BUFFER handle_buf; ACPI_STATUS status; if (subsystem == NULL) return; handle_buf.Pointer = NULL; handle_buf.Length = ACPI_ALLOCATE_BUFFER; status = AcpiNsHandleToPathname(h, &handle_buf, FALSE); if (ACPI_FAILURE(status)) return; snprintf(notify_buf, sizeof(notify_buf), "notify=0x%02x", notify); devctl_notify("ACPI", subsystem, handle_buf.Pointer, notify_buf); AcpiOsFree(handle_buf.Pointer); } #ifdef ACPI_DEBUG /* * Support for parsing debug options from the kernel environment. * * Bits may be set in the AcpiDbgLayer and AcpiDbgLevel debug registers * by specifying the names of the bits in the debug.acpi.layer and * debug.acpi.level environment variables. Bits may be unset by * prefixing the bit name with !. */ struct debugtag { char *name; UINT32 value; }; static struct debugtag dbg_layer[] = { {"ACPI_UTILITIES", ACPI_UTILITIES}, {"ACPI_HARDWARE", ACPI_HARDWARE}, {"ACPI_EVENTS", ACPI_EVENTS}, {"ACPI_TABLES", ACPI_TABLES}, {"ACPI_NAMESPACE", ACPI_NAMESPACE}, {"ACPI_PARSER", ACPI_PARSER}, {"ACPI_DISPATCHER", ACPI_DISPATCHER}, {"ACPI_EXECUTER", ACPI_EXECUTER}, {"ACPI_RESOURCES", ACPI_RESOURCES}, {"ACPI_CA_DEBUGGER", ACPI_CA_DEBUGGER}, {"ACPI_OS_SERVICES", ACPI_OS_SERVICES}, {"ACPI_CA_DISASSEMBLER", ACPI_CA_DISASSEMBLER}, {"ACPI_ALL_COMPONENTS", ACPI_ALL_COMPONENTS}, {"ACPI_AC_ADAPTER", ACPI_AC_ADAPTER}, {"ACPI_BATTERY", ACPI_BATTERY}, {"ACPI_BUS", ACPI_BUS}, {"ACPI_BUTTON", ACPI_BUTTON}, {"ACPI_EC", ACPI_EC}, {"ACPI_FAN", ACPI_FAN}, {"ACPI_POWERRES", ACPI_POWERRES}, {"ACPI_PROCESSOR", ACPI_PROCESSOR}, {"ACPI_THERMAL", ACPI_THERMAL}, {"ACPI_TIMER", ACPI_TIMER}, {"ACPI_ALL_DRIVERS", ACPI_ALL_DRIVERS}, {NULL, 0} }; static struct debugtag dbg_level[] = { {"ACPI_LV_INIT", ACPI_LV_INIT}, {"ACPI_LV_DEBUG_OBJECT", ACPI_LV_DEBUG_OBJECT}, {"ACPI_LV_INFO", ACPI_LV_INFO}, {"ACPI_LV_REPAIR", ACPI_LV_REPAIR}, {"ACPI_LV_ALL_EXCEPTIONS", ACPI_LV_ALL_EXCEPTIONS}, /* Trace verbosity level 1 [Standard Trace Level] */ {"ACPI_LV_INIT_NAMES", ACPI_LV_INIT_NAMES}, {"ACPI_LV_PARSE", ACPI_LV_PARSE}, {"ACPI_LV_LOAD", ACPI_LV_LOAD}, {"ACPI_LV_DISPATCH", ACPI_LV_DISPATCH}, {"ACPI_LV_EXEC", ACPI_LV_EXEC}, {"ACPI_LV_NAMES", ACPI_LV_NAMES}, {"ACPI_LV_OPREGION", ACPI_LV_OPREGION}, {"ACPI_LV_BFIELD", ACPI_LV_BFIELD}, {"ACPI_LV_TABLES", ACPI_LV_TABLES}, {"ACPI_LV_VALUES", ACPI_LV_VALUES}, {"ACPI_LV_OBJECTS", ACPI_LV_OBJECTS}, {"ACPI_LV_RESOURCES", ACPI_LV_RESOURCES}, {"ACPI_LV_USER_REQUESTS", ACPI_LV_USER_REQUESTS}, {"ACPI_LV_PACKAGE", ACPI_LV_PACKAGE}, {"ACPI_LV_VERBOSITY1", ACPI_LV_VERBOSITY1}, /* Trace verbosity level 2 [Function tracing and memory allocation] */ {"ACPI_LV_ALLOCATIONS", ACPI_LV_ALLOCATIONS}, {"ACPI_LV_FUNCTIONS", ACPI_LV_FUNCTIONS}, {"ACPI_LV_OPTIMIZATIONS", ACPI_LV_OPTIMIZATIONS}, {"ACPI_LV_VERBOSITY2", ACPI_LV_VERBOSITY2}, {"ACPI_LV_ALL", ACPI_LV_ALL}, /* Trace verbosity level 3 [Threading, I/O, and Interrupts] */ {"ACPI_LV_MUTEX", ACPI_LV_MUTEX}, {"ACPI_LV_THREADS", ACPI_LV_THREADS}, {"ACPI_LV_IO", ACPI_LV_IO}, {"ACPI_LV_INTERRUPTS", ACPI_LV_INTERRUPTS}, {"ACPI_LV_VERBOSITY3", ACPI_LV_VERBOSITY3}, /* Exceptionally verbose output -- also used in the global "DebugLevel" */ {"ACPI_LV_AML_DISASSEMBLE", ACPI_LV_AML_DISASSEMBLE}, {"ACPI_LV_VERBOSE_INFO", ACPI_LV_VERBOSE_INFO}, {"ACPI_LV_FULL_TABLES", ACPI_LV_FULL_TABLES}, {"ACPI_LV_EVENTS", ACPI_LV_EVENTS}, {"ACPI_LV_VERBOSE", ACPI_LV_VERBOSE}, {NULL, 0} }; static void acpi_parse_debug(char *cp, struct debugtag *tag, UINT32 *flag) { char *ep; int i, l; int set; while (*cp) { if (isspace(*cp)) { cp++; continue; } ep = cp; while (*ep && !isspace(*ep)) ep++; if (*cp == '!') { set = 0; cp++; if (cp == ep) continue; } else { set = 1; } l = ep - cp; for (i = 0; tag[i].name != NULL; i++) { if (!strncmp(cp, tag[i].name, l)) { if (set) *flag |= tag[i].value; else *flag &= ~tag[i].value; } } cp = ep; } } static void acpi_set_debugging(void *junk) { char *layer, *level; if (cold) { AcpiDbgLayer = 0; AcpiDbgLevel = 0; } layer = kern_getenv("debug.acpi.layer"); level = kern_getenv("debug.acpi.level"); if (layer == NULL && level == NULL) return; printf("ACPI set debug"); if (layer != NULL) { if (strcmp("NONE", layer) != 0) printf(" layer '%s'", layer); acpi_parse_debug(layer, &dbg_layer[0], &AcpiDbgLayer); freeenv(layer); } if (level != NULL) { if (strcmp("NONE", level) != 0) printf(" level '%s'", level); acpi_parse_debug(level, &dbg_level[0], &AcpiDbgLevel); freeenv(level); } printf("\n"); } SYSINIT(acpi_debugging, SI_SUB_TUNABLES, SI_ORDER_ANY, acpi_set_debugging, NULL); static int acpi_debug_sysctl(SYSCTL_HANDLER_ARGS) { int error, *dbg; struct debugtag *tag; struct sbuf sb; char temp[128]; if (sbuf_new(&sb, NULL, 128, SBUF_AUTOEXTEND) == NULL) return (ENOMEM); if (strcmp(oidp->oid_arg1, "debug.acpi.layer") == 0) { tag = &dbg_layer[0]; dbg = &AcpiDbgLayer; } else { tag = &dbg_level[0]; dbg = &AcpiDbgLevel; } /* Get old values if this is a get request. */ ACPI_SERIAL_BEGIN(acpi); if (*dbg == 0) { sbuf_cpy(&sb, "NONE"); } else if (req->newptr == NULL) { for (; tag->name != NULL; tag++) { if ((*dbg & tag->value) == tag->value) sbuf_printf(&sb, "%s ", tag->name); } } sbuf_trim(&sb); sbuf_finish(&sb); strlcpy(temp, sbuf_data(&sb), sizeof(temp)); sbuf_delete(&sb); error = sysctl_handle_string(oidp, temp, sizeof(temp), req); /* Check for error or no change */ if (error == 0 && req->newptr != NULL) { *dbg = 0; kern_setenv((char *)oidp->oid_arg1, temp); acpi_set_debugging(NULL); } ACPI_SERIAL_END(acpi); return (error); } SYSCTL_PROC(_debug_acpi, OID_AUTO, layer, CTLFLAG_RW | CTLTYPE_STRING, "debug.acpi.layer", 0, acpi_debug_sysctl, "A", ""); SYSCTL_PROC(_debug_acpi, OID_AUTO, level, CTLFLAG_RW | CTLTYPE_STRING, "debug.acpi.level", 0, acpi_debug_sysctl, "A", ""); #endif /* ACPI_DEBUG */ static int acpi_debug_objects_sysctl(SYSCTL_HANDLER_ARGS) { int error; int old; old = acpi_debug_objects; error = sysctl_handle_int(oidp, &acpi_debug_objects, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (old == acpi_debug_objects || (old && acpi_debug_objects)) return (0); ACPI_SERIAL_BEGIN(acpi); AcpiGbl_EnableAmlDebugObject = acpi_debug_objects ? TRUE : FALSE; ACPI_SERIAL_END(acpi); return (0); } static int acpi_parse_interfaces(char *str, struct acpi_interface *iface) { char *p; size_t len; int i, j; p = str; while (isspace(*p) || *p == ',') p++; len = strlen(p); if (len == 0) return (0); p = strdup(p, M_TEMP); for (i = 0; i < len; i++) if (p[i] == ',') p[i] = '\0'; i = j = 0; while (i < len) if (isspace(p[i]) || p[i] == '\0') i++; else { i += strlen(p + i) + 1; j++; } if (j == 0) { free(p, M_TEMP); return (0); } iface->data = malloc(sizeof(*iface->data) * j, M_TEMP, M_WAITOK); iface->num = j; i = j = 0; while (i < len) if (isspace(p[i]) || p[i] == '\0') i++; else { iface->data[j] = p + i; i += strlen(p + i) + 1; j++; } return (j); } static void acpi_free_interfaces(struct acpi_interface *iface) { free(iface->data[0], M_TEMP); free(iface->data, M_TEMP); } static void acpi_reset_interfaces(device_t dev) { struct acpi_interface list; ACPI_STATUS status; int i; if (acpi_parse_interfaces(acpi_install_interface, &list) > 0) { for (i = 0; i < list.num; i++) { status = AcpiInstallInterface(list.data[i]); if (ACPI_FAILURE(status)) device_printf(dev, "failed to install _OSI(\"%s\"): %s\n", list.data[i], AcpiFormatException(status)); else if (bootverbose) device_printf(dev, "installed _OSI(\"%s\")\n", list.data[i]); } acpi_free_interfaces(&list); } if (acpi_parse_interfaces(acpi_remove_interface, &list) > 0) { for (i = 0; i < list.num; i++) { status = AcpiRemoveInterface(list.data[i]); if (ACPI_FAILURE(status)) device_printf(dev, "failed to remove _OSI(\"%s\"): %s\n", list.data[i], AcpiFormatException(status)); else if (bootverbose) device_printf(dev, "removed _OSI(\"%s\")\n", list.data[i]); } acpi_free_interfaces(&list); } } static int acpi_pm_func(u_long cmd, void *arg, ...) { int state, acpi_state; int error; struct acpi_softc *sc; va_list ap; error = 0; switch (cmd) { case POWER_CMD_SUSPEND: sc = (struct acpi_softc *)arg; if (sc == NULL) { error = EINVAL; goto out; } va_start(ap, arg); state = va_arg(ap, int); va_end(ap); switch (state) { case POWER_SLEEP_STATE_STANDBY: acpi_state = sc->acpi_standby_sx; break; case POWER_SLEEP_STATE_SUSPEND: acpi_state = sc->acpi_suspend_sx; break; case POWER_SLEEP_STATE_HIBERNATE: acpi_state = ACPI_STATE_S4; break; default: error = EINVAL; goto out; } if (ACPI_FAILURE(acpi_EnterSleepState(sc, acpi_state))) error = ENXIO; break; default: error = EINVAL; goto out; } out: return (error); } static void acpi_pm_register(void *arg) { if (!cold || resource_disabled("acpi", 0)) return; power_pm_register(POWER_PM_TYPE_ACPI, acpi_pm_func, NULL); } -SYSINIT(power, SI_SUB_KLD, SI_ORDER_ANY, acpi_pm_register, 0); +SYSINIT(power, SI_SUB_KLD, SI_ORDER_ANY, acpi_pm_register, NULL); Index: stable/11/sys/fs/cuse/cuse.c =================================================================== --- stable/11/sys/fs/cuse/cuse.c (revision 359651) +++ stable/11/sys/fs/cuse/cuse.c (revision 359652) @@ -1,1946 +1,1946 @@ /* $FreeBSD$ */ /*- * Copyright (c) 2010-2020 Hans Petter Selasky. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MODULE_VERSION(cuse, 1); /* * Prevent cuse4bsd.ko and cuse.ko from loading at the same time by * declaring support for the cuse4bsd interface in cuse.ko: */ MODULE_VERSION(cuse4bsd, 1); #ifdef FEATURE FEATURE(cuse, "Userspace character devices"); #endif struct cuse_command; struct cuse_server; struct cuse_client; struct cuse_client_command { TAILQ_ENTRY(cuse_client_command) entry; struct cuse_command sub; struct sx sx; struct cv cv; struct thread *entered; struct cuse_client *client; struct proc *proc_curr; int proc_refs; int got_signal; int error; int command; }; struct cuse_memory { TAILQ_ENTRY(cuse_memory) entry; vm_object_t object; uint32_t page_count; uint32_t alloc_nr; }; struct cuse_server_dev { TAILQ_ENTRY(cuse_server_dev) entry; struct cuse_server *server; struct cdev *kern_dev; struct cuse_dev *user_dev; }; struct cuse_server { TAILQ_ENTRY(cuse_server) entry; TAILQ_HEAD(, cuse_client_command) head; TAILQ_HEAD(, cuse_server_dev) hdev; TAILQ_HEAD(, cuse_client) hcli; TAILQ_HEAD(, cuse_memory) hmem; struct mtx mtx; struct cv cv; struct selinfo selinfo; pid_t pid; int is_closing; int refs; }; struct cuse_client { TAILQ_ENTRY(cuse_client) entry; TAILQ_ENTRY(cuse_client) entry_ref; struct cuse_client_command cmds[CUSE_CMD_MAX]; struct cuse_server *server; struct cuse_server_dev *server_dev; uint8_t ioctl_buffer[CUSE_BUFFER_MAX] __aligned(4); int fflags; /* file flags */ int cflags; /* client flags */ #define CUSE_CLI_IS_CLOSING 0x01 #define CUSE_CLI_KNOTE_NEED_READ 0x02 #define CUSE_CLI_KNOTE_NEED_WRITE 0x04 #define CUSE_CLI_KNOTE_HAS_READ 0x08 #define CUSE_CLI_KNOTE_HAS_WRITE 0x10 }; #define CUSE_CLIENT_CLOSING(pcc) \ ((pcc)->cflags & CUSE_CLI_IS_CLOSING) static MALLOC_DEFINE(M_CUSE, "cuse", "CUSE memory"); static TAILQ_HEAD(, cuse_server) cuse_server_head; static struct mtx cuse_global_mtx; static struct cdev *cuse_dev; static struct cuse_server *cuse_alloc_unit[CUSE_DEVICES_MAX]; static int cuse_alloc_unit_id[CUSE_DEVICES_MAX]; static void cuse_server_wakeup_all_client_locked(struct cuse_server *pcs); static void cuse_client_kqfilter_read_detach(struct knote *kn); static void cuse_client_kqfilter_write_detach(struct knote *kn); static int cuse_client_kqfilter_read_event(struct knote *kn, long hint); static int cuse_client_kqfilter_write_event(struct knote *kn, long hint); static struct filterops cuse_client_kqfilter_read_ops = { .f_isfd = 1, .f_detach = cuse_client_kqfilter_read_detach, .f_event = cuse_client_kqfilter_read_event, }; static struct filterops cuse_client_kqfilter_write_ops = { .f_isfd = 1, .f_detach = cuse_client_kqfilter_write_detach, .f_event = cuse_client_kqfilter_write_event, }; static d_open_t cuse_client_open; static d_close_t cuse_client_close; static d_ioctl_t cuse_client_ioctl; static d_read_t cuse_client_read; static d_write_t cuse_client_write; static d_poll_t cuse_client_poll; static d_mmap_single_t cuse_client_mmap_single; static d_kqfilter_t cuse_client_kqfilter; static struct cdevsw cuse_client_devsw = { .d_version = D_VERSION, .d_open = cuse_client_open, .d_close = cuse_client_close, .d_ioctl = cuse_client_ioctl, .d_name = "cuse_client", .d_flags = D_TRACKCLOSE, .d_read = cuse_client_read, .d_write = cuse_client_write, .d_poll = cuse_client_poll, .d_mmap_single = cuse_client_mmap_single, .d_kqfilter = cuse_client_kqfilter, }; static d_open_t cuse_server_open; static d_close_t cuse_server_close; static d_ioctl_t cuse_server_ioctl; static d_read_t cuse_server_read; static d_write_t cuse_server_write; static d_poll_t cuse_server_poll; static d_mmap_single_t cuse_server_mmap_single; static struct cdevsw cuse_server_devsw = { .d_version = D_VERSION, .d_open = cuse_server_open, .d_close = cuse_server_close, .d_ioctl = cuse_server_ioctl, .d_name = "cuse_server", .d_flags = D_TRACKCLOSE, .d_read = cuse_server_read, .d_write = cuse_server_write, .d_poll = cuse_server_poll, .d_mmap_single = cuse_server_mmap_single, }; static void cuse_client_is_closing(struct cuse_client *); static int cuse_free_unit_by_id_locked(struct cuse_server *, int); static void cuse_global_lock(void) { mtx_lock(&cuse_global_mtx); } static void cuse_global_unlock(void) { mtx_unlock(&cuse_global_mtx); } static void cuse_server_lock(struct cuse_server *pcs) { mtx_lock(&pcs->mtx); } static void cuse_server_unlock(struct cuse_server *pcs) { mtx_unlock(&pcs->mtx); } static void cuse_cmd_lock(struct cuse_client_command *pccmd) { sx_xlock(&pccmd->sx); } static void cuse_cmd_unlock(struct cuse_client_command *pccmd) { sx_xunlock(&pccmd->sx); } static void cuse_kern_init(void *arg) { TAILQ_INIT(&cuse_server_head); mtx_init(&cuse_global_mtx, "cuse-global-mtx", NULL, MTX_DEF); cuse_dev = make_dev(&cuse_server_devsw, 0, UID_ROOT, GID_OPERATOR, 0600, "cuse"); printf("Cuse v%d.%d.%d @ /dev/cuse\n", (CUSE_VERSION >> 16) & 0xFF, (CUSE_VERSION >> 8) & 0xFF, (CUSE_VERSION >> 0) & 0xFF); } -SYSINIT(cuse_kern_init, SI_SUB_DEVFS, SI_ORDER_ANY, cuse_kern_init, 0); +SYSINIT(cuse_kern_init, SI_SUB_DEVFS, SI_ORDER_ANY, cuse_kern_init, NULL); static void cuse_kern_uninit(void *arg) { void *ptr; while (1) { printf("Cuse: Please exit all /dev/cuse instances " "and processes which have used this device.\n"); pause("DRAIN", 2 * hz); cuse_global_lock(); ptr = TAILQ_FIRST(&cuse_server_head); cuse_global_unlock(); if (ptr == NULL) break; } if (cuse_dev != NULL) destroy_dev(cuse_dev); mtx_destroy(&cuse_global_mtx); } SYSUNINIT(cuse_kern_uninit, SI_SUB_DEVFS, SI_ORDER_ANY, cuse_kern_uninit, 0); static int cuse_server_get(struct cuse_server **ppcs) { struct cuse_server *pcs; int error; error = devfs_get_cdevpriv((void **)&pcs); if (error != 0) { *ppcs = NULL; return (error); } if (pcs->is_closing) { *ppcs = NULL; return (EINVAL); } *ppcs = pcs; return (0); } static void cuse_server_is_closing(struct cuse_server *pcs) { struct cuse_client *pcc; if (pcs->is_closing) return; pcs->is_closing = 1; TAILQ_FOREACH(pcc, &pcs->hcli, entry) { cuse_client_is_closing(pcc); } } static struct cuse_client_command * cuse_server_find_command(struct cuse_server *pcs, struct thread *td) { struct cuse_client *pcc; int n; if (pcs->is_closing) goto done; TAILQ_FOREACH(pcc, &pcs->hcli, entry) { if (CUSE_CLIENT_CLOSING(pcc)) continue; for (n = 0; n != CUSE_CMD_MAX; n++) { if (pcc->cmds[n].entered == td) return (&pcc->cmds[n]); } } done: return (NULL); } static void cuse_str_filter(char *ptr) { int c; while (((c = *ptr) != 0)) { if ((c >= 'a') && (c <= 'z')) { ptr++; continue; } if ((c >= 'A') && (c <= 'Z')) { ptr++; continue; } if ((c >= '0') && (c <= '9')) { ptr++; continue; } if ((c == '.') || (c == '_') || (c == '/')) { ptr++; continue; } *ptr = '_'; ptr++; } } static int cuse_convert_error(int error) { ; /* indent fix */ switch (error) { case CUSE_ERR_NONE: return (0); case CUSE_ERR_BUSY: return (EBUSY); case CUSE_ERR_WOULDBLOCK: return (EWOULDBLOCK); case CUSE_ERR_INVALID: return (EINVAL); case CUSE_ERR_NO_MEMORY: return (ENOMEM); case CUSE_ERR_FAULT: return (EFAULT); case CUSE_ERR_SIGNAL: return (EINTR); case CUSE_ERR_NO_DEVICE: return (ENODEV); default: return (ENXIO); } } static void cuse_vm_memory_free(struct cuse_memory *mem) { /* last user is gone - free */ vm_object_deallocate(mem->object); /* free CUSE memory */ free(mem, M_CUSE); } static int cuse_server_alloc_memory(struct cuse_server *pcs, uint32_t alloc_nr, uint32_t page_count) { struct cuse_memory *temp; struct cuse_memory *mem; vm_object_t object; int error; mem = malloc(sizeof(*mem), M_CUSE, M_WAITOK | M_ZERO); if (mem == NULL) return (ENOMEM); object = vm_pager_allocate(OBJT_SWAP, NULL, PAGE_SIZE * page_count, VM_PROT_DEFAULT, 0, curthread->td_ucred); if (object == NULL) { error = ENOMEM; goto error_0; } cuse_server_lock(pcs); /* check if allocation number already exists */ TAILQ_FOREACH(temp, &pcs->hmem, entry) { if (temp->alloc_nr == alloc_nr) break; } if (temp != NULL) { cuse_server_unlock(pcs); error = EBUSY; goto error_1; } mem->object = object; mem->page_count = page_count; mem->alloc_nr = alloc_nr; TAILQ_INSERT_TAIL(&pcs->hmem, mem, entry); cuse_server_unlock(pcs); return (0); error_1: vm_object_deallocate(object); error_0: free(mem, M_CUSE); return (error); } static int cuse_server_free_memory(struct cuse_server *pcs, uint32_t alloc_nr) { struct cuse_memory *mem; cuse_server_lock(pcs); TAILQ_FOREACH(mem, &pcs->hmem, entry) { if (mem->alloc_nr == alloc_nr) break; } if (mem == NULL) { cuse_server_unlock(pcs); return (EINVAL); } TAILQ_REMOVE(&pcs->hmem, mem, entry); cuse_server_unlock(pcs); cuse_vm_memory_free(mem); return (0); } static int cuse_client_get(struct cuse_client **ppcc) { struct cuse_client *pcc; int error; /* try to get private data */ error = devfs_get_cdevpriv((void **)&pcc); if (error != 0) { *ppcc = NULL; return (error); } if (CUSE_CLIENT_CLOSING(pcc) || pcc->server->is_closing) { *ppcc = NULL; return (EINVAL); } *ppcc = pcc; return (0); } static void cuse_client_is_closing(struct cuse_client *pcc) { struct cuse_client_command *pccmd; uint32_t n; if (CUSE_CLIENT_CLOSING(pcc)) return; pcc->cflags |= CUSE_CLI_IS_CLOSING; pcc->server_dev = NULL; for (n = 0; n != CUSE_CMD_MAX; n++) { pccmd = &pcc->cmds[n]; if (pccmd->entry.tqe_prev != NULL) { TAILQ_REMOVE(&pcc->server->head, pccmd, entry); pccmd->entry.tqe_prev = NULL; } cv_broadcast(&pccmd->cv); } } static void cuse_client_send_command_locked(struct cuse_client_command *pccmd, uintptr_t data_ptr, unsigned long arg, int fflags, int ioflag) { unsigned long cuse_fflags = 0; struct cuse_server *pcs; if (fflags & FREAD) cuse_fflags |= CUSE_FFLAG_READ; if (fflags & FWRITE) cuse_fflags |= CUSE_FFLAG_WRITE; if (ioflag & IO_NDELAY) cuse_fflags |= CUSE_FFLAG_NONBLOCK; #if defined(__LP64__) if (SV_CURPROC_FLAG(SV_ILP32)) cuse_fflags |= CUSE_FFLAG_COMPAT32; #endif pccmd->sub.fflags = cuse_fflags; pccmd->sub.data_pointer = data_ptr; pccmd->sub.argument = arg; pcs = pccmd->client->server; if ((pccmd->entry.tqe_prev == NULL) && (CUSE_CLIENT_CLOSING(pccmd->client) == 0) && (pcs->is_closing == 0)) { TAILQ_INSERT_TAIL(&pcs->head, pccmd, entry); cv_signal(&pcs->cv); } } static void cuse_client_got_signal(struct cuse_client_command *pccmd) { struct cuse_server *pcs; pccmd->got_signal = 1; pccmd = &pccmd->client->cmds[CUSE_CMD_SIGNAL]; pcs = pccmd->client->server; if ((pccmd->entry.tqe_prev == NULL) && (CUSE_CLIENT_CLOSING(pccmd->client) == 0) && (pcs->is_closing == 0)) { TAILQ_INSERT_TAIL(&pcs->head, pccmd, entry); cv_signal(&pcs->cv); } } static int cuse_client_receive_command_locked(struct cuse_client_command *pccmd, uint8_t *arg_ptr, uint32_t arg_len) { struct cuse_server *pcs; int error; pcs = pccmd->client->server; error = 0; pccmd->proc_curr = curthread->td_proc; if (CUSE_CLIENT_CLOSING(pccmd->client) || pcs->is_closing) { error = CUSE_ERR_OTHER; goto done; } while (pccmd->command == CUSE_CMD_NONE) { if (error != 0) { cv_wait(&pccmd->cv, &pcs->mtx); } else { error = cv_wait_sig(&pccmd->cv, &pcs->mtx); if (error != 0) cuse_client_got_signal(pccmd); } if (CUSE_CLIENT_CLOSING(pccmd->client) || pcs->is_closing) { error = CUSE_ERR_OTHER; goto done; } } error = pccmd->error; pccmd->command = CUSE_CMD_NONE; cv_signal(&pccmd->cv); done: /* wait until all process references are gone */ pccmd->proc_curr = NULL; while (pccmd->proc_refs != 0) cv_wait(&pccmd->cv, &pcs->mtx); return (error); } /*------------------------------------------------------------------------* * CUSE SERVER PART *------------------------------------------------------------------------*/ static void cuse_server_free_dev(struct cuse_server_dev *pcsd) { struct cuse_server *pcs; struct cuse_client *pcc; /* get server pointer */ pcs = pcsd->server; /* prevent creation of more devices */ cuse_server_lock(pcs); if (pcsd->kern_dev != NULL) pcsd->kern_dev->si_drv1 = NULL; TAILQ_FOREACH(pcc, &pcs->hcli, entry) { if (pcc->server_dev == pcsd) cuse_client_is_closing(pcc); } cuse_server_unlock(pcs); /* destroy device, if any */ if (pcsd->kern_dev != NULL) { /* destroy device synchronously */ destroy_dev(pcsd->kern_dev); } free(pcsd, M_CUSE); } static void cuse_server_unref(struct cuse_server *pcs) { struct cuse_server_dev *pcsd; struct cuse_memory *mem; cuse_server_lock(pcs); if (--(pcs->refs) != 0) { cuse_server_unlock(pcs); return; } cuse_server_is_closing(pcs); /* final client wakeup, if any */ cuse_server_wakeup_all_client_locked(pcs); cuse_global_lock(); TAILQ_REMOVE(&cuse_server_head, pcs, entry); cuse_global_unlock(); while ((pcsd = TAILQ_FIRST(&pcs->hdev)) != NULL) { TAILQ_REMOVE(&pcs->hdev, pcsd, entry); cuse_server_unlock(pcs); cuse_server_free_dev(pcsd); cuse_server_lock(pcs); } cuse_free_unit_by_id_locked(pcs, -1); while ((mem = TAILQ_FIRST(&pcs->hmem)) != NULL) { TAILQ_REMOVE(&pcs->hmem, mem, entry); cuse_server_unlock(pcs); cuse_vm_memory_free(mem); cuse_server_lock(pcs); } knlist_clear(&pcs->selinfo.si_note, 1); knlist_destroy(&pcs->selinfo.si_note); cuse_server_unlock(pcs); seldrain(&pcs->selinfo); cv_destroy(&pcs->cv); mtx_destroy(&pcs->mtx); free(pcs, M_CUSE); } static int cuse_server_do_close(struct cuse_server *pcs) { int retval; cuse_server_lock(pcs); cuse_server_is_closing(pcs); /* final client wakeup, if any */ cuse_server_wakeup_all_client_locked(pcs); knlist_clear(&pcs->selinfo.si_note, 1); retval = pcs->refs; cuse_server_unlock(pcs); return (retval); } static void cuse_server_free(void *arg) { struct cuse_server *pcs = arg; /* * The final server unref should be done by the server thread * to prevent deadlock in the client cdevpriv destructor, * which cannot destroy itself. */ while (cuse_server_do_close(pcs) != 1) pause("W", hz); /* drop final refcount */ cuse_server_unref(pcs); } static int cuse_server_open(struct cdev *dev, int fflags, int devtype, struct thread *td) { struct cuse_server *pcs; pcs = malloc(sizeof(*pcs), M_CUSE, M_WAITOK | M_ZERO); if (pcs == NULL) return (ENOMEM); if (devfs_set_cdevpriv(pcs, &cuse_server_free)) { printf("Cuse: Cannot set cdevpriv.\n"); free(pcs, M_CUSE); return (ENOMEM); } /* store current process ID */ pcs->pid = curproc->p_pid; TAILQ_INIT(&pcs->head); TAILQ_INIT(&pcs->hdev); TAILQ_INIT(&pcs->hcli); TAILQ_INIT(&pcs->hmem); cv_init(&pcs->cv, "cuse-server-cv"); mtx_init(&pcs->mtx, "cuse-server-mtx", NULL, MTX_DEF); knlist_init_mtx(&pcs->selinfo.si_note, &pcs->mtx); cuse_global_lock(); pcs->refs++; TAILQ_INSERT_TAIL(&cuse_server_head, pcs, entry); cuse_global_unlock(); return (0); } static int cuse_server_close(struct cdev *dev, int fflag, int devtype, struct thread *td) { struct cuse_server *pcs; if (cuse_server_get(&pcs) == 0) cuse_server_do_close(pcs); return (0); } static int cuse_server_read(struct cdev *dev, struct uio *uio, int ioflag) { return (ENXIO); } static int cuse_server_write(struct cdev *dev, struct uio *uio, int ioflag) { return (ENXIO); } static int cuse_server_ioctl_copy_locked(struct cuse_server *pcs, struct cuse_client_command *pccmd, struct cuse_data_chunk *pchk, int isread) { struct proc *p_proc; uint32_t offset; int error; offset = pchk->peer_ptr - CUSE_BUF_MIN_PTR; if (pchk->length > CUSE_BUFFER_MAX) return (EFAULT); if (offset >= CUSE_BUFFER_MAX) return (EFAULT); if ((offset + pchk->length) > CUSE_BUFFER_MAX) return (EFAULT); p_proc = pccmd->proc_curr; if (p_proc == NULL) return (ENXIO); if (pccmd->proc_refs < 0) return (ENOMEM); pccmd->proc_refs++; cuse_server_unlock(pcs); if (isread == 0) { error = copyin( (void *)pchk->local_ptr, pccmd->client->ioctl_buffer + offset, pchk->length); } else { error = copyout( pccmd->client->ioctl_buffer + offset, (void *)pchk->local_ptr, pchk->length); } cuse_server_lock(pcs); pccmd->proc_refs--; if (pccmd->proc_curr == NULL) cv_signal(&pccmd->cv); return (error); } static int cuse_proc2proc_copy(struct proc *proc_s, vm_offset_t data_s, struct proc *proc_d, vm_offset_t data_d, size_t len) { struct thread *td; struct proc *proc_cur; int error; td = curthread; proc_cur = td->td_proc; if (proc_cur == proc_d) { struct iovec iov = { .iov_base = (caddr_t)data_d, .iov_len = len, }; struct uio uio = { .uio_iov = &iov, .uio_iovcnt = 1, .uio_offset = (off_t)data_s, .uio_resid = len, .uio_segflg = UIO_USERSPACE, .uio_rw = UIO_READ, .uio_td = td, }; PHOLD(proc_s); error = proc_rwmem(proc_s, &uio); PRELE(proc_s); } else if (proc_cur == proc_s) { struct iovec iov = { .iov_base = (caddr_t)data_s, .iov_len = len, }; struct uio uio = { .uio_iov = &iov, .uio_iovcnt = 1, .uio_offset = (off_t)data_d, .uio_resid = len, .uio_segflg = UIO_USERSPACE, .uio_rw = UIO_WRITE, .uio_td = td, }; PHOLD(proc_d); error = proc_rwmem(proc_d, &uio); PRELE(proc_d); } else { error = EINVAL; } return (error); } static int cuse_server_data_copy_locked(struct cuse_server *pcs, struct cuse_client_command *pccmd, struct cuse_data_chunk *pchk, int isread) { struct proc *p_proc; int error; p_proc = pccmd->proc_curr; if (p_proc == NULL) return (ENXIO); if (pccmd->proc_refs < 0) return (ENOMEM); pccmd->proc_refs++; cuse_server_unlock(pcs); if (isread == 0) { error = cuse_proc2proc_copy( curthread->td_proc, pchk->local_ptr, p_proc, pchk->peer_ptr, pchk->length); } else { error = cuse_proc2proc_copy( p_proc, pchk->peer_ptr, curthread->td_proc, pchk->local_ptr, pchk->length); } cuse_server_lock(pcs); pccmd->proc_refs--; if (pccmd->proc_curr == NULL) cv_signal(&pccmd->cv); return (error); } static int cuse_alloc_unit_by_id_locked(struct cuse_server *pcs, int id) { int n; int x = 0; int match; do { for (match = n = 0; n != CUSE_DEVICES_MAX; n++) { if (cuse_alloc_unit[n] != NULL) { if ((cuse_alloc_unit_id[n] ^ id) & CUSE_ID_MASK) continue; if ((cuse_alloc_unit_id[n] & ~CUSE_ID_MASK) == x) { x++; match = 1; } } } } while (match); if (x < 256) { for (n = 0; n != CUSE_DEVICES_MAX; n++) { if (cuse_alloc_unit[n] == NULL) { cuse_alloc_unit[n] = pcs; cuse_alloc_unit_id[n] = id | x; return (x); } } } return (-1); } static void cuse_server_wakeup_locked(struct cuse_server *pcs) { selwakeup(&pcs->selinfo); KNOTE_LOCKED(&pcs->selinfo.si_note, 0); } static void cuse_server_wakeup_all_client_locked(struct cuse_server *pcs) { struct cuse_client *pcc; TAILQ_FOREACH(pcc, &pcs->hcli, entry) { pcc->cflags |= (CUSE_CLI_KNOTE_NEED_READ | CUSE_CLI_KNOTE_NEED_WRITE); } cuse_server_wakeup_locked(pcs); } static int cuse_free_unit_by_id_locked(struct cuse_server *pcs, int id) { int n; int found = 0; for (n = 0; n != CUSE_DEVICES_MAX; n++) { if (cuse_alloc_unit[n] == pcs) { if (cuse_alloc_unit_id[n] == id || id == -1) { cuse_alloc_unit[n] = NULL; cuse_alloc_unit_id[n] = 0; found = 1; } } } return (found ? 0 : EINVAL); } static int cuse_server_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data, int fflag, struct thread *td) { struct cuse_server *pcs; int error; error = cuse_server_get(&pcs); if (error != 0) return (error); switch (cmd) { struct cuse_client_command *pccmd; struct cuse_client *pcc; struct cuse_command *pcmd; struct cuse_alloc_info *pai; struct cuse_create_dev *pcd; struct cuse_server_dev *pcsd; struct cuse_data_chunk *pchk; int n; case CUSE_IOCTL_GET_COMMAND: pcmd = (void *)data; cuse_server_lock(pcs); while ((pccmd = TAILQ_FIRST(&pcs->head)) == NULL) { error = cv_wait_sig(&pcs->cv, &pcs->mtx); if (pcs->is_closing) error = ENXIO; if (error) { cuse_server_unlock(pcs); return (error); } } TAILQ_REMOVE(&pcs->head, pccmd, entry); pccmd->entry.tqe_prev = NULL; pccmd->entered = curthread; *pcmd = pccmd->sub; cuse_server_unlock(pcs); break; case CUSE_IOCTL_SYNC_COMMAND: cuse_server_lock(pcs); while ((pccmd = cuse_server_find_command(pcs, curthread)) != NULL) { /* send sync command */ pccmd->entered = NULL; pccmd->error = *(int *)data; pccmd->command = CUSE_CMD_SYNC; /* signal peer, if any */ cv_signal(&pccmd->cv); } cuse_server_unlock(pcs); break; case CUSE_IOCTL_ALLOC_UNIT: cuse_server_lock(pcs); n = cuse_alloc_unit_by_id_locked(pcs, CUSE_ID_DEFAULT(0)); cuse_server_unlock(pcs); if (n < 0) error = ENOMEM; else *(int *)data = n; break; case CUSE_IOCTL_ALLOC_UNIT_BY_ID: n = *(int *)data; n = (n & CUSE_ID_MASK); cuse_server_lock(pcs); n = cuse_alloc_unit_by_id_locked(pcs, n); cuse_server_unlock(pcs); if (n < 0) error = ENOMEM; else *(int *)data = n; break; case CUSE_IOCTL_FREE_UNIT: n = *(int *)data; n = CUSE_ID_DEFAULT(n); cuse_server_lock(pcs); error = cuse_free_unit_by_id_locked(pcs, n); cuse_server_unlock(pcs); break; case CUSE_IOCTL_FREE_UNIT_BY_ID: n = *(int *)data; cuse_server_lock(pcs); error = cuse_free_unit_by_id_locked(pcs, n); cuse_server_unlock(pcs); break; case CUSE_IOCTL_ALLOC_MEMORY: pai = (void *)data; if (pai->alloc_nr >= CUSE_ALLOC_UNIT_MAX) { error = ENOMEM; break; } if (pai->page_count >= CUSE_ALLOC_PAGES_MAX) { error = ENOMEM; break; } error = cuse_server_alloc_memory(pcs, pai->alloc_nr, pai->page_count); break; case CUSE_IOCTL_FREE_MEMORY: pai = (void *)data; if (pai->alloc_nr >= CUSE_ALLOC_UNIT_MAX) { error = ENOMEM; break; } error = cuse_server_free_memory(pcs, pai->alloc_nr); break; case CUSE_IOCTL_GET_SIG: cuse_server_lock(pcs); pccmd = cuse_server_find_command(pcs, curthread); if (pccmd != NULL) { n = pccmd->got_signal; pccmd->got_signal = 0; } else { n = 0; } cuse_server_unlock(pcs); *(int *)data = n; break; case CUSE_IOCTL_SET_PFH: cuse_server_lock(pcs); pccmd = cuse_server_find_command(pcs, curthread); if (pccmd != NULL) { pcc = pccmd->client; for (n = 0; n != CUSE_CMD_MAX; n++) { pcc->cmds[n].sub.per_file_handle = *(uintptr_t *)data; } } else { error = ENXIO; } cuse_server_unlock(pcs); break; case CUSE_IOCTL_CREATE_DEV: error = priv_check(curthread, PRIV_DRIVER); if (error) break; pcd = (void *)data; /* filter input */ pcd->devname[sizeof(pcd->devname) - 1] = 0; if (pcd->devname[0] == 0) { error = EINVAL; break; } cuse_str_filter(pcd->devname); pcd->permissions &= 0777; /* try to allocate a character device */ pcsd = malloc(sizeof(*pcsd), M_CUSE, M_WAITOK | M_ZERO); if (pcsd == NULL) { error = ENOMEM; break; } pcsd->server = pcs; pcsd->user_dev = pcd->dev; pcsd->kern_dev = make_dev_credf(MAKEDEV_CHECKNAME, &cuse_client_devsw, 0, NULL, pcd->user_id, pcd->group_id, pcd->permissions, "%s", pcd->devname); if (pcsd->kern_dev == NULL) { free(pcsd, M_CUSE); error = ENOMEM; break; } pcsd->kern_dev->si_drv1 = pcsd; cuse_server_lock(pcs); TAILQ_INSERT_TAIL(&pcs->hdev, pcsd, entry); cuse_server_unlock(pcs); break; case CUSE_IOCTL_DESTROY_DEV: error = priv_check(curthread, PRIV_DRIVER); if (error) break; cuse_server_lock(pcs); error = EINVAL; pcsd = TAILQ_FIRST(&pcs->hdev); while (pcsd != NULL) { if (pcsd->user_dev == *(struct cuse_dev **)data) { TAILQ_REMOVE(&pcs->hdev, pcsd, entry); cuse_server_unlock(pcs); cuse_server_free_dev(pcsd); cuse_server_lock(pcs); error = 0; pcsd = TAILQ_FIRST(&pcs->hdev); } else { pcsd = TAILQ_NEXT(pcsd, entry); } } cuse_server_unlock(pcs); break; case CUSE_IOCTL_WRITE_DATA: case CUSE_IOCTL_READ_DATA: cuse_server_lock(pcs); pchk = (struct cuse_data_chunk *)data; pccmd = cuse_server_find_command(pcs, curthread); if (pccmd == NULL) { error = ENXIO; /* invalid request */ } else if (pchk->peer_ptr < CUSE_BUF_MIN_PTR) { error = EFAULT; /* NULL pointer */ } else if (pchk->peer_ptr < CUSE_BUF_MAX_PTR) { error = cuse_server_ioctl_copy_locked(pcs, pccmd, pchk, cmd == CUSE_IOCTL_READ_DATA); } else { error = cuse_server_data_copy_locked(pcs, pccmd, pchk, cmd == CUSE_IOCTL_READ_DATA); } cuse_server_unlock(pcs); break; case CUSE_IOCTL_SELWAKEUP: cuse_server_lock(pcs); /* * We don't know which direction caused the event. * Wakeup both! */ cuse_server_wakeup_all_client_locked(pcs); cuse_server_unlock(pcs); break; default: error = ENXIO; break; } return (error); } static int cuse_server_poll(struct cdev *dev, int events, struct thread *td) { return (events & (POLLHUP | POLLPRI | POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM)); } static int cuse_server_mmap_single(struct cdev *dev, vm_ooffset_t *offset, vm_size_t size, struct vm_object **object, int nprot) { uint32_t page_nr = *offset / PAGE_SIZE; uint32_t alloc_nr = page_nr / CUSE_ALLOC_PAGES_MAX; struct cuse_memory *mem; struct cuse_server *pcs; int error; error = cuse_server_get(&pcs); if (error != 0) return (error); cuse_server_lock(pcs); /* lookup memory structure */ TAILQ_FOREACH(mem, &pcs->hmem, entry) { if (mem->alloc_nr == alloc_nr) break; } if (mem == NULL) { cuse_server_unlock(pcs); return (ENOMEM); } /* verify page offset */ page_nr %= CUSE_ALLOC_PAGES_MAX; if (page_nr >= mem->page_count) { cuse_server_unlock(pcs); return (ENXIO); } /* verify mmap size */ if ((size % PAGE_SIZE) != 0 || (size < PAGE_SIZE) || (size > ((mem->page_count - page_nr) * PAGE_SIZE))) { cuse_server_unlock(pcs); return (EINVAL); } vm_object_reference(mem->object); *object = mem->object; cuse_server_unlock(pcs); /* set new VM object offset to use */ *offset = page_nr * PAGE_SIZE; /* success */ return (0); } /*------------------------------------------------------------------------* * CUSE CLIENT PART *------------------------------------------------------------------------*/ static void cuse_client_free(void *arg) { struct cuse_client *pcc = arg; struct cuse_client_command *pccmd; struct cuse_server *pcs; int n; pcs = pcc->server; cuse_server_lock(pcs); cuse_client_is_closing(pcc); TAILQ_REMOVE(&pcs->hcli, pcc, entry); cuse_server_unlock(pcs); for (n = 0; n != CUSE_CMD_MAX; n++) { pccmd = &pcc->cmds[n]; sx_destroy(&pccmd->sx); cv_destroy(&pccmd->cv); } free(pcc, M_CUSE); /* drop reference on server */ cuse_server_unref(pcs); } static int cuse_client_open(struct cdev *dev, int fflags, int devtype, struct thread *td) { struct cuse_client_command *pccmd; struct cuse_server_dev *pcsd; struct cuse_client *pcc; struct cuse_server *pcs; struct cuse_dev *pcd; int error; int n; pcsd = dev->si_drv1; if (pcsd != NULL) { pcs = pcsd->server; pcd = pcsd->user_dev; cuse_server_lock(pcs); /* * Check that the refcount didn't wrap and that the * same process is not both client and server. This * can easily lead to deadlocks when destroying the * CUSE character device nodes: */ pcs->refs++; if (pcs->refs < 0 || pcs->pid == curproc->p_pid) { /* overflow or wrong PID */ pcs->refs--; cuse_server_unlock(pcs); return (EINVAL); } cuse_server_unlock(pcs); } else { return (EINVAL); } pcc = malloc(sizeof(*pcc), M_CUSE, M_WAITOK | M_ZERO); if (pcc == NULL) { /* drop reference on server */ cuse_server_unref(pcs); return (ENOMEM); } if (devfs_set_cdevpriv(pcc, &cuse_client_free)) { printf("Cuse: Cannot set cdevpriv.\n"); /* drop reference on server */ cuse_server_unref(pcs); free(pcc, M_CUSE); return (ENOMEM); } pcc->fflags = fflags; pcc->server_dev = pcsd; pcc->server = pcs; for (n = 0; n != CUSE_CMD_MAX; n++) { pccmd = &pcc->cmds[n]; pccmd->sub.dev = pcd; pccmd->sub.command = n; pccmd->client = pcc; sx_init(&pccmd->sx, "cuse-client-sx"); cv_init(&pccmd->cv, "cuse-client-cv"); } cuse_server_lock(pcs); /* cuse_client_free() assumes that the client is listed somewhere! */ /* always enqueue */ TAILQ_INSERT_TAIL(&pcs->hcli, pcc, entry); /* check if server is closing */ if ((pcs->is_closing != 0) || (dev->si_drv1 == NULL)) { error = EINVAL; } else { error = 0; } cuse_server_unlock(pcs); if (error) { devfs_clear_cdevpriv(); /* XXX bugfix */ return (error); } pccmd = &pcc->cmds[CUSE_CMD_OPEN]; cuse_cmd_lock(pccmd); cuse_server_lock(pcs); cuse_client_send_command_locked(pccmd, 0, 0, pcc->fflags, 0); error = cuse_client_receive_command_locked(pccmd, 0, 0); cuse_server_unlock(pcs); if (error < 0) { error = cuse_convert_error(error); } else { error = 0; } cuse_cmd_unlock(pccmd); if (error) devfs_clear_cdevpriv(); /* XXX bugfix */ return (error); } static int cuse_client_close(struct cdev *dev, int fflag, int devtype, struct thread *td) { struct cuse_client_command *pccmd; struct cuse_client *pcc; struct cuse_server *pcs; int error; error = cuse_client_get(&pcc); if (error != 0) return (0); pccmd = &pcc->cmds[CUSE_CMD_CLOSE]; pcs = pcc->server; cuse_cmd_lock(pccmd); cuse_server_lock(pcs); cuse_client_send_command_locked(pccmd, 0, 0, pcc->fflags, 0); error = cuse_client_receive_command_locked(pccmd, 0, 0); cuse_cmd_unlock(pccmd); cuse_client_is_closing(pcc); cuse_server_unlock(pcs); return (0); } static void cuse_client_kqfilter_poll(struct cdev *dev, struct cuse_client *pcc) { struct cuse_server *pcs = pcc->server; int temp; cuse_server_lock(pcs); temp = (pcc->cflags & (CUSE_CLI_KNOTE_HAS_READ | CUSE_CLI_KNOTE_HAS_WRITE)); pcc->cflags &= ~(CUSE_CLI_KNOTE_NEED_READ | CUSE_CLI_KNOTE_NEED_WRITE); cuse_server_unlock(pcs); if (temp != 0) { /* get the latest polling state from the server */ temp = cuse_client_poll(dev, POLLIN | POLLOUT, NULL); if (temp & (POLLIN | POLLOUT)) { cuse_server_lock(pcs); if (temp & POLLIN) pcc->cflags |= CUSE_CLI_KNOTE_NEED_READ; if (temp & POLLOUT) pcc->cflags |= CUSE_CLI_KNOTE_NEED_WRITE; /* make sure the "knote" gets woken up */ cuse_server_wakeup_locked(pcc->server); cuse_server_unlock(pcs); } } } static int cuse_client_read(struct cdev *dev, struct uio *uio, int ioflag) { struct cuse_client_command *pccmd; struct cuse_client *pcc; struct cuse_server *pcs; int error; int len; error = cuse_client_get(&pcc); if (error != 0) return (error); pccmd = &pcc->cmds[CUSE_CMD_READ]; pcs = pcc->server; if (uio->uio_segflg != UIO_USERSPACE) { return (EINVAL); } uio->uio_segflg = UIO_NOCOPY; cuse_cmd_lock(pccmd); while (uio->uio_resid != 0) { if (uio->uio_iov->iov_len > CUSE_LENGTH_MAX) { error = ENOMEM; break; } len = uio->uio_iov->iov_len; cuse_server_lock(pcs); cuse_client_send_command_locked(pccmd, (uintptr_t)uio->uio_iov->iov_base, (unsigned long)(unsigned int)len, pcc->fflags, ioflag); error = cuse_client_receive_command_locked(pccmd, 0, 0); cuse_server_unlock(pcs); if (error < 0) { error = cuse_convert_error(error); break; } else if (error == len) { error = uiomove(NULL, error, uio); if (error) break; } else { error = uiomove(NULL, error, uio); break; } } cuse_cmd_unlock(pccmd); uio->uio_segflg = UIO_USERSPACE;/* restore segment flag */ if (error == EWOULDBLOCK) cuse_client_kqfilter_poll(dev, pcc); return (error); } static int cuse_client_write(struct cdev *dev, struct uio *uio, int ioflag) { struct cuse_client_command *pccmd; struct cuse_client *pcc; struct cuse_server *pcs; int error; int len; error = cuse_client_get(&pcc); if (error != 0) return (error); pccmd = &pcc->cmds[CUSE_CMD_WRITE]; pcs = pcc->server; if (uio->uio_segflg != UIO_USERSPACE) { return (EINVAL); } uio->uio_segflg = UIO_NOCOPY; cuse_cmd_lock(pccmd); while (uio->uio_resid != 0) { if (uio->uio_iov->iov_len > CUSE_LENGTH_MAX) { error = ENOMEM; break; } len = uio->uio_iov->iov_len; cuse_server_lock(pcs); cuse_client_send_command_locked(pccmd, (uintptr_t)uio->uio_iov->iov_base, (unsigned long)(unsigned int)len, pcc->fflags, ioflag); error = cuse_client_receive_command_locked(pccmd, 0, 0); cuse_server_unlock(pcs); if (error < 0) { error = cuse_convert_error(error); break; } else if (error == len) { error = uiomove(NULL, error, uio); if (error) break; } else { error = uiomove(NULL, error, uio); break; } } cuse_cmd_unlock(pccmd); uio->uio_segflg = UIO_USERSPACE;/* restore segment flag */ if (error == EWOULDBLOCK) cuse_client_kqfilter_poll(dev, pcc); return (error); } int cuse_client_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data, int fflag, struct thread *td) { struct cuse_client_command *pccmd; struct cuse_client *pcc; struct cuse_server *pcs; int error; int len; error = cuse_client_get(&pcc); if (error != 0) return (error); len = IOCPARM_LEN(cmd); if (len > CUSE_BUFFER_MAX) return (ENOMEM); pccmd = &pcc->cmds[CUSE_CMD_IOCTL]; pcs = pcc->server; cuse_cmd_lock(pccmd); if (cmd & (IOC_IN | IOC_VOID)) memcpy(pcc->ioctl_buffer, data, len); /* * When the ioctl-length is zero drivers can pass information * through the data pointer of the ioctl. Make sure this information * is forwarded to the driver. */ cuse_server_lock(pcs); cuse_client_send_command_locked(pccmd, (len == 0) ? *(long *)data : CUSE_BUF_MIN_PTR, (unsigned long)cmd, pcc->fflags, (fflag & O_NONBLOCK) ? IO_NDELAY : 0); error = cuse_client_receive_command_locked(pccmd, data, len); cuse_server_unlock(pcs); if (error < 0) { error = cuse_convert_error(error); } else { error = 0; } if (cmd & IOC_OUT) memcpy(data, pcc->ioctl_buffer, len); cuse_cmd_unlock(pccmd); if (error == EWOULDBLOCK) cuse_client_kqfilter_poll(dev, pcc); return (error); } static int cuse_client_poll(struct cdev *dev, int events, struct thread *td) { struct cuse_client_command *pccmd; struct cuse_client *pcc; struct cuse_server *pcs; unsigned long temp; int error; int revents; error = cuse_client_get(&pcc); if (error != 0) goto pollnval; temp = 0; pcs = pcc->server; if (events & (POLLPRI | POLLIN | POLLRDNORM)) temp |= CUSE_POLL_READ; if (events & (POLLOUT | POLLWRNORM)) temp |= CUSE_POLL_WRITE; if (events & POLLHUP) temp |= CUSE_POLL_ERROR; pccmd = &pcc->cmds[CUSE_CMD_POLL]; cuse_cmd_lock(pccmd); /* Need to selrecord() first to not loose any events. */ if (temp != 0 && td != NULL) selrecord(td, &pcs->selinfo); cuse_server_lock(pcs); cuse_client_send_command_locked(pccmd, 0, temp, pcc->fflags, IO_NDELAY); error = cuse_client_receive_command_locked(pccmd, 0, 0); cuse_server_unlock(pcs); cuse_cmd_unlock(pccmd); if (error < 0) { goto pollnval; } else { revents = 0; if (error & CUSE_POLL_READ) revents |= (events & (POLLPRI | POLLIN | POLLRDNORM)); if (error & CUSE_POLL_WRITE) revents |= (events & (POLLOUT | POLLWRNORM)); if (error & CUSE_POLL_ERROR) revents |= (events & POLLHUP); } return (revents); pollnval: /* XXX many clients don't understand POLLNVAL */ return (events & (POLLHUP | POLLPRI | POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM)); } static int cuse_client_mmap_single(struct cdev *dev, vm_ooffset_t *offset, vm_size_t size, struct vm_object **object, int nprot) { uint32_t page_nr = *offset / PAGE_SIZE; uint32_t alloc_nr = page_nr / CUSE_ALLOC_PAGES_MAX; struct cuse_memory *mem; struct cuse_client *pcc; struct cuse_server *pcs; int error; error = cuse_client_get(&pcc); if (error != 0) return (error); pcs = pcc->server; cuse_server_lock(pcs); /* lookup memory structure */ TAILQ_FOREACH(mem, &pcs->hmem, entry) { if (mem->alloc_nr == alloc_nr) break; } if (mem == NULL) { cuse_server_unlock(pcs); return (ENOMEM); } /* verify page offset */ page_nr %= CUSE_ALLOC_PAGES_MAX; if (page_nr >= mem->page_count) { cuse_server_unlock(pcs); return (ENXIO); } /* verify mmap size */ if ((size % PAGE_SIZE) != 0 || (size < PAGE_SIZE) || (size > ((mem->page_count - page_nr) * PAGE_SIZE))) { cuse_server_unlock(pcs); return (EINVAL); } vm_object_reference(mem->object); *object = mem->object; cuse_server_unlock(pcs); /* set new VM object offset to use */ *offset = page_nr * PAGE_SIZE; /* success */ return (0); } static void cuse_client_kqfilter_read_detach(struct knote *kn) { struct cuse_client *pcc; struct cuse_server *pcs; pcc = kn->kn_hook; pcs = pcc->server; cuse_server_lock(pcs); knlist_remove(&pcs->selinfo.si_note, kn, 1); cuse_server_unlock(pcs); } static void cuse_client_kqfilter_write_detach(struct knote *kn) { struct cuse_client *pcc; struct cuse_server *pcs; pcc = kn->kn_hook; pcs = pcc->server; cuse_server_lock(pcs); knlist_remove(&pcs->selinfo.si_note, kn, 1); cuse_server_unlock(pcs); } static int cuse_client_kqfilter_read_event(struct knote *kn, long hint) { struct cuse_client *pcc; pcc = kn->kn_hook; mtx_assert(&pcc->server->mtx, MA_OWNED); return ((pcc->cflags & CUSE_CLI_KNOTE_NEED_READ) ? 1 : 0); } static int cuse_client_kqfilter_write_event(struct knote *kn, long hint) { struct cuse_client *pcc; pcc = kn->kn_hook; mtx_assert(&pcc->server->mtx, MA_OWNED); return ((pcc->cflags & CUSE_CLI_KNOTE_NEED_WRITE) ? 1 : 0); } static int cuse_client_kqfilter(struct cdev *dev, struct knote *kn) { struct cuse_client *pcc; struct cuse_server *pcs; int error; error = cuse_client_get(&pcc); if (error != 0) return (error); pcs = pcc->server; cuse_server_lock(pcs); switch (kn->kn_filter) { case EVFILT_READ: pcc->cflags |= CUSE_CLI_KNOTE_HAS_READ; kn->kn_hook = pcc; kn->kn_fop = &cuse_client_kqfilter_read_ops; knlist_add(&pcs->selinfo.si_note, kn, 1); break; case EVFILT_WRITE: pcc->cflags |= CUSE_CLI_KNOTE_HAS_WRITE; kn->kn_hook = pcc; kn->kn_fop = &cuse_client_kqfilter_write_ops; knlist_add(&pcs->selinfo.si_note, kn, 1); break; default: error = EINVAL; break; } cuse_server_unlock(pcs); if (error == 0) cuse_client_kqfilter_poll(dev, pcc); return (error); } Index: stable/11/sys/i386/bios/apm.c =================================================================== --- stable/11/sys/i386/bios/apm.c (revision 359651) +++ stable/11/sys/i386/bios/apm.c (revision 359652) @@ -1,1592 +1,1592 @@ /*- * APM (Advanced Power Management) BIOS Device Driver * * Copyright (c) 1994 UKAI, Fumitoshi. * Copyright (c) 1994-1995 by HOSOKAWA, Tatsumi * Copyright (c) 1996 Nate Williams * Copyright (c) 1997 Poul-Henning Kamp * * This software may be used, modified, copied, and distributed, in * both source and binary form provided that the above copyright and * these terms are retained. Under no circumstances is the author * responsible for the proper functioning of this software, nor does * the author assume any responsibility for damages incurred with its * use. * * Sep, 1994 Implemented on FreeBSD 1.1.5.1R (Toshiba AVS001WD) */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Used by the apm_saver screen saver module */ int apm_display(int newstate); struct apm_softc apm_softc; static void apm_resume(void); static int apm_bioscall(void); static int apm_check_function_supported(u_int version, u_int func); static int apm_pm_func(u_long, void*, ...); static u_long apm_version; int apm_evindex; #define SCFLAG_ONORMAL 0x0000001 #define SCFLAG_OCTL 0x0000002 #define SCFLAG_OPEN (SCFLAG_ONORMAL|SCFLAG_OCTL) #define APMDEV_NORMAL 0 #define APMDEV_CTL 1 #ifdef PC98 extern int bios32_apm98(struct bios_regs *, u_int, u_short); /* PC98's SMM definition */ #define APM_NECSMM_PORT 0x6b8e #define APM_NECSMM_PORTSZ 1 #define APM_NECSMM_EN 0x10 static __inline void apm_enable_smm(struct apm_softc *); static __inline void apm_disable_smm(struct apm_softc *); int apm_necsmm_addr; u_int32_t apm_necsmm_mask; #endif static struct apmhook *hook[NAPM_HOOK]; /* XXX */ #define is_enabled(foo) ((foo) ? "enabled" : "disabled") /* Map version number to integer (keeps ordering of version numbers) */ #define INTVERSION(major, minor) ((major)*100 + (minor)) static d_open_t apmopen; static d_close_t apmclose; static d_write_t apmwrite; static d_ioctl_t apmioctl; static d_poll_t apmpoll; static struct cdevsw apm_cdevsw = { .d_version = D_VERSION, .d_flags = D_NEEDGIANT, .d_open = apmopen, .d_close = apmclose, .d_write = apmwrite, .d_ioctl = apmioctl, .d_poll = apmpoll, .d_name = "apm", }; static int apm_suspend_delay = 1; static int apm_standby_delay = 1; static int apm_swab_batt_minutes = 0; static int apm_debug = 0; #define APM_DPRINT(args...) do { \ if (apm_debug) { \ printf(args); \ } \ } while (0) SYSCTL_INT(_machdep, OID_AUTO, apm_suspend_delay, CTLFLAG_RW, &apm_suspend_delay, 1, ""); SYSCTL_INT(_machdep, OID_AUTO, apm_standby_delay, CTLFLAG_RW, &apm_standby_delay, 1, ""); SYSCTL_INT(_debug, OID_AUTO, apm_debug, CTLFLAG_RW, &apm_debug, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, apm_swab_batt_minutes, CTLFLAG_RWTUN, &apm_swab_batt_minutes, 0, "Byte swap battery time value."); #ifdef PC98 static __inline void apm_enable_smm(sc) struct apm_softc *sc; { bus_space_tag_t iot = sc->sc_iot; bus_space_handle_t ioh = sc->sc_ioh; if (apm_necsmm_addr != 0) bus_space_write_1(iot, ioh, 0, (bus_space_read_1(iot, ioh, 0) | ~apm_necsmm_mask)); } static __inline void apm_disable_smm(sc) struct apm_softc *sc; { bus_space_tag_t iot = sc->sc_iot; bus_space_handle_t ioh = sc->sc_ioh; if (apm_necsmm_addr != 0) bus_space_write_1(iot, ioh, 0, (bus_space_read_1(iot, ioh, 0) & apm_necsmm_mask)); } #endif /* * return 0 if the function successfull, * return 1 if the function unsuccessfull, * return -1 if the function unsupported. */ static int apm_bioscall(void) { struct apm_softc *sc = &apm_softc; int errno = 0; u_int apm_func = sc->bios.r.eax & 0xff; if (!apm_check_function_supported(sc->intversion, apm_func)) { APM_DPRINT("apm_bioscall: function 0x%x is not supported in v%d.%d\n", apm_func, sc->majorversion, sc->minorversion); return (-1); } sc->bios_busy = 1; #ifdef PC98 set_bios_selectors(&sc->bios.seg, BIOSCODE_FLAG | BIOSDATA_FLAG); if (bios32_apm98(&sc->bios.r, sc->bios.entry, GSEL(GBIOSCODE32_SEL, SEL_KPL)) != 0) return 1; #else if (sc->connectmode == APM_PROT32CONNECT) { set_bios_selectors(&sc->bios.seg, BIOSCODE_FLAG | BIOSDATA_FLAG); errno = bios32(&sc->bios.r, sc->bios.entry, GSEL(GBIOSCODE32_SEL, SEL_KPL)); } else { errno = bios16(&sc->bios, NULL); } #endif sc->bios_busy = 0; return (errno); } /* check whether APM function is supported (1) or not (0). */ static int apm_check_function_supported(u_int version, u_int func) { /* except driver version */ if (func == APM_DRVVERSION) { return (1); } #ifdef PC98 if (func == APM_GETPWSTATUS) { return (1); } #endif switch (version) { case INTVERSION(1, 0): if (func > APM_GETPMEVENT) { return (0); /* not supported */ } break; case INTVERSION(1, 1): if (func > APM_ENGAGEDISENGAGEPM && func < APM_OEMFUNC) { return (0); /* not supported */ } break; case INTVERSION(1, 2): break; } return (1); /* supported */ } /* enable/disable power management */ static int apm_enable_disable_pm(int enable) { struct apm_softc *sc = &apm_softc; sc->bios.r.eax = (APM_BIOS << 8) | APM_ENABLEDISABLEPM; if (sc->intversion >= INTVERSION(1, 1)) sc->bios.r.ebx = PMDV_ALLDEV; else sc->bios.r.ebx = 0xffff; /* APM version 1.0 only */ sc->bios.r.ecx = enable; sc->bios.r.edx = 0; return (apm_bioscall()); } /* register driver version (APM 1.1 or later) */ static int apm_driver_version(int version) { struct apm_softc *sc = &apm_softc; sc->bios.r.eax = (APM_BIOS << 8) | APM_DRVVERSION; sc->bios.r.ebx = 0x0; sc->bios.r.ecx = version; sc->bios.r.edx = 0; if (apm_bioscall() == 0 && sc->bios.r.eax == version) return (0); /* Some old BIOSes don't return the connection version in %ax. */ if (sc->bios.r.eax == ((APM_BIOS << 8) | APM_DRVVERSION)) return (0); return (1); } /* engage/disengage power management (APM 1.1 or later) */ static int apm_engage_disengage_pm(int engage) { struct apm_softc *sc = &apm_softc; sc->bios.r.eax = (APM_BIOS << 8) | APM_ENGAGEDISENGAGEPM; sc->bios.r.ebx = PMDV_ALLDEV; sc->bios.r.ecx = engage; sc->bios.r.edx = 0; return (apm_bioscall()); } /* get PM event */ static u_int apm_getevent(void) { struct apm_softc *sc = &apm_softc; sc->bios.r.eax = (APM_BIOS << 8) | APM_GETPMEVENT; sc->bios.r.ebx = 0; sc->bios.r.ecx = 0; sc->bios.r.edx = 0; if (apm_bioscall()) return (PMEV_NOEVENT); return (sc->bios.r.ebx & 0xffff); } /* suspend entire system */ static int apm_suspend_system(int state) { struct apm_softc *sc = &apm_softc; sc->bios.r.eax = (APM_BIOS << 8) | APM_SETPWSTATE; sc->bios.r.ebx = PMDV_ALLDEV; sc->bios.r.ecx = state; sc->bios.r.edx = 0; #ifdef PC98 apm_disable_smm(sc); #endif if (apm_bioscall()) { printf("Entire system suspend failure: errcode = %d\n", 0xff & (sc->bios.r.eax >> 8)); return 1; } #ifdef PC98 apm_enable_smm(sc); #endif return 0; } /* Display control */ /* * Experimental implementation: My laptop machine can't handle this function * If your laptop can control the display via APM, please inform me. * HOSOKAWA, Tatsumi */ int apm_display(int newstate) { struct apm_softc *sc = &apm_softc; sc->bios.r.eax = (APM_BIOS << 8) | APM_SETPWSTATE; sc->bios.r.ebx = PMDV_DISP0; sc->bios.r.ecx = newstate ? PMST_APMENABLED:PMST_SUSPEND; sc->bios.r.edx = 0; if (apm_bioscall() == 0) { return 0; } /* If failed, then try to blank all display devices instead. */ sc->bios.r.eax = (APM_BIOS << 8) | APM_SETPWSTATE; sc->bios.r.ebx = PMDV_DISPALL; /* all display devices */ sc->bios.r.ecx = newstate ? PMST_APMENABLED:PMST_SUSPEND; sc->bios.r.edx = 0; if (apm_bioscall() == 0) { return 0; } printf("Display off failure: errcode = %d\n", 0xff & (sc->bios.r.eax >> 8)); return 1; } /* * Turn off the entire system. */ static void apm_power_off(void *junk, int howto) { struct apm_softc *sc = &apm_softc; /* Not halting powering off, or not active */ if (!(howto & RB_POWEROFF) || !apm_softc.active) return; sc->bios.r.eax = (APM_BIOS << 8) | APM_SETPWSTATE; sc->bios.r.ebx = PMDV_ALLDEV; sc->bios.r.ecx = PMST_OFF; sc->bios.r.edx = 0; (void) apm_bioscall(); } /* APM Battery low handler */ static void apm_battery_low(void) { printf("\007\007 * * * BATTERY IS LOW * * * \007\007"); } /* APM hook manager */ static struct apmhook * apm_add_hook(struct apmhook **list, struct apmhook *ah) { int s; struct apmhook *p, *prev; APM_DPRINT("Add hook \"%s\"\n", ah->ah_name); s = splhigh(); if (ah == NULL) panic("illegal apm_hook!"); prev = NULL; for (p = *list; p != NULL; prev = p, p = p->ah_next) if (p->ah_order > ah->ah_order) break; if (prev == NULL) { ah->ah_next = *list; *list = ah; } else { ah->ah_next = prev->ah_next; prev->ah_next = ah; } splx(s); return ah; } static void apm_del_hook(struct apmhook **list, struct apmhook *ah) { int s; struct apmhook *p, *prev; s = splhigh(); prev = NULL; for (p = *list; p != NULL; prev = p, p = p->ah_next) if (p == ah) goto deleteit; panic("Tried to delete unregistered apm_hook."); goto nosuchnode; deleteit: if (prev != NULL) prev->ah_next = p->ah_next; else *list = p->ah_next; nosuchnode: splx(s); } /* APM driver calls some functions automatically */ static void apm_execute_hook(struct apmhook *list) { struct apmhook *p; for (p = list; p != NULL; p = p->ah_next) { APM_DPRINT("Execute APM hook \"%s.\"\n", p->ah_name); if ((*(p->ah_fun))(p->ah_arg)) printf("Warning: APM hook \"%s\" failed", p->ah_name); } } /* establish an apm hook */ struct apmhook * apm_hook_establish(int apmh, struct apmhook *ah) { if (apmh < 0 || apmh >= NAPM_HOOK) return NULL; return apm_add_hook(&hook[apmh], ah); } /* disestablish an apm hook */ void apm_hook_disestablish(int apmh, struct apmhook *ah) { if (apmh < 0 || apmh >= NAPM_HOOK) return; apm_del_hook(&hook[apmh], ah); } static int apm_record_event(struct apm_softc *, u_int); static void apm_processevent(void); static u_int apm_op_inprog = 0; static void apm_do_suspend(void) { struct apm_softc *sc = &apm_softc; int error; if (sc == NULL || sc->initialized == 0) return; apm_op_inprog = 0; sc->suspends = sc->suspend_countdown = 0; EVENTHANDLER_INVOKE(power_suspend); /* * Be sure to hold Giant across DEVICE_SUSPEND/RESUME since * non-MPSAFE drivers need this. */ mtx_lock(&Giant); error = DEVICE_SUSPEND(root_bus); if (error) goto backout; apm_execute_hook(hook[APM_HOOK_SUSPEND]); if (apm_suspend_system(PMST_SUSPEND) == 0) { sc->suspending = 1; apm_processevent(); mtx_unlock(&Giant); return; } /* Failure, 'resume' the system again */ apm_execute_hook(hook[APM_HOOK_RESUME]); DEVICE_RESUME(root_bus); backout: mtx_unlock(&Giant); EVENTHANDLER_INVOKE(power_resume); } static void apm_do_standby(void) { struct apm_softc *sc = &apm_softc; if (sc == NULL || sc->initialized == 0) return; apm_op_inprog = 0; sc->standbys = sc->standby_countdown = 0; /* * As far as standby, we don't need to execute * all of suspend hooks. */ if (apm_suspend_system(PMST_STANDBY) == 0) apm_processevent(); return; } static void apm_lastreq_notify(void) { struct apm_softc *sc = &apm_softc; sc->bios.r.eax = (APM_BIOS << 8) | APM_SETPWSTATE; sc->bios.r.ebx = PMDV_ALLDEV; sc->bios.r.ecx = PMST_LASTREQNOTIFY; sc->bios.r.edx = 0; apm_bioscall(); } static int apm_lastreq_rejected(void) { struct apm_softc *sc = &apm_softc; if (apm_op_inprog == 0) { return 1; /* no operation in progress */ } sc->bios.r.eax = (APM_BIOS << 8) | APM_SETPWSTATE; sc->bios.r.ebx = PMDV_ALLDEV; sc->bios.r.ecx = PMST_LASTREQREJECT; sc->bios.r.edx = 0; if (apm_bioscall()) { APM_DPRINT("apm_lastreq_rejected: failed\n"); return 1; } apm_op_inprog = 0; return 0; } /* * Public interface to the suspend/resume: * * Execute suspend and resume hook before and after sleep, respectively. * */ void apm_suspend(int state) { struct apm_softc *sc = &apm_softc; if (sc == NULL || sc->initialized == 0) return; switch (state) { case PMST_SUSPEND: if (sc->suspends) return; sc->suspends++; sc->suspend_countdown = apm_suspend_delay; break; case PMST_STANDBY: if (sc->standbys) return; sc->standbys++; sc->standby_countdown = apm_standby_delay; break; default: printf("apm_suspend: Unknown Suspend state 0x%x\n", state); return; } apm_op_inprog++; apm_lastreq_notify(); } static void apm_resume(void) { struct apm_softc *sc = &apm_softc; if (sc == NULL || sc->initialized == 0 || sc->suspending == 0) return; sc->suspending = 0; apm_execute_hook(hook[APM_HOOK_RESUME]); mtx_lock(&Giant); DEVICE_RESUME(root_bus); mtx_unlock(&Giant); EVENTHANDLER_INVOKE(power_resume); } /* get power status per battery */ static int apm_get_pwstatus(apm_pwstatus_t app) { struct apm_softc *sc = &apm_softc; if (app->ap_device != PMDV_ALLDEV && (app->ap_device < PMDV_BATT0 || app->ap_device > PMDV_BATT_ALL)) return 1; sc->bios.r.eax = (APM_BIOS << 8) | APM_GETPWSTATUS; sc->bios.r.ebx = app->ap_device; sc->bios.r.ecx = 0; sc->bios.r.edx = 0xffff; /* default to unknown battery time */ if (apm_bioscall()) return 1; app->ap_acline = (sc->bios.r.ebx >> 8) & 0xff; app->ap_batt_stat = sc->bios.r.ebx & 0xff; app->ap_batt_flag = (sc->bios.r.ecx >> 8) & 0xff; app->ap_batt_life = sc->bios.r.ecx & 0xff; sc->bios.r.edx &= 0xffff; if (apm_swab_batt_minutes) sc->bios.r.edx = __bswap16(sc->bios.r.edx) | 0x8000; if (sc->bios.r.edx == 0xffff) /* Time is unknown */ app->ap_batt_time = -1; else if (sc->bios.r.edx & 0x8000) /* Time is in minutes */ app->ap_batt_time = (sc->bios.r.edx & 0x7fff) * 60; else /* Time is in seconds */ app->ap_batt_time = sc->bios.r.edx; return 0; } /* get APM information */ static int apm_get_info(apm_info_t aip) { struct apm_softc *sc = &apm_softc; struct apm_pwstatus aps; bzero(&aps, sizeof(aps)); aps.ap_device = PMDV_ALLDEV; if (apm_get_pwstatus(&aps)) return 1; aip->ai_infoversion = 1; aip->ai_acline = aps.ap_acline; aip->ai_batt_stat = aps.ap_batt_stat; aip->ai_batt_life = aps.ap_batt_life; aip->ai_batt_time = aps.ap_batt_time; aip->ai_major = (u_int)sc->majorversion; aip->ai_minor = (u_int)sc->minorversion; aip->ai_status = (u_int)sc->active; sc->bios.r.eax = (APM_BIOS << 8) | APM_GETCAPABILITIES; sc->bios.r.ebx = 0; sc->bios.r.ecx = 0; sc->bios.r.edx = 0; if (apm_bioscall()) { aip->ai_batteries = 0xffffffff; /* Unknown */ aip->ai_capabilities = 0xff00; /* Unknown, with no bits set */ } else { aip->ai_batteries = sc->bios.r.ebx & 0xff; aip->ai_capabilities = sc->bios.r.ecx & 0xff; } bzero(aip->ai_spare, sizeof aip->ai_spare); return 0; } /* inform APM BIOS that CPU is idle */ void apm_cpu_idle(void) { struct apm_softc *sc = &apm_softc; if (sc->active) { sc->bios.r.eax = (APM_BIOS <<8) | APM_CPUIDLE; sc->bios.r.edx = sc->bios.r.ecx = sc->bios.r.ebx = 0; (void) apm_bioscall(); } /* * Some APM implementation halts CPU in BIOS, whenever * "CPU-idle" function are invoked, but swtch() of * FreeBSD halts CPU, therefore, CPU is halted twice * in the sched loop. It makes the interrupt latency * terribly long and be able to cause a serious problem * in interrupt processing. We prevent it by removing * "hlt" operation from swtch() and managed it under * APM driver. */ if (!sc->active || sc->always_halt_cpu) halt(); /* wait for interrupt */ } /* inform APM BIOS that CPU is busy */ void apm_cpu_busy(void) { struct apm_softc *sc = &apm_softc; /* * The APM specification says this is only necessary if your BIOS * slows down the processor in the idle task, otherwise it's not * necessary. */ if (sc->slow_idle_cpu && sc->active) { sc->bios.r.eax = (APM_BIOS <<8) | APM_CPUBUSY; sc->bios.r.edx = sc->bios.r.ecx = sc->bios.r.ebx = 0; apm_bioscall(); } } /* * APM thread loop. * * This routine wakes up from time to time to deal with delaying the * suspend of the system, or other events. */ static void apm_event_thread(void *arg) { struct apm_softc *sc = &apm_softc; sc->running = 1; while (sc->active) { if (apm_op_inprog) apm_lastreq_notify(); if (sc->standbys && sc->standby_countdown-- <= 0) apm_do_standby(); if (sc->suspends && sc->suspend_countdown-- <= 0) apm_do_suspend(); if (!sc->bios_busy) apm_processevent(); mtx_lock(&sc->mtx); cv_timedwait(&sc->cv, &sc->mtx, 10 * hz / 9); mtx_unlock(&sc->mtx); } sc->running = 0; kproc_exit(0); } /* enable APM BIOS */ static void apm_event_enable(void) { struct apm_softc *sc = &apm_softc; APM_DPRINT("called apm_event_enable()\n"); if (sc == NULL || sc->initialized == 0) return; /* Start the thread */ sc->active = 1; if (kproc_create(apm_event_thread, sc, &sc->event_thread, 0, 0, "apm worker")) panic("Cannot create apm worker thread"); return; } /* disable APM BIOS */ static void apm_event_disable(void) { struct apm_softc *sc = &apm_softc; APM_DPRINT("called apm_event_disable()\n"); if (sc == NULL || sc->initialized == 0) return; mtx_lock(&sc->mtx); sc->active = 0; while (sc->running) { cv_broadcast(&sc->cv); msleep(sc->event_thread, &sc->mtx, PWAIT, "apmdie", 0); } mtx_unlock(&sc->mtx); sc->event_thread = NULL; return; } /* halt CPU in scheduling loop */ static void apm_halt_cpu(void) { struct apm_softc *sc = &apm_softc; if (sc == NULL || sc->initialized == 0) return; sc->always_halt_cpu = 1; return; } /* don't halt CPU in scheduling loop */ static void apm_not_halt_cpu(void) { struct apm_softc *sc = &apm_softc; if (sc == NULL || sc->initialized == 0) return; sc->always_halt_cpu = 0; return; } /* device driver definitions */ /* * Module event */ static int apm_modevent(struct module *mod, int event, void *junk) { switch (event) { case MOD_LOAD: if (!cold) return (EPERM); break; case MOD_UNLOAD: if (!cold && power_pm_get_type() == POWER_PM_TYPE_APM) return (EBUSY); break; default: break; } return (0); } /* * Create "connection point" */ static void apm_identify(driver_t *driver, device_t parent) { device_t child; if (!cold) { printf("Don't load this driver from userland!!\n"); return; } if (resource_disabled("apm", 0)) return; child = BUS_ADD_CHILD(parent, 0, "apm", 0); if (child == NULL) panic("apm_identify"); } /* * probe for APM BIOS */ static int apm_probe(device_t dev) { #define APM_KERNBASE KERNBASE struct vm86frame vmf; struct apm_softc *sc = &apm_softc; #ifdef PC98 int rid; #endif device_set_desc(dev, "APM BIOS"); if (device_get_unit(dev) > 0) { printf("apm: Only one APM driver supported.\n"); return ENXIO; } if (power_pm_get_type() != POWER_PM_TYPE_NONE && power_pm_get_type() != POWER_PM_TYPE_APM) { printf("apm: Other PM system enabled.\n"); return ENXIO; } bzero(&vmf, sizeof(struct vm86frame)); /* safety */ bzero(&apm_softc, sizeof(apm_softc)); vmf.vmf_ah = APM_BIOS; vmf.vmf_al = APM_INSTCHECK; vmf.vmf_bx = 0; if (vm86_intcall(APM_INT, &vmf)) return ENXIO; /* APM not found */ if (vmf.vmf_bx != 0x504d) { printf("apm: incorrect signature (0x%x)\n", vmf.vmf_bx); return ENXIO; } if ((vmf.vmf_cx & (APM_32BIT_SUPPORT | APM_16BIT_SUPPORT)) == 0) { printf("apm: protected mode connections are not supported\n"); return ENXIO; } apm_version = vmf.vmf_ax; sc->slow_idle_cpu = ((vmf.vmf_cx & APM_CPUIDLE_SLOW) != 0); sc->disabled = ((vmf.vmf_cx & APM_DISABLED) != 0); sc->disengaged = ((vmf.vmf_cx & APM_DISENGAGED) != 0); vmf.vmf_ah = APM_BIOS; vmf.vmf_al = APM_DISCONNECT; vmf.vmf_bx = 0; vm86_intcall(APM_INT, &vmf); /* disconnect, just in case */ #ifdef PC98 /* PC98 have bogos APM 32bit BIOS */ if ((vmf.vmf_cx & APM_32BIT_SUPPORT) == 0) return ENXIO; rid = 0; bus_set_resource(dev, SYS_RES_IOPORT, rid, APM_NECSMM_PORT, APM_NECSMM_PORTSZ); sc->sc_res = bus_alloc_resource(dev, SYS_RES_IOPORT, &rid, APM_NECSMM_PORT, ~0, APM_NECSMM_PORTSZ, RF_ACTIVE); if (sc->sc_res == NULL) { printf("apm: cannot open NEC smm device\n"); return ENXIO; } bus_release_resource(dev, SYS_RES_IOPORT, rid, sc->sc_res); vmf.vmf_ah = APM_BIOS; vmf.vmf_al = APM_PROT32CONNECT; vmf.vmf_bx = 0; if (vm86_intcall(APM_INT, &vmf)) { printf("apm: 32-bit connection error.\n"); return (ENXIO); } sc->bios.seg.code32.base = (vmf.vmf_ax << 4) + APM_KERNBASE; sc->bios.seg.code32.limit = 0xffff; sc->bios.seg.code16.base = (vmf.vmf_cx << 4) + APM_KERNBASE; sc->bios.seg.code16.limit = 0xffff; sc->bios.seg.data.base = (vmf.vmf_dx << 4) + APM_KERNBASE; sc->bios.seg.data.limit = 0xffff; sc->bios.entry = vmf.vmf_ebx; sc->connectmode = APM_PROT32CONNECT; #else if ((vmf.vmf_cx & APM_32BIT_SUPPORT) != 0) { vmf.vmf_ah = APM_BIOS; vmf.vmf_al = APM_PROT32CONNECT; vmf.vmf_bx = 0; if (vm86_intcall(APM_INT, &vmf)) { printf("apm: 32-bit connection error.\n"); return (ENXIO); } sc->bios.seg.code32.base = (vmf.vmf_ax << 4) + APM_KERNBASE; sc->bios.seg.code32.limit = 0xffff; sc->bios.seg.code16.base = (vmf.vmf_cx << 4) + APM_KERNBASE; sc->bios.seg.code16.limit = 0xffff; sc->bios.seg.data.base = (vmf.vmf_dx << 4) + APM_KERNBASE; sc->bios.seg.data.limit = 0xffff; sc->bios.entry = vmf.vmf_ebx; sc->connectmode = APM_PROT32CONNECT; } else { /* use 16-bit connection */ vmf.vmf_ah = APM_BIOS; vmf.vmf_al = APM_PROT16CONNECT; vmf.vmf_bx = 0; if (vm86_intcall(APM_INT, &vmf)) { printf("apm: 16-bit connection error.\n"); return (ENXIO); } sc->bios.seg.code16.base = (vmf.vmf_ax << 4) + APM_KERNBASE; sc->bios.seg.code16.limit = 0xffff; sc->bios.seg.data.base = (vmf.vmf_cx << 4) + APM_KERNBASE; sc->bios.seg.data.limit = 0xffff; sc->bios.entry = vmf.vmf_bx; sc->connectmode = APM_PROT16CONNECT; } #endif return(0); } /* * return 0 if the user will notice and handle the event, * return 1 if the kernel driver should do so. */ static int apm_record_event(struct apm_softc *sc, u_int event_type) { struct apm_event_info *evp; if ((sc->sc_flags & SCFLAG_OPEN) == 0) return 1; /* no user waiting */ if (sc->event_count == APM_NEVENTS) return 1; /* overflow */ if (sc->event_filter[event_type] == 0) return 1; /* not registered */ evp = &sc->event_list[sc->event_ptr]; sc->event_count++; sc->event_ptr++; sc->event_ptr %= APM_NEVENTS; evp->type = event_type; evp->index = ++apm_evindex; selwakeuppri(&sc->sc_rsel, PZERO); return (sc->sc_flags & SCFLAG_OCTL) ? 0 : 1; /* user may handle */ } /* Power profile */ static void apm_power_profile(struct apm_softc *sc) { int state; struct apm_info info; static int apm_acline = 0; if (apm_get_info(&info)) return; if (apm_acline != info.ai_acline) { apm_acline = info.ai_acline; state = apm_acline ? POWER_PROFILE_PERFORMANCE : POWER_PROFILE_ECONOMY; power_profile_set_state(state); } } /* Process APM event */ static void apm_processevent(void) { int apm_event; struct apm_softc *sc = &apm_softc; #define OPMEV_DEBUGMESSAGE(symbol) case symbol: \ APM_DPRINT("Received APM Event: " #symbol "\n"); do { apm_event = apm_getevent(); switch (apm_event) { OPMEV_DEBUGMESSAGE(PMEV_STANDBYREQ); if (apm_op_inprog == 0) { apm_op_inprog++; if (apm_record_event(sc, apm_event)) { apm_suspend(PMST_STANDBY); } } break; OPMEV_DEBUGMESSAGE(PMEV_USERSTANDBYREQ); if (apm_op_inprog == 0) { apm_op_inprog++; if (apm_record_event(sc, apm_event)) { apm_suspend(PMST_STANDBY); } } break; OPMEV_DEBUGMESSAGE(PMEV_SUSPENDREQ); apm_lastreq_notify(); if (apm_op_inprog == 0) { apm_op_inprog++; if (apm_record_event(sc, apm_event)) { apm_do_suspend(); } } return; /* XXX skip the rest */ OPMEV_DEBUGMESSAGE(PMEV_USERSUSPENDREQ); apm_lastreq_notify(); if (apm_op_inprog == 0) { apm_op_inprog++; if (apm_record_event(sc, apm_event)) { apm_do_suspend(); } } return; /* XXX skip the rest */ OPMEV_DEBUGMESSAGE(PMEV_CRITSUSPEND); apm_do_suspend(); break; OPMEV_DEBUGMESSAGE(PMEV_NORMRESUME); apm_record_event(sc, apm_event); apm_resume(); break; OPMEV_DEBUGMESSAGE(PMEV_CRITRESUME); apm_record_event(sc, apm_event); apm_resume(); break; OPMEV_DEBUGMESSAGE(PMEV_STANDBYRESUME); apm_record_event(sc, apm_event); break; OPMEV_DEBUGMESSAGE(PMEV_BATTERYLOW); if (apm_record_event(sc, apm_event)) { apm_battery_low(); apm_suspend(PMST_SUSPEND); } break; OPMEV_DEBUGMESSAGE(PMEV_POWERSTATECHANGE); apm_record_event(sc, apm_event); apm_power_profile(sc); break; OPMEV_DEBUGMESSAGE(PMEV_UPDATETIME); apm_record_event(sc, apm_event); inittodr(0); /* adjust time to RTC */ break; OPMEV_DEBUGMESSAGE(PMEV_CAPABILITIESCHANGE); apm_record_event(sc, apm_event); apm_power_profile(sc); break; case PMEV_NOEVENT: break; default: printf("Unknown Original APM Event 0x%x\n", apm_event); break; } } while (apm_event != PMEV_NOEVENT); #ifdef PC98 apm_disable_smm(sc); #endif } /* * Attach APM: * * Initialize APM driver */ static int apm_attach(device_t dev) { struct apm_softc *sc = &apm_softc; int drv_version; #ifdef PC98 int rid; #endif mtx_init(&sc->mtx, device_get_nameunit(dev), "apm", MTX_DEF); cv_init(&sc->cv, "cbb cv"); #ifndef PC98 if (device_get_flags(dev) & 0x20) atrtcclock_disable = 1; #endif sc->initialized = 0; /* Must be externally enabled */ sc->active = 0; /* Always call HLT in idle loop */ sc->always_halt_cpu = 1; getenv_int("debug.apm_debug", &apm_debug); /* print bootstrap messages */ APM_DPRINT("apm: APM BIOS version %04lx\n", apm_version); APM_DPRINT("apm: Code16 0x%08x, Data 0x%08x\n", sc->bios.seg.code16.base, sc->bios.seg.data.base); APM_DPRINT("apm: Code entry 0x%08x, Idling CPU %s, Management %s\n", sc->bios.entry, is_enabled(sc->slow_idle_cpu), is_enabled(!sc->disabled)); APM_DPRINT("apm: CS_limit=0x%x, DS_limit=0x%x\n", sc->bios.seg.code16.limit, sc->bios.seg.data.limit); #ifdef PC98 rid = 0; sc->sc_res = bus_alloc_resource(dev, SYS_RES_IOPORT, &rid, APM_NECSMM_PORT, ~0, APM_NECSMM_PORTSZ, RF_ACTIVE); if (sc->sc_res == NULL) panic("%s: counldn't map I/O ports", device_get_name(dev)); sc->sc_iot = rman_get_bustag(sc->sc_res); sc->sc_ioh = rman_get_bushandle(sc->sc_res); if (apm_version==0x112 || apm_version==0x111 || apm_version==0x110) apm_necsmm_addr = APM_NECSMM_PORT; else apm_necsmm_addr = 0; apm_necsmm_mask = ~APM_NECSMM_EN; #endif /* PC98 */ /* * In one test, apm bios version was 1.02; an attempt to register * a 1.04 driver resulted in a 1.00 connection! Registering a * 1.02 driver resulted in a 1.02 connection. */ drv_version = apm_version > 0x102 ? 0x102 : apm_version; for (; drv_version > 0x100; drv_version--) if (apm_driver_version(drv_version) == 0) break; sc->minorversion = ((drv_version & 0x00f0) >> 4) * 10 + ((drv_version & 0x000f) >> 0); sc->majorversion = ((drv_version & 0xf000) >> 12) * 10 + ((apm_version & 0x0f00) >> 8); sc->intversion = INTVERSION(sc->majorversion, sc->minorversion); if (sc->intversion >= INTVERSION(1, 1)) APM_DPRINT("apm: Engaged control %s\n", is_enabled(!sc->disengaged)); device_printf(dev, "found APM BIOS v%ld.%ld, connected at v%d.%d\n", ((apm_version & 0xf000) >> 12) * 10 + ((apm_version & 0x0f00) >> 8), ((apm_version & 0x00f0) >> 4) * 10 + ((apm_version & 0x000f) >> 0), sc->majorversion, sc->minorversion); APM_DPRINT("apm: Slow Idling CPU %s\n", is_enabled(sc->slow_idle_cpu)); /* enable power management */ if (sc->disabled) { if (apm_enable_disable_pm(1)) { APM_DPRINT("apm: *Warning* enable function failed! [%x]\n", (sc->bios.r.eax >> 8) & 0xff); } } /* engage power managment (APM 1.1 or later) */ if (sc->intversion >= INTVERSION(1, 1) && sc->disengaged) { if (apm_engage_disengage_pm(1)) { APM_DPRINT("apm: *Warning* engage function failed err=[%x]", (sc->bios.r.eax >> 8) & 0xff); APM_DPRINT(" (Docked or using external power?).\n"); } } /* Power the system off using APM */ EVENTHANDLER_REGISTER(shutdown_final, apm_power_off, NULL, SHUTDOWN_PRI_LAST); /* Register APM again to pass the correct argument of pm_func. */ power_pm_register(POWER_PM_TYPE_APM, apm_pm_func, sc); sc->initialized = 1; sc->suspending = 0; sc->running = 0; make_dev(&apm_cdevsw, APMDEV_NORMAL, UID_ROOT, GID_OPERATOR, 0664, "apm"); make_dev(&apm_cdevsw, APMDEV_CTL, UID_ROOT, GID_OPERATOR, 0660, "apmctl"); return 0; } static int apmopen(struct cdev *dev, int flag, int fmt, struct thread *td) { struct apm_softc *sc = &apm_softc; if (sc == NULL || sc->initialized == 0) return (ENXIO); switch (dev2unit(dev)) { case APMDEV_CTL: if (!(flag & FWRITE)) return EINVAL; if (sc->sc_flags & SCFLAG_OCTL) return EBUSY; sc->sc_flags |= SCFLAG_OCTL; bzero(sc->event_filter, sizeof sc->event_filter); break; case APMDEV_NORMAL: sc->sc_flags |= SCFLAG_ONORMAL; break; } return 0; } static int apmclose(struct cdev *dev, int flag, int fmt, struct thread *td) { struct apm_softc *sc = &apm_softc; switch (dev2unit(dev)) { case APMDEV_CTL: apm_lastreq_rejected(); sc->sc_flags &= ~SCFLAG_OCTL; bzero(sc->event_filter, sizeof sc->event_filter); break; case APMDEV_NORMAL: sc->sc_flags &= ~SCFLAG_ONORMAL; break; } if ((sc->sc_flags & SCFLAG_OPEN) == 0) { sc->event_count = 0; sc->event_ptr = 0; } return 0; } static int apmioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td) { struct apm_softc *sc = &apm_softc; struct apm_bios_arg *args; int error = 0; int ret; int newstate; if (sc == NULL || sc->initialized == 0) return (ENXIO); APM_DPRINT("APM ioctl: cmd = 0x%lx\n", cmd); switch (cmd) { case APMIO_SUSPEND: if (!(flag & FWRITE)) return (EPERM); if (sc->active) apm_suspend(PMST_SUSPEND); else error = EINVAL; break; case APMIO_STANDBY: if (!(flag & FWRITE)) return (EPERM); if (sc->active) apm_suspend(PMST_STANDBY); else error = EINVAL; break; case APMIO_GETINFO_OLD: { struct apm_info info; apm_info_old_t aiop; if (apm_get_info(&info)) error = ENXIO; aiop = (apm_info_old_t)addr; aiop->ai_major = info.ai_major; aiop->ai_minor = info.ai_minor; aiop->ai_acline = info.ai_acline; aiop->ai_batt_stat = info.ai_batt_stat; aiop->ai_batt_life = info.ai_batt_life; aiop->ai_status = info.ai_status; } break; case APMIO_GETINFO: if (apm_get_info((apm_info_t)addr)) error = ENXIO; break; case APMIO_GETPWSTATUS: if (apm_get_pwstatus((apm_pwstatus_t)addr)) error = ENXIO; break; case APMIO_ENABLE: if (!(flag & FWRITE)) return (EPERM); apm_event_enable(); break; case APMIO_DISABLE: if (!(flag & FWRITE)) return (EPERM); apm_event_disable(); break; case APMIO_HALTCPU: if (!(flag & FWRITE)) return (EPERM); apm_halt_cpu(); break; case APMIO_NOTHALTCPU: if (!(flag & FWRITE)) return (EPERM); apm_not_halt_cpu(); break; case APMIO_DISPLAY: if (!(flag & FWRITE)) return (EPERM); newstate = *(int *)addr; if (apm_display(newstate)) error = ENXIO; break; case APMIO_BIOS: if (!(flag & FWRITE)) return (EPERM); /* XXX compatibility with the old interface */ args = (struct apm_bios_arg *)addr; #ifdef PC98 if (((args->eax >> 8) & 0xff) == 0x53) { sc->bios.r.eax = args->eax & ~0xffff; sc->bios.r.eax |= APM_BIOS << 8; switch (args->eax & 0xff) { case 0x0a: sc->bios.r.eax |= APM_GETPWSTATUS; break; case 0x0e: sc->bios.r.eax |= APM_DRVVERSION; break; default: sc->bios.r.eax |= args->eax & 0xff; break; } } else #endif sc->bios.r.eax = args->eax; sc->bios.r.ebx = args->ebx; sc->bios.r.ecx = args->ecx; sc->bios.r.edx = args->edx; sc->bios.r.esi = args->esi; sc->bios.r.edi = args->edi; if ((ret = apm_bioscall())) { /* * Return code 1 means bios call was unsuccessful. * Error code is stored in %ah. * Return code -1 means bios call was unsupported * in the APM BIOS version. */ if (ret == -1) { error = EINVAL; } } else { /* * Return code 0 means bios call was successful. * We need only %al and can discard %ah. */ sc->bios.r.eax &= 0xff; } args->eax = sc->bios.r.eax; args->ebx = sc->bios.r.ebx; args->ecx = sc->bios.r.ecx; args->edx = sc->bios.r.edx; args->esi = sc->bios.r.esi; args->edi = sc->bios.r.edi; break; default: error = EINVAL; break; } /* for /dev/apmctl */ if (dev2unit(dev) == APMDEV_CTL) { struct apm_event_info *evp; int i; error = 0; switch (cmd) { case APMIO_NEXTEVENT: if (!sc->event_count) { error = EAGAIN; } else { evp = (struct apm_event_info *)addr; i = sc->event_ptr + APM_NEVENTS - sc->event_count; i %= APM_NEVENTS; *evp = sc->event_list[i]; sc->event_count--; } break; case APMIO_REJECTLASTREQ: if (apm_lastreq_rejected()) { error = EINVAL; } break; default: error = EINVAL; break; } } return error; } static int apmwrite(struct cdev *dev, struct uio *uio, int ioflag) { struct apm_softc *sc = &apm_softc; u_int event_type; int error; u_char enabled; if (dev2unit(dev) != APMDEV_CTL) return(ENODEV); if (uio->uio_resid != sizeof(u_int)) return(E2BIG); if ((error = uiomove((caddr_t)&event_type, sizeof(u_int), uio))) return(error); if (event_type < 0 || event_type >= APM_NPMEV) return(EINVAL); if (sc->event_filter[event_type] == 0) { enabled = 1; } else { enabled = 0; } sc->event_filter[event_type] = enabled; APM_DPRINT("apmwrite: event 0x%x %s\n", event_type, is_enabled(enabled)); return uio->uio_resid; } static int apmpoll(struct cdev *dev, int events, struct thread *td) { struct apm_softc *sc = &apm_softc; int revents = 0; if (events & (POLLIN | POLLRDNORM)) { if (sc->event_count) { revents |= events & (POLLIN | POLLRDNORM); } else { selrecord(td, &sc->sc_rsel); } } return (revents); } static device_method_t apm_methods[] = { /* Device interface */ DEVMETHOD(device_identify, apm_identify), DEVMETHOD(device_probe, apm_probe), DEVMETHOD(device_attach, apm_attach), { 0, 0 } }; static driver_t apm_driver = { "apm", apm_methods, 1, /* no softc (XXX) */ }; static devclass_t apm_devclass; DRIVER_MODULE(apm, legacy, apm_driver, apm_devclass, apm_modevent, 0); MODULE_VERSION(apm, 1); static int apm_pm_func(u_long cmd, void *arg, ...) { int state, apm_state; int error; va_list ap; error = 0; switch (cmd) { case POWER_CMD_SUSPEND: va_start(ap, arg); state = va_arg(ap, int); va_end(ap); switch (state) { case POWER_SLEEP_STATE_STANDBY: apm_state = PMST_STANDBY; break; case POWER_SLEEP_STATE_SUSPEND: case POWER_SLEEP_STATE_HIBERNATE: apm_state = PMST_SUSPEND; break; default: error = EINVAL; goto out; } apm_suspend(apm_state); break; default: error = EINVAL; goto out; } out: return (error); } static void apm_pm_register(void *arg) { if (!resource_disabled("apm", 0)) power_pm_register(POWER_PM_TYPE_APM, apm_pm_func, NULL); } -SYSINIT(power, SI_SUB_KLD, SI_ORDER_ANY, apm_pm_register, 0); +SYSINIT(power, SI_SUB_KLD, SI_ORDER_ANY, apm_pm_register, NULL); Index: stable/11/sys/kern/imgact_binmisc.c =================================================================== --- stable/11/sys/kern/imgact_binmisc.c (revision 359651) +++ stable/11/sys/kern/imgact_binmisc.c (revision 359652) @@ -1,757 +1,759 @@ /*- * Copyright (c) 2013-16, Stacey D. Son * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /** * Miscellaneous binary interpreter image activator. * * If the given target executable's header matches 'xbe_magic' field in the * 'interpreter_list' then it will use the user-level interpreter specified in * the 'xbe_interpreter' field to execute the binary. The 'xbe_magic' field may * be adjusted to a given offset using the value in the 'xbe_moffset' field * and bits of the header may be masked using the 'xbe_mask' field. The * 'interpreter_list' entries are managed using sysctl(3) as described in the * file. */ /* * Node of the interpreter list. */ typedef struct imgact_binmisc_entry { char *ibe_name; uint8_t *ibe_magic; uint32_t ibe_moffset; uint32_t ibe_msize; uint8_t *ibe_mask; uint8_t *ibe_interpreter; uint32_t ibe_interp_argcnt; uint32_t ibe_interp_length; uint32_t ibe_flags; SLIST_ENTRY(imgact_binmisc_entry) link; } imgact_binmisc_entry_t; /* * sysctl() commands. */ #define IBC_ADD 1 /* Add given entry. */ #define IBC_REMOVE 2 /* Remove entry for a given name. */ #define IBC_DISABLE 3 /* Disable entry for a given name. */ #define IBC_ENABLE 4 /* Enable entry for a given name. */ #define IBC_LOOKUP 5 /* Lookup and return entry for given name. */ #define IBC_LIST 6 /* Get a snapshot of the interpretor list. */ /* * Interpreter string macros. * * They all start with '#' followed by a single letter: */ #define ISM_POUND '#' /* "##" is the escape sequence for single #. */ #define ISM_OLD_ARGV0 'a' /* "#a" is replaced with the old argv0. */ MALLOC_DEFINE(M_BINMISC, KMOD_NAME, "misc binary image activator"); /* The interpreter list. */ static SLIST_HEAD(, imgact_binmisc_entry) interpreter_list = SLIST_HEAD_INITIALIZER(interpreter_list); static int interp_list_entry_count = 0; static struct sx interp_list_sx; /* * Populate the entry with the information about the interpreter. */ static void imgact_binmisc_populate_interp(char *str, imgact_binmisc_entry_t *ibe) { uint32_t len = 0, argc = 1; char t[IBE_INTERP_LEN_MAX]; char *sp, *tp; memset(t, 0, sizeof(t)); /* * Normalize interpreter string. Replace white space between args with * single space. */ sp = str; tp = t; while (*sp != '\0') { if (*sp == ' ' || *sp == '\t') { if (++len >= IBE_INTERP_LEN_MAX) break; *tp++ = ' '; argc++; while (*sp == ' ' || *sp == '\t') sp++; continue; } else { *tp++ = *sp++; len++; } } *tp = '\0'; len++; ibe->ibe_interpreter = malloc(len, M_BINMISC, M_WAITOK|M_ZERO); /* Populate all the ibe fields for the interpreter. */ memcpy(ibe->ibe_interpreter, t, len); ibe->ibe_interp_argcnt = argc; ibe->ibe_interp_length = len; } /* * Allocate memory and populate a new entry for the interpreter table. */ static imgact_binmisc_entry_t * imgact_binmisc_new_entry(ximgact_binmisc_entry_t *xbe) { imgact_binmisc_entry_t *ibe = NULL; size_t namesz = min(strlen(xbe->xbe_name) + 1, IBE_NAME_MAX); ibe = malloc(sizeof(*ibe), M_BINMISC, M_WAITOK|M_ZERO); ibe->ibe_name = malloc(namesz, M_BINMISC, M_WAITOK|M_ZERO); strlcpy(ibe->ibe_name, xbe->xbe_name, namesz); imgact_binmisc_populate_interp(xbe->xbe_interpreter, ibe); ibe->ibe_magic = malloc(xbe->xbe_msize, M_BINMISC, M_WAITOK|M_ZERO); memcpy(ibe->ibe_magic, xbe->xbe_magic, xbe->xbe_msize); ibe->ibe_mask = malloc(xbe->xbe_msize, M_BINMISC, M_WAITOK|M_ZERO); memcpy(ibe->ibe_mask, xbe->xbe_mask, xbe->xbe_msize); ibe->ibe_moffset = xbe->xbe_moffset; ibe->ibe_msize = xbe->xbe_msize; ibe->ibe_flags = xbe->xbe_flags; return (ibe); } /* * Free the allocated memory for a given list item. */ static void imgact_binmisc_destroy_entry(imgact_binmisc_entry_t *ibe) { if (!ibe) return; if (ibe->ibe_magic) free(ibe->ibe_magic, M_BINMISC); if (ibe->ibe_mask) free(ibe->ibe_mask, M_BINMISC); if (ibe->ibe_interpreter) free(ibe->ibe_interpreter, M_BINMISC); if (ibe->ibe_name) free(ibe->ibe_name, M_BINMISC); if (ibe) free(ibe, M_BINMISC); } /* * Find the interpreter in the list by the given name. Return NULL if not * found. */ static imgact_binmisc_entry_t * imgact_binmisc_find_entry(char *name) { imgact_binmisc_entry_t *ibe; sx_assert(&interp_list_sx, SA_LOCKED); SLIST_FOREACH(ibe, &interpreter_list, link) { if (strncmp(name, ibe->ibe_name, IBE_NAME_MAX) == 0) return (ibe); } return (NULL); } /* * Add the given interpreter if it doesn't already exist. Return EEXIST * if the name already exist in the interpreter list. */ static int imgact_binmisc_add_entry(ximgact_binmisc_entry_t *xbe) { imgact_binmisc_entry_t *ibe; char *p; int cnt; if (xbe->xbe_msize > IBE_MAGIC_MAX) return (EINVAL); for(cnt = 0, p = xbe->xbe_name; *p != 0; cnt++, p++) if (cnt >= IBE_NAME_MAX || !isascii((int)*p)) return (EINVAL); for(cnt = 0, p = xbe->xbe_interpreter; *p != 0; cnt++, p++) if (cnt >= IBE_INTERP_LEN_MAX || !isascii((int)*p)) return (EINVAL); /* Make sure we don't have any invalid #'s. */ p = xbe->xbe_interpreter; while (1) { p = strchr(p, '#'); if (!p) break; p++; switch(*p) { case ISM_POUND: /* "##" */ p++; break; case ISM_OLD_ARGV0: /* "#a" */ p++; break; case 0: default: /* Anything besides the above is invalid. */ return (EINVAL); } } sx_xlock(&interp_list_sx); if (imgact_binmisc_find_entry(xbe->xbe_name) != NULL) { sx_xunlock(&interp_list_sx); return (EEXIST); } /* Preallocate a new entry. */ ibe = imgact_binmisc_new_entry(xbe); SLIST_INSERT_HEAD(&interpreter_list, ibe, link); interp_list_entry_count++; sx_xunlock(&interp_list_sx); return (0); } /* * Remove the interpreter in the list with the given name. Return ENOENT * if not found. */ static int imgact_binmisc_remove_entry(char *name) { imgact_binmisc_entry_t *ibe; sx_xlock(&interp_list_sx); if ((ibe = imgact_binmisc_find_entry(name)) == NULL) { sx_xunlock(&interp_list_sx); return (ENOENT); } SLIST_REMOVE(&interpreter_list, ibe, imgact_binmisc_entry, link); interp_list_entry_count--; sx_xunlock(&interp_list_sx); imgact_binmisc_destroy_entry(ibe); return (0); } /* * Disable the interpreter in the list with the given name. Return ENOENT * if not found. */ static int imgact_binmisc_disable_entry(char *name) { imgact_binmisc_entry_t *ibe; sx_xlock(&interp_list_sx); if ((ibe = imgact_binmisc_find_entry(name)) == NULL) { sx_xunlock(&interp_list_sx); return (ENOENT); } ibe->ibe_flags &= ~IBF_ENABLED; sx_xunlock(&interp_list_sx); return (0); } /* * Enable the interpreter in the list with the given name. Return ENOENT * if not found. */ static int imgact_binmisc_enable_entry(char *name) { imgact_binmisc_entry_t *ibe; sx_xlock(&interp_list_sx); if ((ibe = imgact_binmisc_find_entry(name)) == NULL) { sx_xunlock(&interp_list_sx); return (ENOENT); } ibe->ibe_flags |= IBF_ENABLED; sx_xunlock(&interp_list_sx); return (0); } static int imgact_binmisc_populate_xbe(ximgact_binmisc_entry_t *xbe, imgact_binmisc_entry_t *ibe) { uint32_t i; sx_assert(&interp_list_sx, SA_LOCKED); memset(xbe, 0, sizeof(*xbe)); strlcpy(xbe->xbe_name, ibe->ibe_name, IBE_NAME_MAX); /* Copy interpreter string. Replace NULL breaks with space. */ memcpy(xbe->xbe_interpreter, ibe->ibe_interpreter, ibe->ibe_interp_length); for(i = 0; i < (ibe->ibe_interp_length - 1); i++) if (xbe->xbe_interpreter[i] == '\0') xbe->xbe_interpreter[i] = ' '; memcpy(xbe->xbe_magic, ibe->ibe_magic, ibe->ibe_msize); memcpy(xbe->xbe_mask, ibe->ibe_mask, ibe->ibe_msize); xbe->xbe_version = IBE_VERSION; xbe->xbe_flags = ibe->ibe_flags; xbe->xbe_moffset = ibe->ibe_moffset; xbe->xbe_msize = ibe->ibe_msize; return (0); } /* * Retrieve the interpreter with the give name and populate the * ximgact_binmisc_entry structure. Return ENOENT if not found. */ static int imgact_binmisc_lookup_entry(char *name, ximgact_binmisc_entry_t *xbe) { imgact_binmisc_entry_t *ibe; int error = 0; sx_slock(&interp_list_sx); if ((ibe = imgact_binmisc_find_entry(name)) == NULL) { sx_sunlock(&interp_list_sx); return (ENOENT); } error = imgact_binmisc_populate_xbe(xbe, ibe); sx_sunlock(&interp_list_sx); return (error); } /* * Get a snapshot of all the interpreter entries in the list. */ static int imgact_binmisc_get_all_entries(struct sysctl_req *req) { ximgact_binmisc_entry_t *xbe, *xbep; imgact_binmisc_entry_t *ibe; int error = 0, count; sx_slock(&interp_list_sx); count = interp_list_entry_count; xbe = malloc(sizeof(*xbe) * count, M_BINMISC, M_WAITOK|M_ZERO); xbep = xbe; SLIST_FOREACH(ibe, &interpreter_list, link) { error = imgact_binmisc_populate_xbe(xbep++, ibe); if (error) break; } sx_sunlock(&interp_list_sx); if (!error) error = SYSCTL_OUT(req, xbe, sizeof(*xbe) * count); free(xbe, M_BINMISC); return (error); } /* * sysctl() handler for munipulating interpretor table. * Not MP safe (locked by sysctl). */ static int sysctl_kern_binmisc(SYSCTL_HANDLER_ARGS) { ximgact_binmisc_entry_t xbe; int error = 0; switch(arg2) { case IBC_ADD: /* Add an entry. Limited to IBE_MAX_ENTRIES. */ error = SYSCTL_IN(req, &xbe, sizeof(xbe)); if (error) return (error); if (IBE_VERSION != xbe.xbe_version) return (EINVAL); if (interp_list_entry_count == IBE_MAX_ENTRIES) return (ENOSPC); error = imgact_binmisc_add_entry(&xbe); break; case IBC_REMOVE: /* Remove an entry. */ error = SYSCTL_IN(req, &xbe, sizeof(xbe)); if (error) return (error); if (IBE_VERSION != xbe.xbe_version) return (EINVAL); error = imgact_binmisc_remove_entry(xbe.xbe_name); break; case IBC_DISABLE: /* Disable an entry. */ error = SYSCTL_IN(req, &xbe, sizeof(xbe)); if (error) return (error); if (IBE_VERSION != xbe.xbe_version) return (EINVAL); error = imgact_binmisc_disable_entry(xbe.xbe_name); break; case IBC_ENABLE: /* Enable an entry. */ error = SYSCTL_IN(req, &xbe, sizeof(xbe)); if (error) return (error); if (IBE_VERSION != xbe.xbe_version) return (EINVAL); error = imgact_binmisc_enable_entry(xbe.xbe_name); break; case IBC_LOOKUP: /* Lookup an entry. */ error = SYSCTL_IN(req, &xbe, sizeof(xbe)); if (error) return (error); if (IBE_VERSION != xbe.xbe_version) return (EINVAL); error = imgact_binmisc_lookup_entry(xbe.xbe_name, &xbe); if (!error) error = SYSCTL_OUT(req, &xbe, sizeof(xbe)); break; case IBC_LIST: /* Return a snapshot of the interpretor list. */ if (!req->oldptr) { /* No pointer then just return the list size. */ error = SYSCTL_OUT(req, 0, interp_list_entry_count * sizeof(ximgact_binmisc_entry_t)); return (error); } else if (!req->oldlen) return (EINVAL); error = imgact_binmisc_get_all_entries(req); break; default: return (EINVAL); } return (error); } SYSCTL_NODE(_kern, OID_AUTO, binmisc, CTLFLAG_RW, 0, "Image activator for miscellaneous binaries"); SYSCTL_PROC(_kern_binmisc, OID_AUTO, add, CTLFLAG_MPSAFE|CTLTYPE_STRUCT|CTLFLAG_WR, NULL, IBC_ADD, sysctl_kern_binmisc, "S,ximgact_binmisc_entry", "Add an activator entry"); SYSCTL_PROC(_kern_binmisc, OID_AUTO, remove, CTLFLAG_MPSAFE|CTLTYPE_STRUCT|CTLFLAG_WR, NULL, IBC_REMOVE, sysctl_kern_binmisc, "S,ximgact_binmisc_entry", "Remove an activator entry"); SYSCTL_PROC(_kern_binmisc, OID_AUTO, disable, CTLFLAG_MPSAFE|CTLTYPE_STRUCT|CTLFLAG_WR, NULL, IBC_DISABLE, sysctl_kern_binmisc, "S,ximgact_binmisc_entry", "Disable an activator entry"); SYSCTL_PROC(_kern_binmisc, OID_AUTO, enable, CTLFLAG_MPSAFE|CTLTYPE_STRUCT|CTLFLAG_WR, NULL, IBC_ENABLE, sysctl_kern_binmisc, "S,ximgact_binmisc_entry", "Enable an activator entry"); SYSCTL_PROC(_kern_binmisc, OID_AUTO, lookup, CTLFLAG_MPSAFE|CTLTYPE_STRUCT|CTLFLAG_RW|CTLFLAG_ANYBODY, NULL, IBC_LOOKUP, sysctl_kern_binmisc, "S,ximgact_binmisc_entry", "Lookup an activator entry"); SYSCTL_PROC(_kern_binmisc, OID_AUTO, list, CTLFLAG_MPSAFE|CTLTYPE_STRUCT|CTLFLAG_RD|CTLFLAG_ANYBODY, NULL, IBC_LIST, sysctl_kern_binmisc, "S,ximgact_binmisc_entry", "Get snapshot of all the activator entries"); static imgact_binmisc_entry_t * imgact_binmisc_find_interpreter(const char *image_header) { imgact_binmisc_entry_t *ibe; const char *p; int i; size_t sz; sx_assert(&interp_list_sx, SA_LOCKED); SLIST_FOREACH(ibe, &interpreter_list, link) { if (!(IBF_ENABLED & ibe->ibe_flags)) continue; p = image_header + ibe->ibe_moffset; sz = ibe->ibe_msize; if (IBF_USE_MASK & ibe->ibe_flags) { /* Compare using mask. */ for (i = 0; i < sz; i++) if ((*p++ ^ ibe->ibe_magic[i]) & ibe->ibe_mask[i]) break; } else { for (i = 0; i < sz; i++) if (*p++ ^ ibe->ibe_magic[i]) break; } if (i == ibe->ibe_msize) return (ibe); } return (NULL); } static int imgact_binmisc_exec(struct image_params *imgp) { const char *image_header = imgp->image_header; const char *fname = NULL; int error = 0; size_t offset, l; imgact_binmisc_entry_t *ibe; struct sbuf *sname; char *s, *d; /* Do we have an interpreter for the given image header? */ sx_slock(&interp_list_sx); if ((ibe = imgact_binmisc_find_interpreter(image_header)) == NULL) { sx_sunlock(&interp_list_sx); return (-1); } /* No interpreter nesting allowed. */ if (imgp->interpreted & IMGACT_BINMISC) { sx_sunlock(&interp_list_sx); return (ENOEXEC); } imgp->interpreted |= IMGACT_BINMISC; if (imgp->args->fname != NULL) { fname = imgp->args->fname; sname = NULL; } else { /* Use the fdescfs(5) path for fexecve(2). */ sname = sbuf_new_auto(); sbuf_printf(sname, "/dev/fd/%d", imgp->args->fd); sbuf_finish(sname); fname = sbuf_data(sname); } /* * We need to "push" the interpreter in the arg[] list. To do this, * we first shift all the other values in the `begin_argv' area to * provide the exact amount of room for the values added. Set up * `offset' as the number of bytes to be added to the `begin_argv' * area. */ offset = ibe->ibe_interp_length; /* Adjust the offset for #'s. */ s = ibe->ibe_interpreter; while (1) { s = strchr(s, '#'); if (!s) break; s++; switch(*s) { case ISM_POUND: /* "##" -> "#": reduce offset by one. */ offset--; break; case ISM_OLD_ARGV0: /* "#a" -> (old argv0): increase offset to fit fname */ offset += strlen(fname) - 2; break; default: /* Hmm... This shouldn't happen. */ sx_sunlock(&interp_list_sx); printf("%s: Unknown macro #%c sequence in " "interpreter string\n", KMOD_NAME, *(s + 1)); error = EINVAL; goto done; } s++; } /* Check to make sure we won't overrun the stringspace. */ if (offset > imgp->args->stringspace) { sx_sunlock(&interp_list_sx); error = E2BIG; goto done; } /* Make room for the interpreter */ bcopy(imgp->args->begin_argv, imgp->args->begin_argv + offset, imgp->args->endp - imgp->args->begin_argv); /* Adjust everything by the offset. */ imgp->args->begin_envv += offset; imgp->args->endp += offset; imgp->args->stringspace -= offset; /* Add the new argument(s) in the count. */ imgp->args->argc += ibe->ibe_interp_argcnt; /* * The original arg[] list has been shifted appropriately. Copy in * the interpreter path. */ s = ibe->ibe_interpreter; d = imgp->args->begin_argv; while(*s != '\0') { switch (*s) { case '#': /* Handle "#" in interpreter string. */ s++; switch(*s) { case ISM_POUND: /* "##": Replace with a single '#' */ *d++ = '#'; break; case ISM_OLD_ARGV0: /* "#a": Replace with old arg0 (fname). */ if ((l = strlen(fname)) != 0) { memcpy(d, fname, l); d += l; } break; default: /* Shouldn't happen but skip it if it does. */ break; } break; case ' ': /* Replace space with NUL to separate arguments. */ *d++ = '\0'; break; default: *d++ = *s; break; } s++; } *d = '\0'; sx_sunlock(&interp_list_sx); if (!error) imgp->interpreter_name = imgp->args->begin_argv; done: if (sname) sbuf_delete(sname); return (error); } static void imgact_binmisc_init(void *arg) { sx_init(&interp_list_sx, KMOD_NAME); } static void imgact_binmisc_fini(void *arg) { imgact_binmisc_entry_t *ibe, *ibe_tmp; /* Free all the interpreters. */ sx_xlock(&interp_list_sx); SLIST_FOREACH_SAFE(ibe, &interpreter_list, link, ibe_tmp) { SLIST_REMOVE(&interpreter_list, ibe, imgact_binmisc_entry, link); imgact_binmisc_destroy_entry(ibe); } sx_xunlock(&interp_list_sx); sx_destroy(&interp_list_sx); } -SYSINIT(imgact_binmisc, SI_SUB_EXEC, SI_ORDER_MIDDLE, imgact_binmisc_init, 0); -SYSUNINIT(imgact_binmisc, SI_SUB_EXEC, SI_ORDER_MIDDLE, imgact_binmisc_fini, 0); +SYSINIT(imgact_binmisc, SI_SUB_EXEC, SI_ORDER_MIDDLE, imgact_binmisc_init, + NULL); +SYSUNINIT(imgact_binmisc, SI_SUB_EXEC, SI_ORDER_MIDDLE, imgact_binmisc_fini, + NULL); /* * Tell kern_execve.c about it, with a little help from the linker. */ static struct execsw imgact_binmisc_execsw = { imgact_binmisc_exec, KMOD_NAME }; EXEC_SET(imgact_binmisc, imgact_binmisc_execsw); Index: stable/11/sys/kern/kern_linker.c =================================================================== --- stable/11/sys/kern/kern_linker.c (revision 359651) +++ stable/11/sys/kern/kern_linker.c (revision 359652) @@ -1,2202 +1,2202 @@ /*- * Copyright (c) 1997-2000 Doug Rabson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_kld.h" #include "opt_hwpmc_hooks.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include "linker_if.h" #ifdef HWPMC_HOOKS #include #endif #ifdef KLD_DEBUG int kld_debug = 0; SYSCTL_INT(_debug, OID_AUTO, kld_debug, CTLFLAG_RWTUN, &kld_debug, 0, "Set various levels of KLD debug"); #endif /* These variables are used by kernel debuggers to enumerate loaded files. */ const int kld_off_address = offsetof(struct linker_file, address); const int kld_off_filename = offsetof(struct linker_file, filename); const int kld_off_pathname = offsetof(struct linker_file, pathname); const int kld_off_next = offsetof(struct linker_file, link.tqe_next); /* * static char *linker_search_path(const char *name, struct mod_depend * *verinfo); */ static const char *linker_basename(const char *path); /* * Find a currently loaded file given its filename. */ static linker_file_t linker_find_file_by_name(const char* _filename); /* * Find a currently loaded file given its file id. */ static linker_file_t linker_find_file_by_id(int _fileid); /* Metadata from the static kernel */ SET_DECLARE(modmetadata_set, struct mod_metadata); MALLOC_DEFINE(M_LINKER, "linker", "kernel linker"); linker_file_t linker_kernel_file; static struct sx kld_sx; /* kernel linker lock */ /* * Load counter used by clients to determine if a linker file has been * re-loaded. This counter is incremented for each file load. */ static int loadcnt; static linker_class_list_t classes; static linker_file_list_t linker_files; static int next_file_id = 1; static int linker_no_more_classes = 0; #define LINKER_GET_NEXT_FILE_ID(a) do { \ linker_file_t lftmp; \ \ if (!cold) \ sx_assert(&kld_sx, SA_XLOCKED); \ retry: \ TAILQ_FOREACH(lftmp, &linker_files, link) { \ if (next_file_id == lftmp->id) { \ next_file_id++; \ goto retry; \ } \ } \ (a) = next_file_id; \ } while(0) /* XXX wrong name; we're looking at version provision tags here, not modules */ typedef TAILQ_HEAD(, modlist) modlisthead_t; struct modlist { TAILQ_ENTRY(modlist) link; /* chain together all modules */ linker_file_t container; const char *name; int version; }; typedef struct modlist *modlist_t; static modlisthead_t found_modules; static int linker_file_add_dependency(linker_file_t file, linker_file_t dep); static caddr_t linker_file_lookup_symbol_internal(linker_file_t file, const char* name, int deps); static int linker_load_module(const char *kldname, const char *modname, struct linker_file *parent, const struct mod_depend *verinfo, struct linker_file **lfpp); static modlist_t modlist_lookup2(const char *name, const struct mod_depend *verinfo); static void linker_init(void *arg) { sx_init(&kld_sx, "kernel linker"); TAILQ_INIT(&classes); TAILQ_INIT(&linker_files); } -SYSINIT(linker, SI_SUB_KLD, SI_ORDER_FIRST, linker_init, 0); +SYSINIT(linker, SI_SUB_KLD, SI_ORDER_FIRST, linker_init, NULL); static void linker_stop_class_add(void *arg) { linker_no_more_classes = 1; } SYSINIT(linker_class, SI_SUB_KLD, SI_ORDER_ANY, linker_stop_class_add, NULL); int linker_add_class(linker_class_t lc) { /* * We disallow any class registration past SI_ORDER_ANY * of SI_SUB_KLD. We bump the reference count to keep the * ops from being freed. */ if (linker_no_more_classes == 1) return (EPERM); kobj_class_compile((kobj_class_t) lc); ((kobj_class_t)lc)->refs++; /* XXX: kobj_mtx */ TAILQ_INSERT_TAIL(&classes, lc, link); return (0); } static void linker_file_sysinit(linker_file_t lf) { struct sysinit **start, **stop, **sipp, **xipp, *save; KLD_DPF(FILE, ("linker_file_sysinit: calling SYSINITs for %s\n", lf->filename)); sx_assert(&kld_sx, SA_XLOCKED); if (linker_file_lookup_set(lf, "sysinit_set", &start, &stop, NULL) != 0) return; /* * Perform a bubble sort of the system initialization objects by * their subsystem (primary key) and order (secondary key). * * Since some things care about execution order, this is the operation * which ensures continued function. */ for (sipp = start; sipp < stop; sipp++) { for (xipp = sipp + 1; xipp < stop; xipp++) { if ((*sipp)->subsystem < (*xipp)->subsystem || ((*sipp)->subsystem == (*xipp)->subsystem && (*sipp)->order <= (*xipp)->order)) continue; /* skip */ save = *sipp; *sipp = *xipp; *xipp = save; } } /* * Traverse the (now) ordered list of system initialization tasks. * Perform each task, and continue on to the next task. */ sx_xunlock(&kld_sx); mtx_lock(&Giant); for (sipp = start; sipp < stop; sipp++) { if ((*sipp)->subsystem == SI_SUB_DUMMY) continue; /* skip dummy task(s) */ /* Call function */ (*((*sipp)->func)) ((*sipp)->udata); } mtx_unlock(&Giant); sx_xlock(&kld_sx); } static void linker_file_sysuninit(linker_file_t lf) { struct sysinit **start, **stop, **sipp, **xipp, *save; KLD_DPF(FILE, ("linker_file_sysuninit: calling SYSUNINITs for %s\n", lf->filename)); sx_assert(&kld_sx, SA_XLOCKED); if (linker_file_lookup_set(lf, "sysuninit_set", &start, &stop, NULL) != 0) return; /* * Perform a reverse bubble sort of the system initialization objects * by their subsystem (primary key) and order (secondary key). * * Since some things care about execution order, this is the operation * which ensures continued function. */ for (sipp = start; sipp < stop; sipp++) { for (xipp = sipp + 1; xipp < stop; xipp++) { if ((*sipp)->subsystem > (*xipp)->subsystem || ((*sipp)->subsystem == (*xipp)->subsystem && (*sipp)->order >= (*xipp)->order)) continue; /* skip */ save = *sipp; *sipp = *xipp; *xipp = save; } } /* * Traverse the (now) ordered list of system initialization tasks. * Perform each task, and continue on to the next task. */ sx_xunlock(&kld_sx); mtx_lock(&Giant); for (sipp = start; sipp < stop; sipp++) { if ((*sipp)->subsystem == SI_SUB_DUMMY) continue; /* skip dummy task(s) */ /* Call function */ (*((*sipp)->func)) ((*sipp)->udata); } mtx_unlock(&Giant); sx_xlock(&kld_sx); } static void linker_file_register_sysctls(linker_file_t lf, bool enable) { struct sysctl_oid **start, **stop, **oidp; KLD_DPF(FILE, ("linker_file_register_sysctls: registering SYSCTLs for %s\n", lf->filename)); sx_assert(&kld_sx, SA_XLOCKED); if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0) return; sx_xunlock(&kld_sx); sysctl_wlock(); for (oidp = start; oidp < stop; oidp++) { if (enable) sysctl_register_oid(*oidp); else sysctl_register_disabled_oid(*oidp); } sysctl_wunlock(); sx_xlock(&kld_sx); } static void linker_file_enable_sysctls(linker_file_t lf) { struct sysctl_oid **start, **stop, **oidp; KLD_DPF(FILE, ("linker_file_enable_sysctls: enable SYSCTLs for %s\n", lf->filename)); sx_assert(&kld_sx, SA_XLOCKED); if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0) return; sx_xunlock(&kld_sx); sysctl_wlock(); for (oidp = start; oidp < stop; oidp++) sysctl_enable_oid(*oidp); sysctl_wunlock(); sx_xlock(&kld_sx); } static void linker_file_unregister_sysctls(linker_file_t lf) { struct sysctl_oid **start, **stop, **oidp; KLD_DPF(FILE, ("linker_file_unregister_sysctls: unregistering SYSCTLs" " for %s\n", lf->filename)); sx_assert(&kld_sx, SA_XLOCKED); if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0) return; sx_xunlock(&kld_sx); sysctl_wlock(); for (oidp = start; oidp < stop; oidp++) sysctl_unregister_oid(*oidp); sysctl_wunlock(); sx_xlock(&kld_sx); } static int linker_file_register_modules(linker_file_t lf) { struct mod_metadata **start, **stop, **mdp; const moduledata_t *moddata; int first_error, error; KLD_DPF(FILE, ("linker_file_register_modules: registering modules" " in %s\n", lf->filename)); sx_assert(&kld_sx, SA_XLOCKED); if (linker_file_lookup_set(lf, "modmetadata_set", &start, &stop, NULL) != 0) { /* * This fallback should be unnecessary, but if we get booted * from boot2 instead of loader and we are missing our * metadata then we have to try the best we can. */ if (lf == linker_kernel_file) { start = SET_BEGIN(modmetadata_set); stop = SET_LIMIT(modmetadata_set); } else return (0); } first_error = 0; for (mdp = start; mdp < stop; mdp++) { if ((*mdp)->md_type != MDT_MODULE) continue; moddata = (*mdp)->md_data; KLD_DPF(FILE, ("Registering module %s in %s\n", moddata->name, lf->filename)); error = module_register(moddata, lf); if (error) { printf("Module %s failed to register: %d\n", moddata->name, error); if (first_error == 0) first_error = error; } } return (first_error); } static void linker_init_kernel_modules(void) { sx_xlock(&kld_sx); linker_file_register_modules(linker_kernel_file); sx_xunlock(&kld_sx); } SYSINIT(linker_kernel, SI_SUB_KLD, SI_ORDER_ANY, linker_init_kernel_modules, - 0); + NULL); static int linker_load_file(const char *filename, linker_file_t *result) { linker_class_t lc; linker_file_t lf; int foundfile, error, modules; /* Refuse to load modules if securelevel raised */ if (prison0.pr_securelevel > 0) return (EPERM); sx_assert(&kld_sx, SA_XLOCKED); lf = linker_find_file_by_name(filename); if (lf) { KLD_DPF(FILE, ("linker_load_file: file %s is already loaded," " incrementing refs\n", filename)); *result = lf; lf->refs++; return (0); } foundfile = 0; error = 0; /* * We do not need to protect (lock) classes here because there is * no class registration past startup (SI_SUB_KLD, SI_ORDER_ANY) * and there is no class deregistration mechanism at this time. */ TAILQ_FOREACH(lc, &classes, link) { KLD_DPF(FILE, ("linker_load_file: trying to load %s\n", filename)); error = LINKER_LOAD_FILE(lc, filename, &lf); /* * If we got something other than ENOENT, then it exists but * we cannot load it for some other reason. */ if (error != ENOENT) foundfile = 1; if (lf) { error = linker_file_register_modules(lf); if (error == EEXIST) { linker_file_unload(lf, LINKER_UNLOAD_FORCE); return (error); } modules = !TAILQ_EMPTY(&lf->modules); linker_file_register_sysctls(lf, false); linker_file_sysinit(lf); lf->flags |= LINKER_FILE_LINKED; /* * If all of the modules in this file failed * to load, unload the file and return an * error of ENOEXEC. */ if (modules && TAILQ_EMPTY(&lf->modules)) { linker_file_unload(lf, LINKER_UNLOAD_FORCE); return (ENOEXEC); } linker_file_enable_sysctls(lf); EVENTHANDLER_INVOKE(kld_load, lf); *result = lf; return (0); } } /* * Less than ideal, but tells the user whether it failed to load or * the module was not found. */ if (foundfile) { /* * If the file type has not been recognized by the last try * printout a message before to fail. */ if (error == ENOSYS) printf("linker_load_file: Unsupported file type\n"); /* * Format not recognized or otherwise unloadable. * When loading a module that is statically built into * the kernel EEXIST percolates back up as the return * value. Preserve this so that apps like sysinstall * can recognize this special case and not post bogus * dialog boxes. */ if (error != EEXIST) error = ENOEXEC; } else error = ENOENT; /* Nothing found */ return (error); } int linker_reference_module(const char *modname, struct mod_depend *verinfo, linker_file_t *result) { modlist_t mod; int error; sx_xlock(&kld_sx); if ((mod = modlist_lookup2(modname, verinfo)) != NULL) { *result = mod->container; (*result)->refs++; sx_xunlock(&kld_sx); return (0); } error = linker_load_module(NULL, modname, NULL, verinfo, result); sx_xunlock(&kld_sx); return (error); } int linker_release_module(const char *modname, struct mod_depend *verinfo, linker_file_t lf) { modlist_t mod; int error; sx_xlock(&kld_sx); if (lf == NULL) { KASSERT(modname != NULL, ("linker_release_module: no file or name")); mod = modlist_lookup2(modname, verinfo); if (mod == NULL) { sx_xunlock(&kld_sx); return (ESRCH); } lf = mod->container; } else KASSERT(modname == NULL && verinfo == NULL, ("linker_release_module: both file and name")); error = linker_file_unload(lf, LINKER_UNLOAD_NORMAL); sx_xunlock(&kld_sx); return (error); } static linker_file_t linker_find_file_by_name(const char *filename) { linker_file_t lf; char *koname; koname = malloc(strlen(filename) + 4, M_LINKER, M_WAITOK); sprintf(koname, "%s.ko", filename); sx_assert(&kld_sx, SA_XLOCKED); TAILQ_FOREACH(lf, &linker_files, link) { if (strcmp(lf->filename, koname) == 0) break; if (strcmp(lf->filename, filename) == 0) break; } free(koname, M_LINKER); return (lf); } static linker_file_t linker_find_file_by_id(int fileid) { linker_file_t lf; sx_assert(&kld_sx, SA_XLOCKED); TAILQ_FOREACH(lf, &linker_files, link) if (lf->id == fileid && lf->flags & LINKER_FILE_LINKED) break; return (lf); } int linker_file_foreach(linker_predicate_t *predicate, void *context) { linker_file_t lf; int retval = 0; sx_xlock(&kld_sx); TAILQ_FOREACH(lf, &linker_files, link) { retval = predicate(lf, context); if (retval != 0) break; } sx_xunlock(&kld_sx); return (retval); } linker_file_t linker_make_file(const char *pathname, linker_class_t lc) { linker_file_t lf; const char *filename; if (!cold) sx_assert(&kld_sx, SA_XLOCKED); filename = linker_basename(pathname); KLD_DPF(FILE, ("linker_make_file: new file, filename='%s' for pathname='%s'\n", filename, pathname)); lf = (linker_file_t)kobj_create((kobj_class_t)lc, M_LINKER, M_WAITOK); if (lf == NULL) return (NULL); lf->ctors_addr = 0; lf->ctors_size = 0; lf->refs = 1; lf->userrefs = 0; lf->flags = 0; lf->filename = strdup(filename, M_LINKER); lf->pathname = strdup(pathname, M_LINKER); LINKER_GET_NEXT_FILE_ID(lf->id); lf->ndeps = 0; lf->deps = NULL; lf->loadcnt = ++loadcnt; STAILQ_INIT(&lf->common); TAILQ_INIT(&lf->modules); TAILQ_INSERT_TAIL(&linker_files, lf, link); return (lf); } int linker_file_unload(linker_file_t file, int flags) { module_t mod, next; modlist_t ml, nextml; struct common_symbol *cp; int error, i; /* Refuse to unload modules if securelevel raised. */ if (prison0.pr_securelevel > 0) return (EPERM); sx_assert(&kld_sx, SA_XLOCKED); KLD_DPF(FILE, ("linker_file_unload: lf->refs=%d\n", file->refs)); /* Easy case of just dropping a reference. */ if (file->refs > 1) { file->refs--; return (0); } /* Give eventhandlers a chance to prevent the unload. */ error = 0; EVENTHANDLER_INVOKE(kld_unload_try, file, &error); if (error != 0) return (EBUSY); KLD_DPF(FILE, ("linker_file_unload: file is unloading," " informing modules\n")); /* * Quiesce all the modules to give them a chance to veto the unload. */ MOD_SLOCK; for (mod = TAILQ_FIRST(&file->modules); mod; mod = module_getfnext(mod)) { error = module_quiesce(mod); if (error != 0 && flags != LINKER_UNLOAD_FORCE) { KLD_DPF(FILE, ("linker_file_unload: module %s" " vetoed unload\n", module_getname(mod))); /* * XXX: Do we need to tell all the quiesced modules * that they can resume work now via a new module * event? */ MOD_SUNLOCK; return (error); } } MOD_SUNLOCK; /* * Inform any modules associated with this file that they are * being unloaded. */ MOD_XLOCK; for (mod = TAILQ_FIRST(&file->modules); mod; mod = next) { next = module_getfnext(mod); MOD_XUNLOCK; /* * Give the module a chance to veto the unload. */ if ((error = module_unload(mod)) != 0) { #ifdef KLD_DEBUG MOD_SLOCK; KLD_DPF(FILE, ("linker_file_unload: module %s" " failed unload\n", module_getname(mod))); MOD_SUNLOCK; #endif return (error); } MOD_XLOCK; module_release(mod); } MOD_XUNLOCK; TAILQ_FOREACH_SAFE(ml, &found_modules, link, nextml) { if (ml->container == file) { TAILQ_REMOVE(&found_modules, ml, link); free(ml, M_LINKER); } } /* * Don't try to run SYSUNINITs if we are unloaded due to a * link error. */ if (file->flags & LINKER_FILE_LINKED) { file->flags &= ~LINKER_FILE_LINKED; linker_file_unregister_sysctls(file); linker_file_sysuninit(file); } TAILQ_REMOVE(&linker_files, file, link); if (file->deps) { for (i = 0; i < file->ndeps; i++) linker_file_unload(file->deps[i], flags); free(file->deps, M_LINKER); file->deps = NULL; } while ((cp = STAILQ_FIRST(&file->common)) != NULL) { STAILQ_REMOVE_HEAD(&file->common, link); free(cp, M_LINKER); } LINKER_UNLOAD(file); EVENTHANDLER_INVOKE(kld_unload, file->filename, file->address, file->size); if (file->filename) { free(file->filename, M_LINKER); file->filename = NULL; } if (file->pathname) { free(file->pathname, M_LINKER); file->pathname = NULL; } kobj_delete((kobj_t) file, M_LINKER); return (0); } int linker_ctf_get(linker_file_t file, linker_ctf_t *lc) { return (LINKER_CTF_GET(file, lc)); } static int linker_file_add_dependency(linker_file_t file, linker_file_t dep) { linker_file_t *newdeps; sx_assert(&kld_sx, SA_XLOCKED); file->deps = realloc(file->deps, (file->ndeps + 1) * sizeof(*newdeps), M_LINKER, M_WAITOK | M_ZERO); file->deps[file->ndeps] = dep; file->ndeps++; KLD_DPF(FILE, ("linker_file_add_dependency:" " adding %s as dependency for %s\n", dep->filename, file->filename)); return (0); } /* * Locate a linker set and its contents. This is a helper function to avoid * linker_if.h exposure elsewhere. Note: firstp and lastp are really void **. * This function is used in this file so we can avoid having lots of (void **) * casts. */ int linker_file_lookup_set(linker_file_t file, const char *name, void *firstp, void *lastp, int *countp) { sx_assert(&kld_sx, SA_LOCKED); return (LINKER_LOOKUP_SET(file, name, firstp, lastp, countp)); } /* * List all functions in a file. */ int linker_file_function_listall(linker_file_t lf, linker_function_nameval_callback_t callback_func, void *arg) { return (LINKER_EACH_FUNCTION_NAMEVAL(lf, callback_func, arg)); } caddr_t linker_file_lookup_symbol(linker_file_t file, const char *name, int deps) { caddr_t sym; int locked; locked = sx_xlocked(&kld_sx); if (!locked) sx_xlock(&kld_sx); sym = linker_file_lookup_symbol_internal(file, name, deps); if (!locked) sx_xunlock(&kld_sx); return (sym); } static caddr_t linker_file_lookup_symbol_internal(linker_file_t file, const char *name, int deps) { c_linker_sym_t sym; linker_symval_t symval; caddr_t address; size_t common_size = 0; int i; sx_assert(&kld_sx, SA_XLOCKED); KLD_DPF(SYM, ("linker_file_lookup_symbol: file=%p, name=%s, deps=%d\n", file, name, deps)); if (LINKER_LOOKUP_SYMBOL(file, name, &sym) == 0) { LINKER_SYMBOL_VALUES(file, sym, &symval); if (symval.value == 0) /* * For commons, first look them up in the * dependencies and only allocate space if not found * there. */ common_size = symval.size; else { KLD_DPF(SYM, ("linker_file_lookup_symbol: symbol" ".value=%p\n", symval.value)); return (symval.value); } } if (deps) { for (i = 0; i < file->ndeps; i++) { address = linker_file_lookup_symbol_internal( file->deps[i], name, 0); if (address) { KLD_DPF(SYM, ("linker_file_lookup_symbol:" " deps value=%p\n", address)); return (address); } } } if (common_size > 0) { /* * This is a common symbol which was not found in the * dependencies. We maintain a simple common symbol table in * the file object. */ struct common_symbol *cp; STAILQ_FOREACH(cp, &file->common, link) { if (strcmp(cp->name, name) == 0) { KLD_DPF(SYM, ("linker_file_lookup_symbol:" " old common value=%p\n", cp->address)); return (cp->address); } } /* * Round the symbol size up to align. */ common_size = (common_size + sizeof(int) - 1) & -sizeof(int); cp = malloc(sizeof(struct common_symbol) + common_size + strlen(name) + 1, M_LINKER, M_WAITOK | M_ZERO); cp->address = (caddr_t)(cp + 1); cp->name = cp->address + common_size; strcpy(cp->name, name); bzero(cp->address, common_size); STAILQ_INSERT_TAIL(&file->common, cp, link); KLD_DPF(SYM, ("linker_file_lookup_symbol: new common" " value=%p\n", cp->address)); return (cp->address); } KLD_DPF(SYM, ("linker_file_lookup_symbol: fail\n")); return (0); } /* * Both DDB and stack(9) rely on the kernel linker to provide forward and * backward lookup of symbols. However, DDB and sometimes stack(9) need to * do this in a lockfree manner. We provide a set of internal helper * routines to perform these operations without locks, and then wrappers that * optionally lock. * * linker_debug_lookup() is ifdef DDB as currently it's only used by DDB. */ #ifdef DDB static int linker_debug_lookup(const char *symstr, c_linker_sym_t *sym) { linker_file_t lf; TAILQ_FOREACH(lf, &linker_files, link) { if (LINKER_LOOKUP_SYMBOL(lf, symstr, sym) == 0) return (0); } return (ENOENT); } #endif static int linker_debug_search_symbol(caddr_t value, c_linker_sym_t *sym, long *diffp) { linker_file_t lf; c_linker_sym_t best, es; u_long diff, bestdiff, off; best = 0; off = (uintptr_t)value; bestdiff = off; TAILQ_FOREACH(lf, &linker_files, link) { if (LINKER_SEARCH_SYMBOL(lf, value, &es, &diff) != 0) continue; if (es != 0 && diff < bestdiff) { best = es; bestdiff = diff; } if (bestdiff == 0) break; } if (best) { *sym = best; *diffp = bestdiff; return (0); } else { *sym = 0; *diffp = off; return (ENOENT); } } static int linker_debug_symbol_values(c_linker_sym_t sym, linker_symval_t *symval) { linker_file_t lf; TAILQ_FOREACH(lf, &linker_files, link) { if (LINKER_SYMBOL_VALUES(lf, sym, symval) == 0) return (0); } return (ENOENT); } static int linker_debug_search_symbol_name(caddr_t value, char *buf, u_int buflen, long *offset) { linker_symval_t symval; c_linker_sym_t sym; int error; *offset = 0; error = linker_debug_search_symbol(value, &sym, offset); if (error) return (error); error = linker_debug_symbol_values(sym, &symval); if (error) return (error); strlcpy(buf, symval.name, buflen); return (0); } /* * DDB Helpers. DDB has to look across multiple files with their own symbol * tables and string tables. * * Note that we do not obey list locking protocols here. We really don't need * DDB to hang because somebody's got the lock held. We'll take the chance * that the files list is inconsistent instead. */ #ifdef DDB int linker_ddb_lookup(const char *symstr, c_linker_sym_t *sym) { return (linker_debug_lookup(symstr, sym)); } #endif int linker_ddb_search_symbol(caddr_t value, c_linker_sym_t *sym, long *diffp) { return (linker_debug_search_symbol(value, sym, diffp)); } int linker_ddb_symbol_values(c_linker_sym_t sym, linker_symval_t *symval) { return (linker_debug_symbol_values(sym, symval)); } int linker_ddb_search_symbol_name(caddr_t value, char *buf, u_int buflen, long *offset) { return (linker_debug_search_symbol_name(value, buf, buflen, offset)); } /* * stack(9) helper for non-debugging environemnts. Unlike DDB helpers, we do * obey locking protocols, and offer a significantly less complex interface. */ int linker_search_symbol_name(caddr_t value, char *buf, u_int buflen, long *offset) { int error; sx_slock(&kld_sx); error = linker_debug_search_symbol_name(value, buf, buflen, offset); sx_sunlock(&kld_sx); return (error); } /* * Syscalls. */ int kern_kldload(struct thread *td, const char *file, int *fileid) { const char *kldname, *modname; linker_file_t lf; int error; if ((error = securelevel_gt(td->td_ucred, 0)) != 0) return (error); if ((error = priv_check(td, PRIV_KLD_LOAD)) != 0) return (error); /* * It is possible that kldloaded module will attach a new ifnet, * so vnet context must be set when this ocurs. */ CURVNET_SET(TD_TO_VNET(td)); /* * If file does not contain a qualified name or any dot in it * (kldname.ko, or kldname.ver.ko) treat it as an interface * name. */ if (strchr(file, '/') || strchr(file, '.')) { kldname = file; modname = NULL; } else { kldname = NULL; modname = file; } sx_xlock(&kld_sx); error = linker_load_module(kldname, modname, NULL, NULL, &lf); if (error) { sx_xunlock(&kld_sx); goto done; } lf->userrefs++; if (fileid != NULL) *fileid = lf->id; sx_xunlock(&kld_sx); done: CURVNET_RESTORE(); return (error); } int sys_kldload(struct thread *td, struct kldload_args *uap) { char *pathname = NULL; int error, fileid; td->td_retval[0] = -1; pathname = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); error = copyinstr(uap->file, pathname, MAXPATHLEN, NULL); if (error == 0) { error = kern_kldload(td, pathname, &fileid); if (error == 0) td->td_retval[0] = fileid; } free(pathname, M_TEMP); return (error); } int kern_kldunload(struct thread *td, int fileid, int flags) { linker_file_t lf; int error = 0; if ((error = securelevel_gt(td->td_ucred, 0)) != 0) return (error); if ((error = priv_check(td, PRIV_KLD_UNLOAD)) != 0) return (error); CURVNET_SET(TD_TO_VNET(td)); sx_xlock(&kld_sx); lf = linker_find_file_by_id(fileid); if (lf) { KLD_DPF(FILE, ("kldunload: lf->userrefs=%d\n", lf->userrefs)); if (lf->userrefs == 0) { /* * XXX: maybe LINKER_UNLOAD_FORCE should override ? */ printf("kldunload: attempt to unload file that was" " loaded by the kernel\n"); error = EBUSY; } else { lf->userrefs--; error = linker_file_unload(lf, flags); if (error) lf->userrefs++; } } else error = ENOENT; sx_xunlock(&kld_sx); CURVNET_RESTORE(); return (error); } int sys_kldunload(struct thread *td, struct kldunload_args *uap) { return (kern_kldunload(td, uap->fileid, LINKER_UNLOAD_NORMAL)); } int sys_kldunloadf(struct thread *td, struct kldunloadf_args *uap) { if (uap->flags != LINKER_UNLOAD_NORMAL && uap->flags != LINKER_UNLOAD_FORCE) return (EINVAL); return (kern_kldunload(td, uap->fileid, uap->flags)); } int sys_kldfind(struct thread *td, struct kldfind_args *uap) { char *pathname; const char *filename; linker_file_t lf; int error; #ifdef MAC error = mac_kld_check_stat(td->td_ucred); if (error) return (error); #endif td->td_retval[0] = -1; pathname = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); if ((error = copyinstr(uap->file, pathname, MAXPATHLEN, NULL)) != 0) goto out; filename = linker_basename(pathname); sx_xlock(&kld_sx); lf = linker_find_file_by_name(filename); if (lf) td->td_retval[0] = lf->id; else error = ENOENT; sx_xunlock(&kld_sx); out: free(pathname, M_TEMP); return (error); } int sys_kldnext(struct thread *td, struct kldnext_args *uap) { linker_file_t lf; int error = 0; #ifdef MAC error = mac_kld_check_stat(td->td_ucred); if (error) return (error); #endif sx_xlock(&kld_sx); if (uap->fileid == 0) lf = TAILQ_FIRST(&linker_files); else { lf = linker_find_file_by_id(uap->fileid); if (lf == NULL) { error = ENOENT; goto out; } lf = TAILQ_NEXT(lf, link); } /* Skip partially loaded files. */ while (lf != NULL && !(lf->flags & LINKER_FILE_LINKED)) lf = TAILQ_NEXT(lf, link); if (lf) td->td_retval[0] = lf->id; else td->td_retval[0] = 0; out: sx_xunlock(&kld_sx); return (error); } int sys_kldstat(struct thread *td, struct kldstat_args *uap) { struct kld_file_stat *stat; int error, version; /* * Check the version of the user's structure. */ if ((error = copyin(&uap->stat->version, &version, sizeof(version))) != 0) return (error); if (version != sizeof(struct kld_file_stat_1) && version != sizeof(struct kld_file_stat)) return (EINVAL); stat = malloc(sizeof(*stat), M_TEMP, M_WAITOK | M_ZERO); error = kern_kldstat(td, uap->fileid, stat); if (error == 0) error = copyout(stat, uap->stat, version); free(stat, M_TEMP); return (error); } int kern_kldstat(struct thread *td, int fileid, struct kld_file_stat *stat) { linker_file_t lf; int namelen; #ifdef MAC int error; error = mac_kld_check_stat(td->td_ucred); if (error) return (error); #endif sx_xlock(&kld_sx); lf = linker_find_file_by_id(fileid); if (lf == NULL) { sx_xunlock(&kld_sx); return (ENOENT); } /* Version 1 fields: */ namelen = strlen(lf->filename) + 1; if (namelen > MAXPATHLEN) namelen = MAXPATHLEN; bcopy(lf->filename, &stat->name[0], namelen); stat->refs = lf->refs; stat->id = lf->id; stat->address = lf->address; stat->size = lf->size; /* Version 2 fields: */ namelen = strlen(lf->pathname) + 1; if (namelen > MAXPATHLEN) namelen = MAXPATHLEN; bcopy(lf->pathname, &stat->pathname[0], namelen); sx_xunlock(&kld_sx); td->td_retval[0] = 0; return (0); } #ifdef DDB DB_COMMAND(kldstat, db_kldstat) { linker_file_t lf; #define POINTER_WIDTH ((int)(sizeof(void *) * 2 + 2)) db_printf("Id Refs Address%*c Size Name\n", POINTER_WIDTH - 7, ' '); #undef POINTER_WIDTH TAILQ_FOREACH(lf, &linker_files, link) { if (db_pager_quit) return; db_printf("%2d %4d %p %-8zx %s\n", lf->id, lf->refs, lf->address, lf->size, lf->filename); } } #endif /* DDB */ int sys_kldfirstmod(struct thread *td, struct kldfirstmod_args *uap) { linker_file_t lf; module_t mp; int error = 0; #ifdef MAC error = mac_kld_check_stat(td->td_ucred); if (error) return (error); #endif sx_xlock(&kld_sx); lf = linker_find_file_by_id(uap->fileid); if (lf) { MOD_SLOCK; mp = TAILQ_FIRST(&lf->modules); if (mp != NULL) td->td_retval[0] = module_getid(mp); else td->td_retval[0] = 0; MOD_SUNLOCK; } else error = ENOENT; sx_xunlock(&kld_sx); return (error); } int sys_kldsym(struct thread *td, struct kldsym_args *uap) { char *symstr = NULL; c_linker_sym_t sym; linker_symval_t symval; linker_file_t lf; struct kld_sym_lookup lookup; int error = 0; #ifdef MAC error = mac_kld_check_stat(td->td_ucred); if (error) return (error); #endif if ((error = copyin(uap->data, &lookup, sizeof(lookup))) != 0) return (error); if (lookup.version != sizeof(lookup) || uap->cmd != KLDSYM_LOOKUP) return (EINVAL); symstr = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); if ((error = copyinstr(lookup.symname, symstr, MAXPATHLEN, NULL)) != 0) goto out; sx_xlock(&kld_sx); if (uap->fileid != 0) { lf = linker_find_file_by_id(uap->fileid); if (lf == NULL) error = ENOENT; else if (LINKER_LOOKUP_SYMBOL(lf, symstr, &sym) == 0 && LINKER_SYMBOL_VALUES(lf, sym, &symval) == 0) { lookup.symvalue = (uintptr_t) symval.value; lookup.symsize = symval.size; error = copyout(&lookup, uap->data, sizeof(lookup)); } else error = ENOENT; } else { TAILQ_FOREACH(lf, &linker_files, link) { if (LINKER_LOOKUP_SYMBOL(lf, symstr, &sym) == 0 && LINKER_SYMBOL_VALUES(lf, sym, &symval) == 0) { lookup.symvalue = (uintptr_t)symval.value; lookup.symsize = symval.size; error = copyout(&lookup, uap->data, sizeof(lookup)); break; } } if (lf == NULL) error = ENOENT; } sx_xunlock(&kld_sx); out: free(symstr, M_TEMP); return (error); } /* * Preloaded module support */ static modlist_t modlist_lookup(const char *name, int ver) { modlist_t mod; TAILQ_FOREACH(mod, &found_modules, link) { if (strcmp(mod->name, name) == 0 && (ver == 0 || mod->version == ver)) return (mod); } return (NULL); } static modlist_t modlist_lookup2(const char *name, const struct mod_depend *verinfo) { modlist_t mod, bestmod; int ver; if (verinfo == NULL) return (modlist_lookup(name, 0)); bestmod = NULL; TAILQ_FOREACH(mod, &found_modules, link) { if (strcmp(mod->name, name) != 0) continue; ver = mod->version; if (ver == verinfo->md_ver_preferred) return (mod); if (ver >= verinfo->md_ver_minimum && ver <= verinfo->md_ver_maximum && (bestmod == NULL || ver > bestmod->version)) bestmod = mod; } return (bestmod); } static modlist_t modlist_newmodule(const char *modname, int version, linker_file_t container) { modlist_t mod; mod = malloc(sizeof(struct modlist), M_LINKER, M_NOWAIT | M_ZERO); if (mod == NULL) panic("no memory for module list"); mod->container = container; mod->name = modname; mod->version = version; TAILQ_INSERT_TAIL(&found_modules, mod, link); return (mod); } static void linker_addmodules(linker_file_t lf, struct mod_metadata **start, struct mod_metadata **stop, int preload) { struct mod_metadata *mp, **mdp; const char *modname; int ver; for (mdp = start; mdp < stop; mdp++) { mp = *mdp; if (mp->md_type != MDT_VERSION) continue; modname = mp->md_cval; ver = ((const struct mod_version *)mp->md_data)->mv_version; if (modlist_lookup(modname, ver) != NULL) { printf("module %s already present!\n", modname); /* XXX what can we do? this is a build error. :-( */ continue; } modlist_newmodule(modname, ver, lf); } } static void linker_preload(void *arg) { caddr_t modptr; const char *modname, *nmodname; char *modtype; linker_file_t lf, nlf; linker_class_t lc; int error; linker_file_list_t loaded_files; linker_file_list_t depended_files; struct mod_metadata *mp, *nmp; struct mod_metadata **start, **stop, **mdp, **nmdp; const struct mod_depend *verinfo; int nver; int resolves; modlist_t mod; struct sysinit **si_start, **si_stop; TAILQ_INIT(&loaded_files); TAILQ_INIT(&depended_files); TAILQ_INIT(&found_modules); error = 0; modptr = NULL; sx_xlock(&kld_sx); while ((modptr = preload_search_next_name(modptr)) != NULL) { modname = (char *)preload_search_info(modptr, MODINFO_NAME); modtype = (char *)preload_search_info(modptr, MODINFO_TYPE); if (modname == NULL) { printf("Preloaded module at %p does not have a" " name!\n", modptr); continue; } if (modtype == NULL) { printf("Preloaded module at %p does not have a type!\n", modptr); continue; } if (bootverbose) printf("Preloaded %s \"%s\" at %p.\n", modtype, modname, modptr); lf = NULL; TAILQ_FOREACH(lc, &classes, link) { error = LINKER_LINK_PRELOAD(lc, modname, &lf); if (!error) break; lf = NULL; } if (lf) TAILQ_INSERT_TAIL(&loaded_files, lf, loaded); } /* * First get a list of stuff in the kernel. */ if (linker_file_lookup_set(linker_kernel_file, MDT_SETNAME, &start, &stop, NULL) == 0) linker_addmodules(linker_kernel_file, start, stop, 1); /* * This is a once-off kinky bubble sort to resolve relocation * dependency requirements. */ restart: TAILQ_FOREACH(lf, &loaded_files, loaded) { error = linker_file_lookup_set(lf, MDT_SETNAME, &start, &stop, NULL); /* * First, look to see if we would successfully link with this * stuff. */ resolves = 1; /* unless we know otherwise */ if (!error) { for (mdp = start; mdp < stop; mdp++) { mp = *mdp; if (mp->md_type != MDT_DEPEND) continue; modname = mp->md_cval; verinfo = mp->md_data; for (nmdp = start; nmdp < stop; nmdp++) { nmp = *nmdp; if (nmp->md_type != MDT_VERSION) continue; nmodname = nmp->md_cval; if (strcmp(modname, nmodname) == 0) break; } if (nmdp < stop) /* it's a self reference */ continue; /* * ok, the module isn't here yet, we * are not finished */ if (modlist_lookup2(modname, verinfo) == NULL) resolves = 0; } } /* * OK, if we found our modules, we can link. So, "provide" * the modules inside and add it to the end of the link order * list. */ if (resolves) { if (!error) { for (mdp = start; mdp < stop; mdp++) { mp = *mdp; if (mp->md_type != MDT_VERSION) continue; modname = mp->md_cval; nver = ((const struct mod_version *) mp->md_data)->mv_version; if (modlist_lookup(modname, nver) != NULL) { printf("module %s already" " present!\n", modname); TAILQ_REMOVE(&loaded_files, lf, loaded); linker_file_unload(lf, LINKER_UNLOAD_FORCE); /* we changed tailq next ptr */ goto restart; } modlist_newmodule(modname, nver, lf); } } TAILQ_REMOVE(&loaded_files, lf, loaded); TAILQ_INSERT_TAIL(&depended_files, lf, loaded); /* * Since we provided modules, we need to restart the * sort so that the previous files that depend on us * have a chance. Also, we've busted the tailq next * pointer with the REMOVE. */ goto restart; } } /* * At this point, we check to see what could not be resolved.. */ while ((lf = TAILQ_FIRST(&loaded_files)) != NULL) { TAILQ_REMOVE(&loaded_files, lf, loaded); printf("KLD file %s is missing dependencies\n", lf->filename); linker_file_unload(lf, LINKER_UNLOAD_FORCE); } /* * We made it. Finish off the linking in the order we determined. */ TAILQ_FOREACH_SAFE(lf, &depended_files, loaded, nlf) { if (linker_kernel_file) { linker_kernel_file->refs++; error = linker_file_add_dependency(lf, linker_kernel_file); if (error) panic("cannot add dependency"); } lf->userrefs++; /* so we can (try to) kldunload it */ error = linker_file_lookup_set(lf, MDT_SETNAME, &start, &stop, NULL); if (!error) { for (mdp = start; mdp < stop; mdp++) { mp = *mdp; if (mp->md_type != MDT_DEPEND) continue; modname = mp->md_cval; verinfo = mp->md_data; mod = modlist_lookup2(modname, verinfo); if (mod == NULL) { printf("KLD file %s - cannot find " "dependency \"%s\"\n", lf->filename, modname); goto fail; } /* Don't count self-dependencies */ if (lf == mod->container) continue; mod->container->refs++; error = linker_file_add_dependency(lf, mod->container); if (error) panic("cannot add dependency"); } } /* * Now do relocation etc using the symbol search paths * established by the dependencies */ error = LINKER_LINK_PRELOAD_FINISH(lf); if (error) { printf("KLD file %s - could not finalize loading\n", lf->filename); goto fail; } linker_file_register_modules(lf); if (linker_file_lookup_set(lf, "sysinit_set", &si_start, &si_stop, NULL) == 0) sysinit_add(si_start, si_stop); linker_file_register_sysctls(lf, true); lf->flags |= LINKER_FILE_LINKED; continue; fail: TAILQ_REMOVE(&depended_files, lf, loaded); linker_file_unload(lf, LINKER_UNLOAD_FORCE); } sx_xunlock(&kld_sx); /* woohoo! we made it! */ } -SYSINIT(preload, SI_SUB_KLD, SI_ORDER_MIDDLE, linker_preload, 0); +SYSINIT(preload, SI_SUB_KLD, SI_ORDER_MIDDLE, linker_preload, NULL); /* * Search for a not-loaded module by name. * * Modules may be found in the following locations: * * - preloaded (result is just the module name) - on disk (result is full path * to module) * * If the module name is qualified in any way (contains path, etc.) the we * simply return a copy of it. * * The search path can be manipulated via sysctl. Note that we use the ';' * character as a separator to be consistent with the bootloader. */ static char linker_hintfile[] = "linker.hints"; static char linker_path[MAXPATHLEN] = "/boot/kernel;/boot/modules"; SYSCTL_STRING(_kern, OID_AUTO, module_path, CTLFLAG_RWTUN, linker_path, sizeof(linker_path), "module load search path"); TUNABLE_STR("module_path", linker_path, sizeof(linker_path)); static char *linker_ext_list[] = { "", ".ko", NULL }; /* * Check if file actually exists either with or without extension listed in * the linker_ext_list. (probably should be generic for the rest of the * kernel) */ static char * linker_lookup_file(const char *path, int pathlen, const char *name, int namelen, struct vattr *vap) { struct nameidata nd; struct thread *td = curthread; /* XXX */ char *result, **cpp, *sep; int error, len, extlen, reclen, flags; enum vtype type; extlen = 0; for (cpp = linker_ext_list; *cpp; cpp++) { len = strlen(*cpp); if (len > extlen) extlen = len; } extlen++; /* trailing '\0' */ sep = (path[pathlen - 1] != '/') ? "/" : ""; reclen = pathlen + strlen(sep) + namelen + extlen + 1; result = malloc(reclen, M_LINKER, M_WAITOK); for (cpp = linker_ext_list; *cpp; cpp++) { snprintf(result, reclen, "%.*s%s%.*s%s", pathlen, path, sep, namelen, name, *cpp); /* * Attempt to open the file, and return the path if * we succeed and it's a regular file. */ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, result, td); flags = FREAD; error = vn_open(&nd, &flags, 0, NULL); if (error == 0) { NDFREE(&nd, NDF_ONLY_PNBUF); type = nd.ni_vp->v_type; if (vap) VOP_GETATTR(nd.ni_vp, vap, td->td_ucred); VOP_UNLOCK(nd.ni_vp, 0); vn_close(nd.ni_vp, FREAD, td->td_ucred, td); if (type == VREG) return (result); } } free(result, M_LINKER); return (NULL); } #define INT_ALIGN(base, ptr) ptr = \ (base) + roundup2((ptr) - (base), sizeof(int)) /* * Lookup KLD which contains requested module in the "linker.hints" file. If * version specification is available, then try to find the best KLD. * Otherwise just find the latest one. */ static char * linker_hints_lookup(const char *path, int pathlen, const char *modname, int modnamelen, const struct mod_depend *verinfo) { struct thread *td = curthread; /* XXX */ struct ucred *cred = td ? td->td_ucred : NULL; struct nameidata nd; struct vattr vattr, mattr; u_char *hints = NULL; u_char *cp, *recptr, *bufend, *result, *best, *pathbuf, *sep; int error, ival, bestver, *intp, found, flags, clen, blen; ssize_t reclen; result = NULL; bestver = found = 0; sep = (path[pathlen - 1] != '/') ? "/" : ""; reclen = imax(modnamelen, strlen(linker_hintfile)) + pathlen + strlen(sep) + 1; pathbuf = malloc(reclen, M_LINKER, M_WAITOK); snprintf(pathbuf, reclen, "%.*s%s%s", pathlen, path, sep, linker_hintfile); NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, pathbuf, td); flags = FREAD; error = vn_open(&nd, &flags, 0, NULL); if (error) goto bad; NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp->v_type != VREG) goto bad; best = cp = NULL; error = VOP_GETATTR(nd.ni_vp, &vattr, cred); if (error) goto bad; /* * XXX: we need to limit this number to some reasonable value */ if (vattr.va_size > LINKER_HINTS_MAX) { printf("hints file too large %ld\n", (long)vattr.va_size); goto bad; } hints = malloc(vattr.va_size, M_TEMP, M_WAITOK); error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)hints, vattr.va_size, 0, UIO_SYSSPACE, IO_NODELOCKED, cred, NOCRED, &reclen, td); if (error) goto bad; VOP_UNLOCK(nd.ni_vp, 0); vn_close(nd.ni_vp, FREAD, cred, td); nd.ni_vp = NULL; if (reclen != 0) { printf("can't read %zd\n", reclen); goto bad; } intp = (int *)hints; ival = *intp++; if (ival != LINKER_HINTS_VERSION) { printf("hints file version mismatch %d\n", ival); goto bad; } bufend = hints + vattr.va_size; recptr = (u_char *)intp; clen = blen = 0; while (recptr < bufend && !found) { intp = (int *)recptr; reclen = *intp++; ival = *intp++; cp = (char *)intp; switch (ival) { case MDT_VERSION: clen = *cp++; if (clen != modnamelen || bcmp(cp, modname, clen) != 0) break; cp += clen; INT_ALIGN(hints, cp); ival = *(int *)cp; cp += sizeof(int); clen = *cp++; if (verinfo == NULL || ival == verinfo->md_ver_preferred) { found = 1; break; } if (ival >= verinfo->md_ver_minimum && ival <= verinfo->md_ver_maximum && ival > bestver) { bestver = ival; best = cp; blen = clen; } break; default: break; } recptr += reclen + sizeof(int); } /* * Finally check if KLD is in the place */ if (found) result = linker_lookup_file(path, pathlen, cp, clen, &mattr); else if (best) result = linker_lookup_file(path, pathlen, best, blen, &mattr); /* * KLD is newer than hints file. What we should do now? */ if (result && timespeccmp(&mattr.va_mtime, &vattr.va_mtime, >)) printf("warning: KLD '%s' is newer than the linker.hints" " file\n", result); bad: free(pathbuf, M_LINKER); if (hints) free(hints, M_TEMP); if (nd.ni_vp != NULL) { VOP_UNLOCK(nd.ni_vp, 0); vn_close(nd.ni_vp, FREAD, cred, td); } /* * If nothing found or hints is absent - fallback to the old * way by using "kldname[.ko]" as module name. */ if (!found && !bestver && result == NULL) result = linker_lookup_file(path, pathlen, modname, modnamelen, NULL); return (result); } /* * Lookup KLD which contains requested module in the all directories. */ static char * linker_search_module(const char *modname, int modnamelen, const struct mod_depend *verinfo) { char *cp, *ep, *result; /* * traverse the linker path */ for (cp = linker_path; *cp; cp = ep + 1) { /* find the end of this component */ for (ep = cp; (*ep != 0) && (*ep != ';'); ep++); result = linker_hints_lookup(cp, ep - cp, modname, modnamelen, verinfo); if (result != NULL) return (result); if (*ep == 0) break; } return (NULL); } /* * Search for module in all directories listed in the linker_path. */ static char * linker_search_kld(const char *name) { char *cp, *ep, *result; int len; /* qualified at all? */ if (strchr(name, '/')) return (strdup(name, M_LINKER)); /* traverse the linker path */ len = strlen(name); for (ep = linker_path; *ep; ep++) { cp = ep; /* find the end of this component */ for (; *ep != 0 && *ep != ';'; ep++); result = linker_lookup_file(cp, ep - cp, name, len, NULL); if (result != NULL) return (result); } return (NULL); } static const char * linker_basename(const char *path) { const char *filename; filename = strrchr(path, '/'); if (filename == NULL) return path; if (filename[1]) filename++; return (filename); } #ifdef HWPMC_HOOKS /* * Inform hwpmc about the set of kernel modules currently loaded. */ void * linker_hwpmc_list_objects(void) { linker_file_t lf; struct pmckern_map_in *kobase; int i, nmappings; nmappings = 0; sx_slock(&kld_sx); TAILQ_FOREACH(lf, &linker_files, link) nmappings++; /* Allocate nmappings + 1 entries. */ kobase = malloc((nmappings + 1) * sizeof(struct pmckern_map_in), M_LINKER, M_WAITOK | M_ZERO); i = 0; TAILQ_FOREACH(lf, &linker_files, link) { /* Save the info for this linker file. */ kobase[i].pm_file = lf->filename; kobase[i].pm_address = (uintptr_t)lf->address; i++; } sx_sunlock(&kld_sx); KASSERT(i > 0, ("linker_hpwmc_list_objects: no kernel objects?")); /* The last entry of the malloced area comprises of all zeros. */ KASSERT(kobase[i].pm_file == NULL, ("linker_hwpmc_list_objects: last object not NULL")); return ((void *)kobase); } #endif /* * Find a file which contains given module and load it, if "parent" is not * NULL, register a reference to it. */ static int linker_load_module(const char *kldname, const char *modname, struct linker_file *parent, const struct mod_depend *verinfo, struct linker_file **lfpp) { linker_file_t lfdep; const char *filename; char *pathname; int error; sx_assert(&kld_sx, SA_XLOCKED); if (modname == NULL) { /* * We have to load KLD */ KASSERT(verinfo == NULL, ("linker_load_module: verinfo" " is not NULL")); /* check if root file system is not mounted */ if (rootvnode == NULL || curproc->p_fd->fd_rdir == NULL) return (ENXIO); pathname = linker_search_kld(kldname); } else { if (modlist_lookup2(modname, verinfo) != NULL) return (EEXIST); /* check if root file system is not mounted */ if (rootvnode == NULL || curproc->p_fd->fd_rdir == NULL) return (ENXIO); if (kldname != NULL) pathname = strdup(kldname, M_LINKER); else /* * Need to find a KLD with required module */ pathname = linker_search_module(modname, strlen(modname), verinfo); } if (pathname == NULL) return (ENOENT); /* * Can't load more than one file with the same basename XXX: * Actually it should be possible to have multiple KLDs with * the same basename but different path because they can * provide different versions of the same modules. */ filename = linker_basename(pathname); if (linker_find_file_by_name(filename)) error = EEXIST; else do { error = linker_load_file(pathname, &lfdep); if (error) break; if (modname && verinfo && modlist_lookup2(modname, verinfo) == NULL) { linker_file_unload(lfdep, LINKER_UNLOAD_FORCE); error = ENOENT; break; } if (parent) { error = linker_file_add_dependency(parent, lfdep); if (error) break; } if (lfpp) *lfpp = lfdep; } while (0); free(pathname, M_LINKER); return (error); } /* * This routine is responsible for finding dependencies of userland initiated * kldload(2)'s of files. */ int linker_load_dependencies(linker_file_t lf) { linker_file_t lfdep; struct mod_metadata **start, **stop, **mdp, **nmdp; struct mod_metadata *mp, *nmp; const struct mod_depend *verinfo; modlist_t mod; const char *modname, *nmodname; int ver, error = 0; /* * All files are dependent on /kernel. */ sx_assert(&kld_sx, SA_XLOCKED); if (linker_kernel_file) { linker_kernel_file->refs++; error = linker_file_add_dependency(lf, linker_kernel_file); if (error) return (error); } if (linker_file_lookup_set(lf, MDT_SETNAME, &start, &stop, NULL) != 0) return (0); for (mdp = start; mdp < stop; mdp++) { mp = *mdp; if (mp->md_type != MDT_VERSION) continue; modname = mp->md_cval; ver = ((const struct mod_version *)mp->md_data)->mv_version; mod = modlist_lookup(modname, ver); if (mod != NULL) { printf("interface %s.%d already present in the KLD" " '%s'!\n", modname, ver, mod->container->filename); return (EEXIST); } } for (mdp = start; mdp < stop; mdp++) { mp = *mdp; if (mp->md_type != MDT_DEPEND) continue; modname = mp->md_cval; verinfo = mp->md_data; nmodname = NULL; for (nmdp = start; nmdp < stop; nmdp++) { nmp = *nmdp; if (nmp->md_type != MDT_VERSION) continue; nmodname = nmp->md_cval; if (strcmp(modname, nmodname) == 0) break; } if (nmdp < stop)/* early exit, it's a self reference */ continue; mod = modlist_lookup2(modname, verinfo); if (mod) { /* woohoo, it's loaded already */ lfdep = mod->container; lfdep->refs++; error = linker_file_add_dependency(lf, lfdep); if (error) break; continue; } error = linker_load_module(NULL, modname, lf, verinfo, NULL); if (error) { printf("KLD %s: depends on %s - not available or" " version mismatch\n", lf->filename, modname); break; } } if (error) return (error); linker_addmodules(lf, start, stop, 0); return (error); } static int sysctl_kern_function_list_iterate(const char *name, void *opaque) { struct sysctl_req *req; req = opaque; return (SYSCTL_OUT(req, name, strlen(name) + 1)); } /* * Export a nul-separated, double-nul-terminated list of all function names * in the kernel. */ static int sysctl_kern_function_list(SYSCTL_HANDLER_ARGS) { linker_file_t lf; int error; #ifdef MAC error = mac_kld_check_stat(req->td->td_ucred); if (error) return (error); #endif error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sx_xlock(&kld_sx); TAILQ_FOREACH(lf, &linker_files, link) { error = LINKER_EACH_FUNCTION_NAME(lf, sysctl_kern_function_list_iterate, req); if (error) { sx_xunlock(&kld_sx); return (error); } } sx_xunlock(&kld_sx); return (SYSCTL_OUT(req, "", 1)); } SYSCTL_PROC(_kern, OID_AUTO, function_list, CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, sysctl_kern_function_list, "", "kernel function list"); Index: stable/11/sys/kern/kern_module.c =================================================================== --- stable/11/sys/kern/kern_module.c (revision 359651) +++ stable/11/sys/kern/kern_module.c (revision 359652) @@ -1,519 +1,519 @@ /*- * Copyright (c) 1997 Doug Rabson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_compat.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_MODULE, "module", "module data structures"); struct module { TAILQ_ENTRY(module) link; /* chain together all modules */ TAILQ_ENTRY(module) flink; /* all modules in a file */ struct linker_file *file; /* file which contains this module */ int refs; /* reference count */ int id; /* unique id number */ char *name; /* module name */ modeventhand_t handler; /* event handler */ void *arg; /* argument for handler */ modspecific_t data; /* module specific data */ }; #define MOD_EVENT(mod, type) (mod)->handler((mod), (type), (mod)->arg) static TAILQ_HEAD(modulelist, module) modules; struct sx modules_sx; static int nextid = 1; static void module_shutdown(void *, int); static int modevent_nop(module_t mod, int what, void *arg) { switch(what) { case MOD_LOAD: return (0); case MOD_UNLOAD: return (EBUSY); default: return (EOPNOTSUPP); } } static void module_init(void *arg) { sx_init(&modules_sx, "module subsystem sx lock"); TAILQ_INIT(&modules); EVENTHANDLER_REGISTER(shutdown_final, module_shutdown, NULL, SHUTDOWN_PRI_DEFAULT); } -SYSINIT(module, SI_SUB_KLD, SI_ORDER_FIRST, module_init, 0); +SYSINIT(module, SI_SUB_KLD, SI_ORDER_FIRST, module_init, NULL); static void module_shutdown(void *arg1, int arg2) { module_t mod; if (arg2 & RB_NOSYNC) return; mtx_lock(&Giant); MOD_SLOCK; TAILQ_FOREACH_REVERSE(mod, &modules, modulelist, link) MOD_EVENT(mod, MOD_SHUTDOWN); MOD_SUNLOCK; mtx_unlock(&Giant); } void module_register_init(const void *arg) { const moduledata_t *data = (const moduledata_t *)arg; int error; module_t mod; mtx_lock(&Giant); MOD_SLOCK; mod = module_lookupbyname(data->name); if (mod == NULL) panic("module_register_init: module named %s not found\n", data->name); MOD_SUNLOCK; error = MOD_EVENT(mod, MOD_LOAD); if (error) { MOD_EVENT(mod, MOD_UNLOAD); MOD_XLOCK; module_release(mod); MOD_XUNLOCK; printf("module_register_init: MOD_LOAD (%s, %p, %p) error" " %d\n", data->name, (void *)data->evhand, data->priv, error); } else { MOD_XLOCK; if (mod->file) { /* * Once a module is successfully loaded, move * it to the head of the module list for this * linker file. This resorts the list so that * when the kernel linker iterates over the * modules to unload them, it will unload them * in the reverse order they were loaded. */ TAILQ_REMOVE(&mod->file->modules, mod, flink); TAILQ_INSERT_HEAD(&mod->file->modules, mod, flink); } MOD_XUNLOCK; } mtx_unlock(&Giant); } int module_register(const moduledata_t *data, linker_file_t container) { size_t namelen; module_t newmod; MOD_XLOCK; newmod = module_lookupbyname(data->name); if (newmod != NULL) { MOD_XUNLOCK; printf("%s: cannot register %s from %s; already loaded from %s\n", __func__, data->name, container->filename, newmod->file->filename); return (EEXIST); } namelen = strlen(data->name) + 1; newmod = malloc(sizeof(struct module) + namelen, M_MODULE, M_WAITOK); newmod->refs = 1; newmod->id = nextid++; newmod->name = (char *)(newmod + 1); strcpy(newmod->name, data->name); newmod->handler = data->evhand ? data->evhand : modevent_nop; newmod->arg = data->priv; bzero(&newmod->data, sizeof(newmod->data)); TAILQ_INSERT_TAIL(&modules, newmod, link); if (container) TAILQ_INSERT_TAIL(&container->modules, newmod, flink); newmod->file = container; MOD_XUNLOCK; return (0); } void module_reference(module_t mod) { MOD_XLOCK_ASSERT; MOD_DPF(REFS, ("module_reference: before, refs=%d\n", mod->refs)); mod->refs++; } void module_release(module_t mod) { MOD_XLOCK_ASSERT; if (mod->refs <= 0) panic("module_release: bad reference count"); MOD_DPF(REFS, ("module_release: before, refs=%d\n", mod->refs)); mod->refs--; if (mod->refs == 0) { TAILQ_REMOVE(&modules, mod, link); if (mod->file) TAILQ_REMOVE(&mod->file->modules, mod, flink); free(mod, M_MODULE); } } module_t module_lookupbyname(const char *name) { module_t mod; int err; MOD_LOCK_ASSERT; TAILQ_FOREACH(mod, &modules, link) { err = strcmp(mod->name, name); if (err == 0) return (mod); } return (NULL); } module_t module_lookupbyid(int modid) { module_t mod; MOD_LOCK_ASSERT; TAILQ_FOREACH(mod, &modules, link) if (mod->id == modid) return(mod); return (NULL); } int module_quiesce(module_t mod) { int error; mtx_lock(&Giant); error = MOD_EVENT(mod, MOD_QUIESCE); mtx_unlock(&Giant); if (error == EOPNOTSUPP || error == EINVAL) error = 0; return (error); } int module_unload(module_t mod) { int error; mtx_lock(&Giant); error = MOD_EVENT(mod, MOD_UNLOAD); mtx_unlock(&Giant); return (error); } int module_getid(module_t mod) { MOD_LOCK_ASSERT; return (mod->id); } module_t module_getfnext(module_t mod) { MOD_LOCK_ASSERT; return (TAILQ_NEXT(mod, flink)); } const char * module_getname(module_t mod) { MOD_LOCK_ASSERT; return (mod->name); } void module_setspecific(module_t mod, modspecific_t *datap) { MOD_XLOCK_ASSERT; mod->data = *datap; } linker_file_t module_file(module_t mod) { return (mod->file); } /* * Syscalls. */ int sys_modnext(struct thread *td, struct modnext_args *uap) { module_t mod; int error = 0; td->td_retval[0] = -1; MOD_SLOCK; if (uap->modid == 0) { mod = TAILQ_FIRST(&modules); if (mod) td->td_retval[0] = mod->id; else error = ENOENT; goto done2; } mod = module_lookupbyid(uap->modid); if (mod == NULL) { error = ENOENT; goto done2; } if (TAILQ_NEXT(mod, link)) td->td_retval[0] = TAILQ_NEXT(mod, link)->id; else td->td_retval[0] = 0; done2: MOD_SUNLOCK; return (error); } int sys_modfnext(struct thread *td, struct modfnext_args *uap) { module_t mod; int error; td->td_retval[0] = -1; MOD_SLOCK; mod = module_lookupbyid(uap->modid); if (mod == NULL) { error = ENOENT; } else { error = 0; if (TAILQ_NEXT(mod, flink)) td->td_retval[0] = TAILQ_NEXT(mod, flink)->id; else td->td_retval[0] = 0; } MOD_SUNLOCK; return (error); } struct module_stat_v1 { int version; /* set to sizeof(struct module_stat) */ char name[MAXMODNAME]; int refs; int id; }; int sys_modstat(struct thread *td, struct modstat_args *uap) { module_t mod; modspecific_t data; int error = 0; int id, namelen, refs, version; struct module_stat *stat; char *name; MOD_SLOCK; mod = module_lookupbyid(uap->modid); if (mod == NULL) { MOD_SUNLOCK; return (ENOENT); } id = mod->id; refs = mod->refs; name = mod->name; data = mod->data; MOD_SUNLOCK; stat = uap->stat; /* * Check the version of the user's structure. */ if ((error = copyin(&stat->version, &version, sizeof(version))) != 0) return (error); if (version != sizeof(struct module_stat_v1) && version != sizeof(struct module_stat)) return (EINVAL); namelen = strlen(mod->name) + 1; if (namelen > MAXMODNAME) namelen = MAXMODNAME; if ((error = copyout(name, &stat->name[0], namelen)) != 0) return (error); if ((error = copyout(&refs, &stat->refs, sizeof(int))) != 0) return (error); if ((error = copyout(&id, &stat->id, sizeof(int))) != 0) return (error); /* * >v1 stat includes module data. */ if (version == sizeof(struct module_stat)) if ((error = copyout(&data, &stat->data, sizeof(data))) != 0) return (error); td->td_retval[0] = 0; return (error); } int sys_modfind(struct thread *td, struct modfind_args *uap) { int error = 0; char name[MAXMODNAME]; module_t mod; if ((error = copyinstr(uap->name, name, sizeof name, 0)) != 0) return (error); MOD_SLOCK; mod = module_lookupbyname(name); if (mod == NULL) error = ENOENT; else td->td_retval[0] = module_getid(mod); MOD_SUNLOCK; return (error); } MODULE_VERSION(kernel, __FreeBSD_version); #ifdef COMPAT_FREEBSD32 #include #include #include #include #include typedef union modspecific32 { int intval; uint32_t uintval; int longval; uint32_t ulongval; } modspecific32_t; struct module_stat32 { int version; char name[MAXMODNAME]; int refs; int id; modspecific32_t data; }; int freebsd32_modstat(struct thread *td, struct freebsd32_modstat_args *uap) { module_t mod; modspecific32_t data32; int error = 0; int id, namelen, refs, version; struct module_stat32 *stat32; char *name; MOD_SLOCK; mod = module_lookupbyid(uap->modid); if (mod == NULL) { MOD_SUNLOCK; return (ENOENT); } id = mod->id; refs = mod->refs; name = mod->name; CP(mod->data, data32, intval); CP(mod->data, data32, uintval); CP(mod->data, data32, longval); CP(mod->data, data32, ulongval); MOD_SUNLOCK; stat32 = uap->stat; if ((error = copyin(&stat32->version, &version, sizeof(version))) != 0) return (error); if (version != sizeof(struct module_stat_v1) && version != sizeof(struct module_stat32)) return (EINVAL); namelen = strlen(mod->name) + 1; if (namelen > MAXMODNAME) namelen = MAXMODNAME; if ((error = copyout(name, &stat32->name[0], namelen)) != 0) return (error); if ((error = copyout(&refs, &stat32->refs, sizeof(int))) != 0) return (error); if ((error = copyout(&id, &stat32->id, sizeof(int))) != 0) return (error); /* * >v1 stat includes module data. */ if (version == sizeof(struct module_stat32)) if ((error = copyout(&data32, &stat32->data, sizeof(data32))) != 0) return (error); td->td_retval[0] = 0; return (error); } #endif Index: stable/11/sys/kern/kern_synch.c =================================================================== --- stable/11/sys/kern/kern_synch.c (revision 359651) +++ stable/11/sys/kern/kern_synch.c (revision 359652) @@ -1,588 +1,588 @@ /*- * Copyright (c) 1982, 1986, 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #include #endif #include static void synch_setup(void *dummy); SYSINIT(synch_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, synch_setup, NULL); int hogticks; static uint8_t pause_wchan[MAXCPU]; static struct callout loadav_callout; struct loadavg averunnable = { {0, 0, 0}, FSCALE }; /* load average, of runnable procs */ /* * Constants for averages over 1, 5, and 15 minutes * when sampling at 5 second intervals. */ static fixpt_t cexp[3] = { 0.9200444146293232 * FSCALE, /* exp(-1/12) */ 0.9834714538216174 * FSCALE, /* exp(-1/60) */ 0.9944598480048967 * FSCALE, /* exp(-1/180) */ }; /* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */ SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, FSCALE, ""); static void loadav(void *arg); SDT_PROVIDER_DECLARE(sched); SDT_PROBE_DEFINE(sched, , , preempt); static void sleepinit(void *unused) { hogticks = (hz / 10) * 2; /* Default only. */ init_sleepqueues(); } /* * vmem tries to lock the sleepq mutexes when free'ing kva, so make sure * it is available. */ -SYSINIT(sleepinit, SI_SUB_KMEM, SI_ORDER_ANY, sleepinit, 0); +SYSINIT(sleepinit, SI_SUB_KMEM, SI_ORDER_ANY, sleepinit, NULL); /* * General sleep call. Suspends the current thread until a wakeup is * performed on the specified identifier. The thread will then be made * runnable with the specified priority. Sleeps at most sbt units of time * (0 means no timeout). If pri includes the PCATCH flag, let signals * interrupt the sleep, otherwise ignore them while sleeping. Returns 0 if * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a * signal becomes pending, ERESTART is returned if the current system * call should be restarted if possible, and EINTR is returned if the system * call should be interrupted by the signal (return EINTR). * * The lock argument is unlocked before the caller is suspended, and * re-locked before _sleep() returns. If priority includes the PDROP * flag the lock is not re-locked before returning. */ int _sleep(void *ident, struct lock_object *lock, int priority, const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags) { struct thread *td; struct proc *p; struct lock_class *class; uintptr_t lock_state; int catch, pri, rval, sleepq_flags; WITNESS_SAVE_DECL(lock_witness); td = curthread; p = td->td_proc; #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(1, 0, wmesg); #endif WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock, "Sleeping on \"%s\"", wmesg); KASSERT(sbt != 0 || mtx_owned(&Giant) || lock != NULL, ("sleeping without a lock")); KASSERT(p != NULL, ("msleep1")); KASSERT(ident != NULL && TD_IS_RUNNING(td), ("msleep")); if (priority & PDROP) KASSERT(lock != NULL && lock != &Giant.lock_object, ("PDROP requires a non-Giant lock")); if (lock != NULL) class = LOCK_CLASS(lock); else class = NULL; if (SCHEDULER_STOPPED_TD(td)) { if (lock != NULL && priority & PDROP) class->lc_unlock(lock); return (0); } catch = priority & PCATCH; pri = priority & PRIMASK; KASSERT(!TD_ON_SLEEPQ(td), ("recursive sleep")); if ((uint8_t *)ident >= &pause_wchan[0] && (uint8_t *)ident <= &pause_wchan[MAXCPU - 1]) sleepq_flags = SLEEPQ_PAUSE; else sleepq_flags = SLEEPQ_SLEEP; if (catch) sleepq_flags |= SLEEPQ_INTERRUPTIBLE; sleepq_lock(ident); CTR5(KTR_PROC, "sleep: thread %ld (pid %ld, %s) on %s (%p)", td->td_tid, p->p_pid, td->td_name, wmesg, ident); if (lock == &Giant.lock_object) mtx_assert(&Giant, MA_OWNED); DROP_GIANT(); if (lock != NULL && lock != &Giant.lock_object && !(class->lc_flags & LC_SLEEPABLE)) { WITNESS_SAVE(lock, lock_witness); lock_state = class->lc_unlock(lock); } else /* GCC needs to follow the Yellow Brick Road */ lock_state = -1; /* * We put ourselves on the sleep queue and start our timeout * before calling thread_suspend_check, as we could stop there, * and a wakeup or a SIGCONT (or both) could occur while we were * stopped without resuming us. Thus, we must be ready for sleep * when cursig() is called. If the wakeup happens while we're * stopped, then td will no longer be on a sleep queue upon * return from cursig(). */ sleepq_add(ident, lock, wmesg, sleepq_flags, 0); if (sbt != 0) sleepq_set_timeout_sbt(ident, sbt, pr, flags); if (lock != NULL && class->lc_flags & LC_SLEEPABLE) { sleepq_release(ident); WITNESS_SAVE(lock, lock_witness); lock_state = class->lc_unlock(lock); sleepq_lock(ident); } if (sbt != 0 && catch) rval = sleepq_timedwait_sig(ident, pri); else if (sbt != 0) rval = sleepq_timedwait(ident, pri); else if (catch) rval = sleepq_wait_sig(ident, pri); else { sleepq_wait(ident, pri); rval = 0; } #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 0, wmesg); #endif PICKUP_GIANT(); if (lock != NULL && lock != &Giant.lock_object && !(priority & PDROP)) { class->lc_lock(lock, lock_state); WITNESS_RESTORE(lock, lock_witness); } return (rval); } int msleep_spin_sbt(void *ident, struct mtx *mtx, const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags) { struct thread *td; struct proc *p; int rval; WITNESS_SAVE_DECL(mtx); td = curthread; p = td->td_proc; KASSERT(mtx != NULL, ("sleeping without a mutex")); KASSERT(p != NULL, ("msleep1")); KASSERT(ident != NULL && TD_IS_RUNNING(td), ("msleep")); if (SCHEDULER_STOPPED_TD(td)) return (0); sleepq_lock(ident); CTR5(KTR_PROC, "msleep_spin: thread %ld (pid %ld, %s) on %s (%p)", td->td_tid, p->p_pid, td->td_name, wmesg, ident); DROP_GIANT(); mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED); WITNESS_SAVE(&mtx->lock_object, mtx); mtx_unlock_spin(mtx); /* * We put ourselves on the sleep queue and start our timeout. */ sleepq_add(ident, &mtx->lock_object, wmesg, SLEEPQ_SLEEP, 0); if (sbt != 0) sleepq_set_timeout_sbt(ident, sbt, pr, flags); /* * Can't call ktrace with any spin locks held so it can lock the * ktrace_mtx lock, and WITNESS_WARN considers it an error to hold * any spin lock. Thus, we have to drop the sleepq spin lock while * we handle those requests. This is safe since we have placed our * thread on the sleep queue already. */ #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) { sleepq_release(ident); ktrcsw(1, 0, wmesg); sleepq_lock(ident); } #endif #ifdef WITNESS sleepq_release(ident); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "Sleeping on \"%s\"", wmesg); sleepq_lock(ident); #endif if (sbt != 0) rval = sleepq_timedwait(ident, 0); else { sleepq_wait(ident, 0); rval = 0; } #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 0, wmesg); #endif PICKUP_GIANT(); mtx_lock_spin(mtx); WITNESS_RESTORE(&mtx->lock_object, mtx); return (rval); } /* * pause_sbt() delays the calling thread by the given signed binary * time. During cold bootup, pause_sbt() uses the DELAY() function * instead of the _sleep() function to do the waiting. The "sbt" * argument must be greater than or equal to zero. A "sbt" value of * zero is equivalent to a "sbt" value of one tick. */ int pause_sbt(const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags) { KASSERT(sbt >= 0, ("pause_sbt: timeout must be >= 0")); /* silently convert invalid timeouts */ if (sbt == 0) sbt = tick_sbt; if ((cold && curthread == &thread0) || kdb_active || SCHEDULER_STOPPED()) { /* * We delay one second at a time to avoid overflowing the * system specific DELAY() function(s): */ while (sbt >= SBT_1S) { DELAY(1000000); sbt -= SBT_1S; } /* Do the delay remainder, if any */ sbt = howmany(sbt, SBT_1US); if (sbt > 0) DELAY(sbt); return (EWOULDBLOCK); } return (_sleep(&pause_wchan[curcpu], NULL, (flags & C_CATCH) ? PCATCH : 0, wmesg, sbt, pr, flags)); } /* * Make all threads sleeping on the specified identifier runnable. */ void wakeup(void *ident) { int wakeup_swapper; sleepq_lock(ident); wakeup_swapper = sleepq_broadcast(ident, SLEEPQ_SLEEP, 0, 0); sleepq_release(ident); if (wakeup_swapper) { KASSERT(ident != &proc0, ("wakeup and wakeup_swapper and proc0")); kick_proc0(); } } /* * Make a thread sleeping on the specified identifier runnable. * May wake more than one thread if a target thread is currently * swapped out. */ void wakeup_one(void *ident) { int wakeup_swapper; sleepq_lock(ident); wakeup_swapper = sleepq_signal(ident, SLEEPQ_SLEEP, 0, 0); sleepq_release(ident); if (wakeup_swapper) kick_proc0(); } void wakeup_any(void *ident) { int wakeup_swapper; sleepq_lock(ident); wakeup_swapper = sleepq_signal(ident, SLEEPQ_SLEEP | SLEEPQ_UNFAIR, 0, 0); sleepq_release(ident); if (wakeup_swapper) kick_proc0(); } static void kdb_switch(void) { thread_unlock(curthread); kdb_backtrace(); kdb_reenter(); panic("%s: did not reenter debugger", __func__); } /* * The machine independent parts of context switching. */ void mi_switch(int flags, struct thread *newtd) { uint64_t runtime, new_switchtime; struct thread *td; td = curthread; /* XXX */ THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED); KASSERT(!TD_ON_RUNQ(td), ("mi_switch: called by old code")); #ifdef INVARIANTS if (!TD_ON_LOCK(td) && !TD_IS_RUNNING(td)) mtx_assert(&Giant, MA_NOTOWNED); #endif KASSERT(td->td_critnest == 1 || panicstr, ("mi_switch: switch in a critical section")); KASSERT((flags & (SW_INVOL | SW_VOL)) != 0, ("mi_switch: switch must be voluntary or involuntary")); KASSERT(newtd != curthread, ("mi_switch: preempting back to ourself")); /* * Don't perform context switches from the debugger. */ if (kdb_active) kdb_switch(); if (SCHEDULER_STOPPED_TD(td)) return; if (flags & SW_VOL) { td->td_ru.ru_nvcsw++; td->td_swvoltick = ticks; } else { td->td_ru.ru_nivcsw++; td->td_swinvoltick = ticks; } #ifdef SCHED_STATS SCHED_STAT_INC(sched_switch_stats[flags & SW_TYPE_MASK]); #endif /* * Compute the amount of time during which the current * thread was running, and add that to its total so far. */ new_switchtime = cpu_ticks(); runtime = new_switchtime - PCPU_GET(switchtime); td->td_runtime += runtime; td->td_incruntime += runtime; PCPU_SET(switchtime, new_switchtime); td->td_generation++; /* bump preempt-detect counter */ PCPU_INC(cnt.v_swtch); PCPU_SET(switchticks, ticks); CTR4(KTR_PROC, "mi_switch: old thread %ld (td_sched %p, pid %ld, %s)", td->td_tid, td_get_sched(td), td->td_proc->p_pid, td->td_name); #ifdef KDTRACE_HOOKS if ((flags & SW_PREEMPT) != 0 || ((flags & SW_INVOL) != 0 && (flags & SW_TYPE_MASK) == SWT_NEEDRESCHED)) SDT_PROBE0(sched, , , preempt); #endif sched_switch(td, newtd, flags); CTR4(KTR_PROC, "mi_switch: new thread %ld (td_sched %p, pid %ld, %s)", td->td_tid, td_get_sched(td), td->td_proc->p_pid, td->td_name); /* * If the last thread was exiting, finish cleaning it up. */ if ((td = PCPU_GET(deadthread))) { PCPU_SET(deadthread, NULL); thread_stash(td); } } /* * Change thread state to be runnable, placing it on the run queue if * it is in memory. If it is swapped out, return true so our caller * will know to awaken the swapper. */ int setrunnable(struct thread *td) { THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(td->td_proc->p_state != PRS_ZOMBIE, ("setrunnable: pid %d is a zombie", td->td_proc->p_pid)); switch (td->td_state) { case TDS_RUNNING: case TDS_RUNQ: return (0); case TDS_INHIBITED: /* * If we are only inhibited because we are swapped out * then arange to swap in this process. Otherwise just return. */ if (td->td_inhibitors != TDI_SWAPPED) return (0); /* FALLTHROUGH */ case TDS_CAN_RUN: break; default: printf("state is 0x%x", td->td_state); panic("setrunnable(2)"); } if ((td->td_flags & TDF_INMEM) == 0) { if ((td->td_flags & TDF_SWAPINREQ) == 0) { td->td_flags |= TDF_SWAPINREQ; return (1); } } else sched_wakeup(td); return (0); } /* * Compute a tenex style load average of a quantity on * 1, 5 and 15 minute intervals. */ static void loadav(void *arg) { int i, nrun; struct loadavg *avg; nrun = sched_load(); avg = &averunnable; for (i = 0; i < 3; i++) avg->ldavg[i] = (cexp[i] * avg->ldavg[i] + nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; /* * Schedule the next update to occur after 5 seconds, but add a * random variation to avoid synchronisation with processes that * run at regular intervals. */ callout_reset_sbt(&loadav_callout, SBT_1US * (4000000 + (int)(random() % 2000001)), SBT_1US, loadav, NULL, C_DIRECT_EXEC | C_PREL(32)); } /* ARGSUSED */ static void synch_setup(void *dummy) { callout_init(&loadav_callout, 1); /* Kick off timeout driven events by calling first time. */ loadav(NULL); } int should_yield(void) { return ((u_int)ticks - (u_int)curthread->td_swvoltick >= hogticks); } void maybe_yield(void) { if (should_yield()) kern_yield(PRI_USER); } void kern_yield(int prio) { struct thread *td; td = curthread; DROP_GIANT(); thread_lock(td); if (prio == PRI_USER) prio = td->td_user_pri; if (prio >= 0) sched_prio(td, prio); mi_switch(SW_VOL | SWT_RELINQUISH, NULL); thread_unlock(td); PICKUP_GIANT(); } /* * General purpose yield system call. */ int sys_yield(struct thread *td, struct yield_args *uap) { thread_lock(td); if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) sched_prio(td, PRI_MAX_TIMESHARE); mi_switch(SW_VOL | SWT_RELINQUISH, NULL); thread_unlock(td); td->td_retval[0] = 0; return (0); } Index: stable/11/sys/kern/kern_sysctl.c =================================================================== --- stable/11/sys/kern/kern_sysctl.c (revision 359651) +++ stable/11/sys/kern/kern_sysctl.c (revision 359652) @@ -1,2086 +1,2086 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Mike Karels at Berkeley Software Design, Inc. * * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD * project, to make these variables more userfriendly. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_sysctl.c 8.4 (Berkeley) 4/14/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_compat.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include static MALLOC_DEFINE(M_SYSCTL, "sysctl", "sysctl internal magic"); static MALLOC_DEFINE(M_SYSCTLOID, "sysctloid", "sysctl dynamic oids"); static MALLOC_DEFINE(M_SYSCTLTMP, "sysctltmp", "sysctl temp output buffer"); /* * The sysctllock protects the MIB tree. It also protects sysctl * contexts used with dynamic sysctls. The sysctl_register_oid() and * sysctl_unregister_oid() routines require the sysctllock to already * be held, so the sysctl_wlock() and sysctl_wunlock() routines are * provided for the few places in the kernel which need to use that * API rather than using the dynamic API. Use of the dynamic API is * strongly encouraged for most code. * * The sysctlmemlock is used to limit the amount of user memory wired for * sysctl requests. This is implemented by serializing any userland * sysctl requests larger than a single page via an exclusive lock. */ static struct rmlock sysctllock; static struct sx __exclusive_cache_line sysctlmemlock; #define SYSCTL_WLOCK() rm_wlock(&sysctllock) #define SYSCTL_WUNLOCK() rm_wunlock(&sysctllock) #define SYSCTL_RLOCK(tracker) rm_rlock(&sysctllock, (tracker)) #define SYSCTL_RUNLOCK(tracker) rm_runlock(&sysctllock, (tracker)) #define SYSCTL_WLOCKED() rm_wowned(&sysctllock) #define SYSCTL_ASSERT_LOCKED() rm_assert(&sysctllock, RA_LOCKED) #define SYSCTL_ASSERT_WLOCKED() rm_assert(&sysctllock, RA_WLOCKED) #define SYSCTL_ASSERT_RLOCKED() rm_assert(&sysctllock, RA_RLOCKED) #define SYSCTL_INIT() rm_init_flags(&sysctllock, "sysctl lock", \ RM_SLEEPABLE) #define SYSCTL_SLEEP(ch, wmesg, timo) \ rm_sleep(ch, &sysctllock, 0, wmesg, timo) static int sysctl_root(SYSCTL_HANDLER_ARGS); /* Root list */ struct sysctl_oid_list sysctl__children = SLIST_HEAD_INITIALIZER(&sysctl__children); static int sysctl_remove_oid_locked(struct sysctl_oid *oidp, int del, int recurse); static int sysctl_old_kernel(struct sysctl_req *, const void *, size_t); static int sysctl_new_kernel(struct sysctl_req *, void *, size_t); static struct sysctl_oid * sysctl_find_oidname(const char *name, struct sysctl_oid_list *list) { struct sysctl_oid *oidp; SYSCTL_ASSERT_LOCKED(); SLIST_FOREACH(oidp, list, oid_link) { if (strcmp(oidp->oid_name, name) == 0) { return (oidp); } } return (NULL); } /* * Initialization of the MIB tree. * * Order by number in each list. */ void sysctl_wlock(void) { SYSCTL_WLOCK(); } void sysctl_wunlock(void) { SYSCTL_WUNLOCK(); } static int sysctl_root_handler_locked(struct sysctl_oid *oid, void *arg1, intmax_t arg2, struct sysctl_req *req, struct rm_priotracker *tracker) { int error; if (oid->oid_kind & CTLFLAG_DYN) atomic_add_int(&oid->oid_running, 1); if (tracker != NULL) SYSCTL_RUNLOCK(tracker); else SYSCTL_WUNLOCK(); if (!(oid->oid_kind & CTLFLAG_MPSAFE)) mtx_lock(&Giant); error = oid->oid_handler(oid, arg1, arg2, req); if (!(oid->oid_kind & CTLFLAG_MPSAFE)) mtx_unlock(&Giant); KFAIL_POINT_ERROR(_debug_fail_point, sysctl_running, error); if (tracker != NULL) SYSCTL_RLOCK(tracker); else SYSCTL_WLOCK(); if (oid->oid_kind & CTLFLAG_DYN) { if (atomic_fetchadd_int(&oid->oid_running, -1) == 1 && (oid->oid_kind & CTLFLAG_DYING) != 0) wakeup(&oid->oid_running); } return (error); } static void sysctl_load_tunable_by_oid_locked(struct sysctl_oid *oidp) { struct sysctl_req req; struct sysctl_oid *curr; char *penv = NULL; char path[96]; ssize_t rem = sizeof(path); ssize_t len; uint8_t data[512] __aligned(sizeof(uint64_t)); int size; int error; path[--rem] = 0; for (curr = oidp; curr != NULL; curr = SYSCTL_PARENT(curr)) { len = strlen(curr->oid_name); rem -= len; if (curr != oidp) rem -= 1; if (rem < 0) { printf("OID path exceeds %d bytes\n", (int)sizeof(path)); return; } memcpy(path + rem, curr->oid_name, len); if (curr != oidp) path[rem + len] = '.'; } memset(&req, 0, sizeof(req)); req.td = curthread; req.oldfunc = sysctl_old_kernel; req.newfunc = sysctl_new_kernel; req.lock = REQ_UNWIRED; switch (oidp->oid_kind & CTLTYPE) { case CTLTYPE_INT: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(int), GETENV_SIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_UINT: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(int), GETENV_UNSIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_LONG: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(long), GETENV_SIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_ULONG: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(long), GETENV_UNSIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_S8: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(int8_t), GETENV_SIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_S16: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(int16_t), GETENV_SIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_S32: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(int32_t), GETENV_SIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_S64: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(int64_t), GETENV_SIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_U8: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(uint8_t), GETENV_UNSIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_U16: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(uint16_t), GETENV_UNSIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_U32: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(uint32_t), GETENV_UNSIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_U64: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(uint64_t), GETENV_UNSIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_STRING: penv = kern_getenv(path + rem); if (penv == NULL) return; req.newlen = strlen(penv); req.newptr = penv; break; default: return; } error = sysctl_root_handler_locked(oidp, oidp->oid_arg1, oidp->oid_arg2, &req, NULL); if (error != 0) printf("Setting sysctl %s failed: %d\n", path + rem, error); if (penv != NULL) freeenv(penv); } void sysctl_register_oid(struct sysctl_oid *oidp) { struct sysctl_oid_list *parent = oidp->oid_parent; struct sysctl_oid *p; struct sysctl_oid *q; int oid_number; int timeout = 2; /* * First check if another oid with the same name already * exists in the parent's list. */ SYSCTL_ASSERT_WLOCKED(); p = sysctl_find_oidname(oidp->oid_name, parent); if (p != NULL) { if ((p->oid_kind & CTLTYPE) == CTLTYPE_NODE) { p->oid_refcnt++; return; } else { printf("can't re-use a leaf (%s)!\n", p->oid_name); return; } } /* get current OID number */ oid_number = oidp->oid_number; #if (OID_AUTO >= 0) #error "OID_AUTO is expected to be a negative value" #endif /* * Any negative OID number qualifies as OID_AUTO. Valid OID * numbers should always be positive. * * NOTE: DO NOT change the starting value here, change it in * , and make sure it is at least 256 to * accommodate e.g. net.inet.raw as a static sysctl node. */ if (oid_number < 0) { static int newoid; /* * By decrementing the next OID number we spend less * time inserting the OIDs into a sorted list. */ if (--newoid < CTL_AUTO_START) newoid = 0x7fffffff; oid_number = newoid; } /* * Insert the OID into the parent's list sorted by OID number. */ retry: q = NULL; SLIST_FOREACH(p, parent, oid_link) { /* check if the current OID number is in use */ if (oid_number == p->oid_number) { /* get the next valid OID number */ if (oid_number < CTL_AUTO_START || oid_number == 0x7fffffff) { /* wraparound - restart */ oid_number = CTL_AUTO_START; /* don't loop forever */ if (!timeout--) panic("sysctl: Out of OID numbers\n"); goto retry; } else { oid_number++; } } else if (oid_number < p->oid_number) break; q = p; } /* check for non-auto OID number collision */ if (oidp->oid_number >= 0 && oidp->oid_number < CTL_AUTO_START && oid_number >= CTL_AUTO_START) { printf("sysctl: OID number(%d) is already in use for '%s'\n", oidp->oid_number, oidp->oid_name); } /* update the OID number, if any */ oidp->oid_number = oid_number; if (q != NULL) SLIST_INSERT_AFTER(q, oidp, oid_link); else SLIST_INSERT_HEAD(parent, oidp, oid_link); if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE && #ifdef VIMAGE (oidp->oid_kind & CTLFLAG_VNET) == 0 && #endif (oidp->oid_kind & CTLFLAG_TUN) != 0 && (oidp->oid_kind & CTLFLAG_NOFETCH) == 0) { /* only fetch value once */ oidp->oid_kind |= CTLFLAG_NOFETCH; /* try to fetch value from kernel environment */ sysctl_load_tunable_by_oid_locked(oidp); } } void sysctl_register_disabled_oid(struct sysctl_oid *oidp) { /* * Mark the leaf as dormant if it's not to be immediately enabled. * We do not disable nodes as they can be shared between modules * and it is always safe to access a node. */ KASSERT((oidp->oid_kind & CTLFLAG_DORMANT) == 0, ("internal flag is set in oid_kind")); if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) oidp->oid_kind |= CTLFLAG_DORMANT; sysctl_register_oid(oidp); } void sysctl_enable_oid(struct sysctl_oid *oidp) { SYSCTL_ASSERT_WLOCKED(); if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) { KASSERT((oidp->oid_kind & CTLFLAG_DORMANT) == 0, ("sysctl node is marked as dormant")); return; } KASSERT((oidp->oid_kind & CTLFLAG_DORMANT) != 0, ("enabling already enabled sysctl oid")); oidp->oid_kind &= ~CTLFLAG_DORMANT; } void sysctl_unregister_oid(struct sysctl_oid *oidp) { struct sysctl_oid *p; int error; SYSCTL_ASSERT_WLOCKED(); if (oidp->oid_number == OID_AUTO) { error = EINVAL; } else { error = ENOENT; SLIST_FOREACH(p, oidp->oid_parent, oid_link) { if (p == oidp) { SLIST_REMOVE(oidp->oid_parent, oidp, sysctl_oid, oid_link); error = 0; break; } } } /* * This can happen when a module fails to register and is * being unloaded afterwards. It should not be a panic() * for normal use. */ if (error) { printf("%s: failed(%d) to unregister sysctl(%s)\n", __func__, error, oidp->oid_name); } } /* Initialize a new context to keep track of dynamically added sysctls. */ int sysctl_ctx_init(struct sysctl_ctx_list *c) { if (c == NULL) { return (EINVAL); } /* * No locking here, the caller is responsible for not adding * new nodes to a context until after this function has * returned. */ TAILQ_INIT(c); return (0); } /* Free the context, and destroy all dynamic oids registered in this context */ int sysctl_ctx_free(struct sysctl_ctx_list *clist) { struct sysctl_ctx_entry *e, *e1; int error; error = 0; /* * First perform a "dry run" to check if it's ok to remove oids. * XXX FIXME * XXX This algorithm is a hack. But I don't know any * XXX better solution for now... */ SYSCTL_WLOCK(); TAILQ_FOREACH(e, clist, link) { error = sysctl_remove_oid_locked(e->entry, 0, 0); if (error) break; } /* * Restore deregistered entries, either from the end, * or from the place where error occurred. * e contains the entry that was not unregistered */ if (error) e1 = TAILQ_PREV(e, sysctl_ctx_list, link); else e1 = TAILQ_LAST(clist, sysctl_ctx_list); while (e1 != NULL) { sysctl_register_oid(e1->entry); e1 = TAILQ_PREV(e1, sysctl_ctx_list, link); } if (error) { SYSCTL_WUNLOCK(); return(EBUSY); } /* Now really delete the entries */ e = TAILQ_FIRST(clist); while (e != NULL) { e1 = TAILQ_NEXT(e, link); error = sysctl_remove_oid_locked(e->entry, 1, 0); if (error) panic("sysctl_remove_oid: corrupt tree, entry: %s", e->entry->oid_name); free(e, M_SYSCTLOID); e = e1; } SYSCTL_WUNLOCK(); return (error); } /* Add an entry to the context */ struct sysctl_ctx_entry * sysctl_ctx_entry_add(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp) { struct sysctl_ctx_entry *e; SYSCTL_ASSERT_WLOCKED(); if (clist == NULL || oidp == NULL) return(NULL); e = malloc(sizeof(struct sysctl_ctx_entry), M_SYSCTLOID, M_WAITOK); e->entry = oidp; TAILQ_INSERT_HEAD(clist, e, link); return (e); } /* Find an entry in the context */ struct sysctl_ctx_entry * sysctl_ctx_entry_find(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp) { struct sysctl_ctx_entry *e; SYSCTL_ASSERT_WLOCKED(); if (clist == NULL || oidp == NULL) return(NULL); TAILQ_FOREACH(e, clist, link) { if(e->entry == oidp) return(e); } return (e); } /* * Delete an entry from the context. * NOTE: this function doesn't free oidp! You have to remove it * with sysctl_remove_oid(). */ int sysctl_ctx_entry_del(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp) { struct sysctl_ctx_entry *e; if (clist == NULL || oidp == NULL) return (EINVAL); SYSCTL_WLOCK(); e = sysctl_ctx_entry_find(clist, oidp); if (e != NULL) { TAILQ_REMOVE(clist, e, link); SYSCTL_WUNLOCK(); free(e, M_SYSCTLOID); return (0); } else { SYSCTL_WUNLOCK(); return (ENOENT); } } /* * Remove dynamically created sysctl trees. * oidp - top of the tree to be removed * del - if 0 - just deregister, otherwise free up entries as well * recurse - if != 0 traverse the subtree to be deleted */ int sysctl_remove_oid(struct sysctl_oid *oidp, int del, int recurse) { int error; SYSCTL_WLOCK(); error = sysctl_remove_oid_locked(oidp, del, recurse); SYSCTL_WUNLOCK(); return (error); } int sysctl_remove_name(struct sysctl_oid *parent, const char *name, int del, int recurse) { struct sysctl_oid *p, *tmp; int error; error = ENOENT; SYSCTL_WLOCK(); SLIST_FOREACH_SAFE(p, SYSCTL_CHILDREN(parent), oid_link, tmp) { if (strcmp(p->oid_name, name) == 0) { error = sysctl_remove_oid_locked(p, del, recurse); break; } } SYSCTL_WUNLOCK(); return (error); } static int sysctl_remove_oid_locked(struct sysctl_oid *oidp, int del, int recurse) { struct sysctl_oid *p, *tmp; int error; SYSCTL_ASSERT_WLOCKED(); if (oidp == NULL) return(EINVAL); if ((oidp->oid_kind & CTLFLAG_DYN) == 0) { printf("Warning: can't remove non-dynamic nodes (%s)!\n", oidp->oid_name); return (EINVAL); } /* * WARNING: normal method to do this should be through * sysctl_ctx_free(). Use recursing as the last resort * method to purge your sysctl tree of leftovers... * However, if some other code still references these nodes, * it will panic. */ if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) { if (oidp->oid_refcnt == 1) { SLIST_FOREACH_SAFE(p, SYSCTL_CHILDREN(oidp), oid_link, tmp) { if (!recurse) { printf("Warning: failed attempt to " "remove oid %s with child %s\n", oidp->oid_name, p->oid_name); return (ENOTEMPTY); } error = sysctl_remove_oid_locked(p, del, recurse); if (error) return (error); } } } if (oidp->oid_refcnt > 1 ) { oidp->oid_refcnt--; } else { if (oidp->oid_refcnt == 0) { printf("Warning: bad oid_refcnt=%u (%s)!\n", oidp->oid_refcnt, oidp->oid_name); return (EINVAL); } sysctl_unregister_oid(oidp); if (del) { /* * Wait for all threads running the handler to drain. * This preserves the previous behavior when the * sysctl lock was held across a handler invocation, * and is necessary for module unload correctness. */ while (oidp->oid_running > 0) { oidp->oid_kind |= CTLFLAG_DYING; SYSCTL_SLEEP(&oidp->oid_running, "oidrm", 0); } if (oidp->oid_descr) free(__DECONST(char *, oidp->oid_descr), M_SYSCTLOID); free(__DECONST(char *, oidp->oid_name), M_SYSCTLOID); free(oidp, M_SYSCTLOID); } } return (0); } /* * Create new sysctls at run time. * clist may point to a valid context initialized with sysctl_ctx_init(). */ struct sysctl_oid * sysctl_add_oid(struct sysctl_ctx_list *clist, struct sysctl_oid_list *parent, int number, const char *name, int kind, void *arg1, intmax_t arg2, int (*handler)(SYSCTL_HANDLER_ARGS), const char *fmt, const char *descr) { struct sysctl_oid *oidp; /* You have to hook up somewhere.. */ if (parent == NULL) return(NULL); /* Check if the node already exists, otherwise create it */ SYSCTL_WLOCK(); oidp = sysctl_find_oidname(name, parent); if (oidp != NULL) { if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) { oidp->oid_refcnt++; /* Update the context */ if (clist != NULL) sysctl_ctx_entry_add(clist, oidp); SYSCTL_WUNLOCK(); return (oidp); } else { SYSCTL_WUNLOCK(); printf("can't re-use a leaf (%s)!\n", name); return (NULL); } } oidp = malloc(sizeof(struct sysctl_oid), M_SYSCTLOID, M_WAITOK|M_ZERO); oidp->oid_parent = parent; SLIST_INIT(&oidp->oid_children); oidp->oid_number = number; oidp->oid_refcnt = 1; oidp->oid_name = strdup(name, M_SYSCTLOID); oidp->oid_handler = handler; oidp->oid_kind = CTLFLAG_DYN | kind; oidp->oid_arg1 = arg1; oidp->oid_arg2 = arg2; oidp->oid_fmt = fmt; if (descr != NULL) oidp->oid_descr = strdup(descr, M_SYSCTLOID); /* Update the context, if used */ if (clist != NULL) sysctl_ctx_entry_add(clist, oidp); /* Register this oid */ sysctl_register_oid(oidp); SYSCTL_WUNLOCK(); return (oidp); } /* * Rename an existing oid. */ void sysctl_rename_oid(struct sysctl_oid *oidp, const char *name) { char *newname; char *oldname; newname = strdup(name, M_SYSCTLOID); SYSCTL_WLOCK(); oldname = __DECONST(char *, oidp->oid_name); oidp->oid_name = newname; SYSCTL_WUNLOCK(); free(oldname, M_SYSCTLOID); } /* * Reparent an existing oid. */ int sysctl_move_oid(struct sysctl_oid *oid, struct sysctl_oid_list *parent) { struct sysctl_oid *oidp; SYSCTL_WLOCK(); if (oid->oid_parent == parent) { SYSCTL_WUNLOCK(); return (0); } oidp = sysctl_find_oidname(oid->oid_name, parent); if (oidp != NULL) { SYSCTL_WUNLOCK(); return (EEXIST); } sysctl_unregister_oid(oid); oid->oid_parent = parent; oid->oid_number = OID_AUTO; sysctl_register_oid(oid); SYSCTL_WUNLOCK(); return (0); } /* * Register the kernel's oids on startup. */ SET_DECLARE(sysctl_set, struct sysctl_oid); static void sysctl_register_all(void *arg) { struct sysctl_oid **oidp; sx_init(&sysctlmemlock, "sysctl mem"); SYSCTL_INIT(); SYSCTL_WLOCK(); SET_FOREACH(oidp, sysctl_set) sysctl_register_oid(*oidp); SYSCTL_WUNLOCK(); } -SYSINIT(sysctl, SI_SUB_KMEM, SI_ORDER_FIRST, sysctl_register_all, 0); +SYSINIT(sysctl, SI_SUB_KMEM, SI_ORDER_FIRST, sysctl_register_all, NULL); /* * "Staff-functions" * * These functions implement a presently undocumented interface * used by the sysctl program to walk the tree, and get the type * so it can print the value. * This interface is under work and consideration, and should probably * be killed with a big axe by the first person who can find the time. * (be aware though, that the proper interface isn't as obvious as it * may seem, there are various conflicting requirements. * * {0,0} printf the entire MIB-tree. * {0,1,...} return the name of the "..." OID. * {0,2,...} return the next OID. * {0,3} return the OID of the name in "new" * {0,4,...} return the kind & format info for the "..." OID. * {0,5,...} return the description the "..." OID. */ #ifdef SYSCTL_DEBUG static void sysctl_sysctl_debug_dump_node(struct sysctl_oid_list *l, int i) { int k; struct sysctl_oid *oidp; SYSCTL_ASSERT_LOCKED(); SLIST_FOREACH(oidp, l, oid_link) { for (k=0; koid_number, oidp->oid_name); printf("%c%c", oidp->oid_kind & CTLFLAG_RD ? 'R':' ', oidp->oid_kind & CTLFLAG_WR ? 'W':' '); if (oidp->oid_handler) printf(" *Handler"); switch (oidp->oid_kind & CTLTYPE) { case CTLTYPE_NODE: printf(" Node\n"); if (!oidp->oid_handler) { sysctl_sysctl_debug_dump_node( SYSCTL_CHILDREN(oidp), i + 2); } break; case CTLTYPE_INT: printf(" Int\n"); break; case CTLTYPE_UINT: printf(" u_int\n"); break; case CTLTYPE_LONG: printf(" Long\n"); break; case CTLTYPE_ULONG: printf(" u_long\n"); break; case CTLTYPE_STRING: printf(" String\n"); break; case CTLTYPE_S8: printf(" int8_t\n"); break; case CTLTYPE_S16: printf(" int16_t\n"); break; case CTLTYPE_S32: printf(" int32_t\n"); break; case CTLTYPE_S64: printf(" int64_t\n"); break; case CTLTYPE_U8: printf(" uint8_t\n"); break; case CTLTYPE_U16: printf(" uint16_t\n"); break; case CTLTYPE_U32: printf(" uint32_t\n"); break; case CTLTYPE_U64: printf(" uint64_t\n"); break; case CTLTYPE_OPAQUE: printf(" Opaque/struct\n"); break; default: printf("\n"); } } } static int sysctl_sysctl_debug(SYSCTL_HANDLER_ARGS) { struct rm_priotracker tracker; int error; error = priv_check(req->td, PRIV_SYSCTL_DEBUG); if (error) return (error); SYSCTL_RLOCK(&tracker); sysctl_sysctl_debug_dump_node(&sysctl__children, 0); SYSCTL_RUNLOCK(&tracker); return (ENOENT); } SYSCTL_PROC(_sysctl, 0, debug, CTLTYPE_STRING|CTLFLAG_RD|CTLFLAG_MPSAFE, 0, 0, sysctl_sysctl_debug, "-", ""); #endif static int sysctl_sysctl_name(SYSCTL_HANDLER_ARGS) { int *name = (int *) arg1; u_int namelen = arg2; int error = 0; struct sysctl_oid *oid; struct sysctl_oid_list *lsp = &sysctl__children, *lsp2; struct rm_priotracker tracker; char buf[10]; SYSCTL_RLOCK(&tracker); while (namelen) { if (!lsp) { snprintf(buf,sizeof(buf),"%d",*name); if (req->oldidx) error = SYSCTL_OUT(req, ".", 1); if (!error) error = SYSCTL_OUT(req, buf, strlen(buf)); if (error) goto out; namelen--; name++; continue; } lsp2 = NULL; SLIST_FOREACH(oid, lsp, oid_link) { if (oid->oid_number != *name) continue; if (req->oldidx) error = SYSCTL_OUT(req, ".", 1); if (!error) error = SYSCTL_OUT(req, oid->oid_name, strlen(oid->oid_name)); if (error) goto out; namelen--; name++; if ((oid->oid_kind & CTLTYPE) != CTLTYPE_NODE) break; if (oid->oid_handler) break; lsp2 = SYSCTL_CHILDREN(oid); break; } lsp = lsp2; } error = SYSCTL_OUT(req, "", 1); out: SYSCTL_RUNLOCK(&tracker); return (error); } /* * XXXRW/JA: Shouldn't return name data for nodes that we don't permit in * capability mode. */ static SYSCTL_NODE(_sysctl, 1, name, CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_name, ""); static int sysctl_sysctl_next_ls(struct sysctl_oid_list *lsp, int *name, u_int namelen, int *next, int *len, int level, struct sysctl_oid **oidpp) { struct sysctl_oid *oidp; SYSCTL_ASSERT_LOCKED(); *len = level; SLIST_FOREACH(oidp, lsp, oid_link) { *next = oidp->oid_number; *oidpp = oidp; if ((oidp->oid_kind & (CTLFLAG_SKIP | CTLFLAG_DORMANT)) != 0) continue; if (!namelen) { if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) return (0); if (oidp->oid_handler) /* We really should call the handler here...*/ return (0); lsp = SYSCTL_CHILDREN(oidp); if (!sysctl_sysctl_next_ls(lsp, 0, 0, next+1, len, level+1, oidpp)) return (0); goto emptynode; } if (oidp->oid_number < *name) continue; if (oidp->oid_number > *name) { if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) return (0); if (oidp->oid_handler) return (0); lsp = SYSCTL_CHILDREN(oidp); if (!sysctl_sysctl_next_ls(lsp, name+1, namelen-1, next+1, len, level+1, oidpp)) return (0); goto next; } if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) continue; if (oidp->oid_handler) continue; lsp = SYSCTL_CHILDREN(oidp); if (!sysctl_sysctl_next_ls(lsp, name+1, namelen-1, next+1, len, level+1, oidpp)) return (0); next: namelen = 1; emptynode: *len = level; } return (1); } static int sysctl_sysctl_next(SYSCTL_HANDLER_ARGS) { int *name = (int *) arg1; u_int namelen = arg2; int i, j, error; struct sysctl_oid *oid; struct sysctl_oid_list *lsp = &sysctl__children; struct rm_priotracker tracker; int newoid[CTL_MAXNAME]; SYSCTL_RLOCK(&tracker); i = sysctl_sysctl_next_ls(lsp, name, namelen, newoid, &j, 1, &oid); SYSCTL_RUNLOCK(&tracker); if (i) return (ENOENT); error = SYSCTL_OUT(req, newoid, j * sizeof (int)); return (error); } /* * XXXRW/JA: Shouldn't return next data for nodes that we don't permit in * capability mode. */ static SYSCTL_NODE(_sysctl, 2, next, CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_next, ""); static int name2oid(char *name, int *oid, int *len, struct sysctl_oid **oidpp) { struct sysctl_oid *oidp; struct sysctl_oid_list *lsp = &sysctl__children; char *p; SYSCTL_ASSERT_LOCKED(); for (*len = 0; *len < CTL_MAXNAME;) { p = strsep(&name, "."); oidp = SLIST_FIRST(lsp); for (;; oidp = SLIST_NEXT(oidp, oid_link)) { if (oidp == NULL) return (ENOENT); if (strcmp(p, oidp->oid_name) == 0) break; } *oid++ = oidp->oid_number; (*len)++; if (name == NULL || *name == '\0') { if (oidpp) *oidpp = oidp; return (0); } if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) break; if (oidp->oid_handler) break; lsp = SYSCTL_CHILDREN(oidp); } return (ENOENT); } static int sysctl_sysctl_name2oid(SYSCTL_HANDLER_ARGS) { char *p; int error, oid[CTL_MAXNAME], len = 0; struct sysctl_oid *op = NULL; struct rm_priotracker tracker; char buf[32]; if (!req->newlen) return (ENOENT); if (req->newlen >= MAXPATHLEN) /* XXX arbitrary, undocumented */ return (ENAMETOOLONG); p = buf; if (req->newlen >= sizeof(buf)) p = malloc(req->newlen+1, M_SYSCTL, M_WAITOK); error = SYSCTL_IN(req, p, req->newlen); if (error) { if (p != buf) free(p, M_SYSCTL); return (error); } p [req->newlen] = '\0'; SYSCTL_RLOCK(&tracker); error = name2oid(p, oid, &len, &op); SYSCTL_RUNLOCK(&tracker); if (p != buf) free(p, M_SYSCTL); if (error) return (error); error = SYSCTL_OUT(req, oid, len * sizeof *oid); return (error); } /* * XXXRW/JA: Shouldn't return name2oid data for nodes that we don't permit in * capability mode. */ SYSCTL_PROC(_sysctl, 3, name2oid, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MPSAFE | CTLFLAG_CAPRW, 0, 0, sysctl_sysctl_name2oid, "I", ""); static int sysctl_sysctl_oidfmt(SYSCTL_HANDLER_ARGS) { struct sysctl_oid *oid; struct rm_priotracker tracker; int error; SYSCTL_RLOCK(&tracker); error = sysctl_find_oid(arg1, arg2, &oid, NULL, req); if (error) goto out; if (oid->oid_fmt == NULL) { error = ENOENT; goto out; } error = SYSCTL_OUT(req, &oid->oid_kind, sizeof(oid->oid_kind)); if (error) goto out; error = SYSCTL_OUT(req, oid->oid_fmt, strlen(oid->oid_fmt) + 1); out: SYSCTL_RUNLOCK(&tracker); return (error); } static SYSCTL_NODE(_sysctl, 4, oidfmt, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLFLAG_CAPRD, sysctl_sysctl_oidfmt, ""); static int sysctl_sysctl_oiddescr(SYSCTL_HANDLER_ARGS) { struct sysctl_oid *oid; struct rm_priotracker tracker; int error; SYSCTL_RLOCK(&tracker); error = sysctl_find_oid(arg1, arg2, &oid, NULL, req); if (error) goto out; if (oid->oid_descr == NULL) { error = ENOENT; goto out; } error = SYSCTL_OUT(req, oid->oid_descr, strlen(oid->oid_descr) + 1); out: SYSCTL_RUNLOCK(&tracker); return (error); } static SYSCTL_NODE(_sysctl, 5, oiddescr, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLFLAG_CAPRD, sysctl_sysctl_oiddescr, ""); /* * Default "handler" functions. */ /* * Handle a bool. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_bool(SYSCTL_HANDLER_ARGS) { uint8_t temp; int error; /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) temp = *(bool *)arg1 ? 1 : 0; else temp = arg2 ? 1 : 0; error = SYSCTL_OUT(req, &temp, sizeof(temp)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; else { error = SYSCTL_IN(req, &temp, sizeof(temp)); if (!error) *(bool *)arg1 = temp ? 1 : 0; } return (error); } /* * Handle an int8_t, signed or unsigned. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_8(SYSCTL_HANDLER_ARGS) { int8_t tmpout; int error = 0; /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) tmpout = *(int8_t *)arg1; else tmpout = arg2; error = SYSCTL_OUT(req, &tmpout, sizeof(tmpout)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; else error = SYSCTL_IN(req, arg1, sizeof(tmpout)); return (error); } /* * Handle an int16_t, signed or unsigned. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_16(SYSCTL_HANDLER_ARGS) { int16_t tmpout; int error = 0; /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) tmpout = *(int16_t *)arg1; else tmpout = arg2; error = SYSCTL_OUT(req, &tmpout, sizeof(tmpout)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; else error = SYSCTL_IN(req, arg1, sizeof(tmpout)); return (error); } /* * Handle an int32_t, signed or unsigned. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_32(SYSCTL_HANDLER_ARGS) { int32_t tmpout; int error = 0; /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) tmpout = *(int32_t *)arg1; else tmpout = arg2; error = SYSCTL_OUT(req, &tmpout, sizeof(tmpout)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; else error = SYSCTL_IN(req, arg1, sizeof(tmpout)); return (error); } /* * Handle an int, signed or unsigned. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_int(SYSCTL_HANDLER_ARGS) { int tmpout, error = 0; /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) tmpout = *(int *)arg1; else tmpout = arg2; error = SYSCTL_OUT(req, &tmpout, sizeof(int)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; else error = SYSCTL_IN(req, arg1, sizeof(int)); return (error); } /* * Based on on sysctl_handle_int() convert milliseconds into ticks. * Note: this is used by TCP. */ int sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS) { int error, s, tt; tt = *(int *)arg1; s = (int)((int64_t)tt * 1000 / hz); error = sysctl_handle_int(oidp, &s, 0, req); if (error || !req->newptr) return (error); tt = (int)((int64_t)s * hz / 1000); if (tt < 1) return (EINVAL); *(int *)arg1 = tt; return (0); } /* * Handle a long, signed or unsigned. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_long(SYSCTL_HANDLER_ARGS) { int error = 0; long tmplong; #ifdef SCTL_MASK32 int tmpint; #endif /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) tmplong = *(long *)arg1; else tmplong = arg2; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { tmpint = tmplong; error = SYSCTL_OUT(req, &tmpint, sizeof(int)); } else #endif error = SYSCTL_OUT(req, &tmplong, sizeof(long)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; #ifdef SCTL_MASK32 else if (req->flags & SCTL_MASK32) { error = SYSCTL_IN(req, &tmpint, sizeof(int)); *(long *)arg1 = (long)tmpint; } #endif else error = SYSCTL_IN(req, arg1, sizeof(long)); return (error); } /* * Handle a 64 bit int, signed or unsigned. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_64(SYSCTL_HANDLER_ARGS) { int error = 0; uint64_t tmpout; /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) tmpout = *(uint64_t *)arg1; else tmpout = arg2; error = SYSCTL_OUT(req, &tmpout, sizeof(uint64_t)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; else error = SYSCTL_IN(req, arg1, sizeof(uint64_t)); return (error); } /* * Handle our generic '\0' terminated 'C' string. * Two cases: * a variable string: point arg1 at it, arg2 is max length. * a constant string: point arg1 at it, arg2 is zero. */ int sysctl_handle_string(SYSCTL_HANDLER_ARGS) { size_t outlen; int error = 0, ro_string = 0; /* * A zero-length buffer indicates a fixed size read-only * string: */ if (arg2 == 0) { arg2 = strlen((char *)arg1) + 1; ro_string = 1; } if (req->oldptr != NULL) { char *tmparg; if (ro_string) { tmparg = arg1; } else { /* try to make a coherent snapshot of the string */ tmparg = malloc(arg2, M_SYSCTLTMP, M_WAITOK); memcpy(tmparg, arg1, arg2); } outlen = strnlen(tmparg, arg2 - 1) + 1; error = SYSCTL_OUT(req, tmparg, outlen); if (!ro_string) free(tmparg, M_SYSCTLTMP); } else { outlen = strnlen((char *)arg1, arg2 - 1) + 1; error = SYSCTL_OUT(req, NULL, outlen); } if (error || !req->newptr) return (error); if ((req->newlen - req->newidx) >= arg2) { error = EINVAL; } else { arg2 = (req->newlen - req->newidx); error = SYSCTL_IN(req, arg1, arg2); ((char *)arg1)[arg2] = '\0'; } return (error); } /* * Handle any kind of opaque data. * arg1 points to it, arg2 is the size. */ int sysctl_handle_opaque(SYSCTL_HANDLER_ARGS) { int error, tries; u_int generation; struct sysctl_req req2; /* * Attempt to get a coherent snapshot, by using the thread * pre-emption counter updated from within mi_switch() to * determine if we were pre-empted during a bcopy() or * copyout(). Make 3 attempts at doing this before giving up. * If we encounter an error, stop immediately. */ tries = 0; req2 = *req; retry: generation = curthread->td_generation; error = SYSCTL_OUT(req, arg1, arg2); if (error) return (error); tries++; if (generation != curthread->td_generation && tries < 3) { *req = req2; goto retry; } error = SYSCTL_IN(req, arg1, arg2); return (error); } /* * Convert seconds to a struct timeval. Intended for use with * intervals and thus does not permit negative seconds. */ int sysctl_sec_to_timeval(SYSCTL_HANDLER_ARGS) { struct timeval *tv; int error, secs; tv = arg1; secs = tv->tv_sec; error = sysctl_handle_int(oidp, &secs, 0, req); if (error || req->newptr == NULL) return (error); if (secs < 0) return (EINVAL); tv->tv_sec = secs; return (0); } /* * Transfer functions to/from kernel space. * XXX: rather untested at this point */ static int sysctl_old_kernel(struct sysctl_req *req, const void *p, size_t l) { size_t i = 0; if (req->oldptr) { i = l; if (req->oldlen <= req->oldidx) i = 0; else if (i > req->oldlen - req->oldidx) i = req->oldlen - req->oldidx; if (i > 0) bcopy(p, (char *)req->oldptr + req->oldidx, i); } req->oldidx += l; if (req->oldptr && i != l) return (ENOMEM); return (0); } static int sysctl_new_kernel(struct sysctl_req *req, void *p, size_t l) { if (!req->newptr) return (0); if (req->newlen - req->newidx < l) return (EINVAL); bcopy((char *)req->newptr + req->newidx, p, l); req->newidx += l; return (0); } int kernel_sysctl(struct thread *td, int *name, u_int namelen, void *old, size_t *oldlenp, void *new, size_t newlen, size_t *retval, int flags) { int error = 0; struct sysctl_req req; bzero(&req, sizeof req); req.td = td; req.flags = flags; if (oldlenp) { req.oldlen = *oldlenp; } req.validlen = req.oldlen; if (old) { req.oldptr= old; } if (new != NULL) { req.newlen = newlen; req.newptr = new; } req.oldfunc = sysctl_old_kernel; req.newfunc = sysctl_new_kernel; req.lock = REQ_UNWIRED; error = sysctl_root(0, name, namelen, &req); if (req.lock == REQ_WIRED && req.validlen > 0) vsunlock(req.oldptr, req.validlen); if (error && error != ENOMEM) return (error); if (retval) { if (req.oldptr && req.oldidx > req.validlen) *retval = req.validlen; else *retval = req.oldidx; } return (error); } int kernel_sysctlbyname(struct thread *td, char *name, void *old, size_t *oldlenp, void *new, size_t newlen, size_t *retval, int flags) { int oid[CTL_MAXNAME]; size_t oidlen, plen; int error; oid[0] = 0; /* sysctl internal magic */ oid[1] = 3; /* name2oid */ oidlen = sizeof(oid); error = kernel_sysctl(td, oid, 2, oid, &oidlen, (void *)name, strlen(name), &plen, flags); if (error) return (error); error = kernel_sysctl(td, oid, plen / sizeof(int), old, oldlenp, new, newlen, retval, flags); return (error); } /* * Transfer function to/from user space. */ static int sysctl_old_user(struct sysctl_req *req, const void *p, size_t l) { size_t i, len, origidx; int error; origidx = req->oldidx; req->oldidx += l; if (req->oldptr == NULL) return (0); /* * If we have not wired the user supplied buffer and we are currently * holding locks, drop a witness warning, as it's possible that * write operations to the user page can sleep. */ if (req->lock != REQ_WIRED) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "sysctl_old_user()"); i = l; len = req->validlen; if (len <= origidx) i = 0; else { if (i > len - origidx) i = len - origidx; if (req->lock == REQ_WIRED) { error = copyout_nofault(p, (char *)req->oldptr + origidx, i); } else error = copyout(p, (char *)req->oldptr + origidx, i); if (error != 0) return (error); } if (i < l) return (ENOMEM); return (0); } static int sysctl_new_user(struct sysctl_req *req, void *p, size_t l) { int error; if (!req->newptr) return (0); if (req->newlen - req->newidx < l) return (EINVAL); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "sysctl_new_user()"); error = copyin((char *)req->newptr + req->newidx, p, l); req->newidx += l; return (error); } /* * Wire the user space destination buffer. If set to a value greater than * zero, the len parameter limits the maximum amount of wired memory. */ int sysctl_wire_old_buffer(struct sysctl_req *req, size_t len) { int ret; size_t wiredlen; wiredlen = (len > 0 && len < req->oldlen) ? len : req->oldlen; ret = 0; if (req->lock != REQ_WIRED && req->oldptr && req->oldfunc == sysctl_old_user) { if (wiredlen != 0) { ret = vslock(req->oldptr, wiredlen); if (ret != 0) { if (ret != ENOMEM) return (ret); wiredlen = 0; } } req->lock = REQ_WIRED; req->validlen = wiredlen; } return (0); } int sysctl_find_oid(int *name, u_int namelen, struct sysctl_oid **noid, int *nindx, struct sysctl_req *req) { struct sysctl_oid_list *lsp; struct sysctl_oid *oid; int indx; SYSCTL_ASSERT_LOCKED(); lsp = &sysctl__children; indx = 0; while (indx < CTL_MAXNAME) { SLIST_FOREACH(oid, lsp, oid_link) { if (oid->oid_number == name[indx]) break; } if (oid == NULL) return (ENOENT); indx++; if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) { if (oid->oid_handler != NULL || indx == namelen) { *noid = oid; if (nindx != NULL) *nindx = indx; KASSERT((oid->oid_kind & CTLFLAG_DYING) == 0, ("%s found DYING node %p", __func__, oid)); return (0); } lsp = SYSCTL_CHILDREN(oid); } else if (indx == namelen) { if ((oid->oid_kind & CTLFLAG_DORMANT) != 0) return (ENOENT); *noid = oid; if (nindx != NULL) *nindx = indx; KASSERT((oid->oid_kind & CTLFLAG_DYING) == 0, ("%s found DYING node %p", __func__, oid)); return (0); } else { return (ENOTDIR); } } return (ENOENT); } /* * Traverse our tree, and find the right node, execute whatever it points * to, and return the resulting error code. */ static int sysctl_root(SYSCTL_HANDLER_ARGS) { struct sysctl_oid *oid; struct rm_priotracker tracker; int error, indx, lvl; SYSCTL_RLOCK(&tracker); error = sysctl_find_oid(arg1, arg2, &oid, &indx, req); if (error) goto out; if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) { /* * You can't call a sysctl when it's a node, but has * no handler. Inform the user that it's a node. * The indx may or may not be the same as namelen. */ if (oid->oid_handler == NULL) { error = EISDIR; goto out; } } /* Is this sysctl writable? */ if (req->newptr && !(oid->oid_kind & CTLFLAG_WR)) { error = EPERM; goto out; } KASSERT(req->td != NULL, ("sysctl_root(): req->td == NULL")); #ifdef CAPABILITY_MODE /* * If the process is in capability mode, then don't permit reading or * writing unless specifically granted for the node. */ if (IN_CAPABILITY_MODE(req->td)) { if ((req->oldptr && !(oid->oid_kind & CTLFLAG_CAPRD)) || (req->newptr && !(oid->oid_kind & CTLFLAG_CAPWR))) { error = EPERM; goto out; } } #endif /* Is this sysctl sensitive to securelevels? */ if (req->newptr && (oid->oid_kind & CTLFLAG_SECURE)) { lvl = (oid->oid_kind & CTLMASK_SECURE) >> CTLSHIFT_SECURE; error = securelevel_gt(req->td->td_ucred, lvl); if (error) goto out; } /* Is this sysctl writable by only privileged users? */ if (req->newptr && !(oid->oid_kind & CTLFLAG_ANYBODY)) { int priv; if (oid->oid_kind & CTLFLAG_PRISON) priv = PRIV_SYSCTL_WRITEJAIL; #ifdef VIMAGE else if ((oid->oid_kind & CTLFLAG_VNET) && prison_owns_vnet(req->td->td_ucred)) priv = PRIV_SYSCTL_WRITEJAIL; #endif else priv = PRIV_SYSCTL_WRITE; error = priv_check(req->td, priv); if (error) goto out; } if (!oid->oid_handler) { error = EINVAL; goto out; } if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) { arg1 = (int *)arg1 + indx; arg2 -= indx; } else { arg1 = oid->oid_arg1; arg2 = oid->oid_arg2; } #ifdef MAC error = mac_system_check_sysctl(req->td->td_ucred, oid, arg1, arg2, req); if (error != 0) goto out; #endif #ifdef VIMAGE if ((oid->oid_kind & CTLFLAG_VNET) && arg1 != NULL) arg1 = (void *)(curvnet->vnet_data_base + (uintptr_t)arg1); #endif error = sysctl_root_handler_locked(oid, arg1, arg2, req, &tracker); out: SYSCTL_RUNLOCK(&tracker); return (error); } #ifndef _SYS_SYSPROTO_H_ struct sysctl_args { int *name; u_int namelen; void *old; size_t *oldlenp; void *new; size_t newlen; }; #endif int sys___sysctl(struct thread *td, struct sysctl_args *uap) { int error, i, name[CTL_MAXNAME]; size_t j; if (uap->namelen > CTL_MAXNAME || uap->namelen < 2) return (EINVAL); error = copyin(uap->name, &name, uap->namelen * sizeof(int)); if (error) return (error); error = userland_sysctl(td, name, uap->namelen, uap->old, uap->oldlenp, 0, uap->new, uap->newlen, &j, 0); if (error && error != ENOMEM) return (error); if (uap->oldlenp) { i = copyout(&j, uap->oldlenp, sizeof(j)); if (i) return (i); } return (error); } /* * This is used from various compatibility syscalls too. That's why name * must be in kernel space. */ int userland_sysctl(struct thread *td, int *name, u_int namelen, void *old, size_t *oldlenp, int inkernel, void *new, size_t newlen, size_t *retval, int flags) { int error = 0, memlocked; struct sysctl_req req; bzero(&req, sizeof req); req.td = td; req.flags = flags; if (oldlenp) { if (inkernel) { req.oldlen = *oldlenp; } else { error = copyin(oldlenp, &req.oldlen, sizeof(*oldlenp)); if (error) return (error); } } req.validlen = req.oldlen; req.oldptr = old; if (new != NULL) { req.newlen = newlen; req.newptr = new; } req.oldfunc = sysctl_old_user; req.newfunc = sysctl_new_user; req.lock = REQ_UNWIRED; #ifdef KTRACE if (KTRPOINT(curthread, KTR_SYSCTL)) ktrsysctl(name, namelen); #endif if (req.oldptr && req.oldlen > PAGE_SIZE) { memlocked = 1; sx_xlock(&sysctlmemlock); } else memlocked = 0; CURVNET_SET(TD_TO_VNET(td)); for (;;) { req.oldidx = 0; req.newidx = 0; error = sysctl_root(0, name, namelen, &req); if (error != EAGAIN) break; kern_yield(PRI_USER); } CURVNET_RESTORE(); if (req.lock == REQ_WIRED && req.validlen > 0) vsunlock(req.oldptr, req.validlen); if (memlocked) sx_xunlock(&sysctlmemlock); if (error && error != ENOMEM) return (error); if (retval) { if (req.oldptr && req.oldidx > req.validlen) *retval = req.validlen; else *retval = req.oldidx; } return (error); } /* * Drain into a sysctl struct. The user buffer should be wired if a page * fault would cause issue. */ static int sbuf_sysctl_drain(void *arg, const char *data, int len) { struct sysctl_req *req = arg; int error; error = SYSCTL_OUT(req, data, len); KASSERT(error >= 0, ("Got unexpected negative value %d", error)); return (error == 0 ? len : -error); } struct sbuf * sbuf_new_for_sysctl(struct sbuf *s, char *buf, int length, struct sysctl_req *req) { /* Supply a default buffer size if none given. */ if (buf == NULL && length == 0) length = 64; s = sbuf_new(s, buf, length, SBUF_FIXEDLEN | SBUF_INCLUDENUL); sbuf_set_drain(s, sbuf_sysctl_drain, req); return (s); } Index: stable/11/sys/kern/link_elf.c =================================================================== --- stable/11/sys/kern/link_elf.c (revision 359651) +++ stable/11/sys/kern/link_elf.c (revision 359652) @@ -1,1714 +1,1714 @@ /*- * Copyright (c) 1998-2000 Doug Rabson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_gdb.h" #include #include #ifdef GPROF #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SPARSE_MAPPING #include #include #include #endif #include #include #include #ifdef DDB_CTF #include #endif #include "linker_if.h" #define MAXSEGS 4 typedef struct elf_file { struct linker_file lf; /* Common fields */ int preloaded; /* Was file pre-loaded */ caddr_t address; /* Relocation address */ #ifdef SPARSE_MAPPING vm_object_t object; /* VM object to hold file pages */ #endif Elf_Dyn *dynamic; /* Symbol table etc. */ Elf_Hashelt nbuckets; /* DT_HASH info */ Elf_Hashelt nchains; const Elf_Hashelt *buckets; const Elf_Hashelt *chains; caddr_t hash; caddr_t strtab; /* DT_STRTAB */ int strsz; /* DT_STRSZ */ const Elf_Sym *symtab; /* DT_SYMTAB */ Elf_Addr *got; /* DT_PLTGOT */ const Elf_Rel *pltrel; /* DT_JMPREL */ int pltrelsize; /* DT_PLTRELSZ */ const Elf_Rela *pltrela; /* DT_JMPREL */ int pltrelasize; /* DT_PLTRELSZ */ const Elf_Rel *rel; /* DT_REL */ int relsize; /* DT_RELSZ */ const Elf_Rela *rela; /* DT_RELA */ int relasize; /* DT_RELASZ */ caddr_t modptr; const Elf_Sym *ddbsymtab; /* The symbol table we are using */ long ddbsymcnt; /* Number of symbols */ caddr_t ddbstrtab; /* String table */ long ddbstrcnt; /* number of bytes in string table */ caddr_t symbase; /* malloc'ed symbold base */ caddr_t strbase; /* malloc'ed string base */ caddr_t ctftab; /* CTF table */ long ctfcnt; /* number of bytes in CTF table */ caddr_t ctfoff; /* CTF offset table */ caddr_t typoff; /* Type offset table */ long typlen; /* Number of type entries. */ Elf_Addr pcpu_start; /* Pre-relocation pcpu set start. */ Elf_Addr pcpu_stop; /* Pre-relocation pcpu set stop. */ Elf_Addr pcpu_base; /* Relocated pcpu set address. */ #ifdef VIMAGE Elf_Addr vnet_start; /* Pre-relocation vnet set start. */ Elf_Addr vnet_stop; /* Pre-relocation vnet set stop. */ Elf_Addr vnet_base; /* Relocated vnet set address. */ #endif #ifdef GDB struct link_map gdb; /* hooks for gdb */ #endif } *elf_file_t; struct elf_set { Elf_Addr es_start; Elf_Addr es_stop; Elf_Addr es_base; TAILQ_ENTRY(elf_set) es_link; }; TAILQ_HEAD(elf_set_head, elf_set); #include static int link_elf_link_common_finish(linker_file_t); static int link_elf_link_preload(linker_class_t cls, const char *, linker_file_t *); static int link_elf_link_preload_finish(linker_file_t); static int link_elf_load_file(linker_class_t, const char *, linker_file_t *); static int link_elf_lookup_symbol(linker_file_t, const char *, c_linker_sym_t *); static int link_elf_symbol_values(linker_file_t, c_linker_sym_t, linker_symval_t *); static int link_elf_search_symbol(linker_file_t, caddr_t, c_linker_sym_t *, long *); static void link_elf_unload_file(linker_file_t); static void link_elf_unload_preload(linker_file_t); static int link_elf_lookup_set(linker_file_t, const char *, void ***, void ***, int *); static int link_elf_each_function_name(linker_file_t, int (*)(const char *, void *), void *); static int link_elf_each_function_nameval(linker_file_t, linker_function_nameval_callback_t, void *); static void link_elf_reloc_local(linker_file_t); static long link_elf_symtab_get(linker_file_t, const Elf_Sym **); static long link_elf_strtab_get(linker_file_t, caddr_t *); static int elf_lookup(linker_file_t, Elf_Size, int, Elf_Addr *); static kobj_method_t link_elf_methods[] = { KOBJMETHOD(linker_lookup_symbol, link_elf_lookup_symbol), KOBJMETHOD(linker_symbol_values, link_elf_symbol_values), KOBJMETHOD(linker_search_symbol, link_elf_search_symbol), KOBJMETHOD(linker_unload, link_elf_unload_file), KOBJMETHOD(linker_load_file, link_elf_load_file), KOBJMETHOD(linker_link_preload, link_elf_link_preload), KOBJMETHOD(linker_link_preload_finish, link_elf_link_preload_finish), KOBJMETHOD(linker_lookup_set, link_elf_lookup_set), KOBJMETHOD(linker_each_function_name, link_elf_each_function_name), KOBJMETHOD(linker_each_function_nameval, link_elf_each_function_nameval), KOBJMETHOD(linker_ctf_get, link_elf_ctf_get), KOBJMETHOD(linker_symtab_get, link_elf_symtab_get), KOBJMETHOD(linker_strtab_get, link_elf_strtab_get), { 0, 0 } }; static struct linker_class link_elf_class = { #if ELF_TARG_CLASS == ELFCLASS32 "elf32", #else "elf64", #endif link_elf_methods, sizeof(struct elf_file) }; typedef int (*elf_reloc_fn)(linker_file_t lf, Elf_Addr relocbase, const void *data, int type, elf_lookup_fn lookup); static int parse_dynamic(elf_file_t); static int relocate_file(elf_file_t); static int relocate_file1(elf_file_t ef, elf_lookup_fn lookup, elf_reloc_fn reloc, bool ifuncs); static int link_elf_preload_parse_symbols(elf_file_t); static struct elf_set_head set_pcpu_list; #ifdef VIMAGE static struct elf_set_head set_vnet_list; #endif static void elf_set_add(struct elf_set_head *list, Elf_Addr start, Elf_Addr stop, Elf_Addr base) { struct elf_set *set, *iter; set = malloc(sizeof(*set), M_LINKER, M_WAITOK); set->es_start = start; set->es_stop = stop; set->es_base = base; TAILQ_FOREACH(iter, list, es_link) { KASSERT((set->es_start < iter->es_start && set->es_stop < iter->es_stop) || (set->es_start > iter->es_start && set->es_stop > iter->es_stop), ("linker sets intersection: to insert: 0x%jx-0x%jx; inserted: 0x%jx-0x%jx", (uintmax_t)set->es_start, (uintmax_t)set->es_stop, (uintmax_t)iter->es_start, (uintmax_t)iter->es_stop)); if (iter->es_start > set->es_start) { TAILQ_INSERT_BEFORE(iter, set, es_link); break; } } if (iter == NULL) TAILQ_INSERT_TAIL(list, set, es_link); } static int elf_set_find(struct elf_set_head *list, Elf_Addr addr, Elf_Addr *start, Elf_Addr *base) { struct elf_set *set; TAILQ_FOREACH(set, list, es_link) { if (addr < set->es_start) return (0); if (addr < set->es_stop) { *start = set->es_start; *base = set->es_base; return (1); } } return (0); } static void elf_set_delete(struct elf_set_head *list, Elf_Addr start) { struct elf_set *set; TAILQ_FOREACH(set, list, es_link) { if (start < set->es_start) break; if (start == set->es_start) { TAILQ_REMOVE(list, set, es_link); free(set, M_LINKER); return; } } KASSERT(0, ("deleting unknown linker set (start = 0x%jx)", (uintmax_t)start)); } #ifdef GDB static void r_debug_state(struct r_debug *, struct link_map *); /* * A list of loaded modules for GDB to use for loading symbols. */ struct r_debug r_debug; #define GDB_STATE(s) do { \ r_debug.r_state = s; r_debug_state(NULL, NULL); \ } while (0) /* * Function for the debugger to set a breakpoint on to gain control. */ static void r_debug_state(struct r_debug *dummy_one __unused, struct link_map *dummy_two __unused) { } static void link_elf_add_gdb(struct link_map *l) { struct link_map *prev; l->l_next = NULL; if (r_debug.r_map == NULL) { /* Add first. */ l->l_prev = NULL; r_debug.r_map = l; } else { /* Append to list. */ for (prev = r_debug.r_map; prev->l_next != NULL; prev = prev->l_next) ; l->l_prev = prev; prev->l_next = l; } } static void link_elf_delete_gdb(struct link_map *l) { if (l->l_prev == NULL) { /* Remove first. */ if ((r_debug.r_map = l->l_next) != NULL) l->l_next->l_prev = NULL; } else { /* Remove any but first. */ if ((l->l_prev->l_next = l->l_next) != NULL) l->l_next->l_prev = l->l_prev; } } #endif /* GDB */ /* * The kernel symbol table starts here. */ extern struct _dynamic _DYNAMIC; static void link_elf_error(const char *filename, const char *s) { if (filename == NULL) printf("kldload: %s\n", s); else printf("kldload: %s: %s\n", filename, s); } static void link_elf_invoke_ctors(caddr_t addr, size_t size) { void (**ctor)(void); size_t i, cnt; if (addr == NULL || size == 0) return; cnt = size / sizeof(*ctor); ctor = (void *)addr; for (i = 0; i < cnt; i++) { if (ctor[i] != NULL) (*ctor[i])(); } } /* * Actions performed after linking/loading both the preloaded kernel and any * modules; whether preloaded or dynamicly loaded. */ static int link_elf_link_common_finish(linker_file_t lf) { #ifdef GDB elf_file_t ef = (elf_file_t)lf; char *newfilename; #endif int error; /* Notify MD code that a module is being loaded. */ error = elf_cpu_load_file(lf); if (error != 0) return (error); #ifdef GDB GDB_STATE(RT_ADD); ef->gdb.l_addr = lf->address; newfilename = malloc(strlen(lf->filename) + 1, M_LINKER, M_WAITOK); strcpy(newfilename, lf->filename); ef->gdb.l_name = newfilename; ef->gdb.l_ld = ef->dynamic; link_elf_add_gdb(&ef->gdb); GDB_STATE(RT_CONSISTENT); #endif /* Invoke .ctors */ link_elf_invoke_ctors(lf->ctors_addr, lf->ctors_size); return (0); } extern vm_offset_t __startkernel; static void link_elf_init(void* arg) { Elf_Dyn *dp; Elf_Addr *ctors_addrp; Elf_Size *ctors_sizep; caddr_t modptr, baseptr, sizeptr; elf_file_t ef; char *modname; linker_add_class(&link_elf_class); dp = (Elf_Dyn *)&_DYNAMIC; modname = NULL; modptr = preload_search_by_type("elf" __XSTRING(__ELF_WORD_SIZE) " kernel"); if (modptr == NULL) modptr = preload_search_by_type("elf kernel"); modname = (char *)preload_search_info(modptr, MODINFO_NAME); if (modname == NULL) modname = "kernel"; linker_kernel_file = linker_make_file(modname, &link_elf_class); if (linker_kernel_file == NULL) panic("%s: Can't create linker structures for kernel", __func__); ef = (elf_file_t) linker_kernel_file; ef->preloaded = 1; #ifdef __powerpc__ ef->address = (caddr_t) (__startkernel - KERNBASE); #else ef->address = 0; #endif #ifdef SPARSE_MAPPING ef->object = 0; #endif ef->dynamic = dp; if (dp != NULL) parse_dynamic(ef); linker_kernel_file->address += KERNBASE; linker_kernel_file->size = -(intptr_t)linker_kernel_file->address; if (modptr != NULL) { ef->modptr = modptr; baseptr = preload_search_info(modptr, MODINFO_ADDR); if (baseptr != NULL) linker_kernel_file->address = *(caddr_t *)baseptr; sizeptr = preload_search_info(modptr, MODINFO_SIZE); if (sizeptr != NULL) linker_kernel_file->size = *(size_t *)sizeptr; ctors_addrp = (Elf_Addr *)preload_search_info(modptr, MODINFO_METADATA | MODINFOMD_CTORS_ADDR); ctors_sizep = (Elf_Size *)preload_search_info(modptr, MODINFO_METADATA | MODINFOMD_CTORS_SIZE); if (ctors_addrp != NULL && ctors_sizep != NULL) { linker_kernel_file->ctors_addr = ef->address + *ctors_addrp; linker_kernel_file->ctors_size = *ctors_sizep; } } (void)link_elf_preload_parse_symbols(ef); #ifdef GDB r_debug.r_map = NULL; r_debug.r_brk = r_debug_state; r_debug.r_state = RT_CONSISTENT; #endif (void)link_elf_link_common_finish(linker_kernel_file); linker_kernel_file->flags |= LINKER_FILE_LINKED; TAILQ_INIT(&set_pcpu_list); #ifdef VIMAGE TAILQ_INIT(&set_vnet_list); #endif } -SYSINIT(link_elf, SI_SUB_KLD, SI_ORDER_THIRD, link_elf_init, 0); +SYSINIT(link_elf, SI_SUB_KLD, SI_ORDER_THIRD, link_elf_init, NULL); static int link_elf_preload_parse_symbols(elf_file_t ef) { caddr_t pointer; caddr_t ssym, esym, base; caddr_t strtab; int strcnt; Elf_Sym *symtab; int symcnt; if (ef->modptr == NULL) return (0); pointer = preload_search_info(ef->modptr, MODINFO_METADATA | MODINFOMD_SSYM); if (pointer == NULL) return (0); ssym = *(caddr_t *)pointer; pointer = preload_search_info(ef->modptr, MODINFO_METADATA | MODINFOMD_ESYM); if (pointer == NULL) return (0); esym = *(caddr_t *)pointer; base = ssym; symcnt = *(long *)base; base += sizeof(long); symtab = (Elf_Sym *)base; base += roundup(symcnt, sizeof(long)); if (base > esym || base < ssym) { printf("Symbols are corrupt!\n"); return (EINVAL); } strcnt = *(long *)base; base += sizeof(long); strtab = base; base += roundup(strcnt, sizeof(long)); if (base > esym || base < ssym) { printf("Symbols are corrupt!\n"); return (EINVAL); } ef->ddbsymtab = symtab; ef->ddbsymcnt = symcnt / sizeof(Elf_Sym); ef->ddbstrtab = strtab; ef->ddbstrcnt = strcnt; return (0); } static int parse_dynamic(elf_file_t ef) { Elf_Dyn *dp; int plttype = DT_REL; for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) { switch (dp->d_tag) { case DT_HASH: { /* From src/libexec/rtld-elf/rtld.c */ const Elf_Hashelt *hashtab = (const Elf_Hashelt *) (ef->address + dp->d_un.d_ptr); ef->nbuckets = hashtab[0]; ef->nchains = hashtab[1]; ef->buckets = hashtab + 2; ef->chains = ef->buckets + ef->nbuckets; break; } case DT_STRTAB: ef->strtab = (caddr_t) (ef->address + dp->d_un.d_ptr); break; case DT_STRSZ: ef->strsz = dp->d_un.d_val; break; case DT_SYMTAB: ef->symtab = (Elf_Sym*) (ef->address + dp->d_un.d_ptr); break; case DT_SYMENT: if (dp->d_un.d_val != sizeof(Elf_Sym)) return (ENOEXEC); break; case DT_PLTGOT: ef->got = (Elf_Addr *) (ef->address + dp->d_un.d_ptr); break; case DT_REL: ef->rel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr); break; case DT_RELSZ: ef->relsize = dp->d_un.d_val; break; case DT_RELENT: if (dp->d_un.d_val != sizeof(Elf_Rel)) return (ENOEXEC); break; case DT_JMPREL: ef->pltrel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr); break; case DT_PLTRELSZ: ef->pltrelsize = dp->d_un.d_val; break; case DT_RELA: ef->rela = (const Elf_Rela *) (ef->address + dp->d_un.d_ptr); break; case DT_RELASZ: ef->relasize = dp->d_un.d_val; break; case DT_RELAENT: if (dp->d_un.d_val != sizeof(Elf_Rela)) return (ENOEXEC); break; case DT_PLTREL: plttype = dp->d_un.d_val; if (plttype != DT_REL && plttype != DT_RELA) return (ENOEXEC); break; #ifdef GDB case DT_DEBUG: dp->d_un.d_ptr = (Elf_Addr)&r_debug; break; #endif } } if (plttype == DT_RELA) { ef->pltrela = (const Elf_Rela *)ef->pltrel; ef->pltrel = NULL; ef->pltrelasize = ef->pltrelsize; ef->pltrelsize = 0; } ef->ddbsymtab = ef->symtab; ef->ddbsymcnt = ef->nchains; ef->ddbstrtab = ef->strtab; ef->ddbstrcnt = ef->strsz; return (0); } static int parse_dpcpu(elf_file_t ef) { int error, size; ef->pcpu_start = 0; ef->pcpu_stop = 0; error = link_elf_lookup_set(&ef->lf, "pcpu", (void ***)&ef->pcpu_start, (void ***)&ef->pcpu_stop, NULL); /* Error just means there is no pcpu set to relocate. */ if (error != 0) return (0); size = (uintptr_t)ef->pcpu_stop - (uintptr_t)ef->pcpu_start; /* Empty set? */ if (size < 1) return (0); /* * Allocate space in the primary pcpu area. Copy in our * initialization from the data section and then initialize * all per-cpu storage from that. */ ef->pcpu_base = (Elf_Addr)(uintptr_t)dpcpu_alloc(size); if (ef->pcpu_base == 0) { printf("%s: pcpu module space is out of space; " "cannot allocate %d for %s\n", __func__, size, ef->lf.pathname); return (ENOSPC); } memcpy((void *)ef->pcpu_base, (void *)ef->pcpu_start, size); dpcpu_copy((void *)ef->pcpu_base, size); elf_set_add(&set_pcpu_list, ef->pcpu_start, ef->pcpu_stop, ef->pcpu_base); return (0); } #ifdef VIMAGE static int parse_vnet(elf_file_t ef) { int error, size; ef->vnet_start = 0; ef->vnet_stop = 0; error = link_elf_lookup_set(&ef->lf, "vnet", (void ***)&ef->vnet_start, (void ***)&ef->vnet_stop, NULL); /* Error just means there is no vnet data set to relocate. */ if (error != 0) return (0); size = (uintptr_t)ef->vnet_stop - (uintptr_t)ef->vnet_start; /* Empty set? */ if (size < 1) return (0); /* * Allocate space in the primary vnet area. Copy in our * initialization from the data section and then initialize * all per-vnet storage from that. */ ef->vnet_base = (Elf_Addr)(uintptr_t)vnet_data_alloc(size); if (ef->vnet_base == 0) { printf("%s: vnet module space is out of space; " "cannot allocate %d for %s\n", __func__, size, ef->lf.pathname); return (ENOSPC); } memcpy((void *)ef->vnet_base, (void *)ef->vnet_start, size); vnet_data_copy((void *)ef->vnet_base, size); elf_set_add(&set_vnet_list, ef->vnet_start, ef->vnet_stop, ef->vnet_base); return (0); } #endif static int link_elf_link_preload(linker_class_t cls, const char* filename, linker_file_t *result) { Elf_Addr *ctors_addrp; Elf_Size *ctors_sizep; caddr_t modptr, baseptr, sizeptr, dynptr; char *type; elf_file_t ef; linker_file_t lf; int error; vm_offset_t dp; /* Look to see if we have the file preloaded */ modptr = preload_search_by_name(filename); if (modptr == NULL) return (ENOENT); type = (char *)preload_search_info(modptr, MODINFO_TYPE); baseptr = preload_search_info(modptr, MODINFO_ADDR); sizeptr = preload_search_info(modptr, MODINFO_SIZE); dynptr = preload_search_info(modptr, MODINFO_METADATA | MODINFOMD_DYNAMIC); if (type == NULL || (strcmp(type, "elf" __XSTRING(__ELF_WORD_SIZE) " module") != 0 && strcmp(type, "elf module") != 0)) return (EFTYPE); if (baseptr == NULL || sizeptr == NULL || dynptr == NULL) return (EINVAL); lf = linker_make_file(filename, &link_elf_class); if (lf == NULL) return (ENOMEM); ef = (elf_file_t) lf; ef->preloaded = 1; ef->modptr = modptr; ef->address = *(caddr_t *)baseptr; #ifdef SPARSE_MAPPING ef->object = 0; #endif dp = (vm_offset_t)ef->address + *(vm_offset_t *)dynptr; ef->dynamic = (Elf_Dyn *)dp; lf->address = ef->address; lf->size = *(size_t *)sizeptr; ctors_addrp = (Elf_Addr *)preload_search_info(modptr, MODINFO_METADATA | MODINFOMD_CTORS_ADDR); ctors_sizep = (Elf_Size *)preload_search_info(modptr, MODINFO_METADATA | MODINFOMD_CTORS_SIZE); if (ctors_addrp != NULL && ctors_sizep != NULL) { lf->ctors_addr = ef->address + *ctors_addrp; lf->ctors_size = *ctors_sizep; } error = parse_dynamic(ef); if (error == 0) error = parse_dpcpu(ef); #ifdef VIMAGE if (error == 0) error = parse_vnet(ef); #endif if (error != 0) { linker_file_unload(lf, LINKER_UNLOAD_FORCE); return (error); } link_elf_reloc_local(lf); *result = lf; return (0); } static int link_elf_link_preload_finish(linker_file_t lf) { elf_file_t ef; int error; ef = (elf_file_t) lf; error = relocate_file(ef); if (error != 0) return (error); (void)link_elf_preload_parse_symbols(ef); return (link_elf_link_common_finish(lf)); } static int link_elf_load_file(linker_class_t cls, const char* filename, linker_file_t* result) { struct nameidata nd; struct thread* td = curthread; /* XXX */ Elf_Ehdr *hdr; caddr_t firstpage; int nbytes, i; Elf_Phdr *phdr; Elf_Phdr *phlimit; Elf_Phdr *segs[MAXSEGS]; int nsegs; Elf_Phdr *phdyn; Elf_Phdr *phphdr; caddr_t mapbase; size_t mapsize; Elf_Off base_offset; Elf_Addr base_vaddr; Elf_Addr base_vlimit; int error = 0; ssize_t resid; int flags; elf_file_t ef; linker_file_t lf; Elf_Shdr *shdr; int symtabindex; int symstrindex; int shstrindex; int symcnt; int strcnt; char *shstrs; shdr = NULL; lf = NULL; shstrs = NULL; NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, filename, td); flags = FREAD; error = vn_open(&nd, &flags, 0, NULL); if (error != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp->v_type != VREG) { error = ENOEXEC; firstpage = NULL; goto out; } #ifdef MAC error = mac_kld_check_load(curthread->td_ucred, nd.ni_vp); if (error != 0) { firstpage = NULL; goto out; } #endif /* * Read the elf header from the file. */ firstpage = malloc(PAGE_SIZE, M_LINKER, M_WAITOK); hdr = (Elf_Ehdr *)firstpage; error = vn_rdwr(UIO_READ, nd.ni_vp, firstpage, PAGE_SIZE, 0, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); nbytes = PAGE_SIZE - resid; if (error != 0) goto out; if (!IS_ELF(*hdr)) { error = ENOEXEC; goto out; } if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS || hdr->e_ident[EI_DATA] != ELF_TARG_DATA) { link_elf_error(filename, "Unsupported file layout"); error = ENOEXEC; goto out; } if (hdr->e_ident[EI_VERSION] != EV_CURRENT || hdr->e_version != EV_CURRENT) { link_elf_error(filename, "Unsupported file version"); error = ENOEXEC; goto out; } if (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN) { error = ENOSYS; goto out; } if (hdr->e_machine != ELF_TARG_MACH) { link_elf_error(filename, "Unsupported machine"); error = ENOEXEC; goto out; } /* * We rely on the program header being in the first page. * This is not strictly required by the ABI specification, but * it seems to always true in practice. And, it simplifies * things considerably. */ if (!((hdr->e_phentsize == sizeof(Elf_Phdr)) && (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= PAGE_SIZE) && (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= nbytes))) link_elf_error(filename, "Unreadable program headers"); /* * Scan the program header entries, and save key information. * * We rely on there being exactly two load segments, text and data, * in that order. */ phdr = (Elf_Phdr *) (firstpage + hdr->e_phoff); phlimit = phdr + hdr->e_phnum; nsegs = 0; phdyn = NULL; phphdr = NULL; while (phdr < phlimit) { switch (phdr->p_type) { case PT_LOAD: if (nsegs == MAXSEGS) { link_elf_error(filename, "Too many sections"); error = ENOEXEC; goto out; } /* * XXX: We just trust they come in right order ?? */ segs[nsegs] = phdr; ++nsegs; break; case PT_PHDR: phphdr = phdr; break; case PT_DYNAMIC: phdyn = phdr; break; case PT_INTERP: error = ENOSYS; goto out; } ++phdr; } if (phdyn == NULL) { link_elf_error(filename, "Object is not dynamically-linked"); error = ENOEXEC; goto out; } if (nsegs == 0) { link_elf_error(filename, "No sections"); error = ENOEXEC; goto out; } /* * Allocate the entire address space of the object, to stake * out our contiguous region, and to establish the base * address for relocation. */ base_offset = trunc_page(segs[0]->p_offset); base_vaddr = trunc_page(segs[0]->p_vaddr); base_vlimit = round_page(segs[nsegs - 1]->p_vaddr + segs[nsegs - 1]->p_memsz); mapsize = base_vlimit - base_vaddr; lf = linker_make_file(filename, &link_elf_class); if (lf == NULL) { error = ENOMEM; goto out; } ef = (elf_file_t) lf; #ifdef SPARSE_MAPPING ef->object = vm_object_allocate(OBJT_DEFAULT, mapsize >> PAGE_SHIFT); if (ef->object == NULL) { error = ENOMEM; goto out; } ef->address = (caddr_t) vm_map_min(kernel_map); error = vm_map_find(kernel_map, ef->object, 0, (vm_offset_t *) &ef->address, mapsize, 0, VMFS_OPTIMAL_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error != 0) { vm_object_deallocate(ef->object); ef->object = 0; goto out; } #else ef->address = malloc(mapsize, M_LINKER, M_WAITOK); #endif mapbase = ef->address; /* * Read the text and data sections and zero the bss. */ for (i = 0; i < nsegs; i++) { caddr_t segbase = mapbase + segs[i]->p_vaddr - base_vaddr; error = vn_rdwr(UIO_READ, nd.ni_vp, segbase, segs[i]->p_filesz, segs[i]->p_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error != 0) goto out; bzero(segbase + segs[i]->p_filesz, segs[i]->p_memsz - segs[i]->p_filesz); #ifdef SPARSE_MAPPING /* * Wire down the pages */ error = vm_map_wire(kernel_map, (vm_offset_t) segbase, (vm_offset_t) segbase + segs[i]->p_memsz, VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES); if (error != KERN_SUCCESS) { error = ENOMEM; goto out; } #endif } #ifdef GPROF /* Update profiling information with the new text segment. */ mtx_lock(&Giant); kmupetext((uintfptr_t)(mapbase + segs[0]->p_vaddr - base_vaddr + segs[0]->p_memsz)); mtx_unlock(&Giant); #endif ef->dynamic = (Elf_Dyn *) (mapbase + phdyn->p_vaddr - base_vaddr); lf->address = ef->address; lf->size = mapsize; error = parse_dynamic(ef); if (error != 0) goto out; error = parse_dpcpu(ef); if (error != 0) goto out; #ifdef VIMAGE error = parse_vnet(ef); if (error != 0) goto out; #endif link_elf_reloc_local(lf); VOP_UNLOCK(nd.ni_vp, 0); error = linker_load_dependencies(lf); vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY); if (error != 0) goto out; error = relocate_file(ef); if (error != 0) goto out; /* * Try and load the symbol table if it's present. (you can * strip it!) */ nbytes = hdr->e_shnum * hdr->e_shentsize; if (nbytes == 0 || hdr->e_shoff == 0) goto nosyms; shdr = malloc(nbytes, M_LINKER, M_WAITOK | M_ZERO); error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)shdr, nbytes, hdr->e_shoff, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error != 0) goto out; /* Read section string table */ shstrindex = hdr->e_shstrndx; if (shstrindex != 0 && shdr[shstrindex].sh_type == SHT_STRTAB && shdr[shstrindex].sh_size != 0) { nbytes = shdr[shstrindex].sh_size; shstrs = malloc(nbytes, M_LINKER, M_WAITOK | M_ZERO); error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)shstrs, nbytes, shdr[shstrindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; } symtabindex = -1; symstrindex = -1; for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_type == SHT_SYMTAB) { symtabindex = i; symstrindex = shdr[i].sh_link; } else if (shstrs != NULL && shdr[i].sh_name != 0 && strcmp(shstrs + shdr[i].sh_name, ".ctors") == 0) { /* Record relocated address and size of .ctors. */ lf->ctors_addr = mapbase + shdr[i].sh_addr - base_vaddr; lf->ctors_size = shdr[i].sh_size; } } if (symtabindex < 0 || symstrindex < 0) goto nosyms; symcnt = shdr[symtabindex].sh_size; ef->symbase = malloc(symcnt, M_LINKER, M_WAITOK); strcnt = shdr[symstrindex].sh_size; ef->strbase = malloc(strcnt, M_LINKER, M_WAITOK); error = vn_rdwr(UIO_READ, nd.ni_vp, ef->symbase, symcnt, shdr[symtabindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error != 0) goto out; error = vn_rdwr(UIO_READ, nd.ni_vp, ef->strbase, strcnt, shdr[symstrindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error != 0) goto out; ef->ddbsymcnt = symcnt / sizeof(Elf_Sym); ef->ddbsymtab = (const Elf_Sym *)ef->symbase; ef->ddbstrcnt = strcnt; ef->ddbstrtab = ef->strbase; nosyms: error = link_elf_link_common_finish(lf); if (error != 0) goto out; *result = lf; out: VOP_UNLOCK(nd.ni_vp, 0); vn_close(nd.ni_vp, FREAD, td->td_ucred, td); if (error != 0 && lf != NULL) linker_file_unload(lf, LINKER_UNLOAD_FORCE); free(shdr, M_LINKER); free(firstpage, M_LINKER); free(shstrs, M_LINKER); return (error); } Elf_Addr elf_relocaddr(linker_file_t lf, Elf_Addr x) { elf_file_t ef; ef = (elf_file_t)lf; if (x >= ef->pcpu_start && x < ef->pcpu_stop) return ((x - ef->pcpu_start) + ef->pcpu_base); #ifdef VIMAGE if (x >= ef->vnet_start && x < ef->vnet_stop) return ((x - ef->vnet_start) + ef->vnet_base); #endif return (x); } static void link_elf_unload_file(linker_file_t file) { elf_file_t ef = (elf_file_t) file; if (ef->pcpu_base != 0) { dpcpu_free((void *)ef->pcpu_base, ef->pcpu_stop - ef->pcpu_start); elf_set_delete(&set_pcpu_list, ef->pcpu_start); } #ifdef VIMAGE if (ef->vnet_base != 0) { vnet_data_free((void *)ef->vnet_base, ef->vnet_stop - ef->vnet_start); elf_set_delete(&set_vnet_list, ef->vnet_start); } #endif #ifdef GDB if (ef->gdb.l_ld != NULL) { GDB_STATE(RT_DELETE); free((void *)(uintptr_t)ef->gdb.l_name, M_LINKER); link_elf_delete_gdb(&ef->gdb); GDB_STATE(RT_CONSISTENT); } #endif /* Notify MD code that a module is being unloaded. */ elf_cpu_unload_file(file); if (ef->preloaded) { link_elf_unload_preload(file); return; } #ifdef SPARSE_MAPPING if (ef->object != NULL) { vm_map_remove(kernel_map, (vm_offset_t) ef->address, (vm_offset_t) ef->address + (ef->object->size << PAGE_SHIFT)); } #else free(ef->address, M_LINKER); #endif free(ef->symbase, M_LINKER); free(ef->strbase, M_LINKER); free(ef->ctftab, M_LINKER); free(ef->ctfoff, M_LINKER); free(ef->typoff, M_LINKER); } static void link_elf_unload_preload(linker_file_t file) { if (file->pathname != NULL) preload_delete_name(file->pathname); } static const char * symbol_name(elf_file_t ef, Elf_Size r_info) { const Elf_Sym *ref; if (ELF_R_SYM(r_info)) { ref = ef->symtab + ELF_R_SYM(r_info); return (ef->strtab + ref->st_name); } return (NULL); } static int symbol_type(elf_file_t ef, Elf_Size r_info) { const Elf_Sym *ref; if (ELF_R_SYM(r_info)) { ref = ef->symtab + ELF_R_SYM(r_info); return (ELF_ST_TYPE(ref->st_info)); } return (STT_NOTYPE); } static int relocate_file1(elf_file_t ef, elf_lookup_fn lookup, elf_reloc_fn reloc, bool ifuncs) { const Elf_Rel *rel; const Elf_Rela *rela; const char *symname; #define APPLY_RELOCS(iter, tbl, tblsize, type) do { \ for ((iter) = (tbl); (iter) != NULL && \ (iter) < (tbl) + (tblsize) / sizeof(*(iter)); (iter)++) { \ if ((symbol_type(ef, (iter)->r_info) == \ STT_GNU_IFUNC || \ elf_is_ifunc_reloc((iter)->r_info)) != ifuncs) \ continue; \ if (reloc(&ef->lf, (Elf_Addr)ef->address, \ (iter), (type), lookup)) { \ symname = symbol_name(ef, (iter)->r_info); \ printf("link_elf: symbol %s undefined\n", \ symname); \ return (ENOENT); \ } \ } \ } while (0) APPLY_RELOCS(rel, ef->rel, ef->relsize, ELF_RELOC_REL); APPLY_RELOCS(rela, ef->rela, ef->relasize, ELF_RELOC_RELA); APPLY_RELOCS(rel, ef->pltrel, ef->pltrelsize, ELF_RELOC_REL); APPLY_RELOCS(rela, ef->pltrela, ef->pltrelasize, ELF_RELOC_RELA); #undef APPLY_RELOCS return (0); } static int relocate_file(elf_file_t ef) { int error; error = relocate_file1(ef, elf_lookup, elf_reloc, false); if (error == 0) error = relocate_file1(ef, elf_lookup, elf_reloc, true); return (error); } /* * Hash function for symbol table lookup. Don't even think about changing * this. It is specified by the System V ABI. */ static unsigned long elf_hash(const char *name) { const unsigned char *p = (const unsigned char *) name; unsigned long h = 0; unsigned long g; while (*p != '\0') { h = (h << 4) + *p++; if ((g = h & 0xf0000000) != 0) h ^= g >> 24; h &= ~g; } return (h); } static int link_elf_lookup_symbol(linker_file_t lf, const char *name, c_linker_sym_t *sym) { elf_file_t ef = (elf_file_t) lf; unsigned long symnum; const Elf_Sym* symp; const char *strp; unsigned long hash; int i; /* If we don't have a hash, bail. */ if (ef->buckets == NULL || ef->nbuckets == 0) { printf("link_elf_lookup_symbol: missing symbol hash table\n"); return (ENOENT); } /* First, search hashed global symbols */ hash = elf_hash(name); symnum = ef->buckets[hash % ef->nbuckets]; while (symnum != STN_UNDEF) { if (symnum >= ef->nchains) { printf("%s: corrupt symbol table\n", __func__); return (ENOENT); } symp = ef->symtab + symnum; if (symp->st_name == 0) { printf("%s: corrupt symbol table\n", __func__); return (ENOENT); } strp = ef->strtab + symp->st_name; if (strcmp(name, strp) == 0) { if (symp->st_shndx != SHN_UNDEF || (symp->st_value != 0 && (ELF_ST_TYPE(symp->st_info) == STT_FUNC || ELF_ST_TYPE(symp->st_info) == STT_GNU_IFUNC))) { *sym = (c_linker_sym_t) symp; return (0); } return (ENOENT); } symnum = ef->chains[symnum]; } /* If we have not found it, look at the full table (if loaded) */ if (ef->symtab == ef->ddbsymtab) return (ENOENT); /* Exhaustive search */ for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) { strp = ef->ddbstrtab + symp->st_name; if (strcmp(name, strp) == 0) { if (symp->st_shndx != SHN_UNDEF || (symp->st_value != 0 && (ELF_ST_TYPE(symp->st_info) == STT_FUNC || ELF_ST_TYPE(symp->st_info) == STT_GNU_IFUNC))) { *sym = (c_linker_sym_t) symp; return (0); } return (ENOENT); } } return (ENOENT); } static int link_elf_symbol_values(linker_file_t lf, c_linker_sym_t sym, linker_symval_t *symval) { elf_file_t ef; const Elf_Sym *es; caddr_t val; ef = (elf_file_t)lf; es = (const Elf_Sym *)sym; if (es >= ef->symtab && es < (ef->symtab + ef->nchains)) { symval->name = ef->strtab + es->st_name; val = (caddr_t)ef->address + es->st_value; if (ELF_ST_TYPE(es->st_info) == STT_GNU_IFUNC) val = ((caddr_t (*)(void))val)(); symval->value = val; symval->size = es->st_size; return (0); } if (ef->symtab == ef->ddbsymtab) return (ENOENT); if (es >= ef->ddbsymtab && es < (ef->ddbsymtab + ef->ddbsymcnt)) { symval->name = ef->ddbstrtab + es->st_name; val = (caddr_t)ef->address + es->st_value; if (ELF_ST_TYPE(es->st_info) == STT_GNU_IFUNC) val = ((caddr_t (*)(void))val)(); symval->value = val; symval->size = es->st_size; return (0); } return (ENOENT); } static int link_elf_search_symbol(linker_file_t lf, caddr_t value, c_linker_sym_t *sym, long *diffp) { elf_file_t ef = (elf_file_t) lf; u_long off = (uintptr_t) (void *) value; u_long diff = off; u_long st_value; const Elf_Sym* es; const Elf_Sym* best = NULL; int i; for (i = 0, es = ef->ddbsymtab; i < ef->ddbsymcnt; i++, es++) { if (es->st_name == 0) continue; st_value = es->st_value + (uintptr_t) (void *) ef->address; if (off >= st_value) { if (off - st_value < diff) { diff = off - st_value; best = es; if (diff == 0) break; } else if (off - st_value == diff) { best = es; } } } if (best == NULL) *diffp = off; else *diffp = diff; *sym = (c_linker_sym_t) best; return (0); } /* * Look up a linker set on an ELF system. */ static int link_elf_lookup_set(linker_file_t lf, const char *name, void ***startp, void ***stopp, int *countp) { c_linker_sym_t sym; linker_symval_t symval; char *setsym; void **start, **stop; int len, error = 0, count; len = strlen(name) + sizeof("__start_set_"); /* sizeof includes \0 */ setsym = malloc(len, M_LINKER, M_WAITOK); /* get address of first entry */ snprintf(setsym, len, "%s%s", "__start_set_", name); error = link_elf_lookup_symbol(lf, setsym, &sym); if (error != 0) goto out; link_elf_symbol_values(lf, sym, &symval); if (symval.value == 0) { error = ESRCH; goto out; } start = (void **)symval.value; /* get address of last entry */ snprintf(setsym, len, "%s%s", "__stop_set_", name); error = link_elf_lookup_symbol(lf, setsym, &sym); if (error != 0) goto out; link_elf_symbol_values(lf, sym, &symval); if (symval.value == 0) { error = ESRCH; goto out; } stop = (void **)symval.value; /* and the number of entries */ count = stop - start; /* and copy out */ if (startp != NULL) *startp = start; if (stopp != NULL) *stopp = stop; if (countp != NULL) *countp = count; out: free(setsym, M_LINKER); return (error); } static int link_elf_each_function_name(linker_file_t file, int (*callback)(const char *, void *), void *opaque) { elf_file_t ef = (elf_file_t)file; const Elf_Sym *symp; int i, error; /* Exhaustive search */ for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) { if (symp->st_value != 0 && (ELF_ST_TYPE(symp->st_info) == STT_FUNC || ELF_ST_TYPE(symp->st_info) == STT_GNU_IFUNC)) { error = callback(ef->ddbstrtab + symp->st_name, opaque); if (error != 0) return (error); } } return (0); } static int link_elf_each_function_nameval(linker_file_t file, linker_function_nameval_callback_t callback, void *opaque) { linker_symval_t symval; elf_file_t ef = (elf_file_t)file; const Elf_Sym* symp; int i, error; /* Exhaustive search */ for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) { if (symp->st_value != 0 && (ELF_ST_TYPE(symp->st_info) == STT_FUNC || ELF_ST_TYPE(symp->st_info) == STT_GNU_IFUNC)) { error = link_elf_symbol_values(file, (c_linker_sym_t) symp, &symval); if (error != 0) return (error); error = callback(file, i, &symval, opaque); if (error != 0) return (error); } } return (0); } const Elf_Sym * elf_get_sym(linker_file_t lf, Elf_Size symidx) { elf_file_t ef = (elf_file_t)lf; if (symidx >= ef->nchains) return (NULL); return (ef->symtab + symidx); } const char * elf_get_symname(linker_file_t lf, Elf_Size symidx) { elf_file_t ef = (elf_file_t)lf; const Elf_Sym *sym; if (symidx >= ef->nchains) return (NULL); sym = ef->symtab + symidx; return (ef->strtab + sym->st_name); } /* * Symbol lookup function that can be used when the symbol index is known (ie * in relocations). It uses the symbol index instead of doing a fully fledged * hash table based lookup when such is valid. For example for local symbols. * This is not only more efficient, it's also more correct. It's not always * the case that the symbol can be found through the hash table. */ static int elf_lookup(linker_file_t lf, Elf_Size symidx, int deps, Elf_Addr *res) { elf_file_t ef = (elf_file_t)lf; const Elf_Sym *sym; const char *symbol; Elf_Addr addr, start, base; /* Don't even try to lookup the symbol if the index is bogus. */ if (symidx >= ef->nchains) { *res = 0; return (EINVAL); } sym = ef->symtab + symidx; /* * Don't do a full lookup when the symbol is local. It may even * fail because it may not be found through the hash table. */ if (ELF_ST_BIND(sym->st_info) == STB_LOCAL) { /* Force lookup failure when we have an insanity. */ if (sym->st_shndx == SHN_UNDEF || sym->st_value == 0) { *res = 0; return (EINVAL); } *res = ((Elf_Addr)ef->address + sym->st_value); return (0); } /* * XXX we can avoid doing a hash table based lookup for global * symbols as well. This however is not always valid, so we'll * just do it the hard way for now. Performance tweaks can * always be added. */ symbol = ef->strtab + sym->st_name; /* Force a lookup failure if the symbol name is bogus. */ if (*symbol == 0) { *res = 0; return (EINVAL); } addr = ((Elf_Addr)linker_file_lookup_symbol(lf, symbol, deps)); if (addr == 0 && ELF_ST_BIND(sym->st_info) != STB_WEAK) { *res = 0; return (EINVAL); } if (elf_set_find(&set_pcpu_list, addr, &start, &base)) addr = addr - start + base; #ifdef VIMAGE else if (elf_set_find(&set_vnet_list, addr, &start, &base)) addr = addr - start + base; #endif *res = addr; return (0); } static void link_elf_reloc_local(linker_file_t lf) { const Elf_Rel *rellim; const Elf_Rel *rel; const Elf_Rela *relalim; const Elf_Rela *rela; elf_file_t ef = (elf_file_t)lf; /* Perform relocations without addend if there are any: */ if ((rel = ef->rel) != NULL) { rellim = (const Elf_Rel *)((const char *)ef->rel + ef->relsize); while (rel < rellim) { elf_reloc_local(lf, (Elf_Addr)ef->address, rel, ELF_RELOC_REL, elf_lookup); rel++; } } /* Perform relocations with addend if there are any: */ if ((rela = ef->rela) != NULL) { relalim = (const Elf_Rela *) ((const char *)ef->rela + ef->relasize); while (rela < relalim) { elf_reloc_local(lf, (Elf_Addr)ef->address, rela, ELF_RELOC_RELA, elf_lookup); rela++; } } } static long link_elf_symtab_get(linker_file_t lf, const Elf_Sym **symtab) { elf_file_t ef = (elf_file_t)lf; *symtab = ef->ddbsymtab; if (*symtab == NULL) return (0); return (ef->ddbsymcnt); } static long link_elf_strtab_get(linker_file_t lf, caddr_t *strtab) { elf_file_t ef = (elf_file_t)lf; *strtab = ef->ddbstrtab; if (*strtab == NULL) return (0); return (ef->ddbstrcnt); } #if defined(__i386__) || defined(__amd64__) /* * Use this lookup routine when performing relocations early during boot. * The generic lookup routine depends on kobj, which is not initialized * at that point. */ static int elf_lookup_ifunc(linker_file_t lf, Elf_Size symidx, int deps __unused, Elf_Addr *res) { elf_file_t ef; const Elf_Sym *symp; caddr_t val; ef = (elf_file_t)lf; symp = ef->symtab + symidx; if (ELF_ST_TYPE(symp->st_info) == STT_GNU_IFUNC) { val = (caddr_t)ef->address + symp->st_value; *res = ((Elf_Addr (*)(void))val)(); return (0); } return (ENOENT); } void link_elf_ireloc(caddr_t kmdp) { struct elf_file eff; elf_file_t ef; volatile char *c; size_t i; ef = &eff; /* Do not use bzero/memset before ireloc is done. */ for (c = (char *)ef, i = 0; i < sizeof(*ef); i++) c[i] = 0; ef->modptr = kmdp; ef->dynamic = (Elf_Dyn *)&_DYNAMIC; parse_dynamic(ef); ef->address = 0; link_elf_preload_parse_symbols(ef); relocate_file1(ef, elf_lookup_ifunc, elf_reloc, true); } #endif Index: stable/11/sys/kern/link_elf_obj.c =================================================================== --- stable/11/sys/kern/link_elf_obj.c (revision 359651) +++ stable/11/sys/kern/link_elf_obj.c (revision 359652) @@ -1,1598 +1,1598 @@ /*- * Copyright (c) 1998-2000 Doug Rabson * Copyright (c) 2004 Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB_CTF #include #endif #include "linker_if.h" typedef struct { void *addr; Elf_Off size; int flags; int sec; /* Original section */ char *name; } Elf_progent; typedef struct { Elf_Rel *rel; int nrel; int sec; } Elf_relent; typedef struct { Elf_Rela *rela; int nrela; int sec; } Elf_relaent; typedef struct elf_file { struct linker_file lf; /* Common fields */ int preloaded; caddr_t address; /* Relocation address */ vm_object_t object; /* VM object to hold file pages */ Elf_Shdr *e_shdr; Elf_progent *progtab; int nprogtab; Elf_relaent *relatab; int nrelatab; Elf_relent *reltab; int nreltab; Elf_Sym *ddbsymtab; /* The symbol table we are using */ long ddbsymcnt; /* Number of symbols */ caddr_t ddbstrtab; /* String table */ long ddbstrcnt; /* number of bytes in string table */ caddr_t shstrtab; /* Section name string table */ long shstrcnt; /* number of bytes in string table */ caddr_t ctftab; /* CTF table */ long ctfcnt; /* number of bytes in CTF table */ caddr_t ctfoff; /* CTF offset table */ caddr_t typoff; /* Type offset table */ long typlen; /* Number of type entries. */ } *elf_file_t; #include static int link_elf_link_preload(linker_class_t cls, const char *, linker_file_t *); static int link_elf_link_preload_finish(linker_file_t); static int link_elf_load_file(linker_class_t, const char *, linker_file_t *); static int link_elf_lookup_symbol(linker_file_t, const char *, c_linker_sym_t *); static int link_elf_symbol_values(linker_file_t, c_linker_sym_t, linker_symval_t *); static int link_elf_search_symbol(linker_file_t, caddr_t value, c_linker_sym_t *sym, long *diffp); static void link_elf_unload_file(linker_file_t); static int link_elf_lookup_set(linker_file_t, const char *, void ***, void ***, int *); static int link_elf_each_function_name(linker_file_t, int (*)(const char *, void *), void *); static int link_elf_each_function_nameval(linker_file_t, linker_function_nameval_callback_t, void *); static int link_elf_reloc_local(linker_file_t, bool); static long link_elf_symtab_get(linker_file_t, const Elf_Sym **); static long link_elf_strtab_get(linker_file_t, caddr_t *); static int elf_obj_lookup(linker_file_t lf, Elf_Size symidx, int deps, Elf_Addr *); static kobj_method_t link_elf_methods[] = { KOBJMETHOD(linker_lookup_symbol, link_elf_lookup_symbol), KOBJMETHOD(linker_symbol_values, link_elf_symbol_values), KOBJMETHOD(linker_search_symbol, link_elf_search_symbol), KOBJMETHOD(linker_unload, link_elf_unload_file), KOBJMETHOD(linker_load_file, link_elf_load_file), KOBJMETHOD(linker_link_preload, link_elf_link_preload), KOBJMETHOD(linker_link_preload_finish, link_elf_link_preload_finish), KOBJMETHOD(linker_lookup_set, link_elf_lookup_set), KOBJMETHOD(linker_each_function_name, link_elf_each_function_name), KOBJMETHOD(linker_each_function_nameval, link_elf_each_function_nameval), KOBJMETHOD(linker_ctf_get, link_elf_ctf_get), KOBJMETHOD(linker_symtab_get, link_elf_symtab_get), KOBJMETHOD(linker_strtab_get, link_elf_strtab_get), { 0, 0 } }; static struct linker_class link_elf_class = { #if ELF_TARG_CLASS == ELFCLASS32 "elf32_obj", #else "elf64_obj", #endif link_elf_methods, sizeof(struct elf_file) }; static int relocate_file(elf_file_t ef); static void elf_obj_cleanup_globals_cache(elf_file_t); static void link_elf_error(const char *filename, const char *s) { if (filename == NULL) printf("kldload: %s\n", s); else printf("kldload: %s: %s\n", filename, s); } static void link_elf_init(void *arg) { linker_add_class(&link_elf_class); } -SYSINIT(link_elf_obj, SI_SUB_KLD, SI_ORDER_SECOND, link_elf_init, 0); +SYSINIT(link_elf_obj, SI_SUB_KLD, SI_ORDER_SECOND, link_elf_init, NULL); static int link_elf_link_preload(linker_class_t cls, const char *filename, linker_file_t *result) { Elf_Ehdr *hdr; Elf_Shdr *shdr; Elf_Sym *es; void *modptr, *baseptr, *sizeptr; char *type; elf_file_t ef; linker_file_t lf; Elf_Addr off; int error, i, j, pb, ra, rl, shstrindex, symstrindex, symtabindex; /* Look to see if we have the file preloaded */ modptr = preload_search_by_name(filename); if (modptr == NULL) return ENOENT; type = (char *)preload_search_info(modptr, MODINFO_TYPE); baseptr = preload_search_info(modptr, MODINFO_ADDR); sizeptr = preload_search_info(modptr, MODINFO_SIZE); hdr = (Elf_Ehdr *)preload_search_info(modptr, MODINFO_METADATA | MODINFOMD_ELFHDR); shdr = (Elf_Shdr *)preload_search_info(modptr, MODINFO_METADATA | MODINFOMD_SHDR); if (type == NULL || (strcmp(type, "elf" __XSTRING(__ELF_WORD_SIZE) " obj module") != 0 && strcmp(type, "elf obj module") != 0)) { return (EFTYPE); } if (baseptr == NULL || sizeptr == NULL || hdr == NULL || shdr == NULL) return (EINVAL); lf = linker_make_file(filename, &link_elf_class); if (lf == NULL) return (ENOMEM); ef = (elf_file_t)lf; ef->preloaded = 1; ef->address = *(caddr_t *)baseptr; lf->address = *(caddr_t *)baseptr; lf->size = *(size_t *)sizeptr; if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS || hdr->e_ident[EI_DATA] != ELF_TARG_DATA || hdr->e_ident[EI_VERSION] != EV_CURRENT || hdr->e_version != EV_CURRENT || hdr->e_type != ET_REL || hdr->e_machine != ELF_TARG_MACH) { error = EFTYPE; goto out; } ef->e_shdr = shdr; /* Scan the section header for information and table sizing. */ symtabindex = -1; symstrindex = -1; for (i = 0; i < hdr->e_shnum; i++) { switch (shdr[i].sh_type) { case SHT_PROGBITS: case SHT_NOBITS: #ifdef __amd64__ case SHT_X86_64_UNWIND: #endif /* Ignore sections not loaded by the loader. */ if (shdr[i].sh_addr == 0) break; ef->nprogtab++; break; case SHT_SYMTAB: symtabindex = i; symstrindex = shdr[i].sh_link; break; case SHT_REL: /* * Ignore relocation tables for sections not * loaded by the loader. */ if (shdr[shdr[i].sh_info].sh_addr == 0) break; ef->nreltab++; break; case SHT_RELA: if (shdr[shdr[i].sh_info].sh_addr == 0) break; ef->nrelatab++; break; } } shstrindex = hdr->e_shstrndx; if (ef->nprogtab == 0 || symstrindex < 0 || symstrindex >= hdr->e_shnum || shdr[symstrindex].sh_type != SHT_STRTAB || shstrindex == 0 || shstrindex >= hdr->e_shnum || shdr[shstrindex].sh_type != SHT_STRTAB) { printf("%s: bad/missing section headers\n", filename); error = ENOEXEC; goto out; } /* Allocate space for tracking the load chunks */ if (ef->nprogtab != 0) ef->progtab = malloc(ef->nprogtab * sizeof(*ef->progtab), M_LINKER, M_WAITOK | M_ZERO); if (ef->nreltab != 0) ef->reltab = malloc(ef->nreltab * sizeof(*ef->reltab), M_LINKER, M_WAITOK | M_ZERO); if (ef->nrelatab != 0) ef->relatab = malloc(ef->nrelatab * sizeof(*ef->relatab), M_LINKER, M_WAITOK | M_ZERO); if ((ef->nprogtab != 0 && ef->progtab == NULL) || (ef->nreltab != 0 && ef->reltab == NULL) || (ef->nrelatab != 0 && ef->relatab == NULL)) { error = ENOMEM; goto out; } /* XXX, relocate the sh_addr fields saved by the loader. */ off = 0; for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_addr != 0 && (off == 0 || shdr[i].sh_addr < off)) off = shdr[i].sh_addr; } for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_addr != 0) shdr[i].sh_addr = shdr[i].sh_addr - off + (Elf_Addr)ef->address; } ef->ddbsymcnt = shdr[symtabindex].sh_size / sizeof(Elf_Sym); ef->ddbsymtab = (Elf_Sym *)shdr[symtabindex].sh_addr; ef->ddbstrcnt = shdr[symstrindex].sh_size; ef->ddbstrtab = (char *)shdr[symstrindex].sh_addr; ef->shstrcnt = shdr[shstrindex].sh_size; ef->shstrtab = (char *)shdr[shstrindex].sh_addr; /* Now fill out progtab and the relocation tables. */ pb = 0; rl = 0; ra = 0; for (i = 0; i < hdr->e_shnum; i++) { switch (shdr[i].sh_type) { case SHT_PROGBITS: case SHT_NOBITS: #ifdef __amd64__ case SHT_X86_64_UNWIND: #endif if (shdr[i].sh_addr == 0) break; ef->progtab[pb].addr = (void *)shdr[i].sh_addr; if (shdr[i].sh_type == SHT_PROGBITS) ef->progtab[pb].name = "<>"; #ifdef __amd64__ else if (shdr[i].sh_type == SHT_X86_64_UNWIND) ef->progtab[pb].name = "<>"; #endif else ef->progtab[pb].name = "<>"; ef->progtab[pb].size = shdr[i].sh_size; ef->progtab[pb].sec = i; if (ef->shstrtab && shdr[i].sh_name != 0) ef->progtab[pb].name = ef->shstrtab + shdr[i].sh_name; if (ef->progtab[pb].name != NULL && !strcmp(ef->progtab[pb].name, DPCPU_SETNAME)) { void *dpcpu; dpcpu = dpcpu_alloc(shdr[i].sh_size); if (dpcpu == NULL) { printf("%s: pcpu module space is out " "of space; cannot allocate %#jx " "for %s\n", __func__, (uintmax_t)shdr[i].sh_size, filename); error = ENOSPC; goto out; } memcpy(dpcpu, ef->progtab[pb].addr, ef->progtab[pb].size); dpcpu_copy(dpcpu, shdr[i].sh_size); ef->progtab[pb].addr = dpcpu; #ifdef VIMAGE } else if (ef->progtab[pb].name != NULL && !strcmp(ef->progtab[pb].name, VNET_SETNAME)) { void *vnet_data; vnet_data = vnet_data_alloc(shdr[i].sh_size); if (vnet_data == NULL) { printf("%s: vnet module space is out " "of space; cannot allocate %#jx " "for %s\n", __func__, (uintmax_t)shdr[i].sh_size, filename); error = ENOSPC; goto out; } memcpy(vnet_data, ef->progtab[pb].addr, ef->progtab[pb].size); vnet_data_copy(vnet_data, shdr[i].sh_size); ef->progtab[pb].addr = vnet_data; #endif } else if (ef->progtab[pb].name != NULL && !strcmp(ef->progtab[pb].name, ".ctors")) { lf->ctors_addr = ef->progtab[pb].addr; lf->ctors_size = shdr[i].sh_size; } /* Update all symbol values with the offset. */ for (j = 0; j < ef->ddbsymcnt; j++) { es = &ef->ddbsymtab[j]; if (es->st_shndx != i) continue; es->st_value += (Elf_Addr)ef->progtab[pb].addr; } pb++; break; case SHT_REL: if (shdr[shdr[i].sh_info].sh_addr == 0) break; ef->reltab[rl].rel = (Elf_Rel *)shdr[i].sh_addr; ef->reltab[rl].nrel = shdr[i].sh_size / sizeof(Elf_Rel); ef->reltab[rl].sec = shdr[i].sh_info; rl++; break; case SHT_RELA: if (shdr[shdr[i].sh_info].sh_addr == 0) break; ef->relatab[ra].rela = (Elf_Rela *)shdr[i].sh_addr; ef->relatab[ra].nrela = shdr[i].sh_size / sizeof(Elf_Rela); ef->relatab[ra].sec = shdr[i].sh_info; ra++; break; } } if (pb != ef->nprogtab) { printf("%s: lost progbits\n", filename); error = ENOEXEC; goto out; } if (rl != ef->nreltab) { printf("%s: lost reltab\n", filename); error = ENOEXEC; goto out; } if (ra != ef->nrelatab) { printf("%s: lost relatab\n", filename); error = ENOEXEC; goto out; } /* Local intra-module relocations */ error = link_elf_reloc_local(lf, false); if (error != 0) goto out; *result = lf; return (0); out: /* preload not done this way */ linker_file_unload(lf, LINKER_UNLOAD_FORCE); return (error); } static void link_elf_invoke_ctors(caddr_t addr, size_t size) { void (**ctor)(void); size_t i, cnt; if (addr == NULL || size == 0) return; cnt = size / sizeof(*ctor); ctor = (void *)addr; for (i = 0; i < cnt; i++) { if (ctor[i] != NULL) (*ctor[i])(); } } static int link_elf_link_preload_finish(linker_file_t lf) { elf_file_t ef; int error; ef = (elf_file_t)lf; error = relocate_file(ef); if (error) return (error); /* Notify MD code that a module is being loaded. */ error = elf_cpu_load_file(lf); if (error) return (error); #if defined(__i386__) || defined(__amd64__) /* Now ifuncs. */ error = link_elf_reloc_local(lf, true); if (error != 0) return (error); #endif /* Invoke .ctors */ link_elf_invoke_ctors(lf->ctors_addr, lf->ctors_size); return (0); } static int link_elf_load_file(linker_class_t cls, const char *filename, linker_file_t *result) { struct nameidata nd; struct thread *td = curthread; /* XXX */ Elf_Ehdr *hdr; Elf_Shdr *shdr; Elf_Sym *es; int nbytes, i, j; vm_offset_t mapbase; size_t mapsize; int error = 0; ssize_t resid; int flags; elf_file_t ef; linker_file_t lf; int symtabindex; int symstrindex; int shstrindex; int nsym; int pb, rl, ra; int alignmask; shdr = NULL; lf = NULL; mapsize = 0; hdr = NULL; NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, filename, td); flags = FREAD; error = vn_open(&nd, &flags, 0, NULL); if (error) return error; NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp->v_type != VREG) { error = ENOEXEC; goto out; } #ifdef MAC error = mac_kld_check_load(td->td_ucred, nd.ni_vp); if (error) { goto out; } #endif /* Read the elf header from the file. */ hdr = malloc(sizeof(*hdr), M_LINKER, M_WAITOK); error = vn_rdwr(UIO_READ, nd.ni_vp, (void *)hdr, sizeof(*hdr), 0, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = ENOEXEC; goto out; } if (!IS_ELF(*hdr)) { error = ENOEXEC; goto out; } if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS || hdr->e_ident[EI_DATA] != ELF_TARG_DATA) { link_elf_error(filename, "Unsupported file layout"); error = ENOEXEC; goto out; } if (hdr->e_ident[EI_VERSION] != EV_CURRENT || hdr->e_version != EV_CURRENT) { link_elf_error(filename, "Unsupported file version"); error = ENOEXEC; goto out; } if (hdr->e_type != ET_REL) { error = ENOSYS; goto out; } if (hdr->e_machine != ELF_TARG_MACH) { link_elf_error(filename, "Unsupported machine"); error = ENOEXEC; goto out; } lf = linker_make_file(filename, &link_elf_class); if (!lf) { error = ENOMEM; goto out; } ef = (elf_file_t) lf; ef->nprogtab = 0; ef->e_shdr = 0; ef->nreltab = 0; ef->nrelatab = 0; /* Allocate and read in the section header */ nbytes = hdr->e_shnum * hdr->e_shentsize; if (nbytes == 0 || hdr->e_shoff == 0 || hdr->e_shentsize != sizeof(Elf_Shdr)) { error = ENOEXEC; goto out; } shdr = malloc(nbytes, M_LINKER, M_WAITOK); ef->e_shdr = shdr; error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)shdr, nbytes, hdr->e_shoff, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid) { error = ENOEXEC; goto out; } /* Scan the section header for information and table sizing. */ nsym = 0; symtabindex = -1; symstrindex = -1; for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_size == 0) continue; switch (shdr[i].sh_type) { case SHT_PROGBITS: case SHT_NOBITS: #ifdef __amd64__ case SHT_X86_64_UNWIND: #endif if ((shdr[i].sh_flags & SHF_ALLOC) == 0) break; ef->nprogtab++; break; case SHT_SYMTAB: nsym++; symtabindex = i; symstrindex = shdr[i].sh_link; break; case SHT_REL: /* * Ignore relocation tables for unallocated * sections. */ if ((shdr[shdr[i].sh_info].sh_flags & SHF_ALLOC) == 0) break; ef->nreltab++; break; case SHT_RELA: if ((shdr[shdr[i].sh_info].sh_flags & SHF_ALLOC) == 0) break; ef->nrelatab++; break; case SHT_STRTAB: break; } } if (ef->nprogtab == 0) { link_elf_error(filename, "file has no contents"); error = ENOEXEC; goto out; } if (nsym != 1) { /* Only allow one symbol table for now */ link_elf_error(filename, "file has no valid symbol table"); error = ENOEXEC; goto out; } if (symstrindex < 0 || symstrindex > hdr->e_shnum || shdr[symstrindex].sh_type != SHT_STRTAB) { link_elf_error(filename, "file has invalid symbol strings"); error = ENOEXEC; goto out; } /* Allocate space for tracking the load chunks */ if (ef->nprogtab != 0) ef->progtab = malloc(ef->nprogtab * sizeof(*ef->progtab), M_LINKER, M_WAITOK | M_ZERO); if (ef->nreltab != 0) ef->reltab = malloc(ef->nreltab * sizeof(*ef->reltab), M_LINKER, M_WAITOK | M_ZERO); if (ef->nrelatab != 0) ef->relatab = malloc(ef->nrelatab * sizeof(*ef->relatab), M_LINKER, M_WAITOK | M_ZERO); if (symtabindex == -1) { link_elf_error(filename, "lost symbol table index"); error = ENOEXEC; goto out; } /* Allocate space for and load the symbol table */ ef->ddbsymcnt = shdr[symtabindex].sh_size / sizeof(Elf_Sym); ef->ddbsymtab = malloc(shdr[symtabindex].sh_size, M_LINKER, M_WAITOK); error = vn_rdwr(UIO_READ, nd.ni_vp, (void *)ef->ddbsymtab, shdr[symtabindex].sh_size, shdr[symtabindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = EINVAL; goto out; } /* Allocate space for and load the symbol strings */ ef->ddbstrcnt = shdr[symstrindex].sh_size; ef->ddbstrtab = malloc(shdr[symstrindex].sh_size, M_LINKER, M_WAITOK); error = vn_rdwr(UIO_READ, nd.ni_vp, ef->ddbstrtab, shdr[symstrindex].sh_size, shdr[symstrindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = EINVAL; goto out; } /* Do we have a string table for the section names? */ shstrindex = -1; if (hdr->e_shstrndx != 0 && shdr[hdr->e_shstrndx].sh_type == SHT_STRTAB) { shstrindex = hdr->e_shstrndx; ef->shstrcnt = shdr[shstrindex].sh_size; ef->shstrtab = malloc(shdr[shstrindex].sh_size, M_LINKER, M_WAITOK); error = vn_rdwr(UIO_READ, nd.ni_vp, ef->shstrtab, shdr[shstrindex].sh_size, shdr[shstrindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = EINVAL; goto out; } } /* Size up code/data(progbits) and bss(nobits). */ alignmask = 0; for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_size == 0) continue; switch (shdr[i].sh_type) { case SHT_PROGBITS: case SHT_NOBITS: #ifdef __amd64__ case SHT_X86_64_UNWIND: #endif if ((shdr[i].sh_flags & SHF_ALLOC) == 0) break; alignmask = shdr[i].sh_addralign - 1; mapsize += alignmask; mapsize &= ~alignmask; mapsize += shdr[i].sh_size; break; } } /* * We know how much space we need for the text/data/bss/etc. * This stuff needs to be in a single chunk so that profiling etc * can get the bounds and gdb can associate offsets with modules */ ef->object = vm_object_allocate(OBJT_DEFAULT, round_page(mapsize) >> PAGE_SHIFT); if (ef->object == NULL) { error = ENOMEM; goto out; } ef->address = (caddr_t) vm_map_min(kernel_map); /* * In order to satisfy amd64's architectural requirements on the * location of code and data in the kernel's address space, request a * mapping that is above the kernel. */ #ifdef __amd64__ mapbase = KERNBASE; #else mapbase = VM_MIN_KERNEL_ADDRESS; #endif error = vm_map_find(kernel_map, ef->object, 0, &mapbase, round_page(mapsize), 0, VMFS_OPTIMAL_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) { vm_object_deallocate(ef->object); ef->object = 0; goto out; } /* Wire the pages */ error = vm_map_wire(kernel_map, mapbase, mapbase + round_page(mapsize), VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES); if (error != KERN_SUCCESS) { error = ENOMEM; goto out; } /* Inform the kld system about the situation */ lf->address = ef->address = (caddr_t)mapbase; lf->size = mapsize; /* * Now load code/data(progbits), zero bss(nobits), allocate space for * and load relocs */ pb = 0; rl = 0; ra = 0; alignmask = 0; for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_size == 0) continue; switch (shdr[i].sh_type) { case SHT_PROGBITS: case SHT_NOBITS: #ifdef __amd64__ case SHT_X86_64_UNWIND: #endif if ((shdr[i].sh_flags & SHF_ALLOC) == 0) break; alignmask = shdr[i].sh_addralign - 1; mapbase += alignmask; mapbase &= ~alignmask; if (ef->shstrtab != NULL && shdr[i].sh_name != 0) { ef->progtab[pb].name = ef->shstrtab + shdr[i].sh_name; if (!strcmp(ef->progtab[pb].name, ".ctors")) { lf->ctors_addr = (caddr_t)mapbase; lf->ctors_size = shdr[i].sh_size; } } else if (shdr[i].sh_type == SHT_PROGBITS) ef->progtab[pb].name = "<>"; #ifdef __amd64__ else if (shdr[i].sh_type == SHT_X86_64_UNWIND) ef->progtab[pb].name = "<>"; #endif else ef->progtab[pb].name = "<>"; if (ef->progtab[pb].name != NULL && !strcmp(ef->progtab[pb].name, DPCPU_SETNAME)) { ef->progtab[pb].addr = dpcpu_alloc(shdr[i].sh_size); if (ef->progtab[pb].addr == NULL) { printf("%s: pcpu module space is out " "of space; cannot allocate %#jx " "for %s\n", __func__, (uintmax_t)shdr[i].sh_size, filename); } } #ifdef VIMAGE else if (ef->progtab[pb].name != NULL && !strcmp(ef->progtab[pb].name, VNET_SETNAME)) { ef->progtab[pb].addr = vnet_data_alloc(shdr[i].sh_size); if (ef->progtab[pb].addr == NULL) { printf("%s: vnet module space is out " "of space; cannot allocate %#jx " "for %s\n", __func__, (uintmax_t)shdr[i].sh_size, filename); } } #endif else ef->progtab[pb].addr = (void *)(uintptr_t)mapbase; if (ef->progtab[pb].addr == NULL) { error = ENOSPC; goto out; } ef->progtab[pb].size = shdr[i].sh_size; ef->progtab[pb].sec = i; if (shdr[i].sh_type == SHT_PROGBITS #ifdef __amd64__ || shdr[i].sh_type == SHT_X86_64_UNWIND #endif ) { error = vn_rdwr(UIO_READ, nd.ni_vp, ef->progtab[pb].addr, shdr[i].sh_size, shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = EINVAL; goto out; } /* Initialize the per-cpu or vnet area. */ if (ef->progtab[pb].addr != (void *)mapbase && !strcmp(ef->progtab[pb].name, DPCPU_SETNAME)) dpcpu_copy(ef->progtab[pb].addr, shdr[i].sh_size); #ifdef VIMAGE else if (ef->progtab[pb].addr != (void *)mapbase && !strcmp(ef->progtab[pb].name, VNET_SETNAME)) vnet_data_copy(ef->progtab[pb].addr, shdr[i].sh_size); #endif } else bzero(ef->progtab[pb].addr, shdr[i].sh_size); /* Update all symbol values with the offset. */ for (j = 0; j < ef->ddbsymcnt; j++) { es = &ef->ddbsymtab[j]; if (es->st_shndx != i) continue; es->st_value += (Elf_Addr)ef->progtab[pb].addr; } mapbase += shdr[i].sh_size; pb++; break; case SHT_REL: if ((shdr[shdr[i].sh_info].sh_flags & SHF_ALLOC) == 0) break; ef->reltab[rl].rel = malloc(shdr[i].sh_size, M_LINKER, M_WAITOK); ef->reltab[rl].nrel = shdr[i].sh_size / sizeof(Elf_Rel); ef->reltab[rl].sec = shdr[i].sh_info; error = vn_rdwr(UIO_READ, nd.ni_vp, (void *)ef->reltab[rl].rel, shdr[i].sh_size, shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = EINVAL; goto out; } rl++; break; case SHT_RELA: if ((shdr[shdr[i].sh_info].sh_flags & SHF_ALLOC) == 0) break; ef->relatab[ra].rela = malloc(shdr[i].sh_size, M_LINKER, M_WAITOK); ef->relatab[ra].nrela = shdr[i].sh_size / sizeof(Elf_Rela); ef->relatab[ra].sec = shdr[i].sh_info; error = vn_rdwr(UIO_READ, nd.ni_vp, (void *)ef->relatab[ra].rela, shdr[i].sh_size, shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = EINVAL; goto out; } ra++; break; } } if (pb != ef->nprogtab) { link_elf_error(filename, "lost progbits"); error = ENOEXEC; goto out; } if (rl != ef->nreltab) { link_elf_error(filename, "lost reltab"); error = ENOEXEC; goto out; } if (ra != ef->nrelatab) { link_elf_error(filename, "lost relatab"); error = ENOEXEC; goto out; } if (mapbase != (vm_offset_t)ef->address + mapsize) { printf( "%s: mapbase 0x%lx != address %p + mapsize 0x%lx (0x%lx)\n", filename != NULL ? filename : "", (u_long)mapbase, ef->address, (u_long)mapsize, (u_long)(vm_offset_t)ef->address + mapsize); error = ENOMEM; goto out; } /* Local intra-module relocations */ error = link_elf_reloc_local(lf, false); if (error != 0) goto out; /* Pull in dependencies */ VOP_UNLOCK(nd.ni_vp, 0); error = linker_load_dependencies(lf); vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY); if (error) goto out; /* External relocations */ error = relocate_file(ef); if (error) goto out; /* Notify MD code that a module is being loaded. */ error = elf_cpu_load_file(lf); if (error) goto out; #if defined(__i386__) || defined(__amd64__) /* Now ifuncs. */ error = link_elf_reloc_local(lf, true); if (error != 0) goto out; #endif /* Invoke .ctors */ link_elf_invoke_ctors(lf->ctors_addr, lf->ctors_size); *result = lf; out: VOP_UNLOCK(nd.ni_vp, 0); vn_close(nd.ni_vp, FREAD, td->td_ucred, td); if (error && lf) linker_file_unload(lf, LINKER_UNLOAD_FORCE); free(hdr, M_LINKER); return error; } static void link_elf_unload_file(linker_file_t file) { elf_file_t ef = (elf_file_t) file; int i; /* Notify MD code that a module is being unloaded. */ elf_cpu_unload_file(file); if (ef->progtab) { for (i = 0; i < ef->nprogtab; i++) { if (ef->progtab[i].size == 0) continue; if (ef->progtab[i].name == NULL) continue; if (!strcmp(ef->progtab[i].name, DPCPU_SETNAME)) dpcpu_free(ef->progtab[i].addr, ef->progtab[i].size); #ifdef VIMAGE else if (!strcmp(ef->progtab[i].name, VNET_SETNAME)) vnet_data_free(ef->progtab[i].addr, ef->progtab[i].size); #endif } } if (ef->preloaded) { free(ef->reltab, M_LINKER); free(ef->relatab, M_LINKER); free(ef->progtab, M_LINKER); free(ef->ctftab, M_LINKER); free(ef->ctfoff, M_LINKER); free(ef->typoff, M_LINKER); if (file->pathname != NULL) preload_delete_name(file->pathname); return; } for (i = 0; i < ef->nreltab; i++) free(ef->reltab[i].rel, M_LINKER); for (i = 0; i < ef->nrelatab; i++) free(ef->relatab[i].rela, M_LINKER); free(ef->reltab, M_LINKER); free(ef->relatab, M_LINKER); free(ef->progtab, M_LINKER); if (ef->object) { vm_map_remove(kernel_map, (vm_offset_t) ef->address, (vm_offset_t) ef->address + (ef->object->size << PAGE_SHIFT)); } free(ef->e_shdr, M_LINKER); free(ef->ddbsymtab, M_LINKER); free(ef->ddbstrtab, M_LINKER); free(ef->shstrtab, M_LINKER); free(ef->ctftab, M_LINKER); free(ef->ctfoff, M_LINKER); free(ef->typoff, M_LINKER); } static const char * symbol_name(elf_file_t ef, Elf_Size r_info) { const Elf_Sym *ref; if (ELF_R_SYM(r_info)) { ref = ef->ddbsymtab + ELF_R_SYM(r_info); return ef->ddbstrtab + ref->st_name; } else return NULL; } static Elf_Addr findbase(elf_file_t ef, int sec) { int i; Elf_Addr base = 0; for (i = 0; i < ef->nprogtab; i++) { if (sec == ef->progtab[i].sec) { base = (Elf_Addr)ef->progtab[i].addr; break; } } return base; } static int relocate_file(elf_file_t ef) { const Elf_Rel *rellim; const Elf_Rel *rel; const Elf_Rela *relalim; const Elf_Rela *rela; const char *symname; const Elf_Sym *sym; int i; Elf_Size symidx; Elf_Addr base; /* Perform relocations without addend if there are any: */ for (i = 0; i < ef->nreltab; i++) { rel = ef->reltab[i].rel; if (rel == NULL) { link_elf_error(ef->lf.filename, "lost a reltab!"); return (ENOEXEC); } rellim = rel + ef->reltab[i].nrel; base = findbase(ef, ef->reltab[i].sec); if (base == 0) { link_elf_error(ef->lf.filename, "lost base for reltab"); return (ENOEXEC); } for ( ; rel < rellim; rel++) { symidx = ELF_R_SYM(rel->r_info); if (symidx >= ef->ddbsymcnt) continue; sym = ef->ddbsymtab + symidx; /* Local relocs are already done */ if (ELF_ST_BIND(sym->st_info) == STB_LOCAL) continue; if (elf_reloc(&ef->lf, base, rel, ELF_RELOC_REL, elf_obj_lookup)) { symname = symbol_name(ef, rel->r_info); printf("link_elf_obj: symbol %s undefined\n", symname); return (ENOENT); } } } /* Perform relocations with addend if there are any: */ for (i = 0; i < ef->nrelatab; i++) { rela = ef->relatab[i].rela; if (rela == NULL) { link_elf_error(ef->lf.filename, "lost a relatab!"); return (ENOEXEC); } relalim = rela + ef->relatab[i].nrela; base = findbase(ef, ef->relatab[i].sec); if (base == 0) { link_elf_error(ef->lf.filename, "lost base for relatab"); return (ENOEXEC); } for ( ; rela < relalim; rela++) { symidx = ELF_R_SYM(rela->r_info); if (symidx >= ef->ddbsymcnt) continue; sym = ef->ddbsymtab + symidx; /* Local relocs are already done */ if (ELF_ST_BIND(sym->st_info) == STB_LOCAL) continue; if (elf_reloc(&ef->lf, base, rela, ELF_RELOC_RELA, elf_obj_lookup)) { symname = symbol_name(ef, rela->r_info); printf("link_elf_obj: symbol %s undefined\n", symname); return (ENOENT); } } } /* * Only clean SHN_FBSD_CACHED for successful return. If we * modified symbol table for the object but found an * unresolved symbol, there is no reason to roll back. */ elf_obj_cleanup_globals_cache(ef); return (0); } static int link_elf_lookup_symbol(linker_file_t lf, const char *name, c_linker_sym_t *sym) { elf_file_t ef = (elf_file_t) lf; const Elf_Sym *symp; const char *strp; int i; for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) { strp = ef->ddbstrtab + symp->st_name; if (symp->st_shndx != SHN_UNDEF && strcmp(name, strp) == 0) { *sym = (c_linker_sym_t) symp; return 0; } } return ENOENT; } static int link_elf_symbol_values(linker_file_t lf, c_linker_sym_t sym, linker_symval_t *symval) { elf_file_t ef; const Elf_Sym *es; caddr_t val; ef = (elf_file_t) lf; es = (const Elf_Sym*) sym; val = (caddr_t)es->st_value; if (es >= ef->ddbsymtab && es < (ef->ddbsymtab + ef->ddbsymcnt)) { symval->name = ef->ddbstrtab + es->st_name; val = (caddr_t)es->st_value; if (ELF_ST_TYPE(es->st_info) == STT_GNU_IFUNC) val = ((caddr_t (*)(void))val)(); symval->value = val; symval->size = es->st_size; return 0; } return ENOENT; } static int link_elf_search_symbol(linker_file_t lf, caddr_t value, c_linker_sym_t *sym, long *diffp) { elf_file_t ef = (elf_file_t) lf; u_long off = (uintptr_t) (void *) value; u_long diff = off; u_long st_value; const Elf_Sym *es; const Elf_Sym *best = NULL; int i; for (i = 0, es = ef->ddbsymtab; i < ef->ddbsymcnt; i++, es++) { if (es->st_name == 0) continue; st_value = es->st_value; if (off >= st_value) { if (off - st_value < diff) { diff = off - st_value; best = es; if (diff == 0) break; } else if (off - st_value == diff) { best = es; } } } if (best == NULL) *diffp = off; else *diffp = diff; *sym = (c_linker_sym_t) best; return 0; } /* * Look up a linker set on an ELF system. */ static int link_elf_lookup_set(linker_file_t lf, const char *name, void ***startp, void ***stopp, int *countp) { elf_file_t ef = (elf_file_t)lf; void **start, **stop; int i, count; /* Relative to section number */ for (i = 0; i < ef->nprogtab; i++) { if ((strncmp(ef->progtab[i].name, "set_", 4) == 0) && strcmp(ef->progtab[i].name + 4, name) == 0) { start = (void **)ef->progtab[i].addr; stop = (void **)((char *)ef->progtab[i].addr + ef->progtab[i].size); count = stop - start; if (startp) *startp = start; if (stopp) *stopp = stop; if (countp) *countp = count; return (0); } } return (ESRCH); } static int link_elf_each_function_name(linker_file_t file, int (*callback)(const char *, void *), void *opaque) { elf_file_t ef = (elf_file_t)file; const Elf_Sym *symp; int i, error; /* Exhaustive search */ for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) { if (symp->st_value != 0 && (ELF_ST_TYPE(symp->st_info) == STT_FUNC || ELF_ST_TYPE(symp->st_info) == STT_GNU_IFUNC)) { error = callback(ef->ddbstrtab + symp->st_name, opaque); if (error) return (error); } } return (0); } static int link_elf_each_function_nameval(linker_file_t file, linker_function_nameval_callback_t callback, void *opaque) { linker_symval_t symval; elf_file_t ef = (elf_file_t)file; const Elf_Sym* symp; int i, error; /* Exhaustive search */ for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) { if (symp->st_value != 0 && (ELF_ST_TYPE(symp->st_info) == STT_FUNC || ELF_ST_TYPE(symp->st_info) == STT_GNU_IFUNC)) { error = link_elf_symbol_values(file, (c_linker_sym_t)symp, &symval); if (error) return (error); error = callback(file, i, &symval, opaque); if (error) return (error); } } return (0); } static void elf_obj_cleanup_globals_cache(elf_file_t ef) { Elf_Sym *sym; Elf_Size i; for (i = 0; i < ef->ddbsymcnt; i++) { sym = ef->ddbsymtab + i; if (sym->st_shndx == SHN_FBSD_CACHED) { sym->st_shndx = SHN_UNDEF; sym->st_value = 0; } } } /* * Symbol lookup function that can be used when the symbol index is known (ie * in relocations). It uses the symbol index instead of doing a fully fledged * hash table based lookup when such is valid. For example for local symbols. * This is not only more efficient, it's also more correct. It's not always * the case that the symbol can be found through the hash table. */ static int elf_obj_lookup(linker_file_t lf, Elf_Size symidx, int deps, Elf_Addr *res) { elf_file_t ef = (elf_file_t)lf; Elf_Sym *sym; const char *symbol; Elf_Addr res1; /* Don't even try to lookup the symbol if the index is bogus. */ if (symidx >= ef->ddbsymcnt) { *res = 0; return (EINVAL); } sym = ef->ddbsymtab + symidx; /* Quick answer if there is a definition included. */ if (sym->st_shndx != SHN_UNDEF) { res1 = (Elf_Addr)sym->st_value; if (ELF_ST_TYPE(sym->st_info) == STT_GNU_IFUNC) res1 = ((Elf_Addr (*)(void))res1)(); *res = res1; return (0); } /* If we get here, then it is undefined and needs a lookup. */ switch (ELF_ST_BIND(sym->st_info)) { case STB_LOCAL: /* Local, but undefined? huh? */ *res = 0; return (EINVAL); case STB_GLOBAL: case STB_WEAK: /* Relative to Data or Function name */ symbol = ef->ddbstrtab + sym->st_name; /* Force a lookup failure if the symbol name is bogus. */ if (*symbol == 0) { *res = 0; return (EINVAL); } res1 = (Elf_Addr)linker_file_lookup_symbol(lf, symbol, deps); /* * Cache global lookups during module relocation. The failure * case is particularly expensive for callers, who must scan * through the entire globals table doing strcmp(). Cache to * avoid doing such work repeatedly. * * After relocation is complete, undefined globals will be * restored to SHN_UNDEF in elf_obj_cleanup_globals_cache(), * above. */ if (res1 != 0) { sym->st_shndx = SHN_FBSD_CACHED; sym->st_value = res1; *res = res1; return (0); } else if (ELF_ST_BIND(sym->st_info) == STB_WEAK) { sym->st_value = 0; *res = 0; return (0); } return (EINVAL); default: return (EINVAL); } } static void link_elf_fix_link_set(elf_file_t ef) { static const char startn[] = "__start_"; static const char stopn[] = "__stop_"; Elf_Sym *sym; const char *sym_name, *linkset_name; Elf_Addr startp, stopp; Elf_Size symidx; int start, i; startp = stopp = 0; for (symidx = 1 /* zero entry is special */; symidx < ef->ddbsymcnt; symidx++) { sym = ef->ddbsymtab + symidx; if (sym->st_shndx != SHN_UNDEF) continue; sym_name = ef->ddbstrtab + sym->st_name; if (strncmp(sym_name, startn, sizeof(startn) - 1) == 0) { start = 1; linkset_name = sym_name + sizeof(startn) - 1; } else if (strncmp(sym_name, stopn, sizeof(stopn) - 1) == 0) { start = 0; linkset_name = sym_name + sizeof(stopn) - 1; } else continue; for (i = 0; i < ef->nprogtab; i++) { if (strcmp(ef->progtab[i].name, linkset_name) == 0) { startp = (Elf_Addr)ef->progtab[i].addr; stopp = (Elf_Addr)(startp + ef->progtab[i].size); break; } } if (i == ef->nprogtab) continue; sym->st_value = start ? startp : stopp; sym->st_shndx = i; } } static int link_elf_reloc_local(linker_file_t lf, bool ifuncs) { elf_file_t ef = (elf_file_t)lf; const Elf_Rel *rellim; const Elf_Rel *rel; const Elf_Rela *relalim; const Elf_Rela *rela; const Elf_Sym *sym; Elf_Addr base; int i; Elf_Size symidx; link_elf_fix_link_set(ef); /* Perform relocations without addend if there are any: */ for (i = 0; i < ef->nreltab; i++) { rel = ef->reltab[i].rel; if (rel == NULL) { link_elf_error(ef->lf.filename, "lost a reltab"); return (ENOEXEC); } rellim = rel + ef->reltab[i].nrel; base = findbase(ef, ef->reltab[i].sec); if (base == 0) { link_elf_error(ef->lf.filename, "lost base for reltab"); return (ENOEXEC); } for ( ; rel < rellim; rel++) { symidx = ELF_R_SYM(rel->r_info); if (symidx >= ef->ddbsymcnt) continue; sym = ef->ddbsymtab + symidx; /* Only do local relocs */ if (ELF_ST_BIND(sym->st_info) != STB_LOCAL) continue; if ((ELF_ST_TYPE(sym->st_info) == STT_GNU_IFUNC || elf_is_ifunc_reloc(rel->r_info)) == ifuncs) elf_reloc_local(lf, base, rel, ELF_RELOC_REL, elf_obj_lookup); } } /* Perform relocations with addend if there are any: */ for (i = 0; i < ef->nrelatab; i++) { rela = ef->relatab[i].rela; if (rela == NULL) { link_elf_error(ef->lf.filename, "lost a relatab!"); return (ENOEXEC); } relalim = rela + ef->relatab[i].nrela; base = findbase(ef, ef->relatab[i].sec); if (base == 0) { link_elf_error(ef->lf.filename, "lost base for reltab"); return (ENOEXEC); } for ( ; rela < relalim; rela++) { symidx = ELF_R_SYM(rela->r_info); if (symidx >= ef->ddbsymcnt) continue; sym = ef->ddbsymtab + symidx; /* Only do local relocs */ if (ELF_ST_BIND(sym->st_info) != STB_LOCAL) continue; if ((ELF_ST_TYPE(sym->st_info) == STT_GNU_IFUNC || elf_is_ifunc_reloc(rela->r_info)) == ifuncs) elf_reloc_local(lf, base, rela, ELF_RELOC_RELA, elf_obj_lookup); } } return (0); } static long link_elf_symtab_get(linker_file_t lf, const Elf_Sym **symtab) { elf_file_t ef = (elf_file_t)lf; *symtab = ef->ddbsymtab; if (*symtab == NULL) return (0); return (ef->ddbsymcnt); } static long link_elf_strtab_get(linker_file_t lf, caddr_t *strtab) { elf_file_t ef = (elf_file_t)lf; *strtab = ef->ddbstrtab; if (*strtab == NULL) return (0); return (ef->ddbstrcnt); } Index: stable/11/sys/kern/posix4_mib.c =================================================================== --- stable/11/sys/kern/posix4_mib.c (revision 359651) +++ stable/11/sys/kern/posix4_mib.c (revision 359652) @@ -1,175 +1,175 @@ /*- * Copyright (c) 1998 * HD Associates, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by HD Associates, Inc * 4. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include static int facility[CTL_P1003_1B_MAXID - 1]; static int facility_initialized[CTL_P1003_1B_MAXID - 1]; static int p31b_sysctl_proc(SYSCTL_HANDLER_ARGS); /* OID_AUTO isn't working with sysconf(3). I guess I'd have to * modify it to do a lookup by name from the index. * For now I've left it a top-level sysctl. */ #if 1 SYSCTL_DECL(_p1003_1b); #define P1B_SYSCTL(num, name) \ SYSCTL_INT(_p1003_1b, num, name, CTLFLAG_RD | CTLFLAG_CAPRD, \ facility + num - 1, 0, ""); #define P1B_SYSCTL_RW(num, name) \ SYSCTL_PROC(_p1003_1b, num, name, CTLTYPE_INT | CTLFLAG_RW, NULL, num, \ p31b_sysctl_proc, "I", ""); #else SYSCTL_DECL(_kern_p1003_1b); #define P1B_SYSCTL(num, name) \ SYSCTL_INT(_kern_p1003_1b, OID_AUTO, name, CTLFLAG_RD | CTLFLAG_CAPRD, \ facility + num - 1, 0, ""); #define P1B_SYSCTL_RW(num, name) \ SYSCTL_PROC(_p1003_1b, OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW, NULL, \ num, p31b_sysctl_proc, "I", ""); SYSCTL_NODE(_kern, OID_AUTO, p1003_1b, CTLFLAG_RW, 0, "P1003.1B"); #endif P1B_SYSCTL(CTL_P1003_1B_ASYNCHRONOUS_IO, asynchronous_io); P1B_SYSCTL(CTL_P1003_1B_MAPPED_FILES, mapped_files); P1B_SYSCTL(CTL_P1003_1B_MEMLOCK, memlock); P1B_SYSCTL(CTL_P1003_1B_MEMLOCK_RANGE, memlock_range); P1B_SYSCTL(CTL_P1003_1B_MEMORY_PROTECTION, memory_protection); P1B_SYSCTL(CTL_P1003_1B_MESSAGE_PASSING, message_passing); P1B_SYSCTL(CTL_P1003_1B_PRIORITIZED_IO, prioritized_io); P1B_SYSCTL(CTL_P1003_1B_PRIORITY_SCHEDULING, priority_scheduling); P1B_SYSCTL(CTL_P1003_1B_REALTIME_SIGNALS, realtime_signals); P1B_SYSCTL(CTL_P1003_1B_SEMAPHORES, semaphores); P1B_SYSCTL(CTL_P1003_1B_FSYNC, fsync); P1B_SYSCTL(CTL_P1003_1B_SHARED_MEMORY_OBJECTS, shared_memory_objects); P1B_SYSCTL(CTL_P1003_1B_SYNCHRONIZED_IO, synchronized_io); P1B_SYSCTL(CTL_P1003_1B_TIMERS, timers); P1B_SYSCTL(CTL_P1003_1B_AIO_MAX, aio_max); P1B_SYSCTL(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, aio_prio_delta_max); P1B_SYSCTL(CTL_P1003_1B_DELAYTIMER_MAX, delaytimer_max); P1B_SYSCTL(CTL_P1003_1B_MQ_OPEN_MAX, mq_open_max); P1B_SYSCTL(CTL_P1003_1B_PAGESIZE, pagesize); P1B_SYSCTL(CTL_P1003_1B_RTSIG_MAX, rtsig_max); P1B_SYSCTL_RW(CTL_P1003_1B_SEM_NSEMS_MAX, sem_nsems_max); P1B_SYSCTL(CTL_P1003_1B_SEM_VALUE_MAX, sem_value_max); P1B_SYSCTL(CTL_P1003_1B_SIGQUEUE_MAX, sigqueue_max); P1B_SYSCTL(CTL_P1003_1B_TIMER_MAX, timer_max); #define P31B_VALID(num) ((num) >= 1 && (num) < CTL_P1003_1B_MAXID) static int p31b_sysctl_proc(SYSCTL_HANDLER_ARGS) { int error, num, val; num = arg2; if (!P31B_VALID(num)) return (EINVAL); val = facility_initialized[num - 1] ? facility[num - 1] : 0; error = sysctl_handle_int(oidp, &val, 0, req); if (error == 0 && req->newptr != NULL && facility_initialized[num - 1]) facility[num - 1] = val; return (error); } /* p31b_setcfg: Set the configuration */ void p31b_setcfg(int num, int value) { if (P31B_VALID(num)) { facility[num - 1] = value; facility_initialized[num - 1] = 1; } } void p31b_unsetcfg(int num) { facility[num - 1] = 0; facility_initialized[num - 1] = 0; } int p31b_getcfg(int num) { if (P31B_VALID(num)) return (facility[num - 1]); return (0); } int p31b_iscfg(int num) { if (P31B_VALID(num)) return (facility_initialized[num - 1]); return (0); } /* * Turn on indications for standard (non-configurable) kernel features. */ static void p31b_set_standard(void *dummy) { p31b_setcfg(CTL_P1003_1B_FSYNC, 200112L); p31b_setcfg(CTL_P1003_1B_MAPPED_FILES, 200112L); p31b_setcfg(CTL_P1003_1B_SHARED_MEMORY_OBJECTS, 200112L); p31b_setcfg(CTL_P1003_1B_PAGESIZE, PAGE_SIZE); } SYSINIT(p31b_set_standard, SI_SUB_P1003_1B, SI_ORDER_ANY, p31b_set_standard, - 0); + NULL); Index: stable/11/sys/kern/subr_pcpu.c =================================================================== --- stable/11/sys/kern/subr_pcpu.c (revision 359651) +++ stable/11/sys/kern/subr_pcpu.c (revision 359652) @@ -1,419 +1,419 @@ /*- * Copyright (c) 2001 Wind River Systems, Inc. * All rights reserved. * Written by: John Baldwin * * Copyright (c) 2009 Jeffrey Roberson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This module provides MI support for per-cpu data. * * Each architecture determines the mapping of logical CPU IDs to physical * CPUs. The requirements of this mapping are as follows: * - Logical CPU IDs must reside in the range 0 ... MAXCPU - 1. * - The mapping is not required to be dense. That is, there may be * gaps in the mappings. * - The platform sets the value of MAXCPU in . * - It is suggested, but not required, that in the non-SMP case, the * platform define MAXCPU to be 1 and define the logical ID of the * sole CPU as 0. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_PCPU, "Per-cpu", "Per-cpu resource accouting."); struct dpcpu_free { uintptr_t df_start; int df_len; TAILQ_ENTRY(dpcpu_free) df_link; }; static DPCPU_DEFINE(char, modspace[DPCPU_MODMIN]); static TAILQ_HEAD(, dpcpu_free) dpcpu_head = TAILQ_HEAD_INITIALIZER(dpcpu_head); static struct sx dpcpu_lock; uintptr_t dpcpu_off[MAXCPU]; struct pcpu *cpuid_to_pcpu[MAXCPU]; struct cpuhead cpuhead = STAILQ_HEAD_INITIALIZER(cpuhead); /* * Initialize the MI portions of a struct pcpu. */ void pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) { bzero(pcpu, size); KASSERT(cpuid >= 0 && cpuid < MAXCPU, ("pcpu_init: invalid cpuid %d", cpuid)); pcpu->pc_cpuid = cpuid; cpuid_to_pcpu[cpuid] = pcpu; STAILQ_INSERT_TAIL(&cpuhead, pcpu, pc_allcpu); cpu_pcpu_init(pcpu, cpuid, size); pcpu->pc_rm_queue.rmq_next = &pcpu->pc_rm_queue; pcpu->pc_rm_queue.rmq_prev = &pcpu->pc_rm_queue; } void dpcpu_init(void *dpcpu, int cpuid) { struct pcpu *pcpu; pcpu = pcpu_find(cpuid); pcpu->pc_dynamic = (uintptr_t)dpcpu - DPCPU_START; /* * Initialize defaults from our linker section. */ memcpy(dpcpu, (void *)DPCPU_START, DPCPU_BYTES); /* * Place it in the global pcpu offset array. */ dpcpu_off[cpuid] = pcpu->pc_dynamic; } static void dpcpu_startup(void *dummy __unused) { struct dpcpu_free *df; df = malloc(sizeof(*df), M_PCPU, M_WAITOK | M_ZERO); df->df_start = (uintptr_t)&DPCPU_NAME(modspace); df->df_len = DPCPU_MODMIN; TAILQ_INSERT_HEAD(&dpcpu_head, df, df_link); sx_init(&dpcpu_lock, "dpcpu alloc lock"); } -SYSINIT(dpcpu, SI_SUB_KLD, SI_ORDER_FIRST, dpcpu_startup, 0); +SYSINIT(dpcpu, SI_SUB_KLD, SI_ORDER_FIRST, dpcpu_startup, NULL); /* * UMA_PCPU_ZONE zones, that are available for all kernel * consumers. Right now 64 bit zone is used for counter(9) * and pointer zone is used by flowtable. */ uma_zone_t pcpu_zone_64; uma_zone_t pcpu_zone_ptr; static void pcpu_zones_startup(void) { pcpu_zone_64 = uma_zcreate("64 pcpu", sizeof(uint64_t), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_PCPU); if (sizeof(uint64_t) == sizeof(void *)) pcpu_zone_ptr = pcpu_zone_64; else pcpu_zone_ptr = uma_zcreate("ptr pcpu", sizeof(void *), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_PCPU); } SYSINIT(pcpu_zones, SI_SUB_KMEM, SI_ORDER_ANY, pcpu_zones_startup, NULL); /* * First-fit extent based allocator for allocating space in the per-cpu * region reserved for modules. This is only intended for use by the * kernel linkers to place module linker sets. */ void * dpcpu_alloc(int size) { struct dpcpu_free *df; void *s; s = NULL; size = roundup2(size, sizeof(void *)); sx_xlock(&dpcpu_lock); TAILQ_FOREACH(df, &dpcpu_head, df_link) { if (df->df_len < size) continue; if (df->df_len == size) { s = (void *)df->df_start; TAILQ_REMOVE(&dpcpu_head, df, df_link); free(df, M_PCPU); break; } s = (void *)df->df_start; df->df_len -= size; df->df_start = df->df_start + size; break; } sx_xunlock(&dpcpu_lock); return (s); } /* * Free dynamic per-cpu space at module unload time. */ void dpcpu_free(void *s, int size) { struct dpcpu_free *df; struct dpcpu_free *dn; uintptr_t start; uintptr_t end; size = roundup2(size, sizeof(void *)); start = (uintptr_t)s; end = start + size; /* * Free a region of space and merge it with as many neighbors as * possible. Keeping the list sorted simplifies this operation. */ sx_xlock(&dpcpu_lock); TAILQ_FOREACH(df, &dpcpu_head, df_link) { if (df->df_start > end) break; /* * If we expand at the end of an entry we may have to * merge it with the one following it as well. */ if (df->df_start + df->df_len == start) { df->df_len += size; dn = TAILQ_NEXT(df, df_link); if (df->df_start + df->df_len == dn->df_start) { df->df_len += dn->df_len; TAILQ_REMOVE(&dpcpu_head, dn, df_link); free(dn, M_PCPU); } sx_xunlock(&dpcpu_lock); return; } if (df->df_start == end) { df->df_start = start; df->df_len += size; sx_xunlock(&dpcpu_lock); return; } } dn = malloc(sizeof(*df), M_PCPU, M_WAITOK | M_ZERO); dn->df_start = start; dn->df_len = size; if (df) TAILQ_INSERT_BEFORE(df, dn, df_link); else TAILQ_INSERT_TAIL(&dpcpu_head, dn, df_link); sx_xunlock(&dpcpu_lock); } /* * Initialize the per-cpu storage from an updated linker-set region. */ void dpcpu_copy(void *s, int size) { #ifdef SMP uintptr_t dpcpu; int i; CPU_FOREACH(i) { dpcpu = dpcpu_off[i]; if (dpcpu == 0) continue; memcpy((void *)(dpcpu + (uintptr_t)s), s, size); } #else memcpy((void *)(dpcpu_off[0] + (uintptr_t)s), s, size); #endif } /* * Destroy a struct pcpu. */ void pcpu_destroy(struct pcpu *pcpu) { STAILQ_REMOVE(&cpuhead, pcpu, pcpu, pc_allcpu); cpuid_to_pcpu[pcpu->pc_cpuid] = NULL; dpcpu_off[pcpu->pc_cpuid] = 0; } /* * Locate a struct pcpu by cpu id. */ struct pcpu * pcpu_find(u_int cpuid) { return (cpuid_to_pcpu[cpuid]); } int sysctl_dpcpu_quad(SYSCTL_HANDLER_ARGS) { uintptr_t dpcpu; int64_t count; int i; count = 0; CPU_FOREACH(i) { dpcpu = dpcpu_off[i]; if (dpcpu == 0) continue; count += *(int64_t *)(dpcpu + (uintptr_t)arg1); } return (SYSCTL_OUT(req, &count, sizeof(count))); } int sysctl_dpcpu_long(SYSCTL_HANDLER_ARGS) { uintptr_t dpcpu; long count; int i; count = 0; CPU_FOREACH(i) { dpcpu = dpcpu_off[i]; if (dpcpu == 0) continue; count += *(long *)(dpcpu + (uintptr_t)arg1); } return (SYSCTL_OUT(req, &count, sizeof(count))); } int sysctl_dpcpu_int(SYSCTL_HANDLER_ARGS) { uintptr_t dpcpu; int count; int i; count = 0; CPU_FOREACH(i) { dpcpu = dpcpu_off[i]; if (dpcpu == 0) continue; count += *(int *)(dpcpu + (uintptr_t)arg1); } return (SYSCTL_OUT(req, &count, sizeof(count))); } #ifdef DDB DB_SHOW_COMMAND(dpcpu_off, db_show_dpcpu_off) { int id; CPU_FOREACH(id) { db_printf("dpcpu_off[%2d] = 0x%jx (+ DPCPU_START = %p)\n", id, (uintmax_t)dpcpu_off[id], (void *)(uintptr_t)(dpcpu_off[id] + DPCPU_START)); } } static void show_pcpu(struct pcpu *pc) { struct thread *td; db_printf("cpuid = %d\n", pc->pc_cpuid); db_printf("dynamic pcpu = %p\n", (void *)pc->pc_dynamic); db_printf("curthread = "); td = pc->pc_curthread; if (td != NULL) db_printf("%p: pid %d \"%s\"\n", td, td->td_proc->p_pid, td->td_name); else db_printf("none\n"); db_printf("curpcb = %p\n", pc->pc_curpcb); db_printf("fpcurthread = "); td = pc->pc_fpcurthread; if (td != NULL) db_printf("%p: pid %d \"%s\"\n", td, td->td_proc->p_pid, td->td_name); else db_printf("none\n"); db_printf("idlethread = "); td = pc->pc_idlethread; if (td != NULL) db_printf("%p: tid %d \"%s\"\n", td, td->td_tid, td->td_name); else db_printf("none\n"); db_show_mdpcpu(pc); #ifdef VIMAGE db_printf("curvnet = %p\n", pc->pc_curthread->td_vnet); #endif #ifdef WITNESS db_printf("spin locks held:\n"); witness_list_locks(&pc->pc_spinlocks, db_printf); #endif } DB_SHOW_COMMAND(pcpu, db_show_pcpu) { struct pcpu *pc; int id; if (have_addr) id = ((addr >> 4) % 16) * 10 + (addr % 16); else id = PCPU_GET(cpuid); pc = pcpu_find(id); if (pc == NULL) { db_printf("CPU %d not found\n", id); return; } show_pcpu(pc); } DB_SHOW_ALL_COMMAND(pcpu, db_show_cpu_all) { struct pcpu *pc; int id; db_printf("Current CPU: %d\n\n", PCPU_GET(cpuid)); for (id = 0; id <= mp_maxid; id++) { pc = pcpu_find(id); if (pc != NULL) { show_pcpu(pc); db_printf("\n"); } } } DB_SHOW_ALIAS(allpcpu, db_show_cpu_all); #endif Index: stable/11/sys/net/route.c =================================================================== --- stable/11/sys/net/route.c (revision 359651) +++ stable/11/sys/net/route.c (revision 359652) @@ -1,2328 +1,2328 @@ /*- * Copyright (c) 1980, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)route.c 8.3.1.1 (Berkeley) 2/23/95 * $FreeBSD$ */ /************************************************************************ * Note: In this file a 'fib' is a "forwarding information base" * * Which is the new name for an in kernel routing (next hop) table. * ***********************************************************************/ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_route.h" #include "opt_sctp.h" #include "opt_mrouting.h" #include "opt_mpath.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RADIX_MPATH #include #endif #include #include #include #define RT_MAXFIBS UINT16_MAX /* Kernel config default option. */ #ifdef ROUTETABLES #if ROUTETABLES <= 0 #error "ROUTETABLES defined too low" #endif #if ROUTETABLES > RT_MAXFIBS #error "ROUTETABLES defined too big" #endif #define RT_NUMFIBS ROUTETABLES #endif /* ROUTETABLES */ /* Initialize to default if not otherwise set. */ #ifndef RT_NUMFIBS #define RT_NUMFIBS 1 #endif #if defined(INET) || defined(INET6) #ifdef SCTP extern void sctp_addr_change(struct ifaddr *ifa, int cmd); #endif /* SCTP */ #endif /* This is read-only.. */ u_int rt_numfibs = RT_NUMFIBS; SYSCTL_UINT(_net, OID_AUTO, fibs, CTLFLAG_RDTUN, &rt_numfibs, 0, ""); /* * By default add routes to all fibs for new interfaces. * Once this is set to 0 then only allocate routes on interface * changes for the FIB of the caller when adding a new set of addresses * to an interface. XXX this is a shotgun aproach to a problem that needs * a more fine grained solution.. that will come. * XXX also has the problems getting the FIB from curthread which will not * always work given the fib can be overridden and prefixes can be added * from the network stack context. */ VNET_DEFINE(u_int, rt_add_addr_allfibs) = 1; SYSCTL_UINT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(rt_add_addr_allfibs), 0, ""); VNET_DEFINE(struct rtstat, rtstat); #define V_rtstat VNET(rtstat) VNET_DEFINE(struct rib_head *, rt_tables); #define V_rt_tables VNET(rt_tables) VNET_DEFINE(int, rttrash); /* routes not in table but not freed */ #define V_rttrash VNET(rttrash) /* * Convert a 'struct radix_node *' to a 'struct rtentry *'. * The operation can be done safely (in this code) because a * 'struct rtentry' starts with two 'struct radix_node''s, the first * one representing leaf nodes in the routing tree, which is * what the code in radix.c passes us as a 'struct radix_node'. * * But because there are a lot of assumptions in this conversion, * do not cast explicitly, but always use the macro below. */ #define RNTORT(p) ((struct rtentry *)(p)) static VNET_DEFINE(uma_zone_t, rtzone); /* Routing table UMA zone. */ #define V_rtzone VNET(rtzone) static int rtrequest1_fib_change(struct rib_head *, struct rt_addrinfo *, struct rtentry **, u_int); static void rt_setmetrics(const struct rt_addrinfo *, struct rtentry *); static int rt_ifdelroute(const struct rtentry *rt, void *arg); static struct rtentry *rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, int *perror); static void rt_notifydelete(struct rtentry *rt, struct rt_addrinfo *info); #ifdef RADIX_MPATH static struct radix_node *rt_mpath_unlink(struct rib_head *rnh, struct rt_addrinfo *info, struct rtentry *rto, int *perror); #endif static int rt_exportinfo(struct rtentry *rt, struct rt_addrinfo *info, int flags); struct if_mtuinfo { struct ifnet *ifp; int mtu; }; static int if_updatemtu_cb(struct radix_node *, void *); /* * handler for net.my_fibnum */ static int sysctl_my_fibnum(SYSCTL_HANDLER_ARGS) { int fibnum; int error; fibnum = curthread->td_proc->p_fibnum; error = sysctl_handle_int(oidp, &fibnum, 0, req); return (error); } SYSCTL_PROC(_net, OID_AUTO, my_fibnum, CTLTYPE_INT|CTLFLAG_RD, NULL, 0, &sysctl_my_fibnum, "I", "default FIB of caller"); static __inline struct rib_head ** rt_tables_get_rnh_ptr(int table, int fam) { struct rib_head **rnh; KASSERT(table >= 0 && table < rt_numfibs, ("%s: table out of bounds.", __func__)); KASSERT(fam >= 0 && fam < (AF_MAX+1), ("%s: fam out of bounds.", __func__)); /* rnh is [fib=0][af=0]. */ rnh = (struct rib_head **)V_rt_tables; /* Get the offset to the requested table and fam. */ rnh += table * (AF_MAX+1) + fam; return (rnh); } struct rib_head * rt_tables_get_rnh(int table, int fam) { return (*rt_tables_get_rnh_ptr(table, fam)); } u_int rt_tables_get_gen(int table, int fam) { struct rib_head *rnh; rnh = *rt_tables_get_rnh_ptr(table, fam); KASSERT(rnh != NULL, ("%s: NULL rib_head pointer table %d fam %d", __func__, table, fam)); return (rnh->rnh_gen); } /* * route initialization must occur before ip6_init2(), which happenas at * SI_ORDER_MIDDLE. */ static void route_init(void) { /* whack the tunable ints into line. */ if (rt_numfibs > RT_MAXFIBS) rt_numfibs = RT_MAXFIBS; if (rt_numfibs == 0) rt_numfibs = 1; } -SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, 0); +SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, NULL); static int rtentry_zinit(void *mem, int size, int how) { struct rtentry *rt = mem; rt->rt_pksent = counter_u64_alloc(how); if (rt->rt_pksent == NULL) return (ENOMEM); RT_LOCK_INIT(rt); return (0); } static void rtentry_zfini(void *mem, int size) { struct rtentry *rt = mem; RT_LOCK_DESTROY(rt); counter_u64_free(rt->rt_pksent); } static int rtentry_ctor(void *mem, int size, void *arg, int how) { struct rtentry *rt = mem; bzero(rt, offsetof(struct rtentry, rt_endzero)); counter_u64_zero(rt->rt_pksent); rt->rt_chain = NULL; return (0); } static void rtentry_dtor(void *mem, int size, void *arg) { struct rtentry *rt = mem; RT_UNLOCK_COND(rt); } static void vnet_route_init(const void *unused __unused) { struct domain *dom; struct rib_head **rnh; int table; int fam; V_rt_tables = malloc(rt_numfibs * (AF_MAX+1) * sizeof(struct rib_head *), M_RTABLE, M_WAITOK|M_ZERO); V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), rtentry_ctor, rtentry_dtor, rtentry_zinit, rtentry_zfini, UMA_ALIGN_PTR, 0); for (dom = domains; dom; dom = dom->dom_next) { if (dom->dom_rtattach == NULL) continue; for (table = 0; table < rt_numfibs; table++) { fam = dom->dom_family; if (table != 0 && fam != AF_INET6 && fam != AF_INET) break; rnh = rt_tables_get_rnh_ptr(table, fam); if (rnh == NULL) panic("%s: rnh NULL", __func__); dom->dom_rtattach((void **)rnh, 0); } } } VNET_SYSINIT(vnet_route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, vnet_route_init, 0); #ifdef VIMAGE static void vnet_route_uninit(const void *unused __unused) { int table; int fam; struct domain *dom; struct rib_head **rnh; for (dom = domains; dom; dom = dom->dom_next) { if (dom->dom_rtdetach == NULL) continue; for (table = 0; table < rt_numfibs; table++) { fam = dom->dom_family; if (table != 0 && fam != AF_INET6 && fam != AF_INET) break; rnh = rt_tables_get_rnh_ptr(table, fam); if (rnh == NULL) panic("%s: rnh NULL", __func__); dom->dom_rtdetach((void **)rnh, 0); } } free(V_rt_tables, M_RTABLE); uma_zdestroy(V_rtzone); } VNET_SYSUNINIT(vnet_route_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, vnet_route_uninit, 0); #endif struct rib_head * rt_table_init(int offset) { struct rib_head *rh; rh = malloc(sizeof(struct rib_head), M_RTABLE, M_WAITOK | M_ZERO); /* TODO: These details should be hidded inside radix.c */ /* Init masks tree */ rn_inithead_internal(&rh->head, rh->rnh_nodes, offset); rn_inithead_internal(&rh->rmhead.head, rh->rmhead.mask_nodes, 0); rh->head.rnh_masks = &rh->rmhead; /* Init locks */ rw_init(&rh->rib_lock, "rib head lock"); /* Finally, set base callbacks */ rh->rnh_addaddr = rn_addroute; rh->rnh_deladdr = rn_delete; rh->rnh_matchaddr = rn_match; rh->rnh_lookup = rn_lookup; rh->rnh_walktree = rn_walktree; rh->rnh_walktree_from = rn_walktree_from; return (rh); } static int rt_freeentry(struct radix_node *rn, void *arg) { struct radix_head * const rnh = arg; struct radix_node *x; x = (struct radix_node *)rn_delete(rn + 2, NULL, rnh); if (x != NULL) R_Free(x); return (0); } void rt_table_destroy(struct rib_head *rh) { rn_walktree(&rh->rmhead.head, rt_freeentry, &rh->rmhead.head); /* Assume table is already empty */ rw_destroy(&rh->rib_lock); free(rh, M_RTABLE); } #ifndef _SYS_SYSPROTO_H_ struct setfib_args { int fibnum; }; #endif int sys_setfib(struct thread *td, struct setfib_args *uap) { if (uap->fibnum < 0 || uap->fibnum >= rt_numfibs) return EINVAL; td->td_proc->p_fibnum = uap->fibnum; return (0); } /* * Packet routing routines. */ void rtalloc_ign_fib(struct route *ro, u_long ignore, u_int fibnum) { struct rtentry *rt; if ((rt = ro->ro_rt) != NULL) { if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP) return; RTFREE(rt); ro->ro_rt = NULL; } ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, fibnum); if (ro->ro_rt) RT_UNLOCK(ro->ro_rt); } /* * Look up the route that matches the address given * Or, at least try.. Create a cloned route if needed. * * The returned route, if any, is locked. */ struct rtentry * rtalloc1(struct sockaddr *dst, int report, u_long ignflags) { return (rtalloc1_fib(dst, report, ignflags, RT_DEFAULT_FIB)); } struct rtentry * rtalloc1_fib(struct sockaddr *dst, int report, u_long ignflags, u_int fibnum) { struct rib_head *rh; struct radix_node *rn; struct rtentry *newrt; struct rt_addrinfo info; int err = 0, msgtype = RTM_MISS; KASSERT((fibnum < rt_numfibs), ("rtalloc1_fib: bad fibnum")); rh = rt_tables_get_rnh(fibnum, dst->sa_family); newrt = NULL; if (rh == NULL) goto miss; /* * Look up the address in the table for that Address Family */ if ((ignflags & RTF_RNH_LOCKED) == 0) RIB_RLOCK(rh); #ifdef INVARIANTS else RIB_LOCK_ASSERT(rh); #endif rn = rh->rnh_matchaddr(dst, &rh->head); if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) { newrt = RNTORT(rn); RT_LOCK(newrt); RT_ADDREF(newrt); if ((ignflags & RTF_RNH_LOCKED) == 0) RIB_RUNLOCK(rh); return (newrt); } else if ((ignflags & RTF_RNH_LOCKED) == 0) RIB_RUNLOCK(rh); /* * Either we hit the root or could not find any match, * which basically means: "cannot get there from here". */ miss: V_rtstat.rts_unreach++; if (report) { /* * If required, report the failure to the supervising * Authorities. * For a delete, this is not an error. (report == 0) */ bzero(&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; rt_missmsg_fib(msgtype, &info, 0, err, fibnum); } return (newrt); } /* * Remove a reference count from an rtentry. * If the count gets low enough, take it out of the routing table */ void rtfree(struct rtentry *rt) { struct rib_head *rnh; KASSERT(rt != NULL,("%s: NULL rt", __func__)); rnh = rt_tables_get_rnh(rt->rt_fibnum, rt_key(rt)->sa_family); KASSERT(rnh != NULL,("%s: NULL rnh", __func__)); RT_LOCK_ASSERT(rt); /* * The callers should use RTFREE_LOCKED() or RTFREE(), so * we should come here exactly with the last reference. */ RT_REMREF(rt); if (rt->rt_refcnt > 0) { log(LOG_DEBUG, "%s: %p has %d refs\n", __func__, rt, rt->rt_refcnt); goto done; } /* * On last reference give the "close method" a chance * to cleanup private state. This also permits (for * IPv4 and IPv6) a chance to decide if the routing table * entry should be purged immediately or at a later time. * When an immediate purge is to happen the close routine * typically calls rtexpunge which clears the RTF_UP flag * on the entry so that the code below reclaims the storage. */ if (rt->rt_refcnt == 0 && rnh->rnh_close) rnh->rnh_close((struct radix_node *)rt, &rnh->head); /* * If we are no longer "up" (and ref == 0) * then we can free the resources associated * with the route. */ if ((rt->rt_flags & RTF_UP) == 0) { if (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT)) panic("rtfree 2"); /* * the rtentry must have been removed from the routing table * so it is represented in rttrash.. remove that now. */ V_rttrash--; #ifdef DIAGNOSTIC if (rt->rt_refcnt < 0) { printf("rtfree: %p not freed (neg refs)\n", rt); goto done; } #endif /* * release references on items we hold them on.. * e.g other routes and ifaddrs. */ if (rt->rt_ifa) ifa_free(rt->rt_ifa); /* * The key is separatly alloc'd so free it (see rt_setgate()). * This also frees the gateway, as they are always malloc'd * together. */ R_Free(rt_key(rt)); /* * and the rtentry itself of course */ uma_zfree(V_rtzone, rt); return; } done: RT_UNLOCK(rt); } /* * Force a routing table entry to the specified * destination to go through the given gateway. * Normally called as a result of a routing redirect * message from the network layer. */ void rtredirect_fib(struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *netmask, int flags, struct sockaddr *src, u_int fibnum) { struct rtentry *rt; int error = 0; short *stat = NULL; struct rt_addrinfo info; struct ifaddr *ifa; struct rib_head *rnh; ifa = NULL; rnh = rt_tables_get_rnh(fibnum, dst->sa_family); if (rnh == NULL) { error = EAFNOSUPPORT; goto out; } /* verify the gateway is directly reachable */ if ((ifa = ifa_ifwithnet(gateway, 0, fibnum)) == NULL) { error = ENETUNREACH; goto out; } rt = rtalloc1_fib(dst, 0, 0UL, fibnum); /* NB: rt is locked */ /* * If the redirect isn't from our current router for this dst, * it's either old or wrong. If it redirects us to ourselves, * we have a routing loop, perhaps as a result of an interface * going down recently. */ if (!(flags & RTF_DONE) && rt) { if (!sa_equal(src, rt->rt_gateway)) { error = EINVAL; goto done; } if (rt->rt_ifa != ifa && ifa->ifa_addr->sa_family != AF_LINK) { error = EINVAL; goto done; } } if ((flags & RTF_GATEWAY) && ifa_ifwithaddr_check(gateway)) { error = EHOSTUNREACH; goto done; } /* * Create a new entry if we just got back a wildcard entry * or the lookup failed. This is necessary for hosts * which use routing redirects generated by smart gateways * to dynamically build the routing tables. */ if (rt == NULL || (rt_mask(rt) && rt_mask(rt)->sa_len < 2)) goto create; /* * Don't listen to the redirect if it's * for a route to an interface. */ if (rt->rt_flags & RTF_GATEWAY) { if (((rt->rt_flags & RTF_HOST) == 0) && (flags & RTF_HOST)) { /* * Changing from route to net => route to host. * Create new route, rather than smashing route to net. */ create: if (rt != NULL) RTFREE_LOCKED(rt); flags |= RTF_DYNAMIC; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = gateway; info.rti_info[RTAX_NETMASK] = netmask; info.rti_ifa = ifa; info.rti_flags = flags; error = rtrequest1_fib(RTM_ADD, &info, &rt, fibnum); if (rt != NULL) { RT_LOCK(rt); flags = rt->rt_flags; } stat = &V_rtstat.rts_dynamic; } else { /* * Smash the current notion of the gateway to * this destination. Should check about netmask!!! */ if ((flags & RTF_GATEWAY) == 0) rt->rt_flags &= ~RTF_GATEWAY; rt->rt_flags |= RTF_MODIFIED; flags |= RTF_MODIFIED; stat = &V_rtstat.rts_newgateway; /* * add the key and gateway (in one malloc'd chunk). */ RT_UNLOCK(rt); RIB_WLOCK(rnh); RT_LOCK(rt); rt_setgate(rt, rt_key(rt), gateway); RIB_WUNLOCK(rnh); } } else error = EHOSTUNREACH; done: if (rt) RTFREE_LOCKED(rt); out: if (error) V_rtstat.rts_badredirect++; else if (stat != NULL) (*stat)++; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = gateway; info.rti_info[RTAX_NETMASK] = netmask; info.rti_info[RTAX_AUTHOR] = src; rt_missmsg_fib(RTM_REDIRECT, &info, flags, error, fibnum); if (ifa != NULL) ifa_free(ifa); } /* * Routing table ioctl interface. */ int rtioctl_fib(u_long req, caddr_t data, u_int fibnum) { /* * If more ioctl commands are added here, make sure the proper * super-user checks are being performed because it is possible for * prison-root to make it this far if raw sockets have been enabled * in jails. */ #ifdef INET /* Multicast goop, grrr... */ return mrt_ioctl ? mrt_ioctl(req, data, fibnum) : EOPNOTSUPP; #else /* INET */ return ENXIO; #endif /* INET */ } struct ifaddr * ifa_ifwithroute(int flags, const struct sockaddr *dst, struct sockaddr *gateway, u_int fibnum) { struct ifaddr *ifa; int not_found = 0; if ((flags & RTF_GATEWAY) == 0) { /* * If we are adding a route to an interface, * and the interface is a pt to pt link * we should search for the destination * as our clue to the interface. Otherwise * we can use the local address. */ ifa = NULL; if (flags & RTF_HOST) ifa = ifa_ifwithdstaddr(dst, fibnum); if (ifa == NULL) ifa = ifa_ifwithaddr(gateway); } else { /* * If we are adding a route to a remote net * or host, the gateway may still be on the * other end of a pt to pt link. */ ifa = ifa_ifwithdstaddr(gateway, fibnum); } if (ifa == NULL) ifa = ifa_ifwithnet(gateway, 0, fibnum); if (ifa == NULL) { struct rtentry *rt; rt = rtalloc1_fib(gateway, 0, flags, fibnum); if (rt == NULL) return (NULL); /* * dismiss a gateway that is reachable only * through the default router */ switch (gateway->sa_family) { case AF_INET: if (satosin(rt_key(rt))->sin_addr.s_addr == INADDR_ANY) not_found = 1; break; case AF_INET6: if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(rt))->sin6_addr)) not_found = 1; break; default: break; } if (!not_found && rt->rt_ifa != NULL) { ifa = rt->rt_ifa; ifa_ref(ifa); } RT_REMREF(rt); RT_UNLOCK(rt); if (not_found || ifa == NULL) return (NULL); } if (ifa->ifa_addr->sa_family != dst->sa_family) { struct ifaddr *oifa = ifa; ifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp); if (ifa == NULL) ifa = oifa; else ifa_free(oifa); } return (ifa); } /* * Do appropriate manipulations of a routing tree given * all the bits of info needed */ int rtrequest_fib(int req, struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *netmask, int flags, struct rtentry **ret_nrt, u_int fibnum) { struct rt_addrinfo info; if (dst->sa_len == 0) return(EINVAL); bzero((caddr_t)&info, sizeof(info)); info.rti_flags = flags; info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = gateway; info.rti_info[RTAX_NETMASK] = netmask; return rtrequest1_fib(req, &info, ret_nrt, fibnum); } /* * Copy most of @rt data into @info. * * If @flags contains NHR_COPY, copies dst,netmask and gw to the * pointers specified by @info structure. Assume such pointers * are zeroed sockaddr-like structures with sa_len field initialized * to reflect size of the provided buffer. if no NHR_COPY is specified, * point dst,netmask and gw @info fields to appropriate @rt values. * * if @flags contains NHR_REF, do refcouting on rt_ifp. * * Returns 0 on success. */ int rt_exportinfo(struct rtentry *rt, struct rt_addrinfo *info, int flags) { struct rt_metrics *rmx; struct sockaddr *src, *dst; int sa_len; if (flags & NHR_COPY) { /* Copy destination if dst is non-zero */ src = rt_key(rt); dst = info->rti_info[RTAX_DST]; sa_len = src->sa_len; if (dst != NULL) { if (src->sa_len > dst->sa_len) return (ENOMEM); memcpy(dst, src, src->sa_len); info->rti_addrs |= RTA_DST; } /* Copy mask if set && dst is non-zero */ src = rt_mask(rt); dst = info->rti_info[RTAX_NETMASK]; if (src != NULL && dst != NULL) { /* * Radix stores different value in sa_len, * assume rt_mask() to have the same length * as rt_key() */ if (sa_len > dst->sa_len) return (ENOMEM); memcpy(dst, src, src->sa_len); info->rti_addrs |= RTA_NETMASK; } /* Copy gateway is set && dst is non-zero */ src = rt->rt_gateway; dst = info->rti_info[RTAX_GATEWAY]; if ((rt->rt_flags & RTF_GATEWAY) && src != NULL && dst != NULL){ if (src->sa_len > dst->sa_len) return (ENOMEM); memcpy(dst, src, src->sa_len); info->rti_addrs |= RTA_GATEWAY; } } else { info->rti_info[RTAX_DST] = rt_key(rt); info->rti_addrs |= RTA_DST; if (rt_mask(rt) != NULL) { info->rti_info[RTAX_NETMASK] = rt_mask(rt); info->rti_addrs |= RTA_NETMASK; } if (rt->rt_flags & RTF_GATEWAY) { info->rti_info[RTAX_GATEWAY] = rt->rt_gateway; info->rti_addrs |= RTA_GATEWAY; } } rmx = info->rti_rmx; if (rmx != NULL) { info->rti_mflags |= RTV_MTU; rmx->rmx_mtu = rt->rt_mtu; } info->rti_flags = rt->rt_flags; info->rti_ifp = rt->rt_ifp; info->rti_ifa = rt->rt_ifa; if (flags & NHR_REF) { /* Do 'traditional' refcouting */ if_ref(info->rti_ifp); } return (0); } /* * Lookups up route entry for @dst in RIB database for fib @fibnum. * Exports entry data to @info using rt_exportinfo(). * * if @flags contains NHR_REF, refcouting is performed on rt_ifp. * All references can be released later by calling rib_free_info() * * Returns 0 on success. * Returns ENOENT for lookup failure, ENOMEM for export failure. */ int rib_lookup_info(uint32_t fibnum, const struct sockaddr *dst, uint32_t flags, uint32_t flowid, struct rt_addrinfo *info) { struct rib_head *rh; struct radix_node *rn; struct rtentry *rt; int error; KASSERT((fibnum < rt_numfibs), ("rib_lookup_rte: bad fibnum")); rh = rt_tables_get_rnh(fibnum, dst->sa_family); if (rh == NULL) return (ENOENT); RIB_RLOCK(rh); rn = rh->rnh_matchaddr(__DECONST(void *, dst), &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { rt = RNTORT(rn); /* Ensure route & ifp is UP */ if (RT_LINK_IS_UP(rt->rt_ifp)) { flags = (flags & NHR_REF) | NHR_COPY; error = rt_exportinfo(rt, info, flags); RIB_RUNLOCK(rh); return (error); } } RIB_RUNLOCK(rh); return (ENOENT); } /* * Releases all references acquired by rib_lookup_info() when * called with NHR_REF flags. */ void rib_free_info(struct rt_addrinfo *info) { if_rele(info->rti_ifp); } /* * Iterates over all existing fibs in system calling * @setwa_f function prior to traversing each fib. * Calls @wa_f function for each element in current fib. * If af is not AF_UNSPEC, iterates over fibs in particular * address family. */ void rt_foreach_fib_walk(int af, rt_setwarg_t *setwa_f, rt_walktree_f_t *wa_f, void *arg) { struct rib_head *rnh; uint32_t fibnum; int i; for (fibnum = 0; fibnum < rt_numfibs; fibnum++) { /* Do we want some specific family? */ if (af != AF_UNSPEC) { rnh = rt_tables_get_rnh(fibnum, af); if (rnh == NULL) continue; if (setwa_f != NULL) setwa_f(rnh, fibnum, af, arg); RIB_WLOCK(rnh); rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f,arg); RIB_WUNLOCK(rnh); continue; } for (i = 1; i <= AF_MAX; i++) { rnh = rt_tables_get_rnh(fibnum, i); if (rnh == NULL) continue; if (setwa_f != NULL) setwa_f(rnh, fibnum, i, arg); RIB_WLOCK(rnh); rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f,arg); RIB_WUNLOCK(rnh); } } } struct rt_delinfo { struct rt_addrinfo info; struct rib_head *rnh; struct rtentry *head; }; /* * Conditionally unlinks @rn from radix tree based * on info data passed in @arg. */ static int rt_checkdelroute(struct radix_node *rn, void *arg) { struct rt_delinfo *di; struct rt_addrinfo *info; struct rtentry *rt; int error; di = (struct rt_delinfo *)arg; rt = (struct rtentry *)rn; info = &di->info; error = 0; info->rti_info[RTAX_DST] = rt_key(rt); info->rti_info[RTAX_NETMASK] = rt_mask(rt); info->rti_info[RTAX_GATEWAY] = rt->rt_gateway; rt = rt_unlinkrte(di->rnh, info, &error); if (rt == NULL) { /* Either not allowed or not matched. Skip entry */ return (0); } /* Entry was unlinked. Add to the list and return */ rt->rt_chain = di->head; di->head = rt; return (0); } /* * Iterates over all existing fibs in system. * Deletes each element for which @filter_f function returned * non-zero value. * If @af is not AF_UNSPEC, iterates over fibs in particular * address family. */ void rt_foreach_fib_walk_del(int af, rt_filter_f_t *filter_f, void *arg) { struct rib_head *rnh; struct rt_delinfo di; struct rtentry *rt; uint32_t fibnum; int i, start, end; bzero(&di, sizeof(di)); di.info.rti_filter = filter_f; di.info.rti_filterdata = arg; for (fibnum = 0; fibnum < rt_numfibs; fibnum++) { /* Do we want some specific family? */ if (af != AF_UNSPEC) { start = af; end = af; } else { start = 1; end = AF_MAX; } for (i = start; i <= end; i++) { rnh = rt_tables_get_rnh(fibnum, i); if (rnh == NULL) continue; di.rnh = rnh; RIB_WLOCK(rnh); rnh->rnh_walktree(&rnh->head, rt_checkdelroute, &di); RIB_WUNLOCK(rnh); if (di.head == NULL) continue; /* We might have something to reclaim */ while (di.head != NULL) { rt = di.head; di.head = rt->rt_chain; rt->rt_chain = NULL; /* TODO std rt -> rt_addrinfo export */ di.info.rti_info[RTAX_DST] = rt_key(rt); di.info.rti_info[RTAX_NETMASK] = rt_mask(rt); rt_notifydelete(rt, &di.info); RTFREE_LOCKED(rt); } } } } /* * Delete Routes for a Network Interface * * Called for each routing entry via the rnh->rnh_walktree() call above * to delete all route entries referencing a detaching network interface. * * Arguments: * rt pointer to rtentry * arg argument passed to rnh->rnh_walktree() - detaching interface * * Returns: * 0 successful * errno failed - reason indicated */ static int rt_ifdelroute(const struct rtentry *rt, void *arg) { struct ifnet *ifp = arg; if (rt->rt_ifp != ifp) return (0); /* * Protect (sorta) against walktree recursion problems * with cloned routes */ if ((rt->rt_flags & RTF_UP) == 0) return (0); return (1); } /* * Delete all remaining routes using this interface * Unfortuneatly the only way to do this is to slog through * the entire routing table looking for routes which point * to this interface...oh well... */ void rt_flushifroutes_af(struct ifnet *ifp, int af) { KASSERT((af >= 1 && af <= AF_MAX), ("%s: af %d not >= 1 and <= %d", __func__, af, AF_MAX)); rt_foreach_fib_walk_del(af, rt_ifdelroute, ifp); } void rt_flushifroutes(struct ifnet *ifp) { rt_foreach_fib_walk_del(AF_UNSPEC, rt_ifdelroute, ifp); } /* * Conditionally unlinks rtentry matching data inside @info from @rnh. * Returns unlinked, locked and referenced @rtentry on success, * Returns NULL and sets @perror to: * ESRCH - if prefix was not found, * EADDRINUSE - if trying to delete PINNED route without appropriate flag. * ENOENT - if supplied filter function returned 0 (not matched). */ static struct rtentry * rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, int *perror) { struct sockaddr *dst, *netmask; struct rtentry *rt; struct radix_node *rn; dst = info->rti_info[RTAX_DST]; netmask = info->rti_info[RTAX_NETMASK]; rt = (struct rtentry *)rnh->rnh_lookup(dst, netmask, &rnh->head); if (rt == NULL) { *perror = ESRCH; return (NULL); } if ((info->rti_flags & RTF_PINNED) == 0) { /* Check if target route can be deleted */ if (rt->rt_flags & RTF_PINNED) { *perror = EADDRINUSE; return (NULL); } } if (info->rti_filter != NULL) { if (info->rti_filter(rt, info->rti_filterdata) == 0) { /* Not matched */ *perror = ENOENT; return (NULL); } /* * Filter function requested rte deletion. * Ease the caller work by filling in remaining info * from that particular entry. */ info->rti_info[RTAX_GATEWAY] = rt->rt_gateway; } /* * Remove the item from the tree and return it. * Complain if it is not there and do no more processing. */ *perror = ESRCH; #ifdef RADIX_MPATH if (rt_mpath_capable(rnh)) rn = rt_mpath_unlink(rnh, info, rt, perror); else #endif rn = rnh->rnh_deladdr(dst, netmask, &rnh->head); if (rn == NULL) return (NULL); if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) panic ("rtrequest delete"); rt = RNTORT(rn); RT_LOCK(rt); RT_ADDREF(rt); rt->rt_flags &= ~RTF_UP; *perror = 0; return (rt); } static void rt_notifydelete(struct rtentry *rt, struct rt_addrinfo *info) { struct ifaddr *ifa; /* * give the protocol a chance to keep things in sync. */ ifa = rt->rt_ifa; if (ifa != NULL && ifa->ifa_rtrequest != NULL) ifa->ifa_rtrequest(RTM_DELETE, rt, info); /* * One more rtentry floating around that is not * linked to the routing table. rttrash will be decremented * when RTFREE(rt) is eventually called. */ V_rttrash++; } /* * These (questionable) definitions of apparent local variables apply * to the next two functions. XXXXXX!!! */ #define dst info->rti_info[RTAX_DST] #define gateway info->rti_info[RTAX_GATEWAY] #define netmask info->rti_info[RTAX_NETMASK] #define ifaaddr info->rti_info[RTAX_IFA] #define ifpaddr info->rti_info[RTAX_IFP] #define flags info->rti_flags /* * Look up rt_addrinfo for a specific fib. Note that if rti_ifa is defined, * it will be referenced so the caller must free it. */ int rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum) { struct ifaddr *ifa; int error = 0; /* * ifp may be specified by sockaddr_dl * when protocol address is ambiguous. */ if (info->rti_ifp == NULL && ifpaddr != NULL && ifpaddr->sa_family == AF_LINK && (ifa = ifa_ifwithnet(ifpaddr, 0, fibnum)) != NULL) { info->rti_ifp = ifa->ifa_ifp; ifa_free(ifa); } if (info->rti_ifa == NULL && ifaaddr != NULL) info->rti_ifa = ifa_ifwithaddr(ifaaddr); if (info->rti_ifa == NULL) { struct sockaddr *sa; sa = ifaaddr != NULL ? ifaaddr : (gateway != NULL ? gateway : dst); if (sa != NULL && info->rti_ifp != NULL) info->rti_ifa = ifaof_ifpforaddr(sa, info->rti_ifp); else if (dst != NULL && gateway != NULL) info->rti_ifa = ifa_ifwithroute(flags, dst, gateway, fibnum); else if (sa != NULL) info->rti_ifa = ifa_ifwithroute(flags, sa, sa, fibnum); } if ((ifa = info->rti_ifa) != NULL) { if (info->rti_ifp == NULL) info->rti_ifp = ifa->ifa_ifp; } else error = ENETUNREACH; return (error); } static int if_updatemtu_cb(struct radix_node *rn, void *arg) { struct rtentry *rt; struct if_mtuinfo *ifmtu; rt = (struct rtentry *)rn; ifmtu = (struct if_mtuinfo *)arg; if (rt->rt_ifp != ifmtu->ifp) return (0); if (rt->rt_mtu >= ifmtu->mtu) { /* We have to decrease mtu regardless of flags */ rt->rt_mtu = ifmtu->mtu; return (0); } /* * New MTU is bigger. Check if are allowed to alter it */ if ((rt->rt_flags & (RTF_FIXEDMTU | RTF_GATEWAY | RTF_HOST)) != 0) { /* * Skip routes with user-supplied MTU and * non-interface routes */ return (0); } /* We are safe to update route MTU */ rt->rt_mtu = ifmtu->mtu; return (0); } void rt_updatemtu(struct ifnet *ifp) { struct if_mtuinfo ifmtu; struct rib_head *rnh; int i, j; ifmtu.ifp = ifp; /* * Try to update rt_mtu for all routes using this interface * Unfortunately the only way to do this is to traverse all * routing tables in all fibs/domains. */ for (i = 1; i <= AF_MAX; i++) { ifmtu.mtu = if_getmtu_family(ifp, i); for (j = 0; j < rt_numfibs; j++) { rnh = rt_tables_get_rnh(j, i); if (rnh == NULL) continue; RIB_WLOCK(rnh); rnh->rnh_walktree(&rnh->head, if_updatemtu_cb, &ifmtu); RIB_WUNLOCK(rnh); } } } #if 0 int p_sockaddr(char *buf, int buflen, struct sockaddr *s); int rt_print(char *buf, int buflen, struct rtentry *rt); int p_sockaddr(char *buf, int buflen, struct sockaddr *s) { void *paddr = NULL; switch (s->sa_family) { case AF_INET: paddr = &((struct sockaddr_in *)s)->sin_addr; break; case AF_INET6: paddr = &((struct sockaddr_in6 *)s)->sin6_addr; break; } if (paddr == NULL) return (0); if (inet_ntop(s->sa_family, paddr, buf, buflen) == NULL) return (0); return (strlen(buf)); } int rt_print(char *buf, int buflen, struct rtentry *rt) { struct sockaddr *addr, *mask; int i = 0; addr = rt_key(rt); mask = rt_mask(rt); i = p_sockaddr(buf, buflen, addr); if (!(rt->rt_flags & RTF_HOST)) { buf[i++] = '/'; i += p_sockaddr(buf + i, buflen - i, mask); } if (rt->rt_flags & RTF_GATEWAY) { buf[i++] = '>'; i += p_sockaddr(buf + i, buflen - i, rt->rt_gateway); } return (i); } #endif #ifdef RADIX_MPATH /* * Deletes key for single-path routes, unlinks rtentry with * gateway specified in @info from multi-path routes. * * Returnes unlinked entry. In case of failure, returns NULL * and sets @perror to ESRCH. */ static struct radix_node * rt_mpath_unlink(struct rib_head *rnh, struct rt_addrinfo *info, struct rtentry *rto, int *perror) { /* * if we got multipath routes, we require users to specify * a matching RTAX_GATEWAY. */ struct rtentry *rt; // *rto = NULL; struct radix_node *rn; struct sockaddr *gw; gw = info->rti_info[RTAX_GATEWAY]; rt = rt_mpath_matchgate(rto, gw); if (rt == NULL) { *perror = ESRCH; return (NULL); } /* * this is the first entry in the chain */ if (rto == rt) { rn = rn_mpath_next((struct radix_node *)rt); /* * there is another entry, now it's active */ if (rn) { rto = RNTORT(rn); RT_LOCK(rto); rto->rt_flags |= RTF_UP; RT_UNLOCK(rto); } else if (rt->rt_flags & RTF_GATEWAY) { /* * For gateway routes, we need to * make sure that we we are deleting * the correct gateway. * rt_mpath_matchgate() does not * check the case when there is only * one route in the chain. */ if (gw && (rt->rt_gateway->sa_len != gw->sa_len || memcmp(rt->rt_gateway, gw, gw->sa_len))) { *perror = ESRCH; return (NULL); } } /* * use the normal delete code to remove * the first entry */ rn = rnh->rnh_deladdr(dst, netmask, &rnh->head); *perror = 0; return (rn); } /* * if the entry is 2nd and on up */ if (rt_mpath_deldup(rto, rt) == 0) panic ("rtrequest1: rt_mpath_deldup"); *perror = 0; rn = (struct radix_node *)rt; return (rn); } #endif #ifdef FLOWTABLE static struct rtentry * rt_flowtable_check_route(struct rib_head *rnh, struct rt_addrinfo *info) { #if defined(INET6) || defined(INET) struct radix_node *rn; #endif struct rtentry *rt0; rt0 = NULL; /* "flow-table" only supports IPv6 and IPv4 at the moment. */ switch (dst->sa_family) { #ifdef INET6 case AF_INET6: #endif #ifdef INET case AF_INET: #endif #if defined(INET6) || defined(INET) rn = rnh->rnh_matchaddr(dst, &rnh->head); if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) { struct sockaddr *mask; u_char *m, *n; int len; /* * compare mask to see if the new route is * more specific than the existing one */ rt0 = RNTORT(rn); RT_LOCK(rt0); RT_ADDREF(rt0); RT_UNLOCK(rt0); /* * A host route is already present, so * leave the flow-table entries as is. */ if (rt0->rt_flags & RTF_HOST) { RTFREE(rt0); rt0 = NULL; } else if (!(flags & RTF_HOST) && netmask) { mask = rt_mask(rt0); len = mask->sa_len; m = (u_char *)mask; n = (u_char *)netmask; while (len-- > 0) { if (*n != *m) break; n++; m++; } if (len == 0 || (*n < *m)) { RTFREE(rt0); rt0 = NULL; } } } #endif/* INET6 || INET */ } return (rt0); } #endif int rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, u_int fibnum) { int error = 0; struct rtentry *rt, *rt_old; #ifdef FLOWTABLE struct rtentry *rt0; #endif struct radix_node *rn; struct rib_head *rnh; struct ifaddr *ifa; struct sockaddr *ndst; struct sockaddr_storage mdst; KASSERT((fibnum < rt_numfibs), ("rtrequest1_fib: bad fibnum")); KASSERT((flags & RTF_RNH_LOCKED) == 0, ("rtrequest1_fib: locked")); switch (dst->sa_family) { case AF_INET6: case AF_INET: /* We support multiple FIBs. */ break; default: fibnum = RT_DEFAULT_FIB; break; } /* * Find the correct routing tree to use for this Address Family */ rnh = rt_tables_get_rnh(fibnum, dst->sa_family); if (rnh == NULL) return (EAFNOSUPPORT); /* * If we are adding a host route then we don't want to put * a netmask in the tree, nor do we want to clone it. */ if (flags & RTF_HOST) netmask = NULL; switch (req) { case RTM_DELETE: if (netmask) { if (dst->sa_len > sizeof(mdst)) return (EINVAL); rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask); dst = (struct sockaddr *)&mdst; } RIB_WLOCK(rnh); rt = rt_unlinkrte(rnh, info, &error); RIB_WUNLOCK(rnh); if (error != 0) return (error); rt_notifydelete(rt, info); /* * If the caller wants it, then it can have it, * but it's up to it to free the rtentry as we won't be * doing it. */ if (ret_nrt) { *ret_nrt = rt; RT_UNLOCK(rt); } else RTFREE_LOCKED(rt); break; case RTM_RESOLVE: /* * resolve was only used for route cloning * here for compat */ break; case RTM_ADD: if ((flags & RTF_GATEWAY) && !gateway) return (EINVAL); if (dst && gateway && (dst->sa_family != gateway->sa_family) && (gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK)) return (EINVAL); if (info->rti_ifa == NULL) { error = rt_getifa_fib(info, fibnum); if (error) return (error); } else ifa_ref(info->rti_ifa); ifa = info->rti_ifa; rt = uma_zalloc(V_rtzone, M_NOWAIT); if (rt == NULL) { ifa_free(ifa); return (ENOBUFS); } rt->rt_flags = RTF_UP | flags; rt->rt_fibnum = fibnum; /* * Add the gateway. Possibly re-malloc-ing the storage for it. */ if ((error = rt_setgate(rt, dst, gateway)) != 0) { ifa_free(ifa); uma_zfree(V_rtzone, rt); return (error); } /* * point to the (possibly newly malloc'd) dest address. */ ndst = (struct sockaddr *)rt_key(rt); /* * make sure it contains the value we want (masked if needed). */ if (netmask) { rt_maskedcopy(dst, ndst, netmask); } else bcopy(dst, ndst, dst->sa_len); /* * We use the ifa reference returned by rt_getifa_fib(). * This moved from below so that rnh->rnh_addaddr() can * examine the ifa and ifa->ifa_ifp if it so desires. */ rt->rt_ifa = ifa; rt->rt_ifp = ifa->ifa_ifp; rt->rt_weight = 1; rt_setmetrics(info, rt); RIB_WLOCK(rnh); RT_LOCK(rt); #ifdef RADIX_MPATH /* do not permit exactly the same dst/mask/gw pair */ if (rt_mpath_capable(rnh) && rt_mpath_conflict(rnh, rt, netmask)) { RIB_WUNLOCK(rnh); ifa_free(rt->rt_ifa); R_Free(rt_key(rt)); uma_zfree(V_rtzone, rt); return (EEXIST); } #endif #ifdef FLOWTABLE rt0 = rt_flowtable_check_route(rnh, info); #endif /* FLOWTABLE */ /* XXX mtu manipulation will be done in rnh_addaddr -- itojun */ rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head, rt->rt_nodes); rt_old = NULL; if (rn == NULL && (info->rti_flags & RTF_PINNED) != 0) { /* * Force removal and re-try addition * TODO: better multipath&pinned support */ struct sockaddr *info_dst = info->rti_info[RTAX_DST]; info->rti_info[RTAX_DST] = ndst; /* Do not delete existing PINNED(interface) routes */ info->rti_flags &= ~RTF_PINNED; rt_old = rt_unlinkrte(rnh, info, &error); info->rti_flags |= RTF_PINNED; info->rti_info[RTAX_DST] = info_dst; if (rt_old != NULL) rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head, rt->rt_nodes); } RIB_WUNLOCK(rnh); if (rt_old != NULL) RT_UNLOCK(rt_old); /* * If it still failed to go into the tree, * then un-make it (this should be a function) */ if (rn == NULL) { ifa_free(rt->rt_ifa); R_Free(rt_key(rt)); uma_zfree(V_rtzone, rt); #ifdef FLOWTABLE if (rt0 != NULL) RTFREE(rt0); #endif return (EEXIST); } #ifdef FLOWTABLE else if (rt0 != NULL) { flowtable_route_flush(dst->sa_family, rt0); RTFREE(rt0); } #endif if (rt_old != NULL) { rt_notifydelete(rt_old, info); RTFREE(rt_old); } /* * If this protocol has something to add to this then * allow it to do that as well. */ if (ifa->ifa_rtrequest) ifa->ifa_rtrequest(req, rt, info); /* * actually return a resultant rtentry and * give the caller a single reference. */ if (ret_nrt) { *ret_nrt = rt; RT_ADDREF(rt); } rnh->rnh_gen++; /* Routing table updated */ RT_UNLOCK(rt); break; case RTM_CHANGE: RIB_WLOCK(rnh); error = rtrequest1_fib_change(rnh, info, ret_nrt, fibnum); RIB_WUNLOCK(rnh); break; default: error = EOPNOTSUPP; } return (error); } #undef dst #undef gateway #undef netmask #undef ifaaddr #undef ifpaddr #undef flags static int rtrequest1_fib_change(struct rib_head *rnh, struct rt_addrinfo *info, struct rtentry **ret_nrt, u_int fibnum) { struct rtentry *rt = NULL; int error = 0; int free_ifa = 0; int family, mtu; struct if_mtuinfo ifmtu; rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST], info->rti_info[RTAX_NETMASK], &rnh->head); if (rt == NULL) return (ESRCH); #ifdef RADIX_MPATH /* * If we got multipath routes, * we require users to specify a matching RTAX_GATEWAY. */ if (rt_mpath_capable(rnh)) { rt = rt_mpath_matchgate(rt, info->rti_info[RTAX_GATEWAY]); if (rt == NULL) return (ESRCH); } #endif RT_LOCK(rt); rt_setmetrics(info, rt); /* * New gateway could require new ifaddr, ifp; * flags may also be different; ifp may be specified * by ll sockaddr when protocol address is ambiguous */ if (((rt->rt_flags & RTF_GATEWAY) && info->rti_info[RTAX_GATEWAY] != NULL) || info->rti_info[RTAX_IFP] != NULL || (info->rti_info[RTAX_IFA] != NULL && !sa_equal(info->rti_info[RTAX_IFA], rt->rt_ifa->ifa_addr))) { /* * XXX: Temporarily set RTF_RNH_LOCKED flag in the rti_flags * to avoid rlock in the ifa_ifwithroute(). */ info->rti_flags |= RTF_RNH_LOCKED; error = rt_getifa_fib(info, fibnum); info->rti_flags &= ~RTF_RNH_LOCKED; if (info->rti_ifa != NULL) free_ifa = 1; if (error != 0) goto bad; } /* Check if outgoing interface has changed */ if (info->rti_ifa != NULL && info->rti_ifa != rt->rt_ifa && rt->rt_ifa != NULL && rt->rt_ifa->ifa_rtrequest != NULL) { rt->rt_ifa->ifa_rtrequest(RTM_DELETE, rt, info); ifa_free(rt->rt_ifa); } /* Update gateway address */ if (info->rti_info[RTAX_GATEWAY] != NULL) { error = rt_setgate(rt, rt_key(rt), info->rti_info[RTAX_GATEWAY]); if (error != 0) goto bad; rt->rt_flags &= ~RTF_GATEWAY; rt->rt_flags |= (RTF_GATEWAY & info->rti_flags); } if (info->rti_ifa != NULL && info->rti_ifa != rt->rt_ifa) { ifa_ref(info->rti_ifa); rt->rt_ifa = info->rti_ifa; rt->rt_ifp = info->rti_ifp; } /* Allow some flags to be toggled on change. */ rt->rt_flags &= ~RTF_FMASK; rt->rt_flags |= info->rti_flags & RTF_FMASK; if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest != NULL) rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, info); /* Alter route MTU if necessary */ if (rt->rt_ifp != NULL) { family = info->rti_info[RTAX_DST]->sa_family; mtu = if_getmtu_family(rt->rt_ifp, family); /* Set default MTU */ if (rt->rt_mtu == 0) rt->rt_mtu = mtu; if (rt->rt_mtu != mtu) { /* Check if we really need to update */ ifmtu.ifp = rt->rt_ifp; ifmtu.mtu = mtu; if_updatemtu_cb(rt->rt_nodes, &ifmtu); } } if (ret_nrt) { *ret_nrt = rt; RT_ADDREF(rt); } bad: RT_UNLOCK(rt); if (free_ifa != 0) ifa_free(info->rti_ifa); return (error); } static void rt_setmetrics(const struct rt_addrinfo *info, struct rtentry *rt) { if (info->rti_mflags & RTV_MTU) { if (info->rti_rmx->rmx_mtu != 0) { /* * MTU was explicitly provided by user. * Keep it. */ rt->rt_flags |= RTF_FIXEDMTU; } else { /* * User explicitly sets MTU to 0. * Assume rollback to default. */ rt->rt_flags &= ~RTF_FIXEDMTU; } rt->rt_mtu = info->rti_rmx->rmx_mtu; } if (info->rti_mflags & RTV_WEIGHT) rt->rt_weight = info->rti_rmx->rmx_weight; /* Kernel -> userland timebase conversion. */ if (info->rti_mflags & RTV_EXPIRE) rt->rt_expire = info->rti_rmx->rmx_expire ? info->rti_rmx->rmx_expire - time_second + time_uptime : 0; } int rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate) { /* XXX dst may be overwritten, can we move this to below */ int dlen = SA_SIZE(dst), glen = SA_SIZE(gate); /* * Prepare to store the gateway in rt->rt_gateway. * Both dst and gateway are stored one after the other in the same * malloc'd chunk. If we have room, we can reuse the old buffer, * rt_gateway already points to the right place. * Otherwise, malloc a new block and update the 'dst' address. */ if (rt->rt_gateway == NULL || glen > SA_SIZE(rt->rt_gateway)) { caddr_t new; R_Malloc(new, caddr_t, dlen + glen); if (new == NULL) return ENOBUFS; /* * XXX note, we copy from *dst and not *rt_key(rt) because * rt_setgate() can be called to initialize a newly * allocated route entry, in which case rt_key(rt) == NULL * (and also rt->rt_gateway == NULL). * Free()/free() handle a NULL argument just fine. */ bcopy(dst, new, dlen); R_Free(rt_key(rt)); /* free old block, if any */ rt_key(rt) = (struct sockaddr *)new; rt->rt_gateway = (struct sockaddr *)(new + dlen); } /* * Copy the new gateway value into the memory chunk. */ bcopy(gate, rt->rt_gateway, glen); return (0); } void rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst, struct sockaddr *netmask) { u_char *cp1 = (u_char *)src; u_char *cp2 = (u_char *)dst; u_char *cp3 = (u_char *)netmask; u_char *cplim = cp2 + *cp3; u_char *cplim2 = cp2 + *cp1; *cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */ cp3 += 2; if (cplim > cplim2) cplim = cplim2; while (cp2 < cplim) *cp2++ = *cp1++ & *cp3++; if (cp2 < cplim2) bzero((caddr_t)cp2, (unsigned)(cplim2 - cp2)); } /* * Set up a routing table entry, normally * for an interface. */ #define _SOCKADDR_TMPSIZE 128 /* Not too big.. kernel stack size is limited */ static inline int rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) { struct sockaddr *dst; struct sockaddr *netmask; struct rtentry *rt = NULL; struct rt_addrinfo info; int error = 0; int startfib, endfib; char tempbuf[_SOCKADDR_TMPSIZE]; int didwork = 0; int a_failure = 0; static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK}; struct rib_head *rnh; if (flags & RTF_HOST) { dst = ifa->ifa_dstaddr; netmask = NULL; } else { dst = ifa->ifa_addr; netmask = ifa->ifa_netmask; } if (dst->sa_len == 0) return(EINVAL); switch (dst->sa_family) { case AF_INET6: case AF_INET: /* We support multiple FIBs. */ break; default: fibnum = RT_DEFAULT_FIB; break; } if (fibnum == RT_ALL_FIBS) { if (V_rt_add_addr_allfibs == 0 && cmd == (int)RTM_ADD) startfib = endfib = ifa->ifa_ifp->if_fib; else { startfib = 0; endfib = rt_numfibs - 1; } } else { KASSERT((fibnum < rt_numfibs), ("rtinit1: bad fibnum")); startfib = fibnum; endfib = fibnum; } /* * If it's a delete, check that if it exists, * it's on the correct interface or we might scrub * a route to another ifa which would * be confusing at best and possibly worse. */ if (cmd == RTM_DELETE) { /* * It's a delete, so it should already exist.. * If it's a net, mask off the host bits * (Assuming we have a mask) * XXX this is kinda inet specific.. */ if (netmask != NULL) { rt_maskedcopy(dst, (struct sockaddr *)tempbuf, netmask); dst = (struct sockaddr *)tempbuf; } } /* * Now go through all the requested tables (fibs) and do the * requested action. Realistically, this will either be fib 0 * for protocols that don't do multiple tables or all the * tables for those that do. */ for ( fibnum = startfib; fibnum <= endfib; fibnum++) { if (cmd == RTM_DELETE) { struct radix_node *rn; /* * Look up an rtentry that is in the routing tree and * contains the correct info. */ rnh = rt_tables_get_rnh(fibnum, dst->sa_family); if (rnh == NULL) /* this table doesn't exist but others might */ continue; RIB_RLOCK(rnh); rn = rnh->rnh_lookup(dst, netmask, &rnh->head); #ifdef RADIX_MPATH if (rt_mpath_capable(rnh)) { if (rn == NULL) error = ESRCH; else { rt = RNTORT(rn); /* * for interface route the * rt->rt_gateway is sockaddr_intf * for cloning ARP entries, so * rt_mpath_matchgate must use the * interface address */ rt = rt_mpath_matchgate(rt, ifa->ifa_addr); if (rt == NULL) error = ESRCH; } } #endif error = (rn == NULL || (rn->rn_flags & RNF_ROOT) || RNTORT(rn)->rt_ifa != ifa); RIB_RUNLOCK(rnh); if (error) { /* this is only an error if bad on ALL tables */ continue; } } /* * Do the actual request */ bzero((caddr_t)&info, sizeof(info)); info.rti_ifa = ifa; info.rti_flags = flags | (ifa->ifa_flags & ~IFA_RTSELF) | RTF_PINNED; info.rti_info[RTAX_DST] = dst; /* * doing this for compatibility reasons */ if (cmd == RTM_ADD) info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl; else info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr; info.rti_info[RTAX_NETMASK] = netmask; error = rtrequest1_fib(cmd, &info, &rt, fibnum); if (error == 0 && rt != NULL) { /* * notify any listening routing agents of the change */ RT_LOCK(rt); #ifdef RADIX_MPATH /* * in case address alias finds the first address * e.g. ifconfig bge0 192.0.2.246/24 * e.g. ifconfig bge0 192.0.2.247/24 * the address set in the route is 192.0.2.246 * so we need to replace it with 192.0.2.247 */ if (memcmp(rt->rt_ifa->ifa_addr, ifa->ifa_addr, ifa->ifa_addr->sa_len)) { ifa_free(rt->rt_ifa); ifa_ref(ifa); rt->rt_ifp = ifa->ifa_ifp; rt->rt_ifa = ifa; } #endif /* * doing this for compatibility reasons */ if (cmd == RTM_ADD) { ((struct sockaddr_dl *)rt->rt_gateway)->sdl_type = rt->rt_ifp->if_type; ((struct sockaddr_dl *)rt->rt_gateway)->sdl_index = rt->rt_ifp->if_index; } RT_ADDREF(rt); RT_UNLOCK(rt); rt_newaddrmsg_fib(cmd, ifa, error, rt, fibnum); RT_LOCK(rt); RT_REMREF(rt); if (cmd == RTM_DELETE) { /* * If we are deleting, and we found an entry, * then it's been removed from the tree.. * now throw it away. */ RTFREE_LOCKED(rt); } else { if (cmd == RTM_ADD) { /* * We just wanted to add it.. * we don't actually need a reference. */ RT_REMREF(rt); } RT_UNLOCK(rt); } didwork = 1; } if (error) a_failure = error; } if (cmd == RTM_DELETE) { if (didwork) { error = 0; } else { /* we only give an error if it wasn't in any table */ error = ((flags & RTF_HOST) ? EHOSTUNREACH : ENETUNREACH); } } else { if (a_failure) { /* return an error if any of them failed */ error = a_failure; } } return (error); } /* * Set up a routing table entry, normally * for an interface. */ int rtinit(struct ifaddr *ifa, int cmd, int flags) { struct sockaddr *dst; int fib = RT_DEFAULT_FIB; if (flags & RTF_HOST) { dst = ifa->ifa_dstaddr; } else { dst = ifa->ifa_addr; } switch (dst->sa_family) { case AF_INET6: case AF_INET: /* We do support multiple FIBs. */ fib = RT_ALL_FIBS; break; } return (rtinit1(ifa, cmd, flags, fib)); } /* * Announce interface address arrival/withdraw * Returns 0 on success. */ int rt_addrmsg(int cmd, struct ifaddr *ifa, int fibnum) { KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE, ("unexpected cmd %d", cmd)); KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs), ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs)); #if defined(INET) || defined(INET6) #ifdef SCTP /* * notify the SCTP stack * this will only get called when an address is added/deleted * XXX pass the ifaddr struct instead if ifa->ifa_addr... */ sctp_addr_change(ifa, cmd); #endif /* SCTP */ #endif return (rtsock_addrmsg(cmd, ifa, fibnum)); } /* * Announce route addition/removal. * Users of this function MUST validate input data BEFORE calling. * However we have to be able to handle invalid data: * if some userland app sends us "invalid" route message (invalid mask, * no dst, wrong address families, etc...) we need to pass it back * to app (and any other rtsock consumers) with rtm_errno field set to * non-zero value. * Returns 0 on success. */ int rt_routemsg(int cmd, struct ifnet *ifp, int error, struct rtentry *rt, int fibnum) { KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE, ("unexpected cmd %d", cmd)); KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs), ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs)); KASSERT(rt_key(rt) != NULL, (":%s: rt_key must be supplied", __func__)); return (rtsock_routemsg(cmd, ifp, error, rt, fibnum)); } void rt_newaddrmsg(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt) { rt_newaddrmsg_fib(cmd, ifa, error, rt, RT_ALL_FIBS); } /* * This is called to generate messages from the routing socket * indicating a network interface has had addresses associated with it. */ void rt_newaddrmsg_fib(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt, int fibnum) { KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE, ("unexpected cmd %u", cmd)); KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs), ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs)); if (cmd == RTM_ADD) { rt_addrmsg(cmd, ifa, fibnum); if (rt != NULL) rt_routemsg(cmd, ifa->ifa_ifp, error, rt, fibnum); } else { if (rt != NULL) rt_routemsg(cmd, ifa->ifa_ifp, error, rt, fibnum); rt_addrmsg(cmd, ifa, fibnum); } } Index: stable/11/sys/net/vnet.c =================================================================== --- stable/11/sys/net/vnet.c (revision 359651) +++ stable/11/sys/net/vnet.c (revision 359652) @@ -1,804 +1,804 @@ /*- * Copyright (c) 2004-2009 University of Zagreb * Copyright (c) 2006-2009 FreeBSD Foundation * All rights reserved. * * This software was developed by the University of Zagreb and the * FreeBSD Foundation under sponsorship by the Stichting NLnet and the * FreeBSD Foundation. * * Copyright (c) 2009 Jeffrey Roberson * Copyright (c) 2009 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_kdb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #include #endif #include #include #include /*- * This file implements core functions for virtual network stacks: * * - Virtual network stack management functions. * * - Virtual network stack memory allocator, which virtualizes global * variables in the network stack * * - Virtualized SYSINIT's/SYSUNINIT's, which allow network stack subsystems * to register startup/shutdown events to be run for each virtual network * stack instance. */ FEATURE(vimage, "VIMAGE kernel virtualization"); static MALLOC_DEFINE(M_VNET, "vnet", "network stack control block"); /* * The virtual network stack list has two read-write locks, one sleepable and * the other not, so that the list can be stablized and walked in a variety * of network stack contexts. Both must be acquired exclusively to modify * the list, but a read lock of either lock is sufficient to walk the list. */ struct rwlock vnet_rwlock; struct sx vnet_sxlock; #define VNET_LIST_WLOCK() do { \ sx_xlock(&vnet_sxlock); \ rw_wlock(&vnet_rwlock); \ } while (0) #define VNET_LIST_WUNLOCK() do { \ rw_wunlock(&vnet_rwlock); \ sx_xunlock(&vnet_sxlock); \ } while (0) struct vnet_list_head vnet_head; struct vnet *vnet0; /* * The virtual network stack allocator provides storage for virtualized * global variables. These variables are defined/declared using the * VNET_DEFINE()/VNET_DECLARE() macros, which place them in the 'set_vnet' * linker set. The details of the implementation are somewhat subtle, but * allow the majority of most network subsystems to maintain * virtualization-agnostic. * * The virtual network stack allocator handles variables in the base kernel * vs. modules in similar but different ways. In both cases, virtualized * global variables are marked as such by being declared to be part of the * vnet linker set. These "master" copies of global variables serve two * functions: * * (1) They contain static initialization or "default" values for global * variables which will be propagated to each virtual network stack * instance when created. As with normal global variables, they default * to zero-filled. * * (2) They act as unique global names by which the variable can be referred * to, regardless of network stack instance. The single global symbol * will be used to calculate the location of a per-virtual instance * variable at run-time. * * Each virtual network stack instance has a complete copy of each * virtualized global variable, stored in a malloc'd block of memory * referred to by vnet->vnet_data_mem. Critical to the design is that each * per-instance memory block is laid out identically to the master block so * that the offset of each global variable is the same across all blocks. To * optimize run-time access, a precalculated 'base' address, * vnet->vnet_data_base, is stored in each vnet, and is the amount that can * be added to the address of a 'master' instance of a variable to get to the * per-vnet instance. * * Virtualized global variables are handled in a similar manner, but as each * module has its own 'set_vnet' linker set, and we want to keep all * virtualized globals togther, we reserve space in the kernel's linker set * for potential module variables using a per-vnet character array, * 'modspace'. The virtual network stack allocator maintains a free list to * track what space in the array is free (all, initially) and as modules are * linked, allocates portions of the space to specific globals. The kernel * module linker queries the virtual network stack allocator and will * bind references of the global to the location during linking. It also * calls into the virtual network stack allocator, once the memory is * initialized, in order to propagate the new static initializations to all * existing virtual network stack instances so that the soon-to-be executing * module will find every network stack instance with proper default values. */ /* * Number of bytes of data in the 'set_vnet' linker set, and hence the total * size of all kernel virtualized global variables, and the malloc(9) type * that will be used to allocate it. */ #define VNET_BYTES (VNET_STOP - VNET_START) static MALLOC_DEFINE(M_VNET_DATA, "vnet_data", "VNET data"); /* * VNET_MODMIN is the minimum number of bytes we will reserve for the sum of * global variables across all loaded modules. As this actually sizes an * array declared as a virtualized global variable in the kernel itself, and * we want the virtualized global variable space to be page-sized, we may * have more space than that in practice. */ #define VNET_MODMIN (8 * PAGE_SIZE) #define VNET_SIZE roundup2(VNET_BYTES, PAGE_SIZE) /* * Space to store virtualized global variables from loadable kernel modules, * and the free list to manage it. */ static VNET_DEFINE(char, modspace[VNET_MODMIN]); /* * Global lists of subsystem constructor and destructors for vnets. They are * registered via VNET_SYSINIT() and VNET_SYSUNINIT(). Both lists are * protected by the vnet_sysinit_sxlock global lock. */ static TAILQ_HEAD(vnet_sysinit_head, vnet_sysinit) vnet_constructors = TAILQ_HEAD_INITIALIZER(vnet_constructors); static TAILQ_HEAD(vnet_sysuninit_head, vnet_sysinit) vnet_destructors = TAILQ_HEAD_INITIALIZER(vnet_destructors); struct sx vnet_sysinit_sxlock; #define VNET_SYSINIT_WLOCK() sx_xlock(&vnet_sysinit_sxlock); #define VNET_SYSINIT_WUNLOCK() sx_xunlock(&vnet_sysinit_sxlock); #define VNET_SYSINIT_RLOCK() sx_slock(&vnet_sysinit_sxlock); #define VNET_SYSINIT_RUNLOCK() sx_sunlock(&vnet_sysinit_sxlock); struct vnet_data_free { uintptr_t vnd_start; int vnd_len; TAILQ_ENTRY(vnet_data_free) vnd_link; }; static MALLOC_DEFINE(M_VNET_DATA_FREE, "vnet_data_free", "VNET resource accounting"); static TAILQ_HEAD(, vnet_data_free) vnet_data_free_head = TAILQ_HEAD_INITIALIZER(vnet_data_free_head); static struct sx vnet_data_free_lock; SDT_PROVIDER_DEFINE(vnet); SDT_PROBE_DEFINE1(vnet, functions, vnet_alloc, entry, "int"); SDT_PROBE_DEFINE2(vnet, functions, vnet_alloc, alloc, "int", "struct vnet *"); SDT_PROBE_DEFINE2(vnet, functions, vnet_alloc, return, "int", "struct vnet *"); SDT_PROBE_DEFINE2(vnet, functions, vnet_destroy, entry, "int", "struct vnet *"); SDT_PROBE_DEFINE1(vnet, functions, vnet_destroy, return, "int"); #ifdef DDB static void db_show_vnet_print_vs(struct vnet_sysinit *, int); #endif /* * Allocate a virtual network stack. */ struct vnet * vnet_alloc(void) { struct vnet *vnet; SDT_PROBE1(vnet, functions, vnet_alloc, entry, __LINE__); vnet = malloc(sizeof(struct vnet), M_VNET, M_WAITOK | M_ZERO); vnet->vnet_magic_n = VNET_MAGIC_N; vnet->vnet_state = 0; SDT_PROBE2(vnet, functions, vnet_alloc, alloc, __LINE__, vnet); /* * Allocate storage for virtualized global variables and copy in * initial values form our 'master' copy. */ vnet->vnet_data_mem = malloc(VNET_SIZE, M_VNET_DATA, M_WAITOK); memcpy(vnet->vnet_data_mem, (void *)VNET_START, VNET_BYTES); /* * All use of vnet-specific data will immediately subtract VNET_START * from the base memory pointer, so pre-calculate that now to avoid * it on each use. */ vnet->vnet_data_base = (uintptr_t)vnet->vnet_data_mem - VNET_START; /* Initialize / attach vnet module instances. */ CURVNET_SET_QUIET(vnet); vnet_sysinit(); CURVNET_RESTORE(); VNET_LIST_WLOCK(); LIST_INSERT_HEAD(&vnet_head, vnet, vnet_le); VNET_LIST_WUNLOCK(); SDT_PROBE2(vnet, functions, vnet_alloc, return, __LINE__, vnet); return (vnet); } /* * Destroy a virtual network stack. */ void vnet_destroy(struct vnet *vnet) { SDT_PROBE2(vnet, functions, vnet_destroy, entry, __LINE__, vnet); KASSERT(vnet->vnet_sockcnt == 0, ("%s: vnet still has sockets", __func__)); VNET_LIST_WLOCK(); LIST_REMOVE(vnet, vnet_le); VNET_LIST_WUNLOCK(); CURVNET_SET_QUIET(vnet); vnet_sysuninit(); CURVNET_RESTORE(); /* * Release storage for the virtual network stack instance. */ free(vnet->vnet_data_mem, M_VNET_DATA); vnet->vnet_data_mem = NULL; vnet->vnet_data_base = 0; vnet->vnet_magic_n = 0xdeadbeef; free(vnet, M_VNET); SDT_PROBE1(vnet, functions, vnet_destroy, return, __LINE__); } /* * Boot time initialization and allocation of virtual network stacks. */ static void vnet_init_prelink(void *arg __unused) { rw_init(&vnet_rwlock, "vnet_rwlock"); sx_init(&vnet_sxlock, "vnet_sxlock"); sx_init(&vnet_sysinit_sxlock, "vnet_sysinit_sxlock"); LIST_INIT(&vnet_head); } SYSINIT(vnet_init_prelink, SI_SUB_VNET_PRELINK, SI_ORDER_FIRST, vnet_init_prelink, NULL); static void vnet0_init(void *arg __unused) { /* Warn people before take off - in case we crash early. */ printf("WARNING: VIMAGE (virtualized network stack) is a highly " "experimental feature.\n"); /* * We MUST clear curvnet in vi_init_done() before going SMP, * otherwise CURVNET_SET() macros would scream about unnecessary * curvnet recursions. */ curvnet = prison0.pr_vnet = vnet0 = vnet_alloc(); } SYSINIT(vnet0_init, SI_SUB_VNET, SI_ORDER_FIRST, vnet0_init, NULL); static void vnet_init_done(void *unused __unused) { curvnet = NULL; } SYSINIT(vnet_init_done, SI_SUB_VNET_DONE, SI_ORDER_ANY, vnet_init_done, NULL); /* * Once on boot, initialize the modspace freelist to entirely cover modspace. */ static void vnet_data_startup(void *dummy __unused) { struct vnet_data_free *df; df = malloc(sizeof(*df), M_VNET_DATA_FREE, M_WAITOK | M_ZERO); df->vnd_start = (uintptr_t)&VNET_NAME(modspace); df->vnd_len = VNET_MODMIN; TAILQ_INSERT_HEAD(&vnet_data_free_head, df, vnd_link); sx_init(&vnet_data_free_lock, "vnet_data alloc lock"); } -SYSINIT(vnet_data, SI_SUB_KLD, SI_ORDER_FIRST, vnet_data_startup, 0); +SYSINIT(vnet_data, SI_SUB_KLD, SI_ORDER_FIRST, vnet_data_startup, NULL); /* Dummy VNET_SYSINIT to make sure we always reach the final end state. */ static void vnet_sysinit_done(void *unused __unused) { return; } VNET_SYSINIT(vnet_sysinit_done, SI_SUB_VNET_DONE, SI_ORDER_ANY, vnet_sysinit_done, NULL); /* * When a module is loaded and requires storage for a virtualized global * variable, allocate space from the modspace free list. This interface * should be used only by the kernel linker. */ void * vnet_data_alloc(int size) { struct vnet_data_free *df; void *s; s = NULL; size = roundup2(size, sizeof(void *)); sx_xlock(&vnet_data_free_lock); TAILQ_FOREACH(df, &vnet_data_free_head, vnd_link) { if (df->vnd_len < size) continue; if (df->vnd_len == size) { s = (void *)df->vnd_start; TAILQ_REMOVE(&vnet_data_free_head, df, vnd_link); free(df, M_VNET_DATA_FREE); break; } s = (void *)df->vnd_start; df->vnd_len -= size; df->vnd_start = df->vnd_start + size; break; } sx_xunlock(&vnet_data_free_lock); return (s); } /* * Free space for a virtualized global variable on module unload. */ void vnet_data_free(void *start_arg, int size) { struct vnet_data_free *df; struct vnet_data_free *dn; uintptr_t start; uintptr_t end; size = roundup2(size, sizeof(void *)); start = (uintptr_t)start_arg; end = start + size; /* * Free a region of space and merge it with as many neighbors as * possible. Keeping the list sorted simplifies this operation. */ sx_xlock(&vnet_data_free_lock); TAILQ_FOREACH(df, &vnet_data_free_head, vnd_link) { if (df->vnd_start > end) break; /* * If we expand at the end of an entry we may have to merge * it with the one following it as well. */ if (df->vnd_start + df->vnd_len == start) { df->vnd_len += size; dn = TAILQ_NEXT(df, vnd_link); if (df->vnd_start + df->vnd_len == dn->vnd_start) { df->vnd_len += dn->vnd_len; TAILQ_REMOVE(&vnet_data_free_head, dn, vnd_link); free(dn, M_VNET_DATA_FREE); } sx_xunlock(&vnet_data_free_lock); return; } if (df->vnd_start == end) { df->vnd_start = start; df->vnd_len += size; sx_xunlock(&vnet_data_free_lock); return; } } dn = malloc(sizeof(*df), M_VNET_DATA_FREE, M_WAITOK | M_ZERO); dn->vnd_start = start; dn->vnd_len = size; if (df) TAILQ_INSERT_BEFORE(df, dn, vnd_link); else TAILQ_INSERT_TAIL(&vnet_data_free_head, dn, vnd_link); sx_xunlock(&vnet_data_free_lock); } /* * When a new virtualized global variable has been allocated, propagate its * initial value to each already-allocated virtual network stack instance. */ void vnet_data_copy(void *start, int size) { struct vnet *vnet; VNET_LIST_RLOCK(); LIST_FOREACH(vnet, &vnet_head, vnet_le) memcpy((void *)((uintptr_t)vnet->vnet_data_base + (uintptr_t)start), start, size); VNET_LIST_RUNLOCK(); } /* * Support for special SYSINIT handlers registered via VNET_SYSINIT() * and VNET_SYSUNINIT(). */ void vnet_register_sysinit(void *arg) { struct vnet_sysinit *vs, *vs2; struct vnet *vnet; vs = arg; KASSERT(vs->subsystem > SI_SUB_VNET, ("vnet sysinit too early")); /* Add the constructor to the global list of vnet constructors. */ VNET_SYSINIT_WLOCK(); TAILQ_FOREACH(vs2, &vnet_constructors, link) { if (vs2->subsystem > vs->subsystem) break; if (vs2->subsystem == vs->subsystem && vs2->order > vs->order) break; } if (vs2 != NULL) TAILQ_INSERT_BEFORE(vs2, vs, link); else TAILQ_INSERT_TAIL(&vnet_constructors, vs, link); /* * Invoke the constructor on all the existing vnets when it is * registered. */ VNET_FOREACH(vnet) { CURVNET_SET_QUIET(vnet); vs->func(vs->arg); CURVNET_RESTORE(); } VNET_SYSINIT_WUNLOCK(); } void vnet_deregister_sysinit(void *arg) { struct vnet_sysinit *vs; vs = arg; /* Remove the constructor from the global list of vnet constructors. */ VNET_SYSINIT_WLOCK(); TAILQ_REMOVE(&vnet_constructors, vs, link); VNET_SYSINIT_WUNLOCK(); } void vnet_register_sysuninit(void *arg) { struct vnet_sysinit *vs, *vs2; vs = arg; /* Add the destructor to the global list of vnet destructors. */ VNET_SYSINIT_WLOCK(); TAILQ_FOREACH(vs2, &vnet_destructors, link) { if (vs2->subsystem > vs->subsystem) break; if (vs2->subsystem == vs->subsystem && vs2->order > vs->order) break; } if (vs2 != NULL) TAILQ_INSERT_BEFORE(vs2, vs, link); else TAILQ_INSERT_TAIL(&vnet_destructors, vs, link); VNET_SYSINIT_WUNLOCK(); } void vnet_deregister_sysuninit(void *arg) { struct vnet_sysinit *vs; struct vnet *vnet; vs = arg; /* * Invoke the destructor on all the existing vnets when it is * deregistered. */ VNET_SYSINIT_WLOCK(); VNET_FOREACH(vnet) { CURVNET_SET_QUIET(vnet); vs->func(vs->arg); CURVNET_RESTORE(); } /* Remove the destructor from the global list of vnet destructors. */ TAILQ_REMOVE(&vnet_destructors, vs, link); VNET_SYSINIT_WUNLOCK(); } /* * Invoke all registered vnet constructors on the current vnet. Used during * vnet construction. The caller is responsible for ensuring the new vnet is * the current vnet and that the vnet_sysinit_sxlock lock is locked. */ void vnet_sysinit(void) { struct vnet_sysinit *vs; VNET_SYSINIT_RLOCK(); TAILQ_FOREACH(vs, &vnet_constructors, link) { curvnet->vnet_state = vs->subsystem; vs->func(vs->arg); } VNET_SYSINIT_RUNLOCK(); } /* * Invoke all registered vnet destructors on the current vnet. Used during * vnet destruction. The caller is responsible for ensuring the dying vnet * the current vnet and that the vnet_sysinit_sxlock lock is locked. */ void vnet_sysuninit(void) { struct vnet_sysinit *vs; VNET_SYSINIT_RLOCK(); TAILQ_FOREACH_REVERSE(vs, &vnet_destructors, vnet_sysuninit_head, link) { curvnet->vnet_state = vs->subsystem; vs->func(vs->arg); } VNET_SYSINIT_RUNLOCK(); } /* * EVENTHANDLER(9) extensions. */ /* * Invoke the eventhandler function originally registered with the possibly * registered argument for all virtual network stack instances. * * This iterator can only be used for eventhandlers that do not take any * additional arguments, as we do ignore the variadic arguments from the * EVENTHANDLER_INVOKE() call. */ void vnet_global_eventhandler_iterator_func(void *arg, ...) { VNET_ITERATOR_DECL(vnet_iter); struct eventhandler_entry_vimage *v_ee; /* * There is a bug here in that we should actually cast things to * (struct eventhandler_entry_ ## name *) but that's not easily * possible in here so just re-using the variadic version we * defined for the generic vimage case. */ v_ee = arg; VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); ((vimage_iterator_func_t)v_ee->func)(v_ee->ee_arg); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); } #ifdef VNET_DEBUG struct vnet_recursion { SLIST_ENTRY(vnet_recursion) vnr_le; const char *prev_fn; const char *where_fn; int where_line; struct vnet *old_vnet; struct vnet *new_vnet; }; static SLIST_HEAD(, vnet_recursion) vnet_recursions = SLIST_HEAD_INITIALIZER(vnet_recursions); static void vnet_print_recursion(struct vnet_recursion *vnr, int brief) { if (!brief) printf("CURVNET_SET() recursion in "); printf("%s() line %d, prev in %s()", vnr->where_fn, vnr->where_line, vnr->prev_fn); if (brief) printf(", "); else printf("\n "); printf("%p -> %p\n", vnr->old_vnet, vnr->new_vnet); } void vnet_log_recursion(struct vnet *old_vnet, const char *old_fn, int line) { struct vnet_recursion *vnr; /* Skip already logged recursion events. */ SLIST_FOREACH(vnr, &vnet_recursions, vnr_le) if (vnr->prev_fn == old_fn && vnr->where_fn == curthread->td_vnet_lpush && vnr->where_line == line && (vnr->old_vnet == vnr->new_vnet) == (curvnet == old_vnet)) return; vnr = malloc(sizeof(*vnr), M_VNET, M_NOWAIT | M_ZERO); if (vnr == NULL) panic("%s: malloc failed", __func__); vnr->prev_fn = old_fn; vnr->where_fn = curthread->td_vnet_lpush; vnr->where_line = line; vnr->old_vnet = old_vnet; vnr->new_vnet = curvnet; SLIST_INSERT_HEAD(&vnet_recursions, vnr, vnr_le); vnet_print_recursion(vnr, 0); #ifdef KDB kdb_backtrace(); #endif } #endif /* VNET_DEBUG */ /* * DDB(4). */ #ifdef DDB static void db_vnet_print(struct vnet *vnet) { db_printf("vnet = %p\n", vnet); db_printf(" vnet_magic_n = %#08x (%s, orig %#08x)\n", vnet->vnet_magic_n, (vnet->vnet_magic_n == VNET_MAGIC_N) ? "ok" : "mismatch", VNET_MAGIC_N); db_printf(" vnet_ifcnt = %u\n", vnet->vnet_ifcnt); db_printf(" vnet_sockcnt = %u\n", vnet->vnet_sockcnt); db_printf(" vnet_data_mem = %p\n", vnet->vnet_data_mem); db_printf(" vnet_data_base = %#jx\n", (uintmax_t)vnet->vnet_data_base); db_printf(" vnet_state = %#08x\n", vnet->vnet_state); db_printf("\n"); } DB_SHOW_ALL_COMMAND(vnets, db_show_all_vnets) { VNET_ITERATOR_DECL(vnet_iter); VNET_FOREACH(vnet_iter) { db_vnet_print(vnet_iter); if (db_pager_quit) break; } } DB_SHOW_COMMAND(vnet, db_show_vnet) { if (!have_addr) { db_printf("usage: show vnet \n"); return; } db_vnet_print((struct vnet *)addr); } static void db_show_vnet_print_vs(struct vnet_sysinit *vs, int ddb) { const char *vsname, *funcname; c_db_sym_t sym; db_expr_t offset; #define xprint(...) \ if (ddb) \ db_printf(__VA_ARGS__); \ else \ printf(__VA_ARGS__) if (vs == NULL) { xprint("%s: no vnet_sysinit * given\n", __func__); return; } sym = db_search_symbol((vm_offset_t)vs, DB_STGY_ANY, &offset); db_symbol_values(sym, &vsname, NULL); sym = db_search_symbol((vm_offset_t)vs->func, DB_STGY_PROC, &offset); db_symbol_values(sym, &funcname, NULL); xprint("%s(%p)\n", (vsname != NULL) ? vsname : "", vs); xprint(" %#08x %#08x\n", vs->subsystem, vs->order); xprint(" %p(%s)(%p)\n", vs->func, (funcname != NULL) ? funcname : "", vs->arg); #undef xprint } DB_SHOW_COMMAND(vnet_sysinit, db_show_vnet_sysinit) { struct vnet_sysinit *vs; db_printf("VNET_SYSINIT vs Name(Ptr)\n"); db_printf(" Subsystem Order\n"); db_printf(" Function(Name)(Arg)\n"); TAILQ_FOREACH(vs, &vnet_constructors, link) { db_show_vnet_print_vs(vs, 1); if (db_pager_quit) break; } } DB_SHOW_COMMAND(vnet_sysuninit, db_show_vnet_sysuninit) { struct vnet_sysinit *vs; db_printf("VNET_SYSUNINIT vs Name(Ptr)\n"); db_printf(" Subsystem Order\n"); db_printf(" Function(Name)(Arg)\n"); TAILQ_FOREACH_REVERSE(vs, &vnet_destructors, vnet_sysuninit_head, link) { db_show_vnet_print_vs(vs, 1); if (db_pager_quit) break; } } #ifdef VNET_DEBUG DB_SHOW_COMMAND(vnetrcrs, db_show_vnetrcrs) { struct vnet_recursion *vnr; SLIST_FOREACH(vnr, &vnet_recursions, vnr_le) vnet_print_recursion(vnr, 1); } #endif #endif /* DDB */ Index: stable/11 =================================================================== --- stable/11 (revision 359651) +++ stable/11 (revision 359652) Property changes on: stable/11 ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r333806